1444 changed files with 181338 additions and 83483 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,2 +1 @@
-SOURCES/glibc-2.34.tar.xz
-SOURCES/glibc-upstream-2.34-373.patch
+SOURCES/glibc-2.28.tar.xz
--- a/.glibc.metadata
+++ b/.glibc.metadata
@ -1,2 +1 @@
-7c3b8890a6346793b6334cc5f2fea5d437d307b8 SOURCES/glibc-2.34.tar.xz
-6022f103e5596ad229f22bc966327d71208f7016 SOURCES/glibc-upstream-2.34-373.patch
+ccb5dc9e51a9884df8488f86982439d47b283b2a SOURCES/glibc-2.28.tar.xz
--- a/SOURCES/ChangeLog.old
+++ b/SOURCES/ChangeLog.old
--- a/SOURCES/SUPPORTED
+++ b/SOURCES/SUPPORTED
@ -0,0 +1,496 @@
+# This file names the currently supported and somewhat tested locales.
+# If you have any additions please file a glibc bug report.
+SUPPORTED-LOCALES=\
+C.UTF-8/UTF-8 \
+aa_DJ.UTF-8/UTF-8 \
+aa_DJ/ISO-8859-1 \
+aa_ER/UTF-8 \
+aa_ER@saaho/UTF-8 \
+aa_ET/UTF-8 \
+af_ZA.UTF-8/UTF-8 \
+af_ZA/ISO-8859-1 \
+agr_PE/UTF-8 \
+ak_GH/UTF-8 \
+am_ET/UTF-8 \
+an_ES.UTF-8/UTF-8 \
+an_ES/ISO-8859-15 \
+anp_IN/UTF-8 \
+ar_AE.UTF-8/UTF-8 \
+ar_AE/ISO-8859-6 \
+ar_BH.UTF-8/UTF-8 \
+ar_BH/ISO-8859-6 \
+ar_DZ.UTF-8/UTF-8 \
+ar_DZ/ISO-8859-6 \
+ar_EG.UTF-8/UTF-8 \
+ar_EG/ISO-8859-6 \
+ar_IN/UTF-8 \
+ar_IQ.UTF-8/UTF-8 \
+ar_IQ/ISO-8859-6 \
+ar_JO.UTF-8/UTF-8 \
+ar_JO/ISO-8859-6 \
+ar_KW.UTF-8/UTF-8 \
+ar_KW/ISO-8859-6 \
+ar_LB.UTF-8/UTF-8 \
+ar_LB/ISO-8859-6 \
+ar_LY.UTF-8/UTF-8 \
+ar_LY/ISO-8859-6 \
+ar_MA.UTF-8/UTF-8 \
+ar_MA/ISO-8859-6 \
+ar_OM.UTF-8/UTF-8 \
+ar_OM/ISO-8859-6 \
+ar_QA.UTF-8/UTF-8 \
+ar_QA/ISO-8859-6 \
+ar_SA.UTF-8/UTF-8 \
+ar_SA/ISO-8859-6 \
+ar_SD.UTF-8/UTF-8 \
+ar_SD/ISO-8859-6 \
+ar_SS/UTF-8 \
+ar_SY.UTF-8/UTF-8 \
+ar_SY/ISO-8859-6 \
+ar_TN.UTF-8/UTF-8 \
+ar_TN/ISO-8859-6 \
+ar_YE.UTF-8/UTF-8 \
+ar_YE/ISO-8859-6 \
+ayc_PE/UTF-8 \
+az_AZ/UTF-8 \
+az_IR/UTF-8 \
+as_IN/UTF-8 \
+ast_ES.UTF-8/UTF-8 \
+ast_ES/ISO-8859-15 \
+be_BY.UTF-8/UTF-8 \
+be_BY/CP1251 \
+be_BY@latin/UTF-8 \
+bem_ZM/UTF-8 \
+ber_DZ/UTF-8 \
+ber_MA/UTF-8 \
+bg_BG.UTF-8/UTF-8 \
+bg_BG/CP1251 \
+bhb_IN.UTF-8/UTF-8 \
+bho_IN/UTF-8 \
+bho_NP/UTF-8 \
+bi_VU/UTF-8 \
+bn_BD/UTF-8 \
+bn_IN/UTF-8 \
+bo_CN/UTF-8 \
+bo_IN/UTF-8 \
+br_FR.UTF-8/UTF-8 \
+br_FR/ISO-8859-1 \
+br_FR@euro/ISO-8859-15 \
+brx_IN/UTF-8 \
+bs_BA.UTF-8/UTF-8 \
+bs_BA/ISO-8859-2 \
+byn_ER/UTF-8 \
+ca_AD.UTF-8/UTF-8 \
+ca_AD/ISO-8859-15 \
+ca_ES.UTF-8/UTF-8 \
+ca_ES/ISO-8859-1 \
+ca_ES@euro/ISO-8859-15 \
+ca_ES@valencia/UTF-8 \
+ca_FR.UTF-8/UTF-8 \
+ca_FR/ISO-8859-15 \
+ca_IT.UTF-8/UTF-8 \
+ca_IT/ISO-8859-15 \
+ce_RU/UTF-8 \
+chr_US/UTF-8 \
+cmn_TW/UTF-8 \
+crh_UA/UTF-8 \
+cs_CZ.UTF-8/UTF-8 \
+cs_CZ/ISO-8859-2 \
+csb_PL/UTF-8 \
+cv_RU/UTF-8 \
+cy_GB.UTF-8/UTF-8 \
+cy_GB/ISO-8859-14 \
+da_DK.UTF-8/UTF-8 \
+da_DK/ISO-8859-1 \
+da_DK.ISO-8859-15/ISO-8859-15 \
+de_AT.UTF-8/UTF-8 \
+de_AT/ISO-8859-1 \
+de_AT@euro/ISO-8859-15 \
+de_BE.UTF-8/UTF-8 \
+de_BE/ISO-8859-1 \
+de_BE@euro/ISO-8859-15 \
+de_CH.UTF-8/UTF-8 \
+de_CH/ISO-8859-1 \
+de_DE.UTF-8/UTF-8 \
+de_DE/ISO-8859-1 \
+de_DE@euro/ISO-8859-15 \
+de_IT.UTF-8/UTF-8 \
+de_IT/ISO-8859-1 \
+de_LI.UTF-8/UTF-8 \
+de_LU.UTF-8/UTF-8 \
+de_LU/ISO-8859-1 \
+de_LU@euro/ISO-8859-15 \
+doi_IN/UTF-8 \
+dsb_DE/UTF-8 \
+dv_MV/UTF-8 \
+dz_BT/UTF-8 \
+el_GR.UTF-8/UTF-8 \
+el_GR/ISO-8859-7 \
+el_GR@euro/ISO-8859-7 \
+el_CY.UTF-8/UTF-8 \
+el_CY/ISO-8859-7 \
+en_AG/UTF-8 \
+en_AU.UTF-8/UTF-8 \
+en_AU/ISO-8859-1 \
+en_BW.UTF-8/UTF-8 \
+en_BW/ISO-8859-1 \
+en_CA.UTF-8/UTF-8 \
+en_CA/ISO-8859-1 \
+en_DK.UTF-8/UTF-8 \
+en_DK/ISO-8859-1 \
+en_GB.UTF-8/UTF-8 \
+en_GB/ISO-8859-1 \
+en_GB.ISO-8859-15/ISO-8859-15 \
+en_HK.UTF-8/UTF-8 \
+en_HK/ISO-8859-1 \
+en_IE.UTF-8/UTF-8 \
+en_IE/ISO-8859-1 \
+en_IE@euro/ISO-8859-15 \
+en_IL/UTF-8 \
+en_IN/UTF-8 \
+en_NG/UTF-8 \
+en_NZ.UTF-8/UTF-8 \
+en_NZ/ISO-8859-1 \
+en_PH.UTF-8/UTF-8 \
+en_PH/ISO-8859-1 \
+en_SC.UTF-8/UTF-8 \
+en_SG.UTF-8/UTF-8 \
+en_SG/ISO-8859-1 \
+en_US.UTF-8/UTF-8 \
+en_US/ISO-8859-1 \
+en_US.ISO-8859-15/ISO-8859-15 \
+en_US@ampm/UTF-8 \
+en_US.UTF-8@ampm/UTF-8 \
+en_ZA.UTF-8/UTF-8 \
+en_ZA/ISO-8859-1 \
+en_ZM/UTF-8 \
+en_ZW.UTF-8/UTF-8 \
+en_ZW/ISO-8859-1 \
+eo/UTF-8 \
+es_AR.UTF-8/UTF-8 \
+es_AR/ISO-8859-1 \
+es_BO.UTF-8/UTF-8 \
+es_BO/ISO-8859-1 \
+es_CL.UTF-8/UTF-8 \
+es_CL/ISO-8859-1 \
+es_CO.UTF-8/UTF-8 \
+es_CO/ISO-8859-1 \
+es_CR.UTF-8/UTF-8 \
+es_CR/ISO-8859-1 \
+es_CU/UTF-8 \
+es_DO.UTF-8/UTF-8 \
+es_DO/ISO-8859-1 \
+es_EC.UTF-8/UTF-8 \
+es_EC/ISO-8859-1 \
+es_ES.UTF-8/UTF-8 \
+es_ES/ISO-8859-1 \
+es_ES@euro/ISO-8859-15 \
+es_GT.UTF-8/UTF-8 \
+es_GT/ISO-8859-1 \
+es_HN.UTF-8/UTF-8 \
+es_HN/ISO-8859-1 \
+es_MX.UTF-8/UTF-8 \
+es_MX/ISO-8859-1 \
+es_NI.UTF-8/UTF-8 \
+es_NI/ISO-8859-1 \
+es_PA.UTF-8/UTF-8 \
+es_PA/ISO-8859-1 \
+es_PE.UTF-8/UTF-8 \
+es_PE/ISO-8859-1 \
+es_PR.UTF-8/UTF-8 \
+es_PR/ISO-8859-1 \
+es_PY.UTF-8/UTF-8 \
+es_PY/ISO-8859-1 \
+es_SV.UTF-8/UTF-8 \
+es_SV/ISO-8859-1 \
+es_US.UTF-8/UTF-8 \
+es_US/ISO-8859-1 \
+es_UY.UTF-8/UTF-8 \
+es_UY/ISO-8859-1 \
+es_VE.UTF-8/UTF-8 \
+es_VE/ISO-8859-1 \
+et_EE.UTF-8/UTF-8 \
+et_EE/ISO-8859-1 \
+et_EE.ISO-8859-15/ISO-8859-15 \
+eu_ES.UTF-8/UTF-8 \
+eu_ES/ISO-8859-1 \
+eu_ES@euro/ISO-8859-15 \
+fa_IR/UTF-8 \
+ff_SN/UTF-8 \
+fi_FI.UTF-8/UTF-8 \
+fi_FI/ISO-8859-1 \
+fi_FI@euro/ISO-8859-15 \
+fil_PH/UTF-8 \
+fo_FO.UTF-8/UTF-8 \
+fo_FO/ISO-8859-1 \
+fr_BE.UTF-8/UTF-8 \
+fr_BE/ISO-8859-1 \
+fr_BE@euro/ISO-8859-15 \
+fr_CA.UTF-8/UTF-8 \
+fr_CA/ISO-8859-1 \
+fr_CH.UTF-8/UTF-8 \
+fr_CH/ISO-8859-1 \
+fr_FR.UTF-8/UTF-8 \
+fr_FR/ISO-8859-1 \
+fr_FR@euro/ISO-8859-15 \
+fr_LU.UTF-8/UTF-8 \
+fr_LU/ISO-8859-1 \
+fr_LU@euro/ISO-8859-15 \
+fur_IT/UTF-8 \
+fy_NL/UTF-8 \
+fy_DE/UTF-8 \
+ga_IE.UTF-8/UTF-8 \
+ga_IE/ISO-8859-1 \
+ga_IE@euro/ISO-8859-15 \
+gd_GB.UTF-8/UTF-8 \
+gd_GB/ISO-8859-15 \
+gez_ER/UTF-8 \
+gez_ER@abegede/UTF-8 \
+gez_ET/UTF-8 \
+gez_ET@abegede/UTF-8 \
+gl_ES.UTF-8/UTF-8 \
+gl_ES/ISO-8859-1 \
+gl_ES@euro/ISO-8859-15 \
+gu_IN/UTF-8 \
+gv_GB.UTF-8/UTF-8 \
+gv_GB/ISO-8859-1 \
+ha_NG/UTF-8 \
+hak_TW/UTF-8 \
+he_IL.UTF-8/UTF-8 \
+he_IL/ISO-8859-8 \
+hi_IN/UTF-8 \
+hif_FJ/UTF-8 \
+hne_IN/UTF-8 \
+hr_HR.UTF-8/UTF-8 \
+hr_HR/ISO-8859-2 \
+hsb_DE/ISO-8859-2 \
+hsb_DE.UTF-8/UTF-8 \
+ht_HT/UTF-8 \
+hu_HU.UTF-8/UTF-8 \
+hu_HU/ISO-8859-2 \
+hy_AM/UTF-8 \
+hy_AM.ARMSCII-8/ARMSCII-8 \
+ia_FR/UTF-8 \
+id_ID.UTF-8/UTF-8 \
+id_ID/ISO-8859-1 \
+ig_NG/UTF-8 \
+ik_CA/UTF-8 \
+is_IS.UTF-8/UTF-8 \
+is_IS/ISO-8859-1 \
+it_CH.UTF-8/UTF-8 \
+it_CH/ISO-8859-1 \
+it_IT.UTF-8/UTF-8 \
+it_IT/ISO-8859-1 \
+it_IT@euro/ISO-8859-15 \
+iu_CA/UTF-8 \
+ja_JP.EUC-JP/EUC-JP \
+ja_JP.UTF-8/UTF-8 \
+ka_GE.UTF-8/UTF-8 \
+ka_GE/GEORGIAN-PS \
+kab_DZ/UTF-8 \
+kk_KZ.UTF-8/UTF-8 \
+kk_KZ/PT154 \
+kl_GL.UTF-8/UTF-8 \
+kl_GL/ISO-8859-1 \
+km_KH/UTF-8 \
+kn_IN/UTF-8 \
+ko_KR.EUC-KR/EUC-KR \
+ko_KR.UTF-8/UTF-8 \
+kok_IN/UTF-8 \
+ks_IN/UTF-8 \
+ks_IN@devanagari/UTF-8 \
+ku_TR.UTF-8/UTF-8 \
+ku_TR/ISO-8859-9 \
+kw_GB.UTF-8/UTF-8 \
+kw_GB/ISO-8859-1 \
+ky_KG/UTF-8 \
+lb_LU/UTF-8 \
+lg_UG.UTF-8/UTF-8 \
+lg_UG/ISO-8859-10 \
+li_BE/UTF-8 \
+li_NL/UTF-8 \
+lij_IT/UTF-8 \
+ln_CD/UTF-8 \
+lo_LA/UTF-8 \
+lt_LT.UTF-8/UTF-8 \
+lt_LT/ISO-8859-13 \
+lv_LV.UTF-8/UTF-8 \
+lv_LV/ISO-8859-13 \
+lzh_TW/UTF-8 \
+mag_IN/UTF-8 \
+mai_IN/UTF-8 \
+mai_NP/UTF-8 \
+mfe_MU/UTF-8 \
+mg_MG.UTF-8/UTF-8 \
+mg_MG/ISO-8859-15 \
+mhr_RU/UTF-8 \
+mi_NZ.UTF-8/UTF-8 \
+mi_NZ/ISO-8859-13 \
+miq_NI/UTF-8 \
+mjw_IN/UTF-8 \
+mk_MK.UTF-8/UTF-8 \
+mk_MK/ISO-8859-5 \
+ml_IN/UTF-8 \
+mn_MN/UTF-8 \
+mni_IN/UTF-8 \
+mr_IN/UTF-8 \
+ms_MY.UTF-8/UTF-8 \
+ms_MY/ISO-8859-1 \
+mt_MT.UTF-8/UTF-8 \
+mt_MT/ISO-8859-3 \
+my_MM/UTF-8 \
+nan_TW/UTF-8 \
+nan_TW@latin/UTF-8 \
+nb_NO.UTF-8/UTF-8 \
+nb_NO/ISO-8859-1 \
+nds_DE/UTF-8 \
+nds_NL/UTF-8 \
+ne_NP/UTF-8 \
+nhn_MX/UTF-8 \
+niu_NU/UTF-8 \
+niu_NZ/UTF-8 \
+nl_AW/UTF-8 \
+nl_BE.UTF-8/UTF-8 \
+nl_BE/ISO-8859-1 \
+nl_BE@euro/ISO-8859-15 \
+nl_NL.UTF-8/UTF-8 \
+nl_NL/ISO-8859-1 \
+nl_NL@euro/ISO-8859-15 \
+nn_NO.UTF-8/UTF-8 \
+nn_NO/ISO-8859-1 \
+nr_ZA/UTF-8 \
+nso_ZA/UTF-8 \
+oc_FR.UTF-8/UTF-8 \
+oc_FR/ISO-8859-1 \
+om_ET/UTF-8 \
+om_KE.UTF-8/UTF-8 \
+om_KE/ISO-8859-1 \
+or_IN/UTF-8 \
+os_RU/UTF-8 \
+pa_IN/UTF-8 \
+pa_PK/UTF-8 \
+pap_AW/UTF-8 \
+pap_CW/UTF-8 \
+pl_PL.UTF-8/UTF-8 \
+pl_PL/ISO-8859-2 \
+ps_AF/UTF-8 \
+pt_BR.UTF-8/UTF-8 \
+pt_BR/ISO-8859-1 \
+pt_PT.UTF-8/UTF-8 \
+pt_PT/ISO-8859-1 \
+pt_PT@euro/ISO-8859-15 \
+quz_PE/UTF-8 \
+raj_IN/UTF-8 \
+ro_RO.UTF-8/UTF-8 \
+ro_RO/ISO-8859-2 \
+ru_RU.KOI8-R/KOI8-R \
+ru_RU.UTF-8/UTF-8 \
+ru_RU/ISO-8859-5 \
+ru_UA.UTF-8/UTF-8 \
+ru_UA/KOI8-U \
+rw_RW/UTF-8 \
+sa_IN/UTF-8 \
+sah_RU/UTF-8 \
+sat_IN/UTF-8 \
+sc_IT/UTF-8 \
+sd_IN/UTF-8 \
+sd_IN@devanagari/UTF-8 \
+se_NO/UTF-8 \
+sgs_LT/UTF-8 \
+shn_MM/UTF-8 \
+shs_CA/UTF-8 \
+si_LK/UTF-8 \
+sid_ET/UTF-8 \
+sk_SK.UTF-8/UTF-8 \
+sk_SK/ISO-8859-2 \
+sl_SI.UTF-8/UTF-8 \
+sl_SI/ISO-8859-2 \
+sm_WS/UTF-8 \
+so_DJ.UTF-8/UTF-8 \
+so_DJ/ISO-8859-1 \
+so_ET/UTF-8 \
+so_KE.UTF-8/UTF-8 \
+so_KE/ISO-8859-1 \
+so_SO.UTF-8/UTF-8 \
+so_SO/ISO-8859-1 \
+sq_AL.UTF-8/UTF-8 \
+sq_AL/ISO-8859-1 \
+sq_MK/UTF-8 \
+sr_ME/UTF-8 \
+sr_RS/UTF-8 \
+sr_RS@latin/UTF-8 \
+ss_ZA/UTF-8 \
+st_ZA.UTF-8/UTF-8 \
+st_ZA/ISO-8859-1 \
+sv_FI.UTF-8/UTF-8 \
+sv_FI/ISO-8859-1 \
+sv_FI@euro/ISO-8859-15 \
+sv_SE.UTF-8/UTF-8 \
+sv_SE/ISO-8859-1 \
+sv_SE.ISO-8859-15/ISO-8859-15 \
+sw_KE/UTF-8 \
+sw_TZ/UTF-8 \
+szl_PL/UTF-8 \
+ta_IN/UTF-8 \
+ta_LK/UTF-8 \
+tcy_IN.UTF-8/UTF-8 \
+te_IN/UTF-8 \
+tg_TJ.UTF-8/UTF-8 \
+tg_TJ/KOI8-T \
+th_TH.UTF-8/UTF-8 \
+th_TH/TIS-620 \
+the_NP/UTF-8 \
+ti_ER/UTF-8 \
+ti_ET/UTF-8 \
+tig_ER/UTF-8 \
+tk_TM/UTF-8 \
+tl_PH.UTF-8/UTF-8 \
+tl_PH/ISO-8859-1 \
+tn_ZA/UTF-8 \
+to_TO/UTF-8 \
+tpi_PG/UTF-8 \
+tr_CY.UTF-8/UTF-8 \
+tr_CY/ISO-8859-9 \
+tr_TR.UTF-8/UTF-8 \
+tr_TR/ISO-8859-9 \
+ts_ZA/UTF-8 \
+tt_RU/UTF-8 \
+tt_RU@iqtelif/UTF-8 \
+ug_CN/UTF-8 \
+uk_UA.UTF-8/UTF-8 \
+uk_UA/KOI8-U \
+unm_US/UTF-8 \
+ur_IN/UTF-8 \
+ur_PK/UTF-8 \
+uz_UZ.UTF-8/UTF-8 \
+uz_UZ/ISO-8859-1 \
+uz_UZ@cyrillic/UTF-8 \
+ve_ZA/UTF-8 \
+vi_VN/UTF-8 \
+wa_BE/ISO-8859-1 \
+wa_BE@euro/ISO-8859-15 \
+wa_BE.UTF-8/UTF-8 \
+wae_CH/UTF-8 \
+wal_ET/UTF-8 \
+wo_SN/UTF-8 \
+xh_ZA.UTF-8/UTF-8 \
+xh_ZA/ISO-8859-1 \
+yi_US.UTF-8/UTF-8 \
+yi_US/CP1255 \
+yo_NG/UTF-8 \
+yue_HK/UTF-8 \
+yuw_PG/UTF-8 \
+zh_CN.GB18030/GB18030 \
+zh_CN.GBK/GBK \
+zh_CN.UTF-8/UTF-8 \
+zh_CN/GB2312 \
+zh_HK.UTF-8/UTF-8 \
+zh_HK/BIG5-HKSCS \
+zh_SG.UTF-8/UTF-8 \
+zh_SG.GBK/GBK \
+zh_SG/GB2312 \
+zh_TW.EUC-TW/EUC-TW \
+zh_TW.UTF-8/UTF-8 \
+zh_TW/BIG5 \
+zu_ZA.UTF-8/UTF-8 \
+zu_ZA/ISO-8859-1 \
--- a/SOURCES/build-locale-archive.c
+++ b/SOURCES/build-locale-archive.c
@ -0,0 +1,862 @@
+#define _GNU_SOURCE
+#include <assert.h>
+#include <dirent.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <locale.h>
+#include <stdarg.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <getopt.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#include "../locale/hashval.h"
+#define __LC_LAST 13
+#include "../locale/locarchive.h"
+#include "../crypt/md5.h"
+
+const char *alias_file = DATADIR "/locale/locale.alias";
+const char *locar_file = PREFIX "/lib/locale/locale-archive";
+const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl";
+const char *loc_path = PREFIX "/lib/locale/";
+/* Flags set by `--verbose` option.  */
+int be_quiet = 1;
+int verbose = 0;
+int max_locarchive_open_retry = 10;
+const char *output_prefix;
+
+/* Endianness should have been taken care of by localedef.  We don't need to do
+   additional swapping.  We need this variable exported however, since
+   locarchive.c uses it to determine if it needs to swap endianness of a value
+   before writing to or reading from the archive.  */
+bool swap_endianness_p = false;
+
+static const char *locnames[] =
+  {
+#define DEFINE_CATEGORY(category, category_name, items, a) \
+  [category] = category_name,
+#include "../locale/categories.def"
+#undef  DEFINE_CATEGORY
+  };
+
+static int
+is_prime (unsigned long candidate)
+{
+  /* No even number and none less than 10 will be passed here.  */
+  unsigned long int divn = 3;
+  unsigned long int sq = divn * divn;
+
+  while (sq < candidate && candidate % divn != 0)
+    {
+      ++divn;
+      sq += 4 * divn;
+      ++divn;
+    }
+
+  return candidate % divn != 0;
+}
+
+unsigned long
+next_prime (unsigned long seed)
+{
+  /* Make it definitely odd.  */
+  seed |= 1;
+
+  while (!is_prime (seed))
+    seed += 2;
+
+  return seed;
+}
+
+void
+error (int status, int errnum, const char *message, ...)
+{
+  va_list args;
+
+  va_start (args, message);
+  fflush (stdout);
+  fprintf (stderr, "%s: ", program_invocation_name);
+  vfprintf (stderr, message, args);
+  va_end (args);
+  if (errnum)
+    fprintf (stderr, ": %s", strerror (errnum));
+  putc ('\n', stderr);
+  fflush (stderr);
+  if (status)
+    exit (errnum == EROFS ? 0 : status);
+}
+
+void *
+xmalloc (size_t size)
+{
+  void *p = malloc (size);
+  if (p == NULL)
+    error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size);
+  return p;
+}
+
+static void
+open_tmpl_archive (struct locarhandle *ah)
+{
+  struct stat64 st;
+  int fd;
+  struct locarhead head;
+  const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname;
+
+  /* Open the archive.  We must have exclusive write access.  */
+  fd = open64 (archivefname, O_RDONLY);
+  if (fd == -1)
+    error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"",
+	   archivefname);
+
+  if (fstat64 (fd, &st) < 0)
+    error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"",
+	   archivefname);
+
+  /* Read the header.  */
+  if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head))
+    error (EXIT_FAILURE, errno, "cannot read archive header");
+
+  ah->fd = fd;
+  ah->mmaped = (head.sumhash_offset
+		+ head.sumhash_size * sizeof (struct sumhashent));
+  if (ah->mmaped > (unsigned long) st.st_size)
+    error (EXIT_FAILURE, 0, "locale archive template file truncated");
+  ah->mmaped = st.st_size;
+  ah->reserved = st.st_size;
+
+  /* Now we know how large the administrative information part is.
+     Map all of it.  */
+  ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0);
+  if (ah->addr == MAP_FAILED)
+    error (EXIT_FAILURE, errno, "cannot map archive header");
+}
+
+/* Open the locale archive.  */
+extern void open_archive (struct locarhandle *ah, bool readonly);
+
+/* Close the locale archive.  */
+extern void close_archive (struct locarhandle *ah);
+
+/* Add given locale data to the archive.  */
+extern int add_locale_to_archive (struct locarhandle *ah, const char *name,
+				  locale_data_t data, bool replace);
+
+extern void add_alias (struct locarhandle *ah, const char *alias,
+		       bool replace, const char *oldname,
+		       uint32_t *locrec_offset_p);
+
+extern struct namehashent *
+insert_name (struct locarhandle *ah,
+	     const char *name, size_t name_len, bool replace);
+
+struct nameent
+{
+  char *name;
+  struct locrecent *locrec;
+};
+
+struct dataent
+{
+  const unsigned char *sum;
+  uint32_t file_offset;
+};
+
+static int
+nameentcmp (const void *a, const void *b)
+{
+  struct locrecent *la = ((const struct nameent *) a)->locrec;
+  struct locrecent *lb = ((const struct nameent *) b)->locrec;
+  uint32_t start_a = -1, end_a = 0;
+  uint32_t start_b = -1, end_b = 0;
+  int cnt;
+
+  for (cnt = 0; cnt < __LC_LAST; ++cnt)
+    if (cnt != LC_ALL)
+      {
+	if (la->record[cnt].offset < start_a)
+	  start_a = la->record[cnt].offset;
+	if (la->record[cnt].offset + la->record[cnt].len > end_a)
+	  end_a = la->record[cnt].offset + la->record[cnt].len;
+      }
+  assert (start_a != (uint32_t)-1);
+  assert (end_a != 0);
+
+  for (cnt = 0; cnt < __LC_LAST; ++cnt)
+    if (cnt != LC_ALL)
+      {
+	if (lb->record[cnt].offset < start_b)
+	  start_b = lb->record[cnt].offset;
+	if (lb->record[cnt].offset + lb->record[cnt].len > end_b)
+	  end_b = lb->record[cnt].offset + lb->record[cnt].len;
+      }
+  assert (start_b != (uint32_t)-1);
+  assert (end_b != 0);
+
+  if (start_a != start_b)
+    return (int)start_a - (int)start_b;
+  return (int)end_a - (int)end_b;
+}
+
+static int
+dataentcmp (const void *a, const void *b)
+{
+  if (((const struct dataent *) a)->file_offset
+      < ((const struct dataent *) b)->file_offset)
+    return -1;
+
+  if (((const struct dataent *) a)->file_offset
+      > ((const struct dataent *) b)->file_offset)
+    return 1;
+
+  return 0;
+}
+
+static int
+sumsearchfn (const void *key, const void *ent)
+{
+  uint32_t keyn = *(uint32_t *)key;
+  uint32_t entn = ((struct dataent *)ent)->file_offset;
+
+  if (keyn < entn)
+    return -1;
+  if (keyn > entn)
+    return 1;
+  return 0;
+}
+
+static void
+compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused,
+	      struct dataent *files, locale_data_t data)
+{
+  int cnt;
+  struct locrecent *locrec = name->locrec;
+  struct dataent *file;
+  data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset;
+  data[LC_ALL].size = locrec->record[LC_ALL].len;
+  for (cnt = 0; cnt < __LC_LAST; ++cnt)
+    if (cnt != LC_ALL)
+      {
+	data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset;
+	data[cnt].size = locrec->record[cnt].len;
+	if (data[cnt].addr >= data[LC_ALL].addr
+	    && data[cnt].addr + data[cnt].size
+	       <= data[LC_ALL].addr + data[LC_ALL].size)
+	  __md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum);
+	else
+	  {
+	    file = bsearch (&locrec->record[cnt].offset, files, sumused,
+			    sizeof (*files), sumsearchfn);
+	    if (file == NULL)
+	      error (EXIT_FAILURE, 0, "inconsistent template file");
+	    memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum));
+	  }
+      }
+}
+
+static int
+fill_archive (struct locarhandle *tmpl_ah,
+	      const char *fname,
+	      size_t install_langs_count, char *install_langs_list[],
+	      size_t nlist, char *list[],
+	      const char *primary)
+{
+  struct locarhandle ah;
+  struct locarhead *head;
+  int result = 0;
+  struct nameent *names;
+  struct namehashent *namehashtab;
+  size_t cnt, used;
+  struct dataent *files;
+  struct sumhashent *sumhashtab;
+  size_t sumused;
+  struct locrecent *primary_locrec = NULL;
+  struct nameent *primary_nameent = NULL;
+
+  head = tmpl_ah->addr;
+  names = (struct nameent *) malloc (head->namehash_used
+				     * sizeof (struct nameent));
+  files = (struct dataent *) malloc (head->sumhash_used
+				     * sizeof (struct dataent));
+  if (names == NULL || files == NULL)
+    error (EXIT_FAILURE, errno, "could not allocate tables");
+
+  namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr
+					+ head->namehash_offset);
+  sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr
+				      + head->sumhash_offset);
+
+  for (cnt = used = 0; cnt < head->namehash_size; ++cnt)
+    if (namehashtab[cnt].locrec_offset != 0)
+      {
+	char * name;
+	int i;
+	assert (used < head->namehash_used);
+        name = tmpl_ah->addr + namehashtab[cnt].name_offset;
+        if (install_langs_count == 0)
+          {
+	    /* Always intstall the entry.  */
+            names[used].name = name;
+            names[used++].locrec
+                = (struct locrecent *) ((char *) tmpl_ah->addr +
+                                        namehashtab[cnt].locrec_offset);
+          }
+        else
+          {
+	    /* Only install the entry if the user asked for it via
+	       --install-langs.  */
+            for (i = 0; i < install_langs_count; i++)
+              {
+		/* Add one for "_" and one for the null terminator.  */
+		size_t len = strlen (install_langs_list[i]) + 2;
+		char *install_lang = (char *)xmalloc (len);
+                strcpy (install_lang, install_langs_list[i]);
+                if (strchr (install_lang, '_') == NULL)
+                  strcat (install_lang, "_");
+                if (strncmp (name, install_lang, strlen (install_lang)) == 0)
+                  {
+                    names[used].name = name;
+                    names[used++].locrec
+		      = (struct locrecent *) ((char *)tmpl_ah->addr
+					      + namehashtab[cnt].locrec_offset);
+                  }
+		free (install_lang);
+              }
+          }
+      }
+
+  /* Sort the names.  */
+  qsort (names, used, sizeof (struct nameent), nameentcmp);
+
+  for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt)
+    if (sumhashtab[cnt].file_offset != 0)
+      {
+	assert (sumused < head->sumhash_used);
+	files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum;
+	files[sumused++].file_offset = sumhashtab[cnt].file_offset;
+      }
+
+  /* Sort by file locations.  */
+  qsort (files, sumused, sizeof (struct dataent), dataentcmp);
+
+  /* Open the archive.  This call never returns if we cannot
+     successfully open the archive.  */
+  ah.fname = NULL;
+  if (fname != NULL)
+    ah.fname = fname;
+  open_archive (&ah, false);
+
+  if (primary != NULL)
+    {
+      for (cnt = 0; cnt < used; ++cnt)
+	if (strcmp (names[cnt].name, primary) == 0)
+	  break;
+      if (cnt < used)
+	{
+	  locale_data_t data;
+
+	  compute_data (tmpl_ah, &names[cnt], sumused, files, data);
+	  result |= add_locale_to_archive (&ah, primary, data, 0);
+	  primary_locrec = names[cnt].locrec;
+	  primary_nameent = &names[cnt];
+	}
+    }
+
+  for (cnt = 0; cnt < used; ++cnt)
+    if (&names[cnt] == primary_nameent)
+      continue;
+    else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec)
+	     || names[cnt].locrec == primary_locrec)
+      {
+	const char *oldname;
+	struct namehashent *namehashent;
+	uint32_t locrec_offset;
+
+	if (names[cnt].locrec == primary_locrec)
+	  oldname = primary;
+	else
+	  oldname = names[cnt - 1].name;
+	namehashent = insert_name (&ah, oldname, strlen (oldname), true);
+	assert (namehashent->name_offset != 0);
+	assert (namehashent->locrec_offset != 0);
+	locrec_offset = namehashent->locrec_offset;
+	add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset);
+      }
+    else
+      {
+	locale_data_t data;
+
+	compute_data (tmpl_ah, &names[cnt], sumused, files, data);
+	result |= add_locale_to_archive (&ah, names[cnt].name, data, 0);
+      }
+
+  while (nlist-- > 0)
+    {
+      const char *fname = *list++;
+      size_t fnamelen = strlen (fname);
+      struct stat64 st;
+      DIR *dirp;
+      struct dirent64 *d;
+      int seen;
+      locale_data_t data;
+      int cnt;
+
+      /* First see whether this really is a directory and whether it
+	 contains all the require locale category files.  */
+      if (stat64 (fname, &st) < 0)
+	{
+	  error (0, 0, "stat of \"%s\" failed: %s: ignored", fname,
+		 strerror (errno));
+	  continue;
+	}
+      if (!S_ISDIR (st.st_mode))
+	{
+	  error (0, 0, "\"%s\" is no directory; ignored", fname);
+	  continue;
+	}
+
+      dirp = opendir (fname);
+      if (dirp == NULL)
+	{
+	  error (0, 0, "cannot open directory \"%s\": %s: ignored",
+		 fname, strerror (errno));
+	  continue;
+	}
+
+      seen = 0;
+      while ((d = readdir64 (dirp)) != NULL)
+	{
+	  for (cnt = 0; cnt < __LC_LAST; ++cnt)
+	    if (cnt != LC_ALL)
+	      if (strcmp (d->d_name, locnames[cnt]) == 0)
+		{
+		  unsigned char d_type;
+
+		  /* We have an object of the required name.  If it's
+		     a directory we have to look at a file with the
+		     prefix "SYS_".  Otherwise we have found what we
+		     are looking for.  */
+#ifdef _DIRENT_HAVE_D_TYPE
+		  d_type = d->d_type;
+
+		  if (d_type != DT_REG)
+#endif
+		    {
+		      char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
+
+#ifdef _DIRENT_HAVE_D_TYPE
+		      if (d_type == DT_UNKNOWN || d_type == DT_LNK)
+#endif
+			{
+			  strcpy (stpcpy (stpcpy (fullname, fname), "/"),
+				  d->d_name);
+
+			  if (stat64 (fullname, &st) == -1)
+			    /* We cannot stat the file, ignore it.  */
+			    break;
+
+			  d_type = IFTODT (st.st_mode);
+			}
+
+		      if (d_type == DT_DIR)
+			{
+			  /* We have to do more tests.  The file is a
+			     directory and it therefore must contain a
+			     regular file with the same name except a
+			     "SYS_" prefix.  */
+			  char *t = stpcpy (stpcpy (fullname, fname), "/");
+			  strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"),
+				  d->d_name);
+
+			  if (stat64 (fullname, &st) == -1)
+			    /* There is no SYS_* file or we cannot
+			       access it.  */
+			    break;
+
+			  d_type = IFTODT (st.st_mode);
+			}
+		    }
+
+		  /* If we found a regular file (eventually after
+		     following a symlink) we are successful.  */
+		  if (d_type == DT_REG)
+		    ++seen;
+		  break;
+		}
+	}
+
+      closedir (dirp);
+
+      if (seen != __LC_LAST - 1)
+	{
+	  /* We don't have all locale category files.  Ignore the name.  */
+	  error (0, 0, "incomplete set of locale files in \"%s\"",
+		 fname);
+	  continue;
+	}
+
+      /* Add the files to the archive.  To do this we first compute
+	 sizes and the MD5 sums of all the files.  */
+      for (cnt = 0; cnt < __LC_LAST; ++cnt)
+	if (cnt != LC_ALL)
+	  {
+	    char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7];
+	    int fd;
+
+	    strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]);
+	    fd = open64 (fullname, O_RDONLY);
+	    if (fd == -1 || fstat64 (fd, &st) == -1)
+	      {
+		/* Cannot read the file.  */
+		if (fd != -1)
+		  close (fd);
+		break;
+	      }
+
+	    if (S_ISDIR (st.st_mode))
+	      {
+		char *t;
+		close (fd);
+		t = stpcpy (stpcpy (fullname, fname), "/");
+		strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"),
+			locnames[cnt]);
+
+		fd = open64 (fullname, O_RDONLY);
+		if (fd == -1 || fstat64 (fd, &st) == -1
+		    || !S_ISREG (st.st_mode))
+		  {
+		    if (fd != -1)
+		      close (fd);
+		    break;
+		  }
+	      }
+
+	    /* Map the file.  */
+	    data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
+				     fd, 0);
+	    if (data[cnt].addr == MAP_FAILED)
+	      {
+		/* Cannot map it.  */
+		close (fd);
+		break;
+	      }
+
+	    data[cnt].size = st.st_size;
+	    __md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum);
+
+	    /* We don't need the file descriptor anymore.  */
+	    close (fd);
+	  }
+
+      if (cnt != __LC_LAST)
+	{
+	  while (cnt-- > 0)
+	    if (cnt != LC_ALL)
+	      munmap (data[cnt].addr, data[cnt].size);
+
+	  error (0, 0, "cannot read all files in \"%s\": ignored", fname);
+
+	  continue;
+	}
+
+      result |= add_locale_to_archive (&ah, basename (fname), data, 0);
+
+      for (cnt = 0; cnt < __LC_LAST; ++cnt)
+	if (cnt != LC_ALL)
+	  munmap (data[cnt].addr, data[cnt].size);
+    }
+
+  /* We are done.  */
+  close_archive (&ah);
+
+  return result;
+}
+
+void usage()
+{
+  printf ("\
+Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\
+ Builds a locale archive from a template file.\n\
+ Options:\n\
+  -h, --help                 Print this usage message.\n\
+  -v, --verbose              Verbose execution.\n\
+  -l, --install-langs=LIST   Only include locales given in LIST into the \n\
+                             locale archive.  LIST is a colon separated list\n\
+                             of locale prefixes, for example \"de:en:ja\".\n\
+                             The special argument \"all\" means to install\n\
+                             all languages and it must be present by itself.\n\
+                             If \"all\" is present with any other language it\n\
+                             will be treated as the name of a locale.\n\
+                             If the --install-langs option is missing, all\n\
+                             locales are installed. The colon separated list\n\
+                             can contain any strings matching the beginning of\n\
+                             locale names.\n\
+                             If a string does not contain a \"_\", it is added.\n\
+                             Examples:\n\
+                               --install-langs=\"en\"\n\
+                                 installs en_US, en_US.iso88591,\n\
+                                 en_US.iso885915, en_US.utf8,\n\
+                                 en_GB ...\n\
+                               --install-langs=\"en_US.utf8\"\n\
+                                 installs only en_US.utf8.\n\
+                               --install-langs=\"ko\"\n\
+                                 installs ko_KR, ko_KR.euckr,\n\
+                                 ko_KR.utf8 but *not* kok_IN\n\
+                                 because \"ko\" does not contain\n\
+                                 \"_\" and it is silently added\n\
+                               --install-langs\"ko:kok\"\n\
+                                 installs ko_KR, ko_KR.euckr,\n\
+                                 ko_KR.utf8, kok_IN, and\n\
+                                 kok_IN.utf8.\n\
+                               --install-langs=\"POSIX\" will\n\
+                                 installs *no* locales at all\n\
+                                 because POSIX matches none of\n\
+                                 the locales. Actually, any string\n\
+                                 matching nothing will do that.\n\
+                                 POSIX and C will always be\n\
+                                 available because they are\n\
+                                 builtin.\n\
+                             Aliases are installed as well,\n\
+                             i.e. --install-langs=\"de\"\n\
+                             will install not only every locale starting with\n\
+                             \"de\" but also the aliases \"deutsch\"\n\
+                             and and \"german\" although the latter does not\n\
+                             start with \"de\".\n\
+\n\
+  If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\
+  where the glibc used expects these files are used by default.\n\
+");
+}
+
+int main (int argc, char *argv[])
+{
+  char path[4096];
+  DIR *dirp;
+  struct dirent64 *d;
+  struct stat64 st;
+  char *list[16384], *primary;
+  char *lang;
+  int install_langs_count = 0;
+  int i;
+  char *install_langs_arg, *ila_start;
+  char **install_langs_list = NULL;
+  unsigned int cnt = 0;
+  struct locarhandle tmpl_ah;
+  char *new_locar_fname = NULL;
+  size_t loc_path_len = strlen (loc_path);
+
+  while (1)
+    {
+      int c;
+
+      static struct option long_options[] =
+        {
+            {"help",            no_argument,       0, 'h'},
+            {"verbose",         no_argument,       0, 'v'},
+            {"install-langs",   required_argument, 0, 'l'},
+            {0, 0, 0, 0}
+        };
+      /* getopt_long stores the option index here. */
+      int option_index = 0;
+
+      c = getopt_long (argc, argv, "vhl:",
+                       long_options, &option_index);
+
+      /* Detect the end of the options. */
+      if (c == -1)
+        break;
+
+      switch (c)
+        {
+        case 0:
+          printf ("unknown option %s", long_options[option_index].name);
+          if (optarg)
+            printf (" with arg %s", optarg);
+          printf ("\n");
+          usage ();
+          exit (1);
+
+        case 'v':
+          verbose = 1;
+          be_quiet = 0;
+          break;
+
+        case 'h':
+          usage ();
+          exit (0);
+
+        case 'l':
+          install_langs_arg = ila_start = strdup (optarg);
+          /* If the argument to --install-lang is "all", do
+             not limit the list of languages to install and install
+             them all.  We do not support installing a single locale
+	     called "all".  */
+#define MAGIC_INSTALL_ALL "all"
+          if (install_langs_arg != NULL
+	      && install_langs_arg[0] != '\0'
+	      && !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL,
+			   strlen(MAGIC_INSTALL_ALL)) == 0
+		   && strlen (install_langs_arg) == 3))
+            {
+	      /* Count the number of languages we will install.  */
+              while (true)
+                {
+                  lang = strtok(install_langs_arg, ":;,");
+                  if (lang == NULL)
+                    break;
+                  install_langs_count++;
+                  install_langs_arg = NULL;
+                }
+	      free (ila_start);
+
+	      /* Reject an entire string made up of delimiters.  */
+	      if (install_langs_count == 0)
+		break;
+
+	      /* Copy the list.  */
+	      install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count);
+	      install_langs_arg = ila_start = strdup (optarg);
+	      install_langs_count = 0;
+	      while (true)
+                {
+                  lang = strtok(install_langs_arg, ":;,");
+                  if (lang == NULL)
+                    break;
+                  install_langs_list[install_langs_count] = lang;
+		  install_langs_count++;
+                  install_langs_arg = NULL;
+                }
+            }
+          break;
+
+        case '?':
+          /* getopt_long already printed an error message. */
+          usage ();
+          exit (0);
+
+        default:
+          abort ();
+        }
+    }
+  tmpl_ah.fname = NULL;
+  if (optind < argc)
+    tmpl_ah.fname = argv[optind];
+  if (optind + 1 < argc)
+    new_locar_fname = argv[optind + 1];
+  if (verbose)
+    {
+      if (tmpl_ah.fname)
+        printf("input archive file specified on command line: %s\n",
+               tmpl_ah.fname);
+      else
+        printf("using default input archive file.\n");
+      if (new_locar_fname)
+        printf("output archive file specified on command line: %s\n",
+               new_locar_fname);
+      else
+        printf("using default output archive file.\n");
+    }
+
+  dirp = opendir (loc_path);
+  if (dirp == NULL)
+    error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path);
+
+  open_tmpl_archive (&tmpl_ah);
+
+  if (new_locar_fname)
+    unlink (new_locar_fname);
+  else
+    unlink (locar_file);
+  primary = getenv ("LC_ALL");
+  if (primary == NULL)
+    primary = getenv ("LANG");
+  if (primary != NULL)
+    {
+      if (strncmp (primary, "ja", 2) != 0
+	  && strncmp (primary, "ko", 2) != 0
+	  && strncmp (primary, "zh", 2) != 0)
+	{
+	  char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q;
+	  /* This leads to invalid locales sometimes:
+	     de_DE.iso885915@euro -> de_DE.utf8@euro */
+	  if (ptr != NULL)
+	    {
+	      p = ptr;
+	      q = primary;
+	      while (*q && *q != '.' && *q != '@')
+		*p++ = *q++;
+	      if (*q == '.')
+		while (*q && *q != '@')
+		  q++;
+	      p = stpcpy (p, ".utf8");
+	      strcpy (p, q);
+	      primary = ptr;
+	    }
+	  else
+	    primary = NULL;
+	}
+    }
+
+  memcpy (path, loc_path, loc_path_len);
+
+  while ((d = readdir64 (dirp)) != NULL)
+    {
+      if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0)
+	continue;
+      if (strchr (d->d_name, '_') == NULL)
+	continue;
+
+      size_t d_name_len = strlen (d->d_name);
+      if (loc_path_len + d_name_len + 1 > sizeof (path))
+	{
+	  error (0, 0, "too long filename \"%s\"", d->d_name);
+	  continue;
+	}
+
+      memcpy (path + loc_path_len, d->d_name, d_name_len + 1);
+      if (stat64 (path, &st) < 0)
+	{
+	  error (0, errno, "cannot stat \"%s\"", path);
+	  continue;
+	}
+      if (! S_ISDIR (st.st_mode))
+	continue;
+      if (cnt == 16384)
+	{
+	  error (0, 0, "too many directories in \"%s\"", loc_path);
+	  break;
+	}
+      list[cnt] = strdup (path);
+      if (list[cnt] == NULL)
+	{
+	  error (0, errno, "cannot add file to list \"%s\"", path);
+	  continue;
+	}
+      if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0)
+	{
+	  char *p = list[0];
+	  list[0] = list[cnt];
+	  list[cnt] = p;
+	}
+      cnt++;
+    }
+  closedir (dirp);
+  /* Store the archive to the file specified as the second argument on the
+     command line or the default locale archive.  */
+  fill_archive (&tmpl_ah, new_locar_fname,
+                install_langs_count, install_langs_list,
+                cnt, list, primary);
+  close_archive (&tmpl_ah);
+  truncate (tmpl_file, 0);
+  if (install_langs_count > 0)
+    {
+      free (ila_start);
+      free (install_langs_list);
+    }
+  char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL };
+  execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]);
+  exit (0);
+}
--- a/SOURCES/glibc-RHEL-1017-1.patch
+++ b/SOURCES/glibc-RHEL-1017-1.patch
@ -1,432 +0,0 @@
-From e4ca6de1bc5e4ba3f94cf0c501a293c5bc827b10 Mon Sep 17 00:00:00 2001
-From: Anton Blanchard <anton@ozlabs.org>
-Date: Tue, 27 Jul 2021 15:47:49 +1000
-Subject: powerpc64: Replace some PPC_FEATURE_HAS_VSX with
- PPC_FEATURE_ARCH_2_06
-
-We use PPC_FEATURE_HAS_VSX to select a number of POWER7 optimised
-functions. These functions don't use any VSX instructions, so
-PPC_FEATURE_ARCH_2_06 seems like a better fit.
-
-Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
-
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-index 0acdf22ba3..32564c8f1f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-@@ -95,7 +95,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- #endif
- 	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __memset_power8)
-	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __memset_power7)
- 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
- 			      __memset_power6)
-@@ -139,7 +139,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- #endif
- 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strlen_power8)
-	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strlen_power7)
- 	      IFUNC_IMPL_ADD (array, i, strlen, 1,
- 			      __strlen_ppc))
-@@ -152,7 +152,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- #endif
- 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strncmp_power8)
-	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strncmp_power7)
- 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
- 			      __strncmp_power4)
-@@ -165,7 +165,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, strchr,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strchr_power7)
- 	      IFUNC_IMPL_ADD (array, i, strchr, 1,
- 			      __strchr_ppc))
-@@ -176,7 +176,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strchrnul_power8)
- 	      IFUNC_IMPL_ADD (array, i, strchrnul,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strchrnul_power7)
- 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1,
- 			      __strchrnul_ppc))
-@@ -192,7 +192,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- #endif
- 	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __memcmp_power8)
-	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __memcmp_power7)
- 	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_POWER4,
- 			      __memcmp_power4)
-@@ -244,7 +244,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __memchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, memchr,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __memchr_power7)
- 	      IFUNC_IMPL_ADD (array, i, memchr, 1,
- 			      __memchr_ppc))
-@@ -255,7 +255,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __memrchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, memrchr,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __memrchr_power7)
- 	      IFUNC_IMPL_ADD (array, i, memrchr, 1,
- 			      __memrchr_ppc))
-@@ -272,7 +272,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      __rawmemchr_power9)
- #endif
- 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __rawmemchr_power7)
- 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1,
- 			      __rawmemchr_ppc))
-@@ -282,7 +282,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 	      IFUNC_IMPL_ADD (array, i, strnlen,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strnlen_power8)
-	      IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strnlen_power7)
- 	      IFUNC_IMPL_ADD (array, i, strnlen, 1,
- 			      __strnlen_ppc))
-@@ -293,14 +293,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strcasecmp_power8)
- 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strcasecmp_power7)
- 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ppc))
- 
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c.  */
-   IFUNC_IMPL (i, name, strcasecmp_l,
- 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strcasecmp_l_power7)
- 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
- 			      __strcasecmp_l_ppc))
-@@ -311,14 +311,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strncasecmp_power8)
- 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strncasecmp_power7)
- 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_ppc))
- 
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase_l.c.  */
-   IFUNC_IMPL (i, name, strncasecmp_l,
- 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strncasecmp_l_power7)
- 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
- 			      __strncasecmp_l_ppc))
-@@ -329,7 +329,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strrchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, strrchr,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strrchr_power7)
- 	      IFUNC_IMPL_ADD (array, i, strrchr, 1,
- 			      __strrchr_ppc))
-@@ -357,7 +357,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strncpy_power8)
- 	      IFUNC_IMPL_ADD (array, i, strncpy,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strncpy_power7)
- 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
- 			     __strncpy_ppc))
-@@ -374,7 +374,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __stpncpy_power8)
- 	      IFUNC_IMPL_ADD (array, i, stpncpy,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __stpncpy_power7)
- 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
- 			     __stpncpy_ppc))
-@@ -390,7 +390,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
- 			      __strcmp_power8)
- 	      IFUNC_IMPL_ADD (array, i, strcmp,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strcmp_power7)
- 	      IFUNC_IMPL_ADD (array, i, strcmp, 1,
- 			     __strcmp_ppc))
-@@ -425,7 +425,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c.  */
-   IFUNC_IMPL (i, name, strstr,
-              IFUNC_IMPL_ADD (array, i, strstr,
-                             hwcap & PPC_FEATURE_HAS_VSX,
-+                             hwcap & PPC_FEATURE_ARCH_2_06,
-                              __strstr_power7)
-              IFUNC_IMPL_ADD (array, i, strstr, 1,
-                              __strstr_ppc))
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
-index 0c718d4f15..c24186689e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
-@@ -30,7 +30,7 @@ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
- libc_ifunc (__memchr,
- 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 	    ? __memchr_power8 :
-	    (hwcap & PPC_FEATURE_HAS_VSX)
-+	    (hwcap & PPC_FEATURE_ARCH_2_06)
-             ? __memchr_power7
-             : __memchr_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
-index 4fd089aba7..99559bce26 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
-@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect_memcmp, memcmp,
- #endif
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __memcmp_power8 :
-		       (hwcap & PPC_FEATURE_HAS_VSX)
-+		       (hwcap & PPC_FEATURE_ARCH_2_06)
- 		       ? __memcmp_power7
- 		       : (hwcap & PPC_FEATURE_POWER4)
- 			 ? __memcmp_power4
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
-index e06d6468b8..16bb6f0042 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
-@@ -30,7 +30,7 @@ extern __typeof (__memrchr) __memrchr_power8 attribute_hidden;
- libc_ifunc (__memrchr,
- 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 	    ? __memrchr_power8 :
-	      (hwcap & PPC_FEATURE_HAS_VSX)
-+	      (hwcap & PPC_FEATURE_ARCH_2_06)
- 	      ? __memrchr_power7
- 	    : __memrchr_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
-index 5994bf02e6..c1aa143f60 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
-@@ -48,7 +48,7 @@ libc_ifunc (__libc_memset,
- # endif
-             (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-             ? __memset_power8 :
-	      (hwcap & PPC_FEATURE_HAS_VSX)
-+	      (hwcap & PPC_FEATURE_ARCH_2_06)
- 	      ? __memset_power7 :
- 		(hwcap & PPC_FEATURE_ARCH_2_05)
- 		? __memset_power6 :
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
-index c0ffea2b93..b5d2d3a635 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
-@@ -41,7 +41,7 @@ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
- 		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
- 		       ? __rawmemchr_power9 :
- # endif
-		         (hwcap & PPC_FEATURE_HAS_VSX)
-+		         (hwcap & PPC_FEATURE_ARCH_2_06)
- 		         ? __rawmemchr_power7
- 		       : __rawmemchr_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
-index bebd377fd9..e7035761a7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
-@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
- # endif
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __stpncpy_power8
-		       : (hwcap & PPC_FEATURE_HAS_VSX)
-+		       : (hwcap & PPC_FEATURE_ARCH_2_06)
- 			 ? __stpncpy_power7
- 			 : __stpncpy_ppc);
- weak_alias (__stpncpy, stpncpy)
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
-index dcd7774403..55ca6c85c4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
-@@ -29,7 +29,7 @@ extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
- libc_ifunc (__libc_strcasecmp,
- 	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-              ? __strcasecmp_power8:
-	     (hwcap & PPC_FEATURE_HAS_VSX)
-+	     (hwcap & PPC_FEATURE_ARCH_2_06)
-              ? __strcasecmp_power7
-              : __strcasecmp_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
-index 96a70b8b11..1afee5d7fd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
-@@ -32,7 +32,7 @@ extern __typeof (__strcasecmp_l) __strcasecmp_l_power7 attribute_hidden;
- 
- extern __typeof (__strcasecmp_l) __libc_strcasecmp_l;
- libc_ifunc (__libc_strcasecmp_l,
-	    (hwcap & PPC_FEATURE_HAS_VSX)
-+	    (hwcap & PPC_FEATURE_ARCH_2_06)
-             ? __strcasecmp_l_power7
-             : __strcasecmp_l_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr.c b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
-index ea9ac1134f..27c794c6b7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
-@@ -35,7 +35,7 @@ extern __typeof (strchr) __strchr_power8 attribute_hidden;
- libc_ifunc_redirected (__redirect_strchr, strchr,
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __strchr_power8 :
-		       (hwcap & PPC_FEATURE_HAS_VSX)
-+		       (hwcap & PPC_FEATURE_ARCH_2_06)
- 		       ? __strchr_power7
- 		       : __strchr_ppc);
- weak_alias (strchr, index)
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
-index 4688e7c3f0..4a07b4a242 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
-@@ -30,7 +30,7 @@ extern __typeof (__strchrnul) __strchrnul_power8 attribute_hidden;
- libc_ifunc (__strchrnul,
- 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 	    ? __strchrnul_power8 :
-	    (hwcap & PPC_FEATURE_HAS_VSX)
-+	    (hwcap & PPC_FEATURE_ARCH_2_06)
-             ? __strchrnul_power7
-             : __strchrnul_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
-index 72f9a639bf..4b0b25fff6 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
-@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect_strcmp, strcmp,
- # endif
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __strcmp_power8
-		       : (hwcap & PPC_FEATURE_HAS_VSX)
-+		       : (hwcap & PPC_FEATURE_ARCH_2_06)
- 			 ? __strcmp_power7
- 			 : __strcmp_ppc);
- #endif
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
-index 109c8a90bd..0cd1c6faff 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
-@@ -42,7 +42,7 @@ libc_ifunc (__libc_strlen,
- # endif
- 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 	    ? __strlen_power8 :
-	      (hwcap & PPC_FEATURE_HAS_VSX)
-+	      (hwcap & PPC_FEATURE_ARCH_2_06)
- 	      ? __strlen_power7
- 	      : __strlen_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
-index 2013a5d75a..644046bd74 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
-@@ -29,7 +29,7 @@ extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
- libc_ifunc (__libc_strncasecmp,
- 	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-              ? __strncasecmp_power8:
-	     (hwcap & PPC_FEATURE_HAS_VSX)
-+	     (hwcap & PPC_FEATURE_ARCH_2_06)
-              ? __strncasecmp_power7
-              : __strncasecmp_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c b/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
-index cad6da302d..d2d761af72 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
-@@ -34,7 +34,7 @@ extern __typeof (__strncasecmp_l) __strncasecmp_l_power7 attribute_hidden;
-    ifunc symbol properly.  */
- extern __typeof (__strncasecmp_l) __libc_strncasecmp_l;
- libc_ifunc (__libc_strncasecmp_l,
-	     (hwcap & PPC_FEATURE_HAS_VSX)
-+	     (hwcap & PPC_FEATURE_ARCH_2_06)
-              ? __strncasecmp_l_power7
-              : __strncasecmp_l_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
-index eef524ddfb..1f689e5c05 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
-@@ -43,7 +43,7 @@ libc_ifunc_redirected (__redirect_strncmp, strncmp,
- # endif
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __strncmp_power8
-		       : (hwcap & PPC_FEATURE_HAS_VSX)
-+		       : (hwcap & PPC_FEATURE_ARCH_2_06)
- 			 ? __strncmp_power7
- 			 : (hwcap & PPC_FEATURE_POWER4)
- 			   ? __strncmp_power4
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
-index 7da9def358..d4d3463bd1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
-@@ -43,7 +43,7 @@ libc_ifunc_redirected (__redirect_strncpy, strncpy,
- # endif
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __strncpy_power8
-		       : (hwcap & PPC_FEATURE_HAS_VSX)
-+		       : (hwcap & PPC_FEATURE_ARCH_2_06)
- 			 ? __strncpy_power7
- 			 : __strncpy_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
-index 264b7a752d..baf375a75a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
-@@ -31,7 +31,7 @@ extern __typeof (__strnlen) __strnlen_power8 attribute_hidden;
- libc_ifunc_redirected (__redirect___strnlen, __strnlen,
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __strnlen_power8 :
-			 (hwcap & PPC_FEATURE_HAS_VSX)
-+			 (hwcap & PPC_FEATURE_ARCH_2_06)
- 			 ? __strnlen_power7
- 			 : __strnlen_ppc);
- weak_alias (__strnlen, strnlen)
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
-index bb06b93d19..1c9eea1817 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
-@@ -33,7 +33,7 @@ extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
- libc_ifunc_redirected (__redirect_strrchr, strrchr,
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
- 		       ? __strrchr_power8 :
-		       (hwcap & PPC_FEATURE_HAS_VSX)
-+		       (hwcap & PPC_FEATURE_ARCH_2_06)
- 		       ? __strrchr_power7
- 		       : __strrchr_ppc);
- weak_alias (strrchr, rindex)
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr.c b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
-index bb0588844e..6582798dda 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strstr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
-@@ -30,7 +30,7 @@ extern __typeof (strstr) __strstr_power7 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc_redirected (__redirect_strstr, strstr,
-		       (hwcap & PPC_FEATURE_HAS_VSX)
-+		       (hwcap & PPC_FEATURE_ARCH_2_06)
- 		       ? __strstr_power7
- 		       : __strstr_ppc);
- #endif
--- a/SOURCES/glibc-RHEL-1017-2.patch
+++ b/SOURCES/glibc-RHEL-1017-2.patch
@ -1,83 +0,0 @@
-From f2a15dd668913c5a1388ba7e1131b25162b2ea75 Mon Sep 17 00:00:00 2001
-From: Anton Blanchard <anton@ozlabs.org>
-Date: Tue, 27 Jul 2021 15:47:50 +1000
-Subject: powerpc64: Check cacheline size before using optimised memset
- routines
-
-A number of optimised memset routines assume the cacheline size is 128B,
-so we better check before using them.
-
-Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
-
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-index 32564c8f1f..a3fdcd43bd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-@@ -35,6 +35,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 
-   unsigned long int hwcap = GLRO(dl_hwcap);
-   unsigned long int hwcap2 = GLRO(dl_hwcap2);
-+#ifdef SHARED
-+  int cacheline_size = GLRO(dl_cache_line_size);
-+#endif
- 
-   /* hwcap contains only the latest supported ISA, the code checks which is
-      and fills the previous supported ones.  */
-@@ -90,16 +93,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 	      IFUNC_IMPL_ADD (array, i, memset,
- 			      hwcap2 & PPC_FEATURE2_ARCH_3_1
- 			      && hwcap2 & PPC_FEATURE2_HAS_ISEL
-			      && hwcap & PPC_FEATURE_HAS_VSX,
-+			      && hwcap & PPC_FEATURE_HAS_VSX
-+			      && cacheline_size == 128,
- 			      __memset_power10)
- #endif
-	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && cacheline_size == 128,
- 			      __memset_power8)
-	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06,
-+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06
-+			      && cacheline_size == 128,
- 			      __memset_power7)
-	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
-+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05
-+			      && cacheline_size == 128,
- 			      __memset_power6)
-	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4,
-+	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4
-+			      && cacheline_size == 128,
- 			      __memset_power4)
- 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc))
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
-index c1aa143f60..056e911699 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
-@@ -43,16 +43,21 @@ libc_ifunc (__libc_memset,
- # ifdef __LITTLE_ENDIAN__
- 	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
- 	     && hwcap2 & PPC_FEATURE2_HAS_ISEL
-	     && hwcap & PPC_FEATURE_HAS_VSX)
-+	     && hwcap & PPC_FEATURE_HAS_VSX
-+	     && GLRO(dl_cache_line_size) == 128)
- 	    ? __memset_power10 :
- # endif
-            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+            (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && GLRO(dl_cache_line_size) == 128)
-             ? __memset_power8 :
-	      (hwcap & PPC_FEATURE_ARCH_2_06)
-+	      (hwcap & PPC_FEATURE_ARCH_2_06
-+	       && GLRO(dl_cache_line_size) == 128)
- 	      ? __memset_power7 :
-		(hwcap & PPC_FEATURE_ARCH_2_05)
-+		(hwcap & PPC_FEATURE_ARCH_2_05
-+	         && GLRO(dl_cache_line_size) == 128)
- 		? __memset_power6 :
-		  (hwcap & PPC_FEATURE_POWER4)
-+		  (hwcap & PPC_FEATURE_POWER4
-+	           && GLRO(dl_cache_line_size) == 128)
- 		  ? __memset_power4
-             : __memset_ppc);
- 
--- a/SOURCES/glibc-RHEL-1017-3.patch
+++ b/SOURCES/glibc-RHEL-1017-3.patch
@ -1,703 +0,0 @@
-From 60b4dd25790342b40e8942e3a4115f511a6b6911 Mon Sep 17 00:00:00 2001
-From: Anton Blanchard <anton@ozlabs.org>
-Date: Tue, 27 Jul 2021 15:47:51 +1000
-Subject: powerpc64: Add checks for Altivec and VSX in ifunc selection
-
-We'd like to support processors without Altivec or VSX, so check
-the relevant hwcap bits before selecting them.
-
-Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
-
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
-index 660d7dc686..c8ffbea01c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
-@@ -38,11 +38,13 @@ libc_ifunc (__bzero,
- 	     && hwcap & PPC_FEATURE_HAS_VSX)
- 	    ? __bzero_power10 :
- # endif
-            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
-             ? __bzero_power8 :
- 	      (hwcap & PPC_FEATURE_HAS_VSX)
- 	      ? __bzero_power7 :
-		(hwcap & PPC_FEATURE_ARCH_2_05)
-+		(hwcap & PPC_FEATURE_ARCH_2_05
-+		 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		? __bzero_power6 :
- 		  (hwcap & PPC_FEATURE_POWER4)
- 		  ? __bzero_power4
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-index a3fdcd43bd..c3e25c5981 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
-@@ -60,9 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __memcpy_power10)
- #endif
-	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __memcpy_power8_cached)
-	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __memcpy_power7)
- 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __memcpy_a2)
-@@ -83,7 +85,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __memmove_power10)
- #endif
-	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_ARCH_2_06
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __memmove_power7)
- 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
- 
-@@ -98,6 +101,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      __memset_power10)
- #endif
- 	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC
- 			      && cacheline_size == 128,
- 			      __memset_power8)
- 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06
-@@ -114,12 +118,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
-   IFUNC_IMPL (i, name, strcpy,
- #ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
-+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strcpy_power9)
- #endif
-	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strcpy_power8)
-	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_ARCH_2_06
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strcpy_power7)
- 	      IFUNC_IMPL_ADD (array, i, strcpy, 1,
- 			      __strcpy_ppc))
-@@ -127,12 +134,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c.  */
-   IFUNC_IMPL (i, name, stpcpy,
- #ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
-+	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __stpcpy_power9)
- #endif
-	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __stpcpy_power8)
-	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
-+	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_ARCH_2_06
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __stpcpy_power7)
- 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1,
- 			      __stpcpy_ppc))
-@@ -140,12 +150,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
-   IFUNC_IMPL (i, name, strlen,
- #ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1,
-+	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strlen_power10)
-	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
-+	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strlen_power9)
- #endif
-	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strlen_power8)
- 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strlen_power7)
-@@ -155,7 +168,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c.  */
-   IFUNC_IMPL (i, name, strncmp,
- #ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00,
-+	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strncmp_power9)
- #endif
- 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-@@ -170,7 +184,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strchr.c.  */
-   IFUNC_IMPL (i, name, strchr,
- 	      IFUNC_IMPL_ADD (array, i, strchr,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, strchr,
- 			      hwcap & PPC_FEATURE_ARCH_2_06,
-@@ -181,7 +196,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strchrnul.c.  */
-   IFUNC_IMPL (i, name, strchrnul,
- 	      IFUNC_IMPL_ADD (array, i, strchrnul,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strchrnul_power8)
- 	      IFUNC_IMPL_ADD (array, i, strchrnul,
- 			      hwcap & PPC_FEATURE_ARCH_2_06,
-@@ -198,7 +214,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-             && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __memcmp_power10)
- #endif
-	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __memcmp_power8)
- 	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __memcmp_power7)
-@@ -215,11 +232,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
- 			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __bzero_power10)
- #endif
-	      IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __bzero_power8)
- 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
- 			      __bzero_power7)
-	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
-+	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __bzero_power6)
- 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_POWER4,
- 			      __bzero_power4)
-@@ -241,7 +260,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/mempcpy.c.  */
-   IFUNC_IMPL (i, name, mempcpy,
- 	      IFUNC_IMPL_ADD (array, i, mempcpy,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __mempcpy_power7)
- 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
- 			      __mempcpy_ppc))
-@@ -249,7 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c.  */
-   IFUNC_IMPL (i, name, memchr,
- 	      IFUNC_IMPL_ADD (array, i, memchr,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __memchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, memchr,
- 			      hwcap & PPC_FEATURE_ARCH_2_06,
-@@ -260,7 +281,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/memrchr.c.  */
-   IFUNC_IMPL (i, name, memrchr,
- 	      IFUNC_IMPL_ADD (array, i, memrchr,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __memrchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, memrchr,
- 			      hwcap & PPC_FEATURE_ARCH_2_06,
-@@ -276,7 +298,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-                               && (hwcap & PPC_FEATURE_HAS_VSX),
-                               __rawmemchr_power10)
- 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
-			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
-+			      hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __rawmemchr_power9)
- #endif
- 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
-@@ -288,7 +311,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strnlen.c.  */
-   IFUNC_IMPL (i, name, strnlen,
- 	      IFUNC_IMPL_ADD (array, i, strnlen,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strnlen_power8)
- 	      IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_ARCH_2_06,
- 			      __strnlen_power7)
-@@ -298,7 +322,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c.  */
-   IFUNC_IMPL (i, name, strcasecmp,
- 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strcasecmp_power8)
- 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
- 			      hwcap & PPC_FEATURE_ARCH_2_06,
-@@ -316,7 +341,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase.c.  */
-   IFUNC_IMPL (i, name, strncasecmp,
- 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			       && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strncasecmp_power8)
- 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
- 			      hwcap & PPC_FEATURE_ARCH_2_06,
-@@ -334,7 +360,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c.  */
-   IFUNC_IMPL (i, name, strrchr,
- 	      IFUNC_IMPL_ADD (array, i, strrchr,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strrchr_power8)
- 	      IFUNC_IMPL_ADD (array, i, strrchr,
- 			      hwcap & PPC_FEATURE_ARCH_2_06,
-@@ -345,10 +372,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strncat.c.  */
-   IFUNC_IMPL (i, name, strncat,
- 	      IFUNC_IMPL_ADD (array, i, strncat,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strncat_power8)
- 	      IFUNC_IMPL_ADD (array, i, strncat,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strncat_power7)
- 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
- 			      __strncat_ppc))
-@@ -391,7 +420,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   IFUNC_IMPL (i, name, strcmp,
- #ifdef __LITTLE_ENDIAN__
- 	      IFUNC_IMPL_ADD (array, i, strcmp,
-			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
-+			      hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strcmp_power9)
- #endif
- 	      IFUNC_IMPL_ADD (array, i, strcmp,
-@@ -406,10 +436,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
-   IFUNC_IMPL (i, name, strcat,
- 	      IFUNC_IMPL_ADD (array, i, strcat,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strcat_power8)
- 	      IFUNC_IMPL_ADD (array, i, strcat,
-			      hwcap & PPC_FEATURE_HAS_VSX,
-+			      hwcap & PPC_FEATURE_ARCH_2_06
-+			      && hwcap & PPC_FEATURE_HAS_VSX,
- 			      __strcat_power7)
- 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
- 			     __strcat_ppc))
-@@ -417,7 +449,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c.  */
-   IFUNC_IMPL (i, name, strspn,
-              IFUNC_IMPL_ADD (array, i, strspn,
-                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+                             hwcap2 & PPC_FEATURE2_ARCH_2_07
-+                             && hwcap & PPC_FEATURE_HAS_VSX,
-                              __strspn_power8)
-              IFUNC_IMPL_ADD (array, i, strspn, 1,
-                              __strspn_ppc))
-@@ -425,7 +458,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c.  */
-   IFUNC_IMPL (i, name, strcspn,
-              IFUNC_IMPL_ADD (array, i, strcspn,
-                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+                             hwcap2 & PPC_FEATURE2_ARCH_2_07
-+                             && hwcap & PPC_FEATURE_HAS_VSX,
-                              __strcspn_power8)
-              IFUNC_IMPL_ADD (array, i, strcspn, 1,
-                              __strcspn_ppc))
-@@ -442,7 +476,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
-   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasestr.c.  */
-   IFUNC_IMPL (i, name, strcasestr,
- 	      IFUNC_IMPL_ADD (array, i, strcasestr,
-			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
-+			      hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
- 			      __strcasestr_power8)
-              IFUNC_IMPL_ADD (array, i, strcasestr, 1,
-                              __strcasestr_ppc))
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
-index c24186689e..f40013e061 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
-@@ -28,7 +28,8 @@ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc (__memchr,
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 	    ? __memchr_power8 :
- 	    (hwcap & PPC_FEATURE_ARCH_2_06)
-             ? __memchr_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
-index 99559bce26..89b56c103b 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
-@@ -38,7 +38,8 @@ libc_ifunc_redirected (__redirect_memcmp, memcmp,
- 				 && hwcap & PPC_FEATURE_HAS_VSX)
- 				 ? __memcmp_power10 :
- #endif
-		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		       ? __memcmp_power8 :
- 		       (hwcap & PPC_FEATURE_ARCH_2_06)
- 		       ? __memcmp_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
-index 53ab32ef26..684ee064f2 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
-@@ -45,9 +45,12 @@ libc_ifunc (__libc_memcpy,
- 	    (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX)
- 	    ? __memcpy_power10 :
- # endif
-	    ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC
-+	     && use_cached_memopt)
- 	    ? __memcpy_power8_cached :
-	      (hwcap & PPC_FEATURE_HAS_VSX)
-+	      (hwcap & PPC_FEATURE_ARCH_2_06
-+	       && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 	      ? __memcpy_power7 :
- 		(hwcap & PPC_FEATURE_ARCH_2_06)
- 		? __memcpy_a2 :
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
-index 637b2cbf7f..50253b4554 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
-@@ -41,7 +41,8 @@ libc_ifunc (__libc_memmove,
- 	     && hwcap & PPC_FEATURE_HAS_VSX)
- 	    ? __memmove_power10 :
- #endif
-		     (hwcap & PPC_FEATURE_HAS_VSX)
-+		     (hwcap & PPC_FEATURE_ARCH_2_06
-+		      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		     ? __memmove_power7
- 		     : __memmove_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c b/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
-index b37e0f35b5..563095a5ec 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
-@@ -33,7 +33,8 @@ extern __typeof (__mempcpy) __mempcpy_power7 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc_redirected (__redirect___mempcpy, __mempcpy,
-		       (hwcap & PPC_FEATURE_HAS_VSX)
-+		       (hwcap & PPC_FEATURE_ARCH_2_06
-+			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		       ? __mempcpy_power7
- 		       : __mempcpy_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
-index 16bb6f0042..a8b985b06a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
-@@ -28,7 +28,8 @@ extern __typeof (__memrchr) __memrchr_power8 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc (__memrchr,
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 	    ? __memrchr_power8 :
- 	      (hwcap & PPC_FEATURE_ARCH_2_06)
- 	      ? __memrchr_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
-index 056e911699..a2bc223bcc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
-@@ -48,6 +48,7 @@ libc_ifunc (__libc_memset,
- 	    ? __memset_power10 :
- # endif
-             (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC
- 	     && GLRO(dl_cache_line_size) == 128)
-             ? __memset_power8 :
- 	      (hwcap & PPC_FEATURE_ARCH_2_06
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
-index b5d2d3a635..43eb459e02 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
-@@ -38,7 +38,8 @@ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
- 		     (hwcap2 & PPC_FEATURE2_ARCH_3_1)
- 		     && (hwcap & PPC_FEATURE_HAS_VSX)
- 		     ? __rawmemchr_power10 :
-		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
-+		       (hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			&& hwcap & PPC_FEATURE_HAS_VSX)
- 		       ? __rawmemchr_power9 :
- # endif
- 		         (hwcap & PPC_FEATURE_ARCH_2_06)
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
-index d4eb4285fc..5be413405e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
-@@ -32,12 +32,15 @@ extern __typeof (__stpcpy) __stpcpy_power9 attribute_hidden;
- 
- libc_ifunc_hidden (__stpcpy, __stpcpy,
- # ifdef __LITTLE_ENDIAN__
-		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
-+		   (hwcap2 & PPC_FEATURE2_ARCH_3_00
-+		    && hwcap & PPC_FEATURE_HAS_VSX)
- 		   ? __stpcpy_power9 :
- # endif
-		     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		     (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+		      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		     ? __stpcpy_power8
-		     : (hwcap & PPC_FEATURE_HAS_VSX)
-+		     : (hwcap & PPC_FEATURE_ARCH_2_06
-+		        && hwcap & PPC_FEATURE_HAS_VSX)
- 		       ? __stpcpy_power7
- 		       : __stpcpy_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
-index 55ca6c85c4..21ce2d279b 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
-@@ -27,7 +27,8 @@ extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
- extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
- 
- libc_ifunc (__libc_strcasecmp,
-	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
-              ? __strcasecmp_power8:
- 	     (hwcap & PPC_FEATURE_ARCH_2_06)
-              ? __strcasecmp_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c b/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
-index 7e4bd3b5ac..5bb3016022 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
-@@ -27,7 +27,8 @@ extern __typeof (__strcasestr) __strcasestr_power8 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc (__strcasestr,
-		(hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		(hwcap2 & PPC_FEATURE2_ARCH_2_07
-+		 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		? __strcasestr_power8
- 		: __strcasestr_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
-index 6d342324c4..d8d9870824 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
-@@ -28,9 +28,11 @@ extern __typeof (strcat) __strcat_power8 attribute_hidden;
- # undef strcat
- 
- libc_ifunc_redirected (__redirect_strcat, strcat,
-		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			&& hwcap & PPC_FEATURE_HAS_VSX)
- 		       ? __strcat_power8
-		       : (hwcap & PPC_FEATURE_HAS_VSX)
-+		       : (hwcap & PPC_FEATURE_ARCH_2_06
-+			  && hwcap & PPC_FEATURE_HAS_VSX)
- 			 ? __strcat_power7
- 			 : __strcat_ppc);
- #endif
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr.c b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
-index 27c794c6b7..62b202baf9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
-@@ -33,7 +33,8 @@ extern __typeof (strchr) __strchr_power8 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc_redirected (__redirect_strchr, strchr,
-		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		       ? __strchr_power8 :
- 		       (hwcap & PPC_FEATURE_ARCH_2_06)
- 		       ? __strchr_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
-index 4a07b4a242..40e529b9d9 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
-@@ -28,7 +28,8 @@ extern __typeof (__strchrnul) __strchrnul_power8 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc (__strchrnul,
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 	    ? __strchrnul_power8 :
- 	    (hwcap & PPC_FEATURE_ARCH_2_06)
-             ? __strchrnul_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
-index 4b0b25fff6..8132682a99 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
-@@ -35,7 +35,8 @@ extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
- 
- libc_ifunc_redirected (__redirect_strcmp, strcmp,
- # ifdef __LITTLE_ENDIAN__
-			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
-+			(hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 			? __strcmp_power9 :
- # endif
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
-index b733fa5a23..5af1d45cc1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
-@@ -32,12 +32,15 @@ extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
- 
- libc_ifunc_redirected (__redirect_strcpy, strcpy,
- # ifdef __LITTLE_ENDIAN__
-			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
-+			(hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			 && hwcap & PPC_FEATURE_HAS_VSX)
- 			? __strcpy_power9 :
- # endif
-		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		       ? __strcpy_power8
-		       : (hwcap & PPC_FEATURE_HAS_VSX)
-+		       : (hwcap & PPC_FEATURE_ARCH_2_06
-+		          && hwcap & PPC_FEATURE_HAS_VSX)
- 			 ? __strcpy_power7
- 			 : __strcpy_ppc);
- #endif
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
-index 683aa104d7..8ba01c13b1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
-@@ -27,7 +27,8 @@ extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
- extern __typeof (strcspn) __strcspn_power8 attribute_hidden;
- 
- libc_ifunc (__libc_strcspn,
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_VSX)
- 	    ? __strcspn_power8
- 	    : __strcspn_ppc);
- 
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
-index 0cd1c6faff..f1e28414e0 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
-@@ -35,12 +35,15 @@ extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden;
- 
- libc_ifunc (__libc_strlen,
- # ifdef __LITTLE_ENDIAN__
-	(hwcap2 & PPC_FEATURE2_ARCH_3_1)
-+	(hwcap2 & PPC_FEATURE2_ARCH_3_1
-+	 && hwcap & PPC_FEATURE_HAS_VSX)
- 	? __strlen_power10 :
-	  (hwcap2 & PPC_FEATURE2_ARCH_3_00)
-+	  (hwcap2 & PPC_FEATURE2_ARCH_3_00
-+	   && hwcap & PPC_FEATURE_HAS_VSX)
- 	  ? __strlen_power9 :
- # endif
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 	    ? __strlen_power8 :
- 	      (hwcap & PPC_FEATURE_ARCH_2_06)
- 	      ? __strlen_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
-index 644046bd74..2802cf2c3f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
-@@ -27,7 +27,8 @@ extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
- extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
- 
- libc_ifunc (__libc_strncasecmp,
-	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	     (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
-              ? __strncasecmp_power8:
- 	     (hwcap & PPC_FEATURE_ARCH_2_06)
-              ? __strncasecmp_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat.c b/sysdeps/powerpc/powerpc64/multiarch/strncat.c
-index 0036fca91a..9ea294a72d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncat.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strncat.c
-@@ -26,9 +26,11 @@ extern __typeof (strncat) __strncat_power7 attribute_hidden;
- extern __typeof (strncat) __strncat_power8 attribute_hidden;
- 
- libc_ifunc (strncat,
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_VSX)
- 	    ? __strncat_power8
-	    : (hwcap & PPC_FEATURE_HAS_VSX)
-+	    : (hwcap & PPC_FEATURE_ARCH_2_06
-+	       && hwcap & PPC_FEATURE_HAS_VSX)
-             ? __strncat_power7
-             : __strncat_ppc);
- #endif
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
-index 1f689e5c05..2d21122854 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
-@@ -38,7 +38,8 @@ extern __typeof (strncmp) __strncmp_power9 attribute_hidden;
-    ifunc symbol properly.  */
- libc_ifunc_redirected (__redirect_strncmp, strncmp,
- # ifdef __LITTLE_ENDIAN__
-			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
-+			(hwcap2 & PPC_FEATURE2_ARCH_3_00
-+			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 			? __strncmp_power9 :
- # endif
- 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
-index baf375a75a..e68e9d9f88 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
-@@ -29,7 +29,8 @@ extern __typeof (__strnlen) __strnlen_power8 attribute_hidden;
- # undef strnlen
- # undef __strnlen
- libc_ifunc_redirected (__redirect___strnlen, __strnlen,
-		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		       ? __strnlen_power8 :
- 			 (hwcap & PPC_FEATURE_ARCH_2_06)
- 			 ? __strnlen_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
-index 1c9eea1817..7f0cf2a1b7 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
-@@ -31,7 +31,8 @@ extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
- /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
-    ifunc symbol properly.  */
- libc_ifunc_redirected (__redirect_strrchr, strrchr,
-		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
- 		       ? __strrchr_power8 :
- 		       (hwcap & PPC_FEATURE_ARCH_2_06)
- 		       ? __strrchr_power7
-diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
-index 70167a176b..7613ab3d55 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strspn.c
-+++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
-@@ -27,7 +27,8 @@ extern __typeof (strspn) __strspn_ppc attribute_hidden;
- extern __typeof (strspn) __strspn_power8 attribute_hidden;
- 
- libc_ifunc (__libc_strspn,
-	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
-+	     && hwcap & PPC_FEATURE_HAS_VSX)
- 	    ? __strspn_power8
- 	    : __strspn_ppc);
- 
--- a/SOURCES/glibc-RHEL-1017-4.patch
+++ b/SOURCES/glibc-RHEL-1017-4.patch
@ -1,652 +0,0 @@
-From 21841f0d562f0e944c4d267a28cc3ebd19c847e9 Mon Sep 17 00:00:00 2001
-From: Mahesh Bodapati <bmahi496@linux.ibm.com>
-Date: Tue, 1 Aug 2023 07:41:17 -0500
-Subject: PowerPC: Influence cpu/arch hwcap features via GLIBC_TUNABLES
-
-This patch enables the option to influence hwcaps used by PowerPC.
-The environment variable, GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,-zzz....,
-can be used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature xxx
-and zzz, where the feature name is case-sensitive and has to match the ones
-mentioned in the file{sysdeps/powerpc/dl-procinfo.c}.
-
-Note that the hwcap tunables only used in the IFUNC selection.
-Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
-
-[rebased to c9s by DJ]
-
-diff -rupN a/manual/tunables.texi b/manual/tunables.texi
--- a/manual/tunables.texi	2023-09-13 01:16:19.979884270 -0400
-+++ b/manual/tunables.texi	2023-09-13 01:17:19.217179994 -0400
-@@ -476,7 +476,10 @@ On s390x, the supported HWCAP and STFLE
- @code{sysdeps/s390/cpu-features.c}.  In addition the user can also set
- a CPU arch-level like @code{z13} instead of single HWCAP and STFLE features.
- 
-This tunable is specific to i386, x86-64 and s390x.
-+On powerpc, the supported HWCAP and HWCAP2 features can be found in
-+@code{sysdeps/powerpc/dl-procinfo.c}.
-+
-+This tunable is specific to i386, x86-64, s390x and powerpc.
- @end deftp
- 
- @deftp Tunable glibc.cpu.cached_memopt
-diff -rupN a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c
--- a/sysdeps/powerpc/cpu-features.c	2021-08-01 21:33:43.000000000 -0400
-+++ b/sysdeps/powerpc/cpu-features.c	1969-12-31 19:00:00.000000000 -0500
-@@ -1,39 +0,0 @@
-/* Initialize cpu feature data.  PowerPC version.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <stdint.h>
-#include <cpu-features.h>
-
-#if HAVE_TUNABLES
-# include <elf/dl-tunables.h>
-#endif
-
-static inline void
-init_cpu_features (struct cpu_features *cpu_features)
-{
-  /* Default is to use aligned memory access on optimized function unless
-     tunables is enable, since for this case user can explicit disable
-     unaligned optimizations.  */
-#if HAVE_TUNABLES
-  int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t,
-					NULL);
-  cpu_features->use_cached_memopt = (cached_memfunc > 0);
-#else
-  cpu_features->use_cached_memopt = false;
-#endif
-}
-diff -rupN a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h
--- a/sysdeps/powerpc/cpu-features.h	2021-08-01 21:33:43.000000000 -0400
-+++ b/sysdeps/powerpc/cpu-features.h	1969-12-31 19:00:00.000000000 -0500
-@@ -1,28 +0,0 @@
-/* Initialize cpu feature data.  PowerPC version.
-   Copyright (C) 2017-2021 Free Software Foundation, Inc.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#ifndef __CPU_FEATURES_POWERPC_H
-# define __CPU_FEATURES_POWERPC_H
-
-#include <stdbool.h>
-
-struct cpu_features
-{
-  bool use_cached_memopt;
-};
-
-#endif /* __CPU_FEATURES_H  */
-diff -rupN a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list
--- a/sysdeps/powerpc/dl-tunables.list	2021-08-01 21:33:43.000000000 -0400
-+++ b/sysdeps/powerpc/dl-tunables.list	2023-09-13 01:17:19.226180343 -0400
-@@ -24,5 +24,8 @@ glibc {
-       maxval: 1
-       default: 0
-     }
-+    hwcaps {
-+      type: STRING
-+    }
-   }
- }
-diff -rupN a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c
--- a/sysdeps/powerpc/hwcapinfo.c	2021-08-01 21:33:43.000000000 -0400
-+++ b/sysdeps/powerpc/hwcapinfo.c	2023-09-13 01:17:19.229180459 -0400
-@@ -19,6 +19,7 @@
- #include <unistd.h>
- #include <shlib-compat.h>
- #include <dl-procinfo.h>
-+#include <cpu-features.c>
- 
- uint64_t __tcb_hwcap __attribute__ ((visibility ("hidden")));
- uint32_t __tcb_platform __attribute__ ((visibility ("hidden")));
-@@ -64,6 +65,9 @@ __tcb_parse_hwcap_and_convert_at_platfor
-   else if (h1 & PPC_FEATURE_POWER5)
-     h1 |= PPC_FEATURE_POWER4;
- 
-+  uint64_t array_hwcaps[] = { h1, h2 };
-+  init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps);
-+
-   /* Consolidate both HWCAP and HWCAP2 into a single doubleword so that
-      we can read both in a single load later.  */
-   __tcb_hwcap = h2;
-diff -rupN a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c
--- a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c	2021-08-01 21:33:43.000000000 -0400
-+++ b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c	2023-09-13 01:17:19.232180575 -0400
-@@ -21,6 +21,7 @@
- #include <wchar.h>
- #include <ldsodefs.h>
- #include <ifunc-impl-list.h>
-+#include <cpu-features.h>
- 
- /* Maximum number of IFUNC implementations.  */
- #define MAX_IFUNC	6
-@@ -33,7 +34,8 @@ __libc_ifunc_impl_list (const char *name
- 
-   size_t i = 0;
- 
-  unsigned long int hwcap = GLRO(dl_hwcap);
-+  const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);
-+  unsigned long int hwcap = features->hwcap;
-   /* hwcap contains only the latest supported ISA, the code checks which is
-      and fills the previous supported ones.  */
-   if (hwcap & PPC_FEATURE_ARCH_2_06)
-diff -rupN a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h
--- a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h	2021-08-01 21:33:43.000000000 -0400
-+++ b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h	2023-09-13 01:17:19.232180575 -0400
-@@ -16,6 +16,7 @@
-    <https://www.gnu.org/licenses/>.  */
- 
- #include <ldsodefs.h>
-+#include <cpu-features.h>
- 
- /* The code checks if _rtld_global_ro was realocated before trying to access
-    the dl_hwcap field. The assembly is to make the compiler not optimize the
-@@ -32,11 +33,12 @@
- # define __GLRO(value)  GLRO(value)
- #endif
- 
-/* dl_hwcap contains only the latest supported ISA, the macro checks which is
-   and fills the previous ones.  */
-+/* Get the hardware information post the tunables set, the macro checks
-+   it and fills the previous ones.  */
- #define INIT_ARCH() \
-  unsigned long int hwcap = __GLRO(dl_hwcap); 			\
-  unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \
-+  const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);	\
-+  unsigned long int hwcap = features->hwcap;				\
-+  unsigned long int __attribute__((unused)) hwcap2 = features->hwcap2; \
-   bool __attribute__((unused)) use_cached_memopt =		\
-     __GLRO(dl_powerpc_cpu_features.use_cached_memopt);		\
-   if (hwcap & PPC_FEATURE_ARCH_2_06)				\
-diff -rupN a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
--- a/sysdeps/powerpc/powerpc64/dl-machine.h	2023-09-13 01:16:17.582791395 -0400
-+++ b/sysdeps/powerpc/powerpc64/dl-machine.h	2023-09-13 01:17:19.236180730 -0400
-@@ -27,7 +27,6 @@
- #include <dl-tls.h>
- #include <sysdep.h>
- #include <hwcapinfo.h>
-#include <cpu-features.c>
- #include <dl-static-tls.h>
- #include <dl-funcdesc.h>
- #include <dl-machine-rel.h>
-@@ -293,7 +292,6 @@ static inline void __attribute__ ((unuse
- dl_platform_init (void)
- {
-   __tcb_parse_hwcap_and_convert_at_platform ();
-  init_cpu_features (&GLRO(dl_powerpc_cpu_features));
- }
- #endif
- 
-diff -rupN a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c	2023-09-13 01:16:20.219893569 -0400
-+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c	2023-09-13 01:19:17.169756083 -0400
-@@ -17,6 +17,7 @@
-    <https://www.gnu.org/licenses/>.  */
- 
- #include <assert.h>
-+#include <cpu-features.h>
- #include <string.h>
- #include <wchar.h>
- #include <ldsodefs.h>
-@@ -32,9 +33,9 @@ __libc_ifunc_impl_list (const char *name
-   assert (max >= MAX_IFUNC);
- 
-   size_t i = 0;
-
-  unsigned long int hwcap = GLRO(dl_hwcap);
-  unsigned long int hwcap2 = GLRO(dl_hwcap2);
-+  const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);
-+  unsigned long int hwcap = features->hwcap;
-+  unsigned long int hwcap2 = features->hwcap2;
- #ifdef SHARED
-   int cacheline_size = GLRO(dl_cache_line_size);
- #endif
-diff -rupN a/sysdeps/unix/sysv/linux/powerpc/Makefile b/sysdeps/unix/sysv/linux/powerpc/Makefile
--- a/sysdeps/unix/sysv/linux/powerpc/Makefile	2021-08-01 21:33:43.000000000 -0400
-+++ b/sysdeps/unix/sysv/linux/powerpc/Makefile	2023-09-13 01:17:19.243181002 -0400
-@@ -21,7 +21,12 @@ ifeq ($(subdir),misc)
- sysdep_headers += bits/ppc.h
- sysdep_routines += get_timebase_freq
- tests-static += test-gettimebasefreq-static
-tests += $(tests-static)
-tests += test-gettimebasefreq
-tests += test-powerpc-linux-sysconf
-+tests += \
-+  $(tests-static) \
-+  test-gettimebasefreq \
-+  test-powerpc-linux-sysconf \
-+  tst-hwcap-tunables \
-+  # tests
-+
-+tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
- endif
-diff -rupN a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c
--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c	1969-12-31 19:00:00.000000000 -0500
-+++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c	2023-09-13 01:17:19.247181157 -0400
-@@ -0,0 +1,124 @@
-+/* Initialize cpu feature data.  PowerPC version.
-+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <array_length.h>
-+#include <stdint.h>
-+#include <cpu-features.h>
-+#include <elf/dl-tunables.h>
-+#include <unistd.h>
-+#include <string.h>
-+
-+static void
-+TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
-+{
-+  /* The current IFUNC selection is always using the most recent
-+     features which are available via AT_HWCAP or AT_HWCAP2.  But in
-+     some scenarios it is useful to adjust this selection.
-+
-+     The environment variable:
-+
-+     GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,....
-+
-+     Can be used to enable HWCAP/HWCAP2 feature yyy, disable HWCAP/HWCAP2
-+     feature xxx, where the feature name is case-sensitive and has to match
-+     the ones mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. */
-+
-+  /* Copy the features from dl_powerpc_cpu_features, which contains the
-+     features provided by AT_HWCAP and AT_HWCAP2.  */
-+  struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features);
-+  unsigned long int tcbv_hwcap = cpu_features->hwcap;
-+  unsigned long int tcbv_hwcap2 = cpu_features->hwcap2;
-+  const char *token = valp->strval;
-+  do
-+    {
-+      const char *token_end, *feature;
-+      bool disable;
-+      size_t token_len, i, feature_len, offset = 0;
-+      /* Find token separator or end of string.  */
-+      for (token_end = token; *token_end != ','; token_end++)
-+	if (*token_end == '\0')
-+	  break;
-+
-+      /* Determine feature.  */
-+      token_len = token_end - token;
-+      if (*token == '-')
-+	{
-+	  disable = true;
-+	  feature = token + 1;
-+	  feature_len = token_len - 1;
-+	}
-+      else
-+	{
-+	  disable = false;
-+	  feature = token;
-+	  feature_len = token_len;
-+	}
-+      for (i = 0; i < array_length (hwcap_tunables); ++i)
-+	{
-+	  const char *hwcap_name = hwcap_names + offset;
-+	  size_t hwcap_name_len = strlen (hwcap_name);
-+	  /* Check the tunable name on the supported list.  */
-+	  if (hwcap_name_len == feature_len
-+	      && memcmp (feature, hwcap_name, feature_len) == 0)
-+	    {
-+	      /* Update the hwcap and hwcap2 bits.  */
-+	      if (disable)
-+		{
-+		  /* Id is 1 for hwcap2 tunable.  */
-+		  if (hwcap_tunables[i].id)
-+		    cpu_features->hwcap2 &= ~(hwcap_tunables[i].mask);
-+		  else
-+		    cpu_features->hwcap &= ~(hwcap_tunables[i].mask);
-+		}
-+	      else
-+		{
-+		  /* Enable the features and also check that no unsupported
-+		     features were enabled by user.  */
-+		  if (hwcap_tunables[i].id)
-+		    cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask);
-+		  else
-+		    cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask);
-+		}
-+	      break;
-+	    }
-+	  offset += hwcap_name_len + 1;
-+	}
-+	token += token_len;
-+	/* ... and skip token separator for next round.  */
-+	if (*token == ',')
-+	  token++;
-+    }
-+  while (*token != '\0');
-+}
-+
-+static inline void
-+init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[])
-+{
-+  /* Fill the cpu_features with the supported hwcaps
-+     which are set by __tcb_parse_hwcap_and_convert_at_platform.  */
-+  cpu_features->hwcap = hwcaps[0];
-+  cpu_features->hwcap2 = hwcaps[1];
-+  /* Default is to use aligned memory access on optimized function unless
-+     tunables is enable, since for this case user can explicit disable
-+     unaligned optimizations.  */
-+  int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t,
-+					NULL);
-+  cpu_features->use_cached_memopt = (cached_memfunc > 0);
-+  TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *,
-+	       TUNABLE_CALLBACK (set_hwcaps));
-+}
-diff -rupN a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h
--- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h	1969-12-31 19:00:00.000000000 -0500
-+++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h	2023-09-13 01:17:19.251181312 -0400
-@@ -0,0 +1,130 @@
-+/* Initialize cpu feature data.  PowerPC version.
-+   Copyright (C) 2017-2023 Free Software Foundation, Inc.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#ifndef __CPU_FEATURES_POWERPC_H
-+# define __CPU_FEATURES_POWERPC_H
-+
-+#include <stdbool.h>
-+#include <sys/auxv.h>
-+
-+struct cpu_features
-+{
-+  bool use_cached_memopt;
-+  unsigned long int hwcap;
-+  unsigned long int hwcap2;
-+};
-+
-+static const char hwcap_names[] = {
-+  "4xxmac\0"
-+  "altivec\0"
-+  "arch_2_05\0"
-+  "arch_2_06\0"
-+  "archpmu\0"
-+  "booke\0"
-+  "cellbe\0"
-+  "dfp\0"
-+  "efpdouble\0"
-+  "efpsingle\0"
-+  "fpu\0"
-+  "ic_snoop\0"
-+  "mmu\0"
-+  "notb\0"
-+  "pa6t\0"
-+  "power4\0"
-+  "power5\0"
-+  "power5+\0"
-+  "power6x\0"
-+  "ppc32\0"
-+  "ppc601\0"
-+  "ppc64\0"
-+  "ppcle\0"
-+  "smt\0"
-+  "spe\0"
-+  "true_le\0"
-+  "ucache\0"
-+  "vsx\0"
-+  "arch_2_07\0"
-+  "dscr\0"
-+  "ebb\0"
-+  "htm\0"
-+  "htm-nosc\0"
-+  "htm-no-suspend\0"
-+  "isel\0"
-+  "tar\0"
-+  "vcrypto\0"
-+  "arch_3_00\0"
-+  "ieee128\0"
-+  "darn\0"
-+  "scv\0"
-+  "arch_3_1\0"
-+  "mma\0"
-+};
-+
-+static const struct
-+{
-+  unsigned int mask;
-+  bool id;
-+} hwcap_tunables[] = {
-+   /* AT_HWCAP tunable masks.  */
-+   { PPC_FEATURE_HAS_4xxMAC,                 0 },
-+   { PPC_FEATURE_HAS_ALTIVEC,                0 },
-+   { PPC_FEATURE_ARCH_2_05,                  0 },
-+   { PPC_FEATURE_ARCH_2_06,                  0 },
-+   { PPC_FEATURE_PSERIES_PERFMON_COMPAT,     0 },
-+   { PPC_FEATURE_BOOKE,                      0 },
-+   { PPC_FEATURE_CELL_BE,                    0 },
-+   { PPC_FEATURE_HAS_DFP,                    0 },
-+   { PPC_FEATURE_HAS_EFP_DOUBLE,             0 },
-+   { PPC_FEATURE_HAS_EFP_SINGLE,             0 },
-+   { PPC_FEATURE_HAS_FPU,                    0 },
-+   { PPC_FEATURE_ICACHE_SNOOP,               0 },
-+   { PPC_FEATURE_HAS_MMU,                    0 },
-+   { PPC_FEATURE_NO_TB,                      0 },
-+   { PPC_FEATURE_PA6T,                       0 },
-+   { PPC_FEATURE_POWER4,                     0 },
-+   { PPC_FEATURE_POWER5,                     0 },
-+   { PPC_FEATURE_POWER5_PLUS,                0 },
-+   { PPC_FEATURE_POWER6_EXT,                 0 },
-+   { PPC_FEATURE_32,                         0 },
-+   { PPC_FEATURE_601_INSTR,                  0 },
-+   { PPC_FEATURE_64,                         0 },
-+   { PPC_FEATURE_PPC_LE,                     0 },
-+   { PPC_FEATURE_SMT,                        0 },
-+   { PPC_FEATURE_HAS_SPE,                    0 },
-+   { PPC_FEATURE_TRUE_LE,                    0 },
-+   { PPC_FEATURE_UNIFIED_CACHE,              0 },
-+   { PPC_FEATURE_HAS_VSX,                    0 },
-+
-+   /* AT_HWCAP2 tunable masks.  */
-+   { PPC_FEATURE2_ARCH_2_07,                 1 },
-+   { PPC_FEATURE2_HAS_DSCR,                  1 },
-+   { PPC_FEATURE2_HAS_EBB,                   1 },
-+   { PPC_FEATURE2_HAS_HTM,                   1 },
-+   { PPC_FEATURE2_HTM_NOSC,                  1 },
-+   { PPC_FEATURE2_HTM_NO_SUSPEND,            1 },
-+   { PPC_FEATURE2_HAS_ISEL,                  1 },
-+   { PPC_FEATURE2_HAS_TAR,                   1 },
-+   { PPC_FEATURE2_HAS_VEC_CRYPTO,            1 },
-+   { PPC_FEATURE2_ARCH_3_00,                 1 },
-+   { PPC_FEATURE2_HAS_IEEE128,               1 },
-+   { PPC_FEATURE2_DARN,                      1 },
-+   { PPC_FEATURE2_SCV,                       1 },
-+   { PPC_FEATURE2_ARCH_3_1,                  1 },
-+   { PPC_FEATURE2_MMA,                       1 },
-+};
-+
-+#endif /* __CPU_FEATURES_H  */
-diff -rupN a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list
--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list	2023-09-13 01:16:19.989884657 -0400
-+++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list	2023-09-13 01:17:19.254181428 -0400
-@@ -28,3 +28,4 @@
- @order glibc.malloc.check
- @order glibc.gmon.minarcs
- @order glibc.gmon.maxarcs
-+@order glibc.cpu.hwcaps
-diff -rupN a/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c b/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c
--- a/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c	1969-12-31 19:00:00.000000000 -0500
-+++ b/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c	2023-09-13 01:17:19.258181583 -0400
-@@ -0,0 +1,128 @@
-+/* Tests for powerpc GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
-+   Copyright (C) 2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <http://www.gnu.org/licenses/>.  */
-+
-+#include <array_length.h>
-+#include <getopt.h>
-+#include <ifunc-impl-list.h>
-+#include <spawn.h>
-+#include <stdio.h>
-+#include <stdlib.h>
-+#include <string.h>
-+#include <support/check.h>
-+#include <support/support.h>
-+#include <support/xunistd.h>
-+#include <sys/auxv.h>
-+#include <sys/wait.h>
-+
-+/* Nonzero if the program gets called via `exec'.  */
-+#define CMDLINE_OPTIONS \
-+  { "restart", no_argument, &restart, 1 },
-+static int restart;
-+
-+/* Hold the four initial argument used to respawn the process, plus the extra
-+   '--direct', '--restart', and the function to check  */
-+static char *spargs[8];
-+static int fc;
-+
-+/* Called on process re-execution.  */
-+_Noreturn static void
-+handle_restart (int argc, char *argv[])
-+{
-+  TEST_VERIFY_EXIT (argc == 1);
-+  const char *funcname = argv[0];
-+
-+  struct libc_ifunc_impl impls[32];
-+  int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
-+  if (cnt == 0)
-+    _exit (EXIT_SUCCESS);
-+  TEST_VERIFY_EXIT (cnt >= 1);
-+  for (int i = 0; i < cnt; i++) {
-+    if (strcmp (impls[i].name, funcname) == 0)
-+      {
-+	TEST_COMPARE (impls[i].usable, false);
-+	break;
-+      }
-+  }
-+
-+  _exit (EXIT_SUCCESS);
-+}
-+
-+static void
-+run_test (const char *filter, const char *funcname)
-+{
-+  printf ("info: checking filter %s (expect %s ifunc selection to be removed)\n",
-+	  filter, funcname);
-+  char *tunable = xasprintf ("GLIBC_TUNABLES=glibc.cpu.hwcaps=%s", filter);
-+  char *const newenvs[] = { (char*) tunable, NULL };
-+  spargs[fc] = (char *) funcname;
-+
-+  pid_t pid;
-+  TEST_COMPARE (posix_spawn (&pid, spargs[0], NULL, NULL, spargs, newenvs), 0);
-+  int status;
-+  TEST_COMPARE (xwaitpid (pid, &status, 0), pid);
-+  TEST_VERIFY (WIFEXITED (status));
-+  TEST_VERIFY (!WIFSIGNALED (status));
-+  TEST_COMPARE (WEXITSTATUS (status), 0);
-+
-+  free (tunable);
-+}
-+
-+static int
-+do_test (int argc, char *argv[])
-+{
-+  if (restart)
-+    handle_restart (argc - 1, &argv[1]);
-+
-+  TEST_VERIFY_EXIT (argc == 2 || argc == 5);
-+
-+  int i;
-+  for (i = 0; i < argc - 1; i++)
-+    spargs[i] = argv[i + 1];
-+  spargs[i++] = (char *) "--direct";
-+  spargs[i++] = (char *) "--restart";
-+  fc = i++;
-+  spargs[i] = NULL;
-+
-+  unsigned long int hwcap = getauxval (AT_HWCAP);
-+  unsigned long int hwcap2 = getauxval (AT_HWCAP2);
-+  if (__WORDSIZE == 64)
-+    {
-+      if (hwcap2 & PPC_FEATURE2_ARCH_3_1)
-+	run_test ("-arch_3_1", "__memcpy_power10");
-+      if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
-+	run_test ("-arch_2_07", "__memcpy_power8_cached");
-+      if (hwcap & PPC_FEATURE_ARCH_2_06)
-+	run_test ("-arch_2_06", "__memcpy_power7");
-+      if (hwcap & PPC_FEATURE_ARCH_2_05)
-+	run_test ("-arch_2_06,-arch_2_05","__memcpy_power6");
-+      run_test ("-arch_2_06,-arch_2_05,-power5+,-power5,-power4", "__memcpy_power4");
-+    }
-+  else
-+    {
-+      if (hwcap & PPC_FEATURE_HAS_VSX)
-+	run_test ("-vsx", "__memcpy_power7");
-+      if (hwcap & PPC_FEATURE_ARCH_2_06)
-+	run_test ("-arch_2_06", "__memcpy_a2");
-+      if (hwcap & PPC_FEATURE_ARCH_2_05)
-+	run_test ("-arch_2_05", "__memcpy_power6");
-+    }
-+  return 0;
-+}
-+
-+#define TEST_FUNCTION_ARGV do_test
-+#include <support/test-driver.c>
--- a/SOURCES/glibc-RHEL-10481.patch
+++ b/SOURCES/glibc-RHEL-10481.patch
@ -16,7 +16,7 @@ Date:   Thu Nov 16 19:55:35 2023 +0100
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>

 diff --git a/elf/dl-close.c b/elf/dl-close.c
-index 95a03c9616c6a786..9d158c25498fd8ae 100644
+index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
 --- a/elf/dl-close.c
 +++ b/elf/dl-close.c
@@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
@ -64,7 +64,7 @@ index 95a03c9616c6a786..9d158c25498fd8ae 100644
   dl_close_state = not_pending;
 }
 diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
-index e8ef5e8b3588ab53..937feb6e7008bc62 100644
+index aeb79b40b45054c0..c17ac325eca658ef 100644
 --- a/elf/dl-sort-maps.c
 +++ b/elf/dl-sort-maps.c
@@ -260,13 +260,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
--- a/SOURCES/glibc-RHEL-1191.patch
+++ b/SOURCES/glibc-RHEL-1191.patch
@ -1,69 +0,0 @@
-commit 1493622f4f9048ffede3fbedb64695efa49d662a
-Author: H.J. Lu <hjl.tools@gmail.com>
-Date:   Mon Aug 28 12:08:14 2023 -0700
-
-    x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643]
-    
-    The old Intel software developer manual specified that the low byte of
-    EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of
-    CPUDID leaf 2 was needed to retrieve the complete cache information. The
-    newer Intel manual has been changed to that it should always return 1
-    and be ignored.  If the lower byte isn't 1, CPUID leaf 2 can't be used.
-    In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead.  If
-    CPUID leaf 4 doesn't contain the cache information, cache information
-    isn't available at all.  This addresses BZ #30643.
-
-diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
-index f950e488cfbe42dd..bd2f2b65f78056ca 100644
--- a/sysdeps/x86/dl-cacheinfo.h
-+++ b/sysdeps/x86/dl-cacheinfo.h
-@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
- 	      ++round;
- 	    }
- 	  /* There is no other cache information anywhere else.  */
-	  break;
-+	  return -1;
- 	}
-       else
- 	{
-@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features)
- 
-   /* OK, we can use the CPUID instruction to get all info about the
-      caches.  */
-  unsigned int cnt = 0;
-  unsigned int max = 1;
-   long int result = 0;
-   bool no_level_2_or_3 = false;
-   bool has_level_2 = false;
-+  unsigned int eax;
-+  unsigned int ebx;
-+  unsigned int ecx;
-+  unsigned int edx;
-+  __cpuid (2, eax, ebx, ecx, edx);
- 
-  while (cnt++ < max)
-+  /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
-+     should be ignored.  If it isn't 1, use CPUID leaf 4 instead.  */
-+  if ((eax & 0xff) != 1)
-+    return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3,
-+			     cpu_features);
-+  else
-     {
-      unsigned int eax;
-      unsigned int ebx;
-      unsigned int ecx;
-      unsigned int edx;
-      __cpuid (2, eax, ebx, ecx, edx);
-
-      /* The low byte of EAX in the first round contain the number of
-	 rounds we have to make.  At least one, the one we are already
-	 doing.  */
-      if (cnt == 1)
-	{
-	  max = eax & 0xff;
-	  eax &= 0xffffff00;
-	}
-+      eax &= 0xffffff00;
- 
-       /* Process the individual registers' value.  */
-       result = intel_check_word (name, eax, &has_level_2,
--- a/SOURCES/glibc-RHEL-1192.patch
+++ b/SOURCES/glibc-RHEL-1192.patch
@ -21,7 +21,7 @@ Date:   Tue Aug 29 08:28:31 2023 +0200
    Reviewed-by: DJ Delorie <dj@redhat.com>

 diff --git a/nscd/cache.c b/nscd/cache.c
-index 78b2269788699e6f..ac5902ae10b791bb 100644
+index efe4214d953edb30..2fd3f78ebb567bbe 100644
 --- a/nscd/cache.c
 +++ b/nscd/cache.c
@@ -371,8 +371,11 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
--- a/SOURCES/glibc-RHEL-13720-1.patch
+++ b/SOURCES/glibc-RHEL-13720-1.patch
@ -14,10 +14,10 @@ Conflicts:
 	  (missing alloca removal downstream)

 diff --git a/elf/ldconfig.c b/elf/ldconfig.c
-index be47ad8c2d7f89f3..f0c811001965cc46 100644
+index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
 --- a/elf/ldconfig.c
 +++ b/elf/ldconfig.c
-@@ -778,6 +778,31 @@ struct dlib_entry
+@@ -771,6 +771,31 @@ struct dlib_entry
   struct dlib_entry *next;
 };
 
@ -49,7 +49,7 @@ index be47ad8c2d7f89f3..f0c811001965cc46 100644
 
 static void
 search_dir (const struct dir_entry *entry)
-@@ -854,18 +879,8 @@ search_dir (const struct dir_entry *entry)
+@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
 	continue;
 
       size_t len = strlen (direntry->d_name);
--- a/SOURCES/glibc-RHEL-13720-2.patch
+++ b/SOURCES/glibc-RHEL-13720-2.patch
@ -14,10 +14,10 @@ Date:   Mon Oct 23 12:53:16 2023 +0200
    temporary files created by package managers").

 diff --git a/elf/ldconfig.c b/elf/ldconfig.c
-index f0c811001965cc46..4a96c409994d96c8 100644
+index 51de08f91fbaf093..fb19dd68d41c07a4 100644
 --- a/elf/ldconfig.c
 +++ b/elf/ldconfig.c
-@@ -778,6 +778,17 @@ struct dlib_entry
+@@ -771,6 +771,17 @@ struct dlib_entry
   struct dlib_entry *next;
 };
 
@ -35,7 +35,7 @@ index f0c811001965cc46..4a96c409994d96c8 100644
 /* Skip some temporary DSO files.  These files may be partially written
    and lead to ldconfig crashes when examined.  */
 static bool
-@@ -787,8 +798,7 @@ skip_dso_based_on_name (const char *name, size_t len)
+@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
      names like these are never really DSOs we want to look at.  */
   if (len >= sizeof (".#prelink#") - 1)
     {
@ -45,7 +45,7 @@ index f0c811001965cc46..4a96c409994d96c8 100644
 	return true;
       if (len >= sizeof (".#prelink#.XXXXXX") - 1
 	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
-@@ -796,10 +806,11 @@ skip_dso_based_on_name (const char *name, size_t len)
+@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
 	return true;
     }
   /* Skip temporary files created by RPM.  */
--- a/SOURCES/glibc-RHEL-15343-1.patch
+++ b/SOURCES/glibc-RHEL-15343-1.patch
@ -1,26 +0,0 @@
-commit 1626d8a521c7c771d4118b1328421fea113cab64
-Author: Joe Simmons-Talbott <josimmon@redhat.com>
-Date:   Fri Apr 21 09:24:22 2023 -0400
-
-    string: Allow use of test-string.h for non-ifunc implementations.
-    
-    Mark two variables as unused to silence warning when using
-    test-string.h for non-ifunc implementations.
-    
-    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
-
-diff --git a/string/test-string.h b/string/test-string.h
-index 41de973479..8bcb8afd0a 100644
--- a/string/test-string.h
-+++ b/string/test-string.h
-@@ -130,8 +130,8 @@ cmdline_process_function (int c)
- /* Increase size of FUNC_LIST if assert is triggered at run-time.  */
- static struct libc_ifunc_impl func_list[32];
- static int func_count;
-static int impl_count = -1;
-static impl_t *impl_array;
-+static int impl_count __attribute__ ((unused)) = -1;
-+static impl_t *impl_array __attribute__ ((unused));
- 
- # define FOR_EACH_IMPL(impl, notall) \
-   impl_t *impl;								\
--- a/SOURCES/glibc-RHEL-15343-2.patch
+++ b/SOURCES/glibc-RHEL-15343-2.patch
@ -1,233 +0,0 @@
-commit eaaad78db41724e5a18a42becb238bfc4e683998
-Author: Joe Simmons-Talbott <josimmon@redhat.com>
-Date:   Fri Apr 21 09:24:23 2023 -0400
-
-    string: Add tests for strdup (BZ #30266)
-    
-    Copy strcpy tests for strdup.  Covers some basic testcases with random
-    strings.  Add a zero-length string testcase.
-    
-    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
-    Conflicts:
-        string/Makefile
-          (different test backport order)
-
-
-diff -Nrup a/string/Makefile b/string/Makefile
--- a/string/Makefile	2023-11-30 10:59:16.400251685 -0500
-+++ b/string/Makefile	2023-11-30 11:16:42.829613344 -0500
-@@ -63,7 +63,8 @@ tests		:= tester inl-tester noinl-tester
- 		   tst-strtok_r bug-strcoll2 tst-cmp tst-xbzero-opt	\
- 		   test-endian-types test-endian-file-scope		\
- 		   test-endian-sign-conversion tst-memmove-overflow	\
-		   test-sig_np tst-strerror-fail
-+		   test-sig_np tst-strerror-fail			\
-+		   test-strdup
- 
- # Both tests require the .mo translation files generated by msgfmt.
- tests-translation := tst-strsignal					\
-diff -Nrup a/string/test-strdup.c b/string/test-strdup.c
--- a/string/test-strdup.c	1969-12-31 19:00:00.000000000 -0500
-+++ b/string/test-strdup.c	2023-11-30 11:11:32.850447614 -0500
-@@ -0,0 +1,201 @@
-+/* Test and measure strdup functions.
-+   Copyright (C) 2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <support/check.h>
-+
-+#ifdef WIDE
-+# include <wchar.h>
-+# define CHAR wchar_t
-+# define sfmt "ls"
-+# define BIG_CHAR WCHAR_MAX
-+# define SMALL_CHAR 1273
-+# define STRCMP wcscmp
-+# define MEMCMP wmemcmp
-+# define MEMSET wmemset
-+# define TCS TEST_COMPARE_STRING_WIDE
-+#else
-+# define CHAR char
-+# define sfmt "s"
-+# define BIG_CHAR CHAR_MAX
-+# define SMALL_CHAR 127
-+# define STRCMP strcmp
-+# define MEMCMP memcmp
-+# define MEMSET memset
-+# define TCS TEST_COMPARE_STRING
-+#endif
-+
-+#ifndef STRDUP_RESULT
-+# define STRDUP_RESULT(dst, len) dst
-+# define TEST_MAIN
-+# ifndef WIDE
-+#  define TEST_NAME "strdup"
-+# else
-+#  define TEST_NAME "wcsdup"
-+# endif
-+# include "test-string.h"
-+# ifndef WIDE
-+#  define STRDUP strdup
-+# else
-+#  define STRDUP wcsdup
-+# endif
-+#endif
-+
-+typedef CHAR *(*proto_t) (const CHAR *);
-+
-+static void
-+do_zero_len_test (void)
-+{
-+  CHAR src[1] = { '\0' };
-+  CHAR *dst = STRDUP (src);
-+
-+  TCS (dst, src);
-+  free (dst);
-+}
-+
-+static void
-+do_one_test (const CHAR *src,
-+	     size_t len __attribute__((unused)))
-+{
-+  CHAR *dst = STRDUP (src);
-+
-+  if (STRCMP (dst, src) != 0)
-+    {
-+      error (0, 0,
-+	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
-+	     TEST_NAME, dst, src);
-+      ret = 1;
-+      free (dst);
-+      return;
-+    }
-+  free (dst);
-+}
-+
-+static void
-+do_test (size_t align1, size_t align2, size_t len, int max_char)
-+{
-+  size_t i;
-+  CHAR *s1;
-+/* For wcsdup: align1 and align2 here mean alignment not in bytes,
-+   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
-+   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
-+  align1 &= 7;
-+  if ((align1 + len) * sizeof (CHAR) >= page_size)
-+    return;
-+
-+  align2 &= 7;
-+  if ((align2 + len) * sizeof (CHAR) >= page_size)
-+    return;
-+
-+  s1 = (CHAR *) (buf1) + align1;
-+
-+  for (i = 0; i < len; i++)
-+    s1[i] = 32 + 23 * i % (max_char - 32);
-+  s1[len] = 0;
-+
-+  do_one_test (s1, len);
-+}
-+
-+static void
-+do_random_tests (void)
-+{
-+  size_t i, j, n, align1, align2, len;
-+  CHAR *p1 = (CHAR *)(buf1 + page_size) - 512;
-+  CHAR *res;
-+
-+  for (n = 0; n < ITERATIONS; n++)
-+    {
-+      /* align1 and align2 are expressed as wchar_t and not in bytes for wide
-+     char test, and thus it will be equal to align times wchar_t size.
-+
-+     For non wide version we need to check all alignments from 0 to 63
-+     since some assembly implementations have separate prolog for alignments
-+     more 48.  */
-+
-+      align1 = random () & (63 / sizeof (CHAR));
-+      if (random () & 1)
-+	align2 = random () & (63 / sizeof (CHAR));
-+      else
-+	align2 = align1 + (random () & 24);
-+      len = random () & 511;
-+      j = align1;
-+      if (align2 > j)
-+	j = align2;
-+      if (len + j >= 511)
-+	len = 510 - j - (random () & 7);
-+      j = len + align1 + 64;
-+      if (j > 512)
-+	j = 512;
-+      for (i = 0; i < j; i++)
-+	{
-+	  if (i == len + align1)
-+	    p1[i] = 0;
-+	  else
-+	    {
-+	      p1[i] = random () & BIG_CHAR;
-+	      if (i >= align1 && i < len + align1 && !p1[i])
-+		p1[i] = (random () & SMALL_CHAR) + 3;
-+	    }
-+	}
-+
-+      res =  STRDUP(p1 + align1);
-+      TCS (res, (p1 + align1));
-+      free (res);
-+    }
-+}
-+
-+
-+int
-+test_main (void)
-+{
-+  size_t i;
-+
-+  test_init ();
-+
-+  printf ("%23s", "");
-+  printf ("\t%s", TEST_NAME);
-+  putchar ('\n');
-+
-+  for (i = 0; i < 16; ++i)
-+    {
-+      do_test (0, 0, i, SMALL_CHAR);
-+      do_test (0, 0, i, BIG_CHAR);
-+      do_test (0, i, i, SMALL_CHAR);
-+      do_test (i, 0, i, BIG_CHAR);
-+    }
-+
-+  for (i = 1; i < 8; ++i)
-+    {
-+      do_test (0, 0, 8 << i, SMALL_CHAR);
-+      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
-+    }
-+
-+  for (i = 1; i < 8; ++i)
-+    {
-+      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
-+      do_test (2 * i, i, 8 << i, BIG_CHAR);
-+      do_test (i, i, 8 << i, SMALL_CHAR);
-+      do_test (i, i, 8 << i, BIG_CHAR);
-+    }
-+
-+  do_zero_len_test ();
-+  do_random_tests ();
-+
-+  return ret;
-+}
-+
-+#include <support/test-driver.c>
--- a/SOURCES/glibc-RHEL-15343-3.patch
+++ b/SOURCES/glibc-RHEL-15343-3.patch
@ -1,232 +0,0 @@
-commit 0c48aa0551151ea201f7f528492e89a0b08a6890
-Author: Joe Simmons-Talbott <josimmon@redhat.com>
-Date:   Fri Apr 21 09:24:24 2023 -0400
-
-    string: Add tests for strndup (BZ #30266)
-    
-    Copy strncpy tests for strndup.  Covers some basic testcases with random
-    strings.  Remove tests that set the destination's bytes and checked the
-    resulting buffer's bytes.  Remove wide character test support since
-    wcsndup() doesn't exist.
-    
-    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
-    Conflicts:
-	string/Makefile
-	  (different test backport order)
-
-diff -Nrup a/string/Makefile b/string/Makefile
--- a/string/Makefile	2023-11-30 11:55:02.263010916 -0500
-+++ b/string/Makefile	2023-11-30 11:58:29.238954539 -0500
-@@ -64,7 +64,7 @@ tests		:= tester inl-tester noinl-tester
- 		   test-endian-types test-endian-file-scope		\
- 		   test-endian-sign-conversion tst-memmove-overflow	\
- 		   test-sig_np tst-strerror-fail			\
-		   test-strdup
-+		   test-strdup test-strndup
- 
- # Both tests require the .mo translation files generated by msgfmt.
- tests-translation := tst-strsignal					\
-diff -Nrup a/string/test-strndup.c b/string/test-strndup.c
--- a/string/test-strndup.c	1969-12-31 19:00:00.000000000 -0500
-+++ b/string/test-strndup.c	2023-11-30 11:56:24.986388053 -0500
-@@ -0,0 +1,200 @@
-+/* Test strndup functions.
-+   Copyright (C) 2023 Free Software Foundation, Inc.
-+   This file is part of the GNU C Library.
-+
-+   The GNU C Library is free software; you can redistribute it and/or
-+   modify it under the terms of the GNU Lesser General Public
-+   License as published by the Free Software Foundation; either
-+   version 2.1 of the License, or (at your option) any later version.
-+
-+   The GNU C Library is distributed in the hope that it will be useful,
-+   but WITHOUT ANY WARRANTY; without even the implied warranty of
-+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-+   Lesser General Public License for more details.
-+
-+   You should have received a copy of the GNU Lesser General Public
-+   License along with the GNU C Library; if not, see
-+   <https://www.gnu.org/licenses/>.  */
-+
-+#include <support/check.h>
-+
-+#define TEST_MAIN
-+#include "test-string.h"
-+
-+static void
-+do_one_test (const char *src, size_t len, size_t n)
-+{
-+  char *dst = strndup (src, n);
-+  size_t s = (len > n ? n: len) * sizeof (char);
-+
-+  TEST_COMPARE_BLOB (dst, s, src, s);
-+
-+  free (dst);
-+}
-+
-+static void
-+do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
-+{
-+  size_t i;
-+  char *s1;
-+
-+  align1 &= 7;
-+  if ((align1 + len) * sizeof (char) >= page_size)
-+    return;
-+
-+  align2 &= 7;
-+  if ((align2 + len) * sizeof (char) >= page_size)
-+    return;
-+
-+  s1 = (char *) (buf1) + align1;
-+
-+  for (i = 0; i < len; ++i)
-+    s1[i] = 32 + 23 * i % (max_char - 32);
-+  s1[len] = 0;
-+  for (i = len + 1; (i + align1) * sizeof (char) < page_size && i < len + 64;
-+       ++i)
-+    s1[i] = 32 + 32 * i % (max_char - 32);
-+
-+  do_one_test (s1, len, n);
-+}
-+
-+static void
-+do_page_tests (void)
-+{
-+  char *s1;
-+  const size_t maxoffset = 64;
-+
-+  /* Put s1 at the maxoffset from the edge of buf1's last page.  */
-+  s1 = (char *) buf1 + BUF1PAGES * page_size / sizeof (char) - maxoffset;
-+
-+  memset (s1, 'a', maxoffset - 1);
-+  s1[maxoffset - 1] = '\0';
-+
-+  /* Both strings are bounded to a page with read/write access and the next
-+     page is protected with PROT_NONE (meaning that any access outside of the
-+     page regions will trigger an invalid memory access).
-+
-+     The loop copies the string s1 for all possible offsets up to maxoffset
-+     for both inputs with a size larger than s1 (so memory access outside the
-+     expected memory regions might trigger invalid access).  */
-+
-+  for (size_t off1 = 0; off1 < maxoffset; off1++)
-+    for (size_t off2 = 0; off2 < maxoffset; off2++)
-+      do_one_test (s1 + off1, maxoffset - off1 - 1,
-+		   maxoffset + (maxoffset - off2));
-+}
-+
-+static void
-+do_random_tests (void)
-+{
-+  size_t i, j, n, align1, align2, len, size, mode;
-+  char *p1 = (char *) (buf1 + page_size) - 512;
-+  char *res;
-+
-+  for (n = 0; n < ITERATIONS; n++)
-+    {
-+      mode = random ();
-+      if (mode & 1)
-+	{
-+	  size = random () & 255;
-+	  align1 = 512 - size - (random () & 15);
-+	  if (mode & 2)
-+	    align2 = align1 - (random () & 24);
-+	  else
-+	    align2 = align1 - (random () & 31);
-+	  if (mode & 4)
-+	    {
-+	      j = align1;
-+	      align1 = align2;
-+	      align2 = j;
-+	    }
-+	  if (mode & 8)
-+	    len = size - (random () & 31);
-+	  else
-+	    len = 512;
-+	  if (len >= 512)
-+	    len = random () & 511;
-+	}
-+      else
-+	{
-+	  align1 = random () & 31;
-+	  if (mode & 2)
-+	    align2 = random () & 31;
-+	  else
-+	    align2 = align1 + (random () & 24);
-+	  len = random () & 511;
-+	  j = align1;
-+	  if (align2 > j)
-+	    j = align2;
-+	  if (mode & 4)
-+	    {
-+	      size = random () & 511;
-+	      if (size + j > 512)
-+		size = 512 - j - (random () & 31);
-+	    }
-+	  else
-+	    size = 512 - j;
-+	  if ((mode & 8) && len + j >= 512)
-+	    len = 512 - j - (random () & 7);
-+	}
-+      j = len + align1 + 64;
-+      if (j > 512)
-+	j = 512;
-+      for (i = 0; i < j; i++)
-+	{
-+	  if (i == len + align1)
-+	    p1[i] = 0;
-+	  else
-+	    {
-+	      p1[i] = random () & CHAR_MAX;
-+	      if (i >= align1 && i < len + align1 && !p1[i])
-+		p1[i] = (random () & 127) + 3;
-+	    }
-+	}
-+
-+	res = (char *) strndup ((char *) (p1 + align1), size);
-+	j = len + 1;
-+	if (size < j)
-+	  j = size;
-+	TEST_COMPARE_BLOB (res, j, (char *) (p1 + align1), j);
-+	free (res);
-+    }
-+}
-+
-+int
-+test_main (void)
-+{
-+  size_t i;
-+
-+  test_init ();
-+
-+  printf ("%28s", "");
-+  printf ("\t%s", "strndup");
-+  putchar ('\n');
-+
-+  for (i = 1; i < 8; ++i)
-+    {
-+      do_test (i, i, 16, 16, 127);
-+      do_test (i, i, 16, 16, CHAR_MAX);
-+      do_test (i, 2 * i, 16, 16, 127);
-+      do_test (2 * i, i, 16, 16, CHAR_MAX);
-+      do_test (8 - i, 2 * i, 1 << i, 2 << i, 127);
-+      do_test (2 * i, 8 - i, 2 << i, 1 << i, 127);
-+      do_test (8 - i, 2 * i, 1 << i, 2 << i, CHAR_MAX);
-+      do_test (2 * i, 8 - i, 2 << i, 1 << i, CHAR_MAX);
-+    }
-+
-+  for (i = 1; i < 8; ++i)
-+    {
-+      do_test (0, 0, 4 << i, 8 << i, 127);
-+      do_test (0, 0, 16 << i, 8 << i, 127);
-+      do_test (8 - i, 2 * i, 4 << i, 8 << i, 127);
-+      do_test (8 - i, 2 * i, 16 << i, 8 << i, 127);
-+    }
-+
-+  do_random_tests ();
-+  do_page_tests ();
-+  return ret;
-+}
-+
-+#include <support/test-driver.c>
--- a/SOURCES/glibc-RHEL-15343-4.patch
+++ b/SOURCES/glibc-RHEL-15343-4.patch
@ -1,33 +0,0 @@
-commit 0aa5b28a504c6f1f17b387d8147715d1496fff62
-Author: Joe Simmons-Talbott <josimmon@redhat.com>
-Date:   Fri Apr 21 09:24:25 2023 -0400
-
-    wcsmbs: Add wcsdup() tests. (BZ #30266)
-    
-    Enable wide character testcases for wcsdup().
-    
-    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
-    Conflicts:
-	wcsmbs/Makefile
-          (different test backport order)
-
-
-diff -Nrup a/wcsmbs/Makefile b/wcsmbs/Makefile
--- a/wcsmbs/Makefile	2023-11-30 14:14:18.755010508 -0500
-+++ b/wcsmbs/Makefile	2023-11-30 14:38:18.511131851 -0500
-@@ -53,7 +53,8 @@ tests := tst-wcstof wcsmbs-tst1 tst-wcsn
- 	 tst-c16c32-1 wcsatcliff tst-wcstol-locale tst-wcstod-nan-locale \
- 	 tst-wcstod-round test-char-types tst-fgetwc-after-eof \
- 	 tst-wcstod-nan-sign tst-c16-surrogate tst-c32-state \
-	 $(addprefix test-,$(strop-tests)) tst-mbstowcs
-+	 $(addprefix test-,$(strop-tests)) tst-mbstowcs \
-+	 test-wcsdup
- 
- include ../Rules
- 
-diff -Nrup a/wcsmbs/test-wcsdup.c b/wcsmbs/test-wcsdup.c
--- a/wcsmbs/test-wcsdup.c	1969-12-31 19:00:00.000000000 -0500
-+++ b/wcsmbs/test-wcsdup.c	2023-11-30 14:14:48.869138712 -0500
-@@ -0,0 +1,2 @@
-+#define WIDE 1
-+#include "../string/test-strdup.c"
--- a/SOURCES/glibc-RHEL-15696-1.patch
+++ b/SOURCES/glibc-RHEL-15696-1.patch
@ -0,0 +1,259 @@
+From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:23:59 -0800
+Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
+	upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/test-size_t.h: New file.
+	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
+---
+ sysdeps/x86_64/memchr.S                 | 10 ++--
+ sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 ++-
+ sysdeps/x86_64/x32/Makefile             |  8 +++
+ sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
+ 6 files changed, 148 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/test-size_t.h
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	NEWS
+	(removed)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index feef5d4f..cb320257 100644
+--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
+@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
+ 	mov	%edi, %ecx
+ 
+ #ifdef USE_AS_WMEMCHR
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+ #else
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+ 	punpcklbw %xmm1, %xmm1
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+ 	punpcklbw %xmm1, %xmm1
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index 5f5e7725..c81da19b 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -40,16 +40,20 @@
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM0.  */
+ 	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+ 	vpbroadcastd %xmm0, %ymm0
+ # else
+#  ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#  endif
+ 	vpbroadcastb %xmm0, %ymm0
+ # endif
+ 	/* Check if we may cross page boundary with one vector load.  */
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index f2ebc24f..7d528889 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
+ # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
+ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+
+ifeq ($(subdir),string)
+tests += tst-size_t-memchr
+endif
+
+ifeq ($(subdir),wcsmbs)
+tests += tst-size_t-wmemchr
+endif
+diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
+new file mode 100644
+index 00000000..78a94086
+--- /dev/null
+++ b/sysdeps/x86_64/x32/test-size_t.h
+@@ -0,0 +1,35 @@
+/* Test string/memory functions with size_t in the lower 32 bits of
+   64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
+   field in the lower 32 bits.  When the LEN field of 64-bit register
+   is passed to string/memory function as the size_t parameter, only
+   the lower 32 bits can be used.  */
+typedef struct
+{
+  union
+    {
+      size_t len;
+      void (*fn) (void);
+    };
+  void *p;
+} parameter_t;
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+new file mode 100644
+index 00000000..29a3daf1
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
+@@ -0,0 +1,72 @@
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifndef WIDE
+# define TEST_NAME "memchr"
+#else
+# define TEST_NAME "wmemchr"
+#endif /* WIDE */
+#include "test-size_t.h"
+
+#ifndef WIDE
+# define MEMCHR memchr
+# define CHAR char
+# define UCHAR unsigned char
+#else
+# include <wchar.h>
+# define MEMCHR wmemchr
+# define CHAR wchar_t
+# define UCHAR wchar_t
+#endif /* WIDE */
+
+IMPL (MEMCHR, 1)
+
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
+
+static CHAR *
+__attribute__ ((noinline, noclone))
+do_memchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *res = do_memchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+new file mode 100644
+index 00000000..877801d6
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
+@@ -0,0 +1,20 @@
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memchr.c"
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-10.patch
+++ b/SOURCES/glibc-RHEL-15696-10.patch
@ -0,0 +1,41 @@
+From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:21 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_avx2. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 156c1949..8fb8eedc 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -83,6 +83,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+#  ifndef __ILP32__
+	movq	%rdx, %rcx
+	/* Check if length could overflow when multiplied by
+	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+	   overflow cases as well as redirect cases where its impossible to
+	   length to bound a valid memory region. In these cases just use
+	   'wcscmp'.  */
+	shrq	$56, %rcx
+	jnz	__wcscmp_avx2
+#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-230.patch
+++ b/SOURCES/glibc-upstream-2.34-230.patch
@ -1,25 +1,26 @@
-commit 4ff6ae069b7caacd5f99088abd755717b994f660
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Fri Mar 25 17:13:33 2022 -0500
+From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 25 Mar 2022 17:13:33 -0500
+Subject: [PATCH] x86: Small improvements for wcslen
+Content-type: text/plain; charset=UTF-8

-    x86: Small improvements for wcslen
-    
-    Just a few QOL changes.
-        1. Prefer `add` > `lea` as it has high execution units it can run
-           on.
-        2. Don't break macro-fusion between `test` and `jcc`
-        3. Reduce code size by removing gratuitous padding bytes (-90
-           bytes).
-    
-    geometric_mean(N=20) of all benchmarks New / Original: 0.959
-    
-    All string/memory tests pass.
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit 244b415d386487521882debb845a040a4758cb18)
+Just a few QOL changes.
+    1. Prefer `add` > `lea` as it has high execution units it can run
+       on.
+    2. Don't break macro-fusion between `test` and `jcc`
+    3. Reduce code size by removing gratuitous padding bytes (-90
+       bytes).
+
+geometric_mean(N=20) of all benchmarks New / Original: 0.959
+
+All string/memory tests pass.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
+ 1 file changed, 41 insertions(+), 45 deletions(-)

 diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
-index 61edea1d14d454c6..ad066863a44ea0a5 100644
+index 9f5f7232..254bb030 100644
 --- a/sysdeps/x86_64/wcslen.S
 +++ b/sysdeps/x86_64/wcslen.S
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
@ -251,3 +252,6 @@ index 61edea1d14d454c6..ad066863a44ea0a5 100644
 	ret
 
 END (__wcslen)
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-231.patch
+++ b/SOURCES/glibc-upstream-2.34-231.patch
@ -1,87 +1,92 @@
-commit ffe75982cc0bb2d25d55ed566a3731b9c3017e6f
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Fri Apr 15 12:28:00 2022 -0500
+From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:00 -0500
+Subject: [PATCH] x86: Remove memcmp-sse4.S
+Content-type: text/plain; charset=UTF-8

-    x86: Remove memcmp-sse4.S
-    
-    Code didn't actually use any sse4 instructions since `ptest` was
-    removed in:
-    
-    commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
-    Author: Noah Goldstein <goldstein.w.n@gmail.com>
-    Date:   Wed Nov 10 16:18:56 2021 -0600
-    
-        x86: Shrink memcmp-sse4.S code size
-    
-    The new memcmp-sse2 implementation is also faster.
-    
-    geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
-    
-    Note there are two regressions preferring SSE2 for Size = 1 and Size =
-    65.
-    
-    Size = 1:
-    size, align0, align1, ret, New Time/Old Time
-       1,      1,      1,   0,               1.2
-       1,      1,      1,   1,             1.197
-       1,      1,      1,  -1,               1.2
-    
-    This is intentional. Size == 1 is significantly less hot based on
-    profiles of GCC11 and Python3 than sizes [4, 8] (which is made
-    hotter).
-    
-    Python3 Size = 1        -> 13.64%
-    Python3 Size = [4, 8]   -> 60.92%
-    
-    GCC11   Size = 1        ->  1.29%
-    GCC11   Size = [4, 8]   -> 33.86%
-    
-    size, align0, align1, ret, New Time/Old Time
-       4,      4,      4,   0,             0.622
-       4,      4,      4,   1,             0.797
-       4,      4,      4,  -1,             0.805
-       5,      5,      5,   0,             0.623
-       5,      5,      5,   1,             0.777
-       5,      5,      5,  -1,             0.802
-       6,      6,      6,   0,             0.625
-       6,      6,      6,   1,             0.813
-       6,      6,      6,  -1,             0.788
-       7,      7,      7,   0,             0.625
-       7,      7,      7,   1,             0.799
-       7,      7,      7,  -1,             0.795
-       8,      8,      8,   0,             0.625
-       8,      8,      8,   1,             0.848
-       8,      8,      8,  -1,             0.914
-       9,      9,      9,   0,             0.625
-    
-    Size = 65:
-    size, align0, align1, ret, New Time/Old Time
-      65,      0,      0,   0,             1.103
-      65,      0,      0,   1,             1.216
-      65,      0,      0,  -1,             1.227
-      65,     65,      0,   0,             1.091
-      65,      0,     65,   1,              1.19
-      65,     65,     65,  -1,             1.215
-    
-    This is because A) the checks in range [65, 96] are now unrolled 2x
-    and B) because smaller values <= 16 are now given a hotter path. By
-    contrast the SSE4 version has a branch for Size = 80. The unrolled
-    version has get better performance for returns which need both
-    comparisons.
-    
-    size, align0, align1, ret, New Time/Old Time
-     128,      4,      8,   0,             0.858
-     128,      4,      8,   1,             0.879
-     128,      4,      8,  -1,             0.888
-    
-    As well, out of microbenchmark environments that are not full
-    predictable the branch will have a real-cost.
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit 7cbc03d03091d5664060924789afe46d30a5477e)
+Code didn't actually use any sse4 instructions since `ptest` was
+removed in:
+
+commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Nov 10 16:18:56 2021 -0600
+
+    x86: Shrink memcmp-sse4.S code size
+
+The new memcmp-sse2 implementation is also faster.
+
+geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
+
+Note there are two regressions preferring SSE2 for Size = 1 and Size =
+65.
+
+Size = 1:
+size, align0, align1, ret, New Time/Old Time
+   1,      1,      1,   0,               1.2
+   1,      1,      1,   1,             1.197
+   1,      1,      1,  -1,               1.2
+
+This is intentional. Size == 1 is significantly less hot based on
+profiles of GCC11 and Python3 than sizes [4, 8] (which is made
+hotter).
+
+Python3 Size = 1        -> 13.64%
+Python3 Size = [4, 8]   -> 60.92%
+
+GCC11   Size = 1        ->  1.29%
+GCC11   Size = [4, 8]   -> 33.86%
+
+size, align0, align1, ret, New Time/Old Time
+   4,      4,      4,   0,             0.622
+   4,      4,      4,   1,             0.797
+   4,      4,      4,  -1,             0.805
+   5,      5,      5,   0,             0.623
+   5,      5,      5,   1,             0.777
+   5,      5,      5,  -1,             0.802
+   6,      6,      6,   0,             0.625
+   6,      6,      6,   1,             0.813
+   6,      6,      6,  -1,             0.788
+   7,      7,      7,   0,             0.625
+   7,      7,      7,   1,             0.799
+   7,      7,      7,  -1,             0.795
+   8,      8,      8,   0,             0.625
+   8,      8,      8,   1,             0.848
+   8,      8,      8,  -1,             0.914
+   9,      9,      9,   0,             0.625
+
+Size = 65:
+size, align0, align1, ret, New Time/Old Time
+  65,      0,      0,   0,             1.103
+  65,      0,      0,   1,             1.216
+  65,      0,      0,  -1,             1.227
+  65,     65,      0,   0,             1.091
+  65,      0,     65,   1,              1.19
+  65,     65,     65,  -1,             1.215
+
+This is because A) the checks in range [65, 96] are now unrolled 2x
+and B) because smaller values <= 16 are now given a hotter path. By
+contrast the SSE4 version has a branch for Size = 80. The unrolled
+version has get better performance for returns which need both
+comparisons.
+
+size, align0, align1, ret, New Time/Old Time
+ 128,      4,      8,   0,             0.858
+ 128,      4,      8,   1,             0.879
+ 128,      4,      8,  -1,             0.888
+
+As well, out of microbenchmark environments that are not full
+predictable the branch will have a real-cost.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |   2 -
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |   4 -
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h    |   4 -
+ sysdeps/x86_64/multiarch/memcmp-sse4.S     | 804 ---------------------
+ 4 files changed, 814 deletions(-)
+ delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S

 diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
-index bca82e38d86cc440..b503e4b81e92a11c 100644
+index bca82e38..b503e4b8 100644
 --- a/sysdeps/x86_64/multiarch/Makefile
 +++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,7 +11,6 @@ sysdep_routines += \
@ -101,7 +106,7 @@ index bca82e38d86cc440..b503e4b81e92a11c 100644
 # sysdep_routines
 endif
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
-index 4c7834dd0b951fa4..e5e48b36c3175e68 100644
+index 14314367..450a2917 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
@ -123,7 +128,7 @@ index 4c7834dd0b951fa4..e5e48b36c3175e68 100644
 			      __wmemcmp_ssse3)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
-index 89e2129968e1e49c..5b92594093c1e0bb 100644
+index 690dffe8..0bc47a7f 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -21,7 +21,6 @@
@ -146,12 +151,12 @@ index 89e2129968e1e49c..5b92594093c1e0bb 100644
 
 diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
 deleted file mode 100644
-index 97c102a9c5ab2b91..0000000000000000
+index 50060006..00000000
 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
 +++ /dev/null
@@ -1,804 +0,0 @@
 -/* memcmp with SSE4.1, wmemcmp with SSE4.1
-   Copyright (C) 2010-2021 Free Software Foundation, Inc.
+-   Copyright (C) 2010-2018 Free Software Foundation, Inc.
 -   Contributed by Intel Corporation.
 -   This file is part of the GNU C Library.
 -
@ -167,7 +172,7 @@ index 97c102a9c5ab2b91..0000000000000000
 -
 -   You should have received a copy of the GNU Lesser General Public
 -   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
+-   <http://www.gnu.org/licenses/>.  */
 -
 -#if IS_IN (libc)
 -
@ -954,3 +959,6 @@ index 97c102a9c5ab2b91..0000000000000000
 -	ret
 -END (MEMCMP)
 -#endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-102.patch
+++ b/SOURCES/glibc-RHEL-15696-102.patch
@ -0,0 +1,263 @@
+From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 15 Apr 2022 12:28:01 -0500
+Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+Old code was both inefficient and wasted code size. New code (-62
+bytes) and comparable or better performance in the page cross case.
+
+geometric_mean(N=20) of page cross cases New / Original: 0.960
+
+size, align0, align1, ret, New Time/Old Time
+   1,   4095,      0,   0,             1.001
+   1,   4095,      0,   1,             0.999
+   1,   4095,      0,  -1,               1.0
+   2,   4094,      0,   0,               1.0
+   2,   4094,      0,   1,               1.0
+   2,   4094,      0,  -1,               1.0
+   3,   4093,      0,   0,               1.0
+   3,   4093,      0,   1,               1.0
+   3,   4093,      0,  -1,               1.0
+   4,   4092,      0,   0,             0.987
+   4,   4092,      0,   1,               1.0
+   4,   4092,      0,  -1,               1.0
+   5,   4091,      0,   0,             0.984
+   5,   4091,      0,   1,             1.002
+   5,   4091,      0,  -1,             1.005
+   6,   4090,      0,   0,             0.993
+   6,   4090,      0,   1,             1.001
+   6,   4090,      0,  -1,             1.003
+   7,   4089,      0,   0,             0.991
+   7,   4089,      0,   1,               1.0
+   7,   4089,      0,  -1,             1.001
+   8,   4088,      0,   0,             0.875
+   8,   4088,      0,   1,             0.881
+   8,   4088,      0,  -1,             0.888
+   9,   4087,      0,   0,             0.872
+   9,   4087,      0,   1,             0.879
+   9,   4087,      0,  -1,             0.883
+  10,   4086,      0,   0,             0.878
+  10,   4086,      0,   1,             0.886
+  10,   4086,      0,  -1,             0.873
+  11,   4085,      0,   0,             0.878
+  11,   4085,      0,   1,             0.881
+  11,   4085,      0,  -1,             0.879
+  12,   4084,      0,   0,             0.873
+  12,   4084,      0,   1,             0.889
+  12,   4084,      0,  -1,             0.875
+  13,   4083,      0,   0,             0.873
+  13,   4083,      0,   1,             0.863
+  13,   4083,      0,  -1,             0.863
+  14,   4082,      0,   0,             0.838
+  14,   4082,      0,   1,             0.869
+  14,   4082,      0,  -1,             0.877
+  15,   4081,      0,   0,             0.841
+  15,   4081,      0,   1,             0.869
+  15,   4081,      0,  -1,             0.876
+  16,   4080,      0,   0,             0.988
+  16,   4080,      0,   1,              0.99
+  16,   4080,      0,  -1,             0.989
+  17,   4079,      0,   0,             0.978
+  17,   4079,      0,   1,             0.981
+  17,   4079,      0,  -1,              0.98
+  18,   4078,      0,   0,             0.981
+  18,   4078,      0,   1,              0.98
+  18,   4078,      0,  -1,             0.985
+  19,   4077,      0,   0,             0.977
+  19,   4077,      0,   1,             0.979
+  19,   4077,      0,  -1,             0.986
+  20,   4076,      0,   0,             0.977
+  20,   4076,      0,   1,             0.986
+  20,   4076,      0,  -1,             0.984
+  21,   4075,      0,   0,             0.977
+  21,   4075,      0,   1,             0.983
+  21,   4075,      0,  -1,             0.988
+  22,   4074,      0,   0,             0.983
+  22,   4074,      0,   1,             0.994
+  22,   4074,      0,  -1,             0.993
+  23,   4073,      0,   0,              0.98
+  23,   4073,      0,   1,             0.992
+  23,   4073,      0,  -1,             0.995
+  24,   4072,      0,   0,             0.989
+  24,   4072,      0,   1,             0.989
+  24,   4072,      0,  -1,             0.991
+  25,   4071,      0,   0,              0.99
+  25,   4071,      0,   1,             0.999
+  25,   4071,      0,  -1,             0.996
+  26,   4070,      0,   0,             0.993
+  26,   4070,      0,   1,             0.995
+  26,   4070,      0,  -1,             0.998
+  27,   4069,      0,   0,             0.993
+  27,   4069,      0,   1,             0.999
+  27,   4069,      0,  -1,               1.0
+  28,   4068,      0,   0,             0.997
+  28,   4068,      0,   1,               1.0
+  28,   4068,      0,  -1,             0.999
+  29,   4067,      0,   0,             0.996
+  29,   4067,      0,   1,             0.999
+  29,   4067,      0,  -1,             0.999
+  30,   4066,      0,   0,             0.991
+  30,   4066,      0,   1,             1.001
+  30,   4066,      0,  -1,             0.999
+  31,   4065,      0,   0,             0.988
+  31,   4065,      0,   1,             0.998
+  31,   4065,      0,  -1,             0.998
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
+ 1 file changed, 61 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 16fc673e..99258cf5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -429,22 +429,21 @@ L(page_cross_less_vec):
+ # ifndef USE_AS_WMEMCMP
+ 	cmpl	$8, %edx
+ 	jae	L(between_8_15)
+	/* Fall through for [4, 7].  */
+ 	cmpl	$4, %edx
+-	jae	L(between_4_7)
+	jb	L(between_2_3)
+ 
+-	/* Load as big endian to avoid branches.  */
+-	movzwl	(%rdi), %eax
+-	movzwl	(%rsi), %ecx
+-	shll	$8, %eax
+-	shll	$8, %ecx
+-	bswap	%eax
+-	bswap	%ecx
+-	movzbl	-1(%rdi, %rdx), %edi
+-	movzbl	-1(%rsi, %rdx), %esi
+-	orl	%edi, %eax
+-	orl	%esi, %ecx
+-	/* Subtraction is okay because the upper 8 bits are zero.  */
+-	subl	%ecx, %eax
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+@@ -457,9 +456,33 @@ L(one_or_less):
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+	.p2align 4,, 5
+L(ret_nonzero):
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	/* No ymm register was touched.  */
+	ret
+
+ 	.p2align 4
+ L(between_8_15):
+-# endif
+	movbe	(%rdi), %rax
+	movbe	(%rsi), %rcx
+	subq	%rcx, %rax
+	jnz	L(ret_nonzero)
+	movbe	-8(%rdi, %rdx), %rax
+	movbe	-8(%rsi, %rdx), %rcx
+	subq	%rcx, %rax
+	/* Fast path for return zero.  */
+	jnz	L(ret_nonzero)
+	/* No ymm register was touched.  */
+	ret
+# else
+ 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+@@ -475,16 +498,13 @@ L(between_8_15):
+ 	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
+# endif
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-
+-	.p2align 4
+	.p2align 4,, 10
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+@@ -501,11 +521,17 @@ L(between_16_31):
+ 	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+ 	subl	$0xffff, %eax
+	/* Fast path for return zero.  */
+ 	jnz	L(return_vec_0)
+ 	/* No ymm register was touched.  */
+ 	ret
+ 
+ # ifdef USE_AS_WMEMCMP
+	.p2align 4,, 2
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+ 	.p2align 4
+ L(one_or_less):
+ 	jb	L(zero)
+@@ -520,22 +546,20 @@ L(one_or_less):
+ # else
+ 
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.
+-	 */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	jz	L(zero_4_7)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-L(zero_4_7):
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	bswap	%eax
+	bswap	%ecx
+	shrl	%eax
+	shrl	%ecx
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+	/* Subtraction is okay because the upper bit is zero.  */
+	subl	%ecx, %eax
+ 	/* No ymm register was touched.  */
+ 	ret
+ # endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-233.patch
+++ b/SOURCES/glibc-upstream-2.34-233.patch
@ -1,26 +1,34 @@
-commit 0a11305416e287d85c64f04337cfd64b6b350e0c
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Thu Apr 21 20:52:28 2022 -0500
+From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:28 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
+Content-type: text/plain; charset=UTF-8

-    x86: Optimize {str|wcs}rchr-sse2
-    
-    The new code unrolls the main loop slightly without adding too much
-    overhead and minimizes the comparisons for the search CHAR.
-    
-    Geometric Mean of all benchmarks New / Old: 0.741
-    See email for all results.
-    
-    Full xcheck passes on x86_64 with and without multiarch enabled.
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c)
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.741
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-sse2.S |   2 +-
+ sysdeps/x86_64/multiarch/wcsrchr-sse2.S |   3 +-
+ sysdeps/x86_64/strrchr.S                | 510 +++++++++++++++---------
+ sysdeps/x86_64/wcsrchr.S                | 266 +-----------
+ 4 files changed, 338 insertions(+), 443 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/wcsrchr.S
+	(copyright header)

 diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
-index 67c30d0260cef8a3..a56300bc1830dedd 100644
+index 0ec76fe9..6bb1284b 100644
 --- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
 +++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
-    <https://www.gnu.org/licenses/>.  */
+    <http://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
 -# define strrchr __strrchr_sse2
@ -29,11 +37,11 @@ index 67c30d0260cef8a3..a56300bc1830dedd 100644
 # undef weak_alias
 # define weak_alias(strrchr, rindex)
 diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
-index a36034b40afe8d3d..00f69f2be77a43a0 100644
+index d015e953..f26d53b5 100644
 --- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
 +++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
-    <https://www.gnu.org/licenses/>.  */
+    <http://www.gnu.org/licenses/>.  */
 
 #if IS_IN (libc)
 -# define wcsrchr __wcsrchr_sse2
@ -42,7 +50,7 @@ index a36034b40afe8d3d..00f69f2be77a43a0 100644
 -
 #include "../wcsrchr.S"
 diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
-index dfd09fe9508cb5bc..fc1598bb11417fd5 100644
+index aca98e7e..a58cc220 100644
 --- a/sysdeps/x86_64/strrchr.S
 +++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
@ -587,12 +595,12 @@ index dfd09fe9508cb5bc..fc1598bb11417fd5 100644
 +	libc_hidden_builtin_def (STRRCHR)
 +#endif
 diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
-index 6b318d3f29de9a9e..9006f2220963d76c 100644
+index 2f388537..ae3cfa7d 100644
 --- a/sysdeps/x86_64/wcsrchr.S
 +++ b/sysdeps/x86_64/wcsrchr.S
@@ -17,266 +17,12 @@
    License along with the GNU C Library; if not, see
-    <https://www.gnu.org/licenses/>.  */
+    <http://www.gnu.org/licenses/>.  */
 
 -#include <sysdep.h>
 
@ -863,3 +871,6 @@ index 6b318d3f29de9a9e..9006f2220963d76c 100644
 -
 -END (wcsrchr)
 +#include "../strrchr.S"
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-234.patch
+++ b/SOURCES/glibc-upstream-2.34-234.patch
@ -1,22 +1,23 @@
-commit 00f09a14d2818f438959e764834abb3913f2b20a
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Thu Apr 21 20:52:29 2022 -0500
+From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:29 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
+Content-type: text/plain; charset=UTF-8

-    x86: Optimize {str|wcs}rchr-avx2
-    
-    The new code unrolls the main loop slightly without adding too much
-    overhead and minimizes the comparisons for the search CHAR.
-    
-    Geometric Mean of all benchmarks New / Old: 0.832
-    See email for all results.
-    
-    Full xcheck passes on x86_64 with and without multiarch enabled.
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6)
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.832
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
+ 1 file changed, 269 insertions(+), 157 deletions(-)

 diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
-index 0deba97114d3b83d..b8dec737d5213b25 100644
+index c949410b..3d26fad4 100644
 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
@ -495,3 +496,6 @@ index 0deba97114d3b83d..b8dec737d5213b25 100644
 +	VZEROUPPER_RETURN
 +END(STRRCHR)
 #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-235.patch
+++ b/SOURCES/glibc-upstream-2.34-235.patch
@ -1,22 +1,23 @@
-commit 596c9a32cc5d5eb82587e92d1e66c9ecb7668456
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Thu Apr 21 20:52:30 2022 -0500
+From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 21 Apr 2022 20:52:30 -0500
+Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
+Content-type: text/plain; charset=UTF-8

-    x86: Optimize {str|wcs}rchr-evex
-    
-    The new code unrolls the main loop slightly without adding too much
-    overhead and minimizes the comparisons for the search CHAR.
-    
-    Geometric Mean of all benchmarks New / Old: 0.755
-    See email for all results.
-    
-    Full xcheck passes on x86_64 with and without multiarch enabled.
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d)
+The new code unrolls the main loop slightly without adding too much
+overhead and minimizes the comparisons for the search CHAR.
+
+Geometric Mean of all benchmarks New / Old: 0.755
+See email for all results.
+
+Full xcheck passes on x86_64 with and without multiarch enabled.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
+ 1 file changed, 290 insertions(+), 181 deletions(-)

 diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
-index f920b5a584edd293..f5b6d755ceb85ae2 100644
+index f920b5a5..f5b6d755 100644
 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S
 +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,351 @@
@ -552,3 +553,6 @@ index f920b5a584edd293..f5b6d755ceb85ae2 100644
 -END (STRRCHR)
 +END(STRRCHR)
 #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-335.patch
+++ b/SOURCES/glibc-upstream-2.34-335.patch
@ -1,26 +1,28 @@
-commit 04efdcfac405723c23b25d124817bcfc1697e2d8
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Wed Apr 27 15:13:02 2022 -0500
+From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 27 Apr 2022 15:13:02 -0500
+Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
+Content-type: text/plain; charset=UTF-8

-    sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
-    
-    'get_fast_jitter' is meant to be used purely for performance
-    purposes. In all cases it's used it should be acceptable to get no
-    randomness (see default case). An example use case is in setting
-    jitter for retries between threads at a lock. There is a
-    performance benefit to having jitter, but only if the jitter can
-    be generated very quickly and ultimately there is no serious issue
-    if no jitter is generated.
-    
-    The implementation generally uses 'HP_TIMING_NOW' iff it is
-    inlined (avoid any potential syscall paths).
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit 911c63a51c690dd1a97dfc587097277029baf00f)
+'get_fast_jitter' is meant to be used purely for performance
+purposes. In all cases it's used it should be acceptable to get no
+randomness (see default case). An example use case is in setting
+jitter for retries between threads at a lock. There is a
+performance benefit to having jitter, but only if the jitter can
+be generated very quickly and ultimately there is no serious issue
+if no jitter is generated.
+
+The implementation generally uses 'HP_TIMING_NOW' iff it is
+inlined (avoid any potential syscall paths).
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
+ 1 file changed, 42 insertions(+)
+ create mode 100644 sysdeps/generic/fast-jitter.h

 diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
 new file mode 100644
-index 0000000000000000..4dd53e3475c3dfe6
+index 00000000..4dd53e34
 --- /dev/null
 +++ b/sysdeps/generic/fast-jitter.h
@@ -0,0 +1,42 @@
@ -66,3 +68,6 @@ index 0000000000000000..4dd53e3475c3dfe6
 +}
 +
 +#endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-107.patch
+++ b/SOURCES/glibc-RHEL-15696-107.patch
@ -0,0 +1,226 @@
+From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
+From: Wangyang Guo <wangyang.guo@intel.com>
+Date: Fri, 6 May 2022 01:50:10 +0000
+Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+When mutiple threads waiting for lock at the same time, once lock owner
+releases the lock, waiters will see lock available and all try to lock,
+which may cause an expensive CAS storm.
+
+Binary exponential backoff with random jitter is introduced. As try-lock
+attempt increases, there is more likely that a larger number threads
+compete for adaptive mutex lock, so increase wait time in exponential.
+A random jitter is also added to avoid synchronous try-lock from other
+threads.
+
+v2: Remove read-check before try-lock for performance.
+
+v3:
+1. Restore read-check since it works well in some platform.
+2. Make backoff arch dependent, and enable it for x86_64.
+3. Limit max backoff to reduce latency in large critical section.
+
+v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
+
+v5: Commit log updated for regression in large critical section.
+
+Result of pthread-mutex-locks bench
+
+Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
+First Row: thread number
+First Col: critical section length
+Values: backoff vs upstream, time based, low is better
+
+non-critical-length: 1
+	1	2	4	8	16	32	64	112	140
+0	0.99	0.58	0.52	0.49	0.43	0.44	0.46	0.52	0.54
+1	0.98	0.43	0.56	0.50	0.44	0.45	0.50	0.56	0.57
+2	0.99	0.41	0.57	0.51	0.45	0.47	0.48	0.60	0.61
+4	0.99	0.45	0.59	0.53	0.48	0.49	0.52	0.64	0.65
+8	1.00	0.66	0.71	0.63	0.56	0.59	0.66	0.72	0.71
+16	0.97	0.78	0.91	0.73	0.67	0.70	0.79	0.80	0.80
+32	0.95	1.17	0.98	0.87	0.82	0.86	0.89	0.90	0.90
+64	0.96	0.95	1.01	1.01	0.98	1.00	1.03	0.99	0.99
+128	0.99	1.01	1.01	1.17	1.08	1.12	1.02	0.97	1.02
+
+non-critical-length: 32
+	1	2	4	8	16	32	64	112	140
+0	1.03	0.97	0.75	0.65	0.58	0.58	0.56	0.70	0.70
+1	0.94	0.95	0.76	0.65	0.58	0.58	0.61	0.71	0.72
+2	0.97	0.96	0.77	0.66	0.58	0.59	0.62	0.74	0.74
+4	0.99	0.96	0.78	0.66	0.60	0.61	0.66	0.76	0.77
+8	0.99	0.99	0.84	0.70	0.64	0.66	0.71	0.80	0.80
+16	0.98	0.97	0.95	0.76	0.70	0.73	0.81	0.85	0.84
+32	1.04	1.12	1.04	0.89	0.82	0.86	0.93	0.91	0.91
+64	0.99	1.15	1.07	1.00	0.99	1.01	1.05	0.99	0.99
+128	1.00	1.21	1.20	1.22	1.25	1.31	1.12	1.10	0.99
+
+non-critical-length: 128
+	1	2	4	8	16	32	64	112	140
+0	1.02	1.00	0.99	0.67	0.61	0.61	0.61	0.74	0.73
+1	0.95	0.99	1.00	0.68	0.61	0.60	0.60	0.74	0.74
+2	1.00	1.04	1.00	0.68	0.59	0.61	0.65	0.76	0.76
+4	1.00	0.96	0.98	0.70	0.63	0.63	0.67	0.78	0.77
+8	1.01	1.02	0.89	0.73	0.65	0.67	0.71	0.81	0.80
+16	0.99	0.96	0.96	0.79	0.71	0.73	0.80	0.84	0.84
+32	0.99	0.95	1.05	0.89	0.84	0.85	0.94	0.92	0.91
+64	1.00	0.99	1.16	1.04	1.00	1.02	1.06	0.99	0.99
+128	1.00	1.06	0.98	1.14	1.39	1.26	1.08	1.02	0.98
+
+There is regression in large critical section. But adaptive mutex is
+aimed for "quick" locks. Small critical section is more common when
+users choose to use adaptive pthread_mutex.
+
+Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	pthreadP.h
+	(had been moved)
+	nptl/pthread_mutex_lock.c
+	(max_adaptive_count renamed)
+
+---
+ nptl/pthreadP.h                             |  1 +
+ nptl/pthread_mutex_lock.c                   | 16 +++++++--
+ sysdeps/nptl/pthread_mutex_backoff.h        | 35 ++++++++++++++++++
+ sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
+ 4 files changed, 89 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
+ create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+
+diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
+index 7ddc166c..1550e3b6 100644
+--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
+@@ -33,6 +33,7 @@
+ #include <kernel-features.h>
+ #include <errno.h>
+ #include <internal-signals.h>
+#include <pthread_mutex_backoff.h>
+ 
+ 
+ /* Atomic operations on TLS memory.  */
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index d96a9933..c7770fc9 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 	  int cnt = 0;
+ 	  int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
+ 			     mutex->__data.__spins * 2 + 10);
+	  int spin_count, exp_backoff = 1;
+	  unsigned int jitter = get_jitter ();
+ 	  do
+ 	    {
+-	      if (cnt++ >= max_cnt)
+	      /* In each loop, spin count is exponential backoff plus
+		 random jitter, random range is [0, exp_backoff-1].  */
+	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
+	      cnt += spin_count;
+	      if (cnt >= max_cnt)
+ 		{
+		  /* If cnt exceeds max spin count, just go to wait
+		     queue.  */
+ 		  LLL_MUTEX_LOCK (mutex);
+ 		  break;
+ 		}
+-	      atomic_spin_nop ();
+	      do
+		atomic_spin_nop ();
+	      while (--spin_count > 0);
+	      /* Prepare for next loop.  */
+	      exp_backoff = get_next_backoff (exp_backoff);
+ 	    }
+ 	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
+ 		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..5b26c22a
+--- /dev/null
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,35 @@
+/* Pthread mutex backoff configuration.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+static inline unsigned int
+get_jitter (void)
+{
+  /* Arch dependent random jitter, return 0 disables random.  */
+  return 0;
+}
+
+static inline int
+get_next_backoff (int backoff)
+{
+  /* Next backoff, return 1 disables mutex backoff.  */
+  return 1;
+}
+
+#endif
+diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+new file mode 100644
+index 00000000..ec74c3d9
+--- /dev/null
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
+@@ -0,0 +1,39 @@
+/* Pthread mutex backoff configuration.
+   Copyright (C) 2022 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+#include <fast-jitter.h>
+
+static inline unsigned int
+get_jitter (void)
+{
+  return get_fast_jitter ();
+}
+
+#define MAX_BACKOFF 16
+
+static inline int
+get_next_backoff (int backoff)
+{
+  /* Binary expontial backoff. Limiting max backoff
+     can reduce latency in large critical section.  */
+  return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
+}
+
+#endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-108.patch
+++ b/SOURCES/glibc-RHEL-15696-108.patch
@ -0,0 +1,55 @@
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8
+
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
+ 1 file changed, 2 insertions(+), 6 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 28cc98b6..e267c6cb 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -345,10 +345,10 @@ L(one_or_less):
+ 	movq	%LOCALE_REG, %rdx
+ #  endif
+ 	jb	L(ret_zero)
+-#  ifdef USE_AS_WCSCMP
+ 	/* 'nbe' covers the case where length is negative (large
+ 	   unsigned).  */
+-	jnbe	__wcscmp_avx2
+	jnbe	OVERFLOW_STRCMP
+#  ifdef USE_AS_WCSCMP
+ 	movl	(%rdi), %edx
+ 	xorl	%eax, %eax
+ 	cmpl	(%rsi), %edx
+@@ -357,10 +357,6 @@ L(one_or_less):
+ 	negl	%eax
+ 	orl	$1, %eax
+ #  else
+-	/* 'nbe' covers the case where length is negative (large
+-	   unsigned).  */
+-
+-	jnbe	__strcmp_avx2
+ 	movzbl	(%rdi), %eax
+ 	movzbl	(%rsi), %ecx
+ 	TOLOWER_gpr (%rax, %eax)
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-109.patch
+++ b/SOURCES/glibc-RHEL-15696-109.patch
@ -0,0 +1,60 @@
+From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
+From: Stefan Liebler <stli@linux.ibm.com>
+Date: Mon, 28 Jun 2021 13:01:07 +0200
+Subject: s390x: Update math: redirect roundeven function
+
+After recent commit
+447954a206837b5f153869cfeeeab44631c3fac9
+"math: redirect roundeven function", building on
+s390x fails with:
+Error: symbol `__roundevenl' is already defined
+
+Similar to aarch64/riscv fix, this patch redirects target
+specific functions for s390x:
+commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
+"Update math: redirect roundeven function"
+
+diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
+index 40b07e054b..0773adfed0 100644
+--- a/sysdeps/s390/fpu/s_roundeven.c
+++ b/sysdeps/s390/fpu/s_roundeven.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-double.h>
+ 
+@@ -31,7 +32,6 @@ __roundeven (double x)
+   __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
+   return y;
+ }
+-hidden_def (__roundeven)
+ libm_alias_double (__roundeven, roundeven)
+ 
+ #else
+diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
+index d2fbf3d2b6..289785bc4a 100644
+--- a/sysdeps/s390/fpu/s_roundevenf.c
+++ b/sysdeps/s390/fpu/s_roundevenf.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <libm-alias-float.h>
+ 
+diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
+index 29ab7a8616..94b6459ab4 100644
+--- a/sysdeps/s390/fpu/s_roundevenl.c
+++ b/sysdeps/s390/fpu/s_roundevenl.c
+@@ -18,6 +18,7 @@
+    <https://www.gnu.org/licenses/>.  */
+ 
+ #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
+ # include <math.h>
+ # include <math_private.h>
+ # include <libm-alias-ldouble.h>
--- a/SOURCES/glibc-RHEL-15696-11.patch
+++ b/SOURCES/glibc-RHEL-15696-11.patch
@ -0,0 +1,74 @@
+From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 26 Feb 2021 05:36:59 -0800
+Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
+Content-type: text/plain; charset=UTF-8
+
+1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
+by VZEROUPPER inside a transactionally executing RTM region.
+2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
+loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
+1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp.  Add
+Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
+---
+ sysdeps/x86/cpu-features.c                    | 20 +++++++++++++++++--
+ sysdeps/x86/cpu-tunables.c                    |  2 ++
+ ...cpu-features-preferred_feature_index_1.def |  1 +
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 91042505..3610ee5c 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
+ 	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ 	  |= bit_arch_Prefer_No_VZEROUPPER;
+       else
+-	cpu_features->preferred[index_arch_Prefer_No_AVX512]
+-	  |= bit_arch_Prefer_No_AVX512;
+	{
+	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
+	    |= bit_arch_Prefer_No_AVX512;
+
+	  /* Avoid RTM abort triggered by VZEROUPPER inside a
+	     transactionally executing RTM region.  */
+	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+	      |= bit_arch_Prefer_No_VZEROUPPER;
+
+	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+	     AVX2 strcmp is faster than EVEX strcmp.  */
+	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+	      |= bit_arch_Prefer_AVX2_STRCMP;
+	}
+     }
+   /* This spells out "AuthenticAMD".  */
+   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
+diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
+index 3173b2b9..73adbaba 100644
+--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
+@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
+ 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
+ 						Fast_Copy_Backward,
+ 						disable, 18);
+	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
+ 	    }
+ 	  break;
+ 	case 19:
+diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+index 17a5cc42..4ca70b40 100644
+--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
+ BIT (Prefer_FSRM)
+ BIT (Prefer_No_AVX512)
+ BIT (MathVec_Prefer_No_AVX512)
+BIT (Prefer_AVX2_STRCMP)
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-110.patch
+++ b/SOURCES/glibc-RHEL-15696-110.patch
@ -0,0 +1,26 @@
+From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 23 Jun 2021 13:29:41 -0700
+Subject: Update math: redirect roundeven function
+
+Redirect target specific roundeven functions for aarch64, ldbl-128ibm
+and riscv.
+
+Conflicts:
+	sysdeps/aarch64/*
+	(not needed)
+	sysdeps/riscv/*
+	(not supported)
+
+diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+index 6701970f4a..90eecf496b 100644
+--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ 
--- a/SOURCES/glibc-RHEL-15696-12.patch
+++ b/SOURCES/glibc-RHEL-15696-12.patch
--- a/SOURCES/glibc-RHEL-15696-13.patch
+++ b/SOURCES/glibc-RHEL-15696-13.patch
--- a/SOURCES/glibc-RHEL-15696-14.patch
+++ b/SOURCES/glibc-RHEL-15696-14.patch
@ -0,0 +1,242 @@
+From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 06:46:08 -0800
+Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++--
+ .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++
+ .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++-----
+ 5 files changed, 104 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 46783cd1..4563fc56 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
+		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 082e4da3..6bd3abfc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_chk_ssse3_back)
+@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memmove_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memmove,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memmove_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_chk_ssse3_back)
+@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __memcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __memcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
+@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_chk_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_chk_ssse3_back)
+@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+ 			      __mempcpy_avx_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, mempcpy,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __mempcpy_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+ 			      __mempcpy_ssse3_back)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index 5e5f0299..6f8bce5f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
+   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx_unaligned_erms);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx_unaligned_erms);
+ 
+-      return OPTIMIZE (avx_unaligned);
+	  return OPTIMIZE (avx_unaligned);
+	}
+     }
+ 
+   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
+diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..0cbce8f9
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
+@@ -0,0 +1,33 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		ymm16
+# define VEC1		ymm17
+# define VEC2		ymm18
+# define VEC3		ymm19
+# define VEC4		ymm20
+# define VEC5		ymm21
+# define VEC6		ymm22
+# define VEC7		ymm23
+# define VEC8		ymm24
+# define VEC9		ymm25
+# define VEC10		ymm26
+# define VEC11		ymm27
+# define VEC12		ymm28
+# define VEC13		ymm29
+# define VEC14		ymm30
+# define VEC15		ymm31
+# define VEC(i)		VEC##i
+# define VMOVNT		vmovntdq
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p)		p##.evex
+# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 274aa1c7..08e21692 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -48,6 +48,14 @@
+ # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
+ #endif
+ 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER vzeroupper
+@@ -277,20 +285,20 @@ L(less_vec):
+ #if VEC_SIZE > 32
+ L(between_32_63):
+ 	/* From 32 to 63.  No branch when size == 32.  */
+-	vmovdqu	(%rsi), %ymm0
+-	vmovdqu	-32(%rsi,%rdx), %ymm1
+-	vmovdqu	%ymm0, (%rdi)
+-	vmovdqu	%ymm1, -32(%rdi,%rdx)
+	VMOVU	(%rsi), %YMM0
+	VMOVU	-32(%rsi,%rdx), %YMM1
+	VMOVU	%YMM0, (%rdi)
+	VMOVU	%YMM1, -32(%rdi,%rdx)
+ 	VZEROUPPER
+ 	ret
+ #endif
+ #if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	(%rsi), %xmm0
+-	vmovdqu	-16(%rsi,%rdx), %xmm1
+-	vmovdqu	%xmm0, (%rdi)
+-	vmovdqu	%xmm1, -16(%rdi,%rdx)
+	VMOVU	(%rsi), %XMM0
+	VMOVU	-16(%rsi,%rdx), %XMM1
+	VMOVU	%XMM0, (%rdi)
+	VMOVU	%XMM1, -16(%rdi,%rdx)
+ 	ret
+ #endif
+ L(between_8_15):
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-15.patch
+++ b/SOURCES/glibc-RHEL-15696-15.patch
@ -0,0 +1,254 @@
+From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:15:03 -0800
+Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
+abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |  1 +
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++
+ sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++----
+ .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++
+ .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++-----
+ 6 files changed, 90 insertions(+), 14 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 4563fc56..1cc0a10e 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memchr-evex \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+		   memset-evex-unaligned-erms \
+ 		   rawmemchr-evex \
+ 		   stpcpy-evex \
+ 		   stpncpy-evex \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 6bd3abfc..7cf83485 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_chk_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_chk_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_chk_avx512_unaligned_erms)
+@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __memset_avx2_unaligned_erms)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_evex_unaligned)
+	      IFUNC_IMPL_ADD (array, i, memset,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memset_avx512_unaligned_erms)
+@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, wmemset,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_avx512_unaligned))
+@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+			      CPU_FEATURE_USABLE (AVX512VL),
+			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __wmemset_chk_avx512_unaligned))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 708bd72e..6f31f4dc 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
+   attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+  attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+  attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
+   attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
+@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx2_unaligned_erms);
+-      else
+-	return OPTIMIZE (avx2_unaligned);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (evex_unaligned_erms);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx2_unaligned_erms);
+
+	  return OPTIMIZE (avx2_unaligned);
+	}
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index eb242210..9290c4bf 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -20,6 +20,7 @@
+ 
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
+ 
+ static inline void *
+@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx512_unaligned);
+-      else
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	return OPTIMIZE (evex_unaligned);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ 	return OPTIMIZE (avx2_unaligned);
+     }
+ 
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+new file mode 100644
+index 00000000..ae0a4d6e
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE	32
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		ymm16
+# define VEC(i)		VEC##i
+# define VMOVU		vmovdqu64
+# define VMOVA		vmovdqa64
+# define VZEROUPPER
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movq r, %rax; \
+  vpbroadcastb d, %VEC0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+  movq r, %rax; \
+  vpbroadcastd d, %VEC0
+
+# define SECTION(p)		p##.evex
+# define MEMSET_SYMBOL(p,s)	p##_evex_##s
+# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 9a0fd818..71e91a8f 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -34,6 +34,14 @@
+ # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
+ #endif
+ 
+#ifndef XMM0
+# define XMM0				xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0				ymm0
+#endif
+
+ #ifndef VZEROUPPER
+ # if VEC_SIZE > 16
+ #  define VZEROUPPER			vzeroupper
+@@ -67,7 +75,7 @@
+ ENTRY (__bzero)
+ 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+ 	mov	%RSI_LP, %RDX_LP /* Set n.  */
+-	pxor	%xmm0, %xmm0
+	pxor	%XMM0, %XMM0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+ weak_alias (__bzero, bzero)
+@@ -223,7 +231,7 @@ L(less_vec):
+ 	cmpb	$16, %dl
+ 	jae	L(between_16_31)
+ # endif
+-	MOVQ	%xmm0, %rcx
+	MOVQ	%XMM0, %rcx
+ 	cmpb	$8, %dl
+ 	jae	L(between_8_15)
+ 	cmpb	$4, %dl
+@@ -238,16 +246,16 @@ L(less_vec):
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	vmovdqu	%ymm0, -32(%rdi,%rdx)
+-	vmovdqu	%ymm0, (%rdi)
+	VMOVU	%YMM0, -32(%rdi,%rdx)
+	VMOVU	%YMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	vmovdqu	%xmm0, -16(%rdi,%rdx)
+-	vmovdqu	%xmm0, (%rdi)
+	VMOVU	%XMM0, -16(%rdi,%rdx)
+	VMOVU	%XMM0, (%rdi)
+ 	VZEROUPPER
+ 	ret
+ # endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-16.patch
+++ b/SOURCES/glibc-RHEL-15696-16.patch
@ -0,0 +1,561 @@
+From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 5 Mar 2021 07:20:28 -0800
+Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
+instructions using YMM16-YMM31 registers to avoid RTM abort with usable
+AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
+exit.
+---
+ sysdeps/x86_64/multiarch/Makefile             |   4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  10 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h       |  13 +-
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 440 ++++++++++++++++++
+ sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S |   4 +
+ 5 files changed, 467 insertions(+), 4 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 1cc0a10e..9d79b138 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   memset-avx2-unaligned-erms \
+ 		   memset-avx512-unaligned-erms \
+ 		   memchr-evex \
+		   memcmp-evex-movbe \
+ 		   memmove-evex-unaligned-erms \
+ 		   memrchr-evex \
+ 		   memset-evex-unaligned-erms \
+@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsncmp-evex \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+-		   wmemchr-evex
+		   wmemchr-evex \
+		   wmemcmp-evex-movbe
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 7cf83485..c8da910e 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, memcmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (MOVBE)),
+			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __memcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
+@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
+	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (MOVBE)),
+			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+ 			      __wmemcmp_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 6c1f3153..3ca1f0a6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+ extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
+ 
+ static inline void *
+ IFUNC_SELECTOR (void)
+ {
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+-  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
+-      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    return OPTIMIZE (avx2_movbe);
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex_movbe);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2_movbe);
+    }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+     return OPTIMIZE (sse4_1);
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+new file mode 100644
+index 00000000..9c093972
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -0,0 +1,440 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+      to avoid branches.
+   2. Use overlapping compare to avoid branch.
+   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+      bytes for wmemcmp.
+   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+      area.
+   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+#  define MEMCMP	__memcmp_evex_movbe
+# endif
+
+# define VMOVU		vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+#  define VPCMPEQ	vpcmpeqd
+# else
+#  define VPCMPEQ	vpcmpeqb
+# endif
+
+# define XMM1		xmm17
+# define XMM2		xmm18
+# define YMM1		ymm17
+# define YMM2		ymm18
+# define YMM3		ymm19
+# define YMM4		ymm20
+# define YMM5		ymm21
+# define YMM6		ymm22
+
+# define VEC_SIZE 32
+# ifdef USE_AS_WMEMCMP
+#  define VEC_MASK 0xff
+#  define XMM_MASK 0xf
+# else
+#  define VEC_MASK 0xffffffff
+#  define XMM_MASK 0xffff
+# endif
+
+/* Warning!
+           wmemcmp has to use SIGNED comparison for elements.
+           memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+	.section .text.evex,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+	jb	L(less_vec)
+
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_vec)
+
+	/* More than 2 * VEC.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jb	L(last_4x_vec)
+
+	/* From 4 * VEC to 8 * VEC, inclusively. */
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+
+	kandd	%k1, %k2, %k5
+	kandd	%k3, %k4, %k6
+	kandd	%k5, %k6, %k6
+
+	kmovd	%k6, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	kandd	%k1, %k2, %k5
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	kandd	%k3, %k5, %k5
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	kandd	%k4, %k5, %k5
+
+	kmovd	%k5, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+L(last_vec):
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(first_vec):
+	/* A byte or int32 is different within 16 or 32 bytes.  */
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(%rdi, %rcx, 4), %edx
+	cmpl	(%rsi, %rcx, 4), %edx
+L(wmemcmp_return):
+	setl	%al
+	negl	%eax
+	orl	$1, %eax
+# else
+	movzbl	(%rdi, %rcx), %eax
+	movzbl	(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+# ifdef USE_AS_WMEMCMP
+	.p2align 4
+L(4):
+	xorl	%eax, %eax
+	movl	(%rdi), %edx
+	cmpl	(%rsi), %edx
+	jne	L(wmemcmp_return)
+	ret
+# else
+	.p2align 4
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.  */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	je	L(exit)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+	ret
+
+	.p2align 4
+L(exit):
+	ret
+
+	.p2align 4
+L(between_2_3):
+	/* Load as big endian to avoid branches.  */
+	movzwl	(%rdi), %eax
+	movzwl	(%rsi), %ecx
+	shll	$8, %eax
+	shll	$8, %ecx
+	bswap	%eax
+	bswap	%ecx
+	movb	-1(%rdi, %rdx), %al
+	movb	-1(%rsi, %rdx), %cl
+	/* Subtraction is okay because the upper 8 bits are zero.  */
+	subl	%ecx, %eax
+	ret
+
+	.p2align 4
+L(1):
+	movzbl	(%rdi), %eax
+	movzbl	(%rsi), %ecx
+	subl	%ecx, %eax
+	ret
+# endif
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+	ret
+
+	.p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+	cmpb	$4, %dl
+	je	L(4)
+	jb	L(zero)
+# else
+	cmpb	$1, %dl
+	je	L(1)
+	jb	L(zero)
+	cmpb	$4, %dl
+	jb	L(between_2_3)
+	cmpb	$8, %dl
+	jb	L(between_4_7)
+# endif
+	cmpb	$16, %dl
+	jae	L(between_16_31)
+	/* It is between 8 and 15 bytes.  */
+	vmovq	(%rdi), %XMM1
+	vmovq	(%rsi), %XMM2
+	VPCMPEQ %XMM1, %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-8(%rdi, %rdx), %rdi
+	leaq	-8(%rsi, %rdx), %rsi
+	vmovq	(%rdi), %XMM1
+	vmovq	(%rsi), %XMM2
+	VPCMPEQ %XMM1, %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	VMOVU	(%rsi), %XMM2
+	VPCMPEQ (%rdi), %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-16(%rdi, %rdx), %rdi
+	leaq	-16(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %XMM2
+	VPCMPEQ (%rdi), %XMM2, %k2
+	kmovw	%k2, %eax
+	subl    $XMM_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(more_8x_vec):
+	/* More than 8 * VEC.  Check the first VEC.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Align the first memory area for aligned loads in the loop.
+	   Compute how much the first memory area is misaligned.  */
+	movq	%rdi, %rcx
+	andl	$(VEC_SIZE - 1), %ecx
+	/* Get the negative of offset for alignment.  */
+	subq	$VEC_SIZE, %rcx
+	/* Adjust the second memory area.  */
+	subq	%rcx, %rsi
+	/* Adjust the first memory area which should be aligned now.  */
+	subq	%rcx, %rdi
+	/* Adjust length.  */
+	addq	%rcx, %rdx
+
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	VMOVU	(%rsi), %YMM1
+	VPCMPEQ (%rdi), %YMM1, %k1
+
+	VMOVU	VEC_SIZE(%rsi), %YMM2
+	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	kandd	%k2, %k1, %k5
+
+	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	kandd	%k3, %k5, %k5
+
+	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	kandd	%k4, %k5, %k5
+
+	kmovd	%k5, %eax
+	cmpl	$VEC_MASK, %eax
+	jne	L(4x_vec_end)
+
+	addq	$(VEC_SIZE * 4), %rdi
+	addq	$(VEC_SIZE * 4), %rsi
+
+	subq	$(VEC_SIZE * 4), %rdx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jae	L(loop_4x_vec)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(last_vec)
+	cmpq	$(VEC_SIZE * 2), %rdx
+	jbe	L(last_2x_vec)
+
+L(last_4x_vec):
+	/* From 2 * VEC to 4 * VEC. */
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	/* Use overlapping loads to avoid branches.  */
+	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+
+	addq	$VEC_SIZE, %rdi
+	addq	$VEC_SIZE, %rsi
+	VMOVU	(%rsi), %YMM2
+	VPCMPEQ (%rdi), %YMM2, %k2
+	kmovd	%k2, %eax
+	subl    $VEC_MASK, %eax
+	jnz	L(first_vec)
+	ret
+
+	.p2align 4
+L(4x_vec_end):
+	kmovd	%k1, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec)
+	kmovd	%k2, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x1)
+	kmovd	%k3, %eax
+	subl	$VEC_MASK, %eax
+	jnz	L(first_vec_x2)
+	kmovd	%k4, %eax
+	subl	$VEC_MASK, %eax
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x1):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %ecx
+# ifdef USE_AS_WMEMCMP
+	xorl	%eax, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+	jmp	L(wmemcmp_return)
+# else
+	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+	sub	%edx, %eax
+# endif
+	ret
+END (MEMCMP)
+#endif
+diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+new file mode 100644
+index 00000000..4726d74a
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
+@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-17.patch
+++ b/SOURCES/glibc-RHEL-15696-17.patch
--- a/SOURCES/glibc-RHEL-15696-18.patch
+++ b/SOURCES/glibc-RHEL-15696-18.patch
@ -0,0 +1,735 @@
+From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 23 Feb 2021 06:33:10 -0800
+Subject: [PATCH] x86: Add string/memory function tests in RTM region
+Content-type: text/plain; charset=UTF-8
+
+At function exit, AVX optimized string/memory functions have VZEROUPPER
+which triggers RTM abort.   When such functions are called inside a
+transactionally executing RTM region, RTM abort causes severe performance
+degradation.  Add tests to verify that string/memory functions won't
+cause RTM abort in RTM region.
+---
+ sysdeps/x86/Makefile          | 23 +++++++++++
+ sysdeps/x86/tst-memchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memcmp-rtm.c  | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-memset-rtm.c  | 45 ++++++++++++++++++++++
+ sysdeps/x86/tst-strchr-rtm.c  | 54 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strcpy-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-string-rtm.h  | 72 +++++++++++++++++++++++++++++++++++
+ sysdeps/x86/tst-strlen-rtm.c  | 53 ++++++++++++++++++++++++++
+ sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
+ sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
+ 12 files changed, 618 insertions(+)
+ create mode 100644 sysdeps/x86/tst-memchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-memmove-rtm.c
+ create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-memset-rtm.c
+ create mode 100644 sysdeps/x86/tst-strchr-rtm.c
+ create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
+ create mode 100644 sysdeps/x86/tst-string-rtm.h
+ create mode 100644 sysdeps/x86/tst-strlen-rtm.c
+ create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
+ create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
+
+diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
+index 59e928e9..5be71ada 100644
+--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
+@@ -17,6 +17,29 @@ endif
+ 
+ ifeq ($(subdir),string)
+ sysdep_routines += cacheinfo
+
+tests += \
+  tst-memchr-rtm \
+  tst-memcmp-rtm \
+  tst-memmove-rtm \
+  tst-memrchr-rtm \
+  tst-memset-rtm \
+  tst-strchr-rtm \
+  tst-strcpy-rtm \
+  tst-strlen-rtm \
+  tst-strncmp-rtm \
+  tst-strrchr-rtm
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strrchr-rtm.c += -mrtm
+ endif
+ 
+ ifneq ($(enable-cet),no)
+diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
+new file mode 100644
+index 00000000..e4749401
+--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
+@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memchr", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
+new file mode 100644
+index 00000000..e4c8a623
+--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
+@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  memset (string2, 'a', STRING_SIZE);
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memcmp (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memcmp", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
+new file mode 100644
+index 00000000..4bf97ef1
+--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (memmove (string2, string1, STRING_SIZE) == string2
+      && memcmp (string2, string1, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memmove", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
+new file mode 100644
+index 00000000..a57a5a8e
+--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
+@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = memrchr (string1, 'c', STRING_SIZE);
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memrchr", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
+new file mode 100644
+index 00000000..bf343a4d
+--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
+@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  memset (string1, 'a', STRING_SIZE);
+  return 0;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("memset", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
+new file mode 100644
+index 00000000..a82e29c0
+--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
+@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[100] = 'c';
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strchr (string1, 'c');
+  if (p == &string1[100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strchr", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
+new file mode 100644
+index 00000000..2b2a583f
+--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (strcpy (string2, string1) == string2
+      && strcmp (string2, string1) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strcpy", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
+new file mode 100644
+index 00000000..d2470afa
+--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
+@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <sys/platform/x86.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+	   int (*function) (void))
+{
+  if (!CPU_FEATURE_USABLE (RTM))
+    return EXIT_UNSUPPORTED;
+
+  int status = prepare ();
+  if (status != EXIT_SUCCESS)
+    return status;
+
+  unsigned int i;
+  unsigned int naborts = 0;
+  unsigned int failed = 0;
+  for (i = 0; i < loop; i++)
+    {
+      failed |= function ();
+      if (_xbegin() == _XBEGIN_STARTED)
+	{
+	  failed |= function ();
+	  _xend();
+	}
+      else
+	{
+	  failed |= function ();
+	  ++naborts;
+	}
+    }
+
+  if (failed)
+    FAIL_EXIT1 ("%s() failed", name);
+
+  if (naborts)
+    {
+      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+	 TSX.  */
+      double rate = 100 * ((double) naborts) / ((double) loop);
+      if (rate > 5)
+	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+		    rate, naborts, loop);
+    }
+
+  return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
+new file mode 100644
+index 00000000..0dcf14db
+--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = '\0';
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  size_t len = strlen (string1);
+  if (len == STRING_SIZE - 100)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strlen", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+new file mode 100644
+index 00000000..236ad951
+--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -0,0 +1,52 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  memset (string2, 'a', STRING_SIZE - 1);
+  if (strncmp (string1, string2, STRING_SIZE) == 0)
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  if (strncmp (string1, string2, STRING_SIZE) == 0)
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strncmp", LOOP, prepare, function);
+}
+diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
+new file mode 100644
+index 00000000..e32bfaf5
+--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
+@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+  memset (string1, 'a', STRING_SIZE - 1);
+  string1[STRING_SIZE - 100] = 'c';
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return EXIT_SUCCESS;
+  else
+    return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+  char *p = strrchr (string1, 'c');
+  if (p == &string1[STRING_SIZE - 100])
+    return 0;
+  else
+    return 1;
+}
+
+static int
+do_test (void)
+{
+  return do_test_1 ("strrchr", LOOP, prepare, function);
+}
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-19.patch
+++ b/SOURCES/glibc-RHEL-15696-19.patch
@ -0,0 +1,148 @@
+From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:44:18 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
+with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
+with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
+function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c       | 14 +++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h          | 13 ++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-wmemset.h         | 12 ++++++------
+ .../multiarch/memset-avx512-unaligned-erms.S     | 16 ++++++++--------
+ 4 files changed, 31 insertions(+), 24 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index c1efeec0..d969a156 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 6f3375cc..19795938 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+index bdc94c6c..98c5d406 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+-	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_unaligned);
+-
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+-	return OPTIMIZE (evex_unaligned);
+	{
+	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+	    return OPTIMIZE (avx512_unaligned);
+
+	  return OPTIMIZE (evex_unaligned);
+	}
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ 	return OPTIMIZE (avx2_unaligned_rtm);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 0783979c..22e7b187 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -1,22 +1,22 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
+# define XMM0		xmm16
+# define YMM0		ymm16
+# define VEC0		zmm16
+# define VEC(i)		VEC##i
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+# define VZEROUPPER
+ 
+ # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastb %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
+  vpbroadcastb d, %VEC0
+ 
+ # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+-  vmovd d, %xmm0; \
+   movq r, %rax; \
+-  vpbroadcastd %xmm0, %xmm0; \
+-  vpbroadcastq %xmm0, %zmm0
+  vpbroadcastd d, %VEC0
+ 
+-# define SECTION(p)		p##.avx512
+# define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+ 
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-2.patch
+++ b/SOURCES/glibc-RHEL-15696-2.patch
@ -0,0 +1,230 @@
+From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:25:56 -0800
+Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
+	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
+	tst-size_t-wmemcmp.
+	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 +-
+ sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++-
+ sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 +-
+ sysdeps/x86_64/x32/Makefile                  |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++
+ 6 files changed, 114 insertions(+), 9 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 30f764c3..e3a35b89 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -58,9 +58,12 @@
+ 	.section .text.avx,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+ # endif
+-	cmpq	$VEC_SIZE, %rdx
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+index 8e164f2c..302900f5 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
+@@ -42,13 +42,16 @@
+ 	.section .text.sse4.1,"ax",@progbits
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ # endif
+ 	pxor	%xmm0, %xmm0
+-	cmp	$79, %rdx
+	cmp	$79, %RDX_LP
+ 	ja	L(79bytesormore)
+ # ifndef USE_AS_WMEMCMP
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	L(firstbyte)
+ # endif
+ 	add	%rdx, %rsi
+diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+index 6f76c641..69d030fc 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+@@ -33,9 +33,12 @@
+ 	atom_text_section
+ ENTRY (MEMCMP)
+ # ifdef USE_AS_WMEMCMP
+-	shl	$2, %rdx
+-	test	%rdx, %rdx
+	shl	$2, %RDX_LP
+	test	%RDX_LP, %RDX_LP
+ 	jz	L(equal)
+# elif defined __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ # endif
+ 	mov	%rdx, %rcx
+ 	mov	%rdi, %rdx
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 7d528889..ddec7f04 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr
+tests += tst-size_t-memchr tst-size_t-memcmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+new file mode 100644
+index 00000000..9bd6fdb4
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
+@@ -0,0 +1,76 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  memcpy (buf1, buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_memcmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+new file mode 100644
+index 00000000..e8b5ffd0
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
+@@ -0,0 +1,20 @@
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memcmp.c"
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-20.patch
+++ b/SOURCES/glibc-RHEL-15696-20.patch
@ -0,0 +1,164 @@
+From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sun, 7 Mar 2021 09:45:23 -0800
+Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
+Content-type: text/plain; charset=UTF-8
+
+Update ifunc-memmove.h to select the function optimized with AVX512
+instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
+AVX512VL since VZEROUPPER isn't needed at function exit.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 24 +++++++++---------
+ sysdeps/x86_64/multiarch/ifunc-memmove.h      | 12 +++++----
+ .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
+ 3 files changed, 42 insertions(+), 19 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d969a156..fec384f6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memmove_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memmove,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memmove_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
+ 			      __memmove_ssse3_back)
+@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __memcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __memcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
+@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_chk_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ 			      CPU_FEATURE_USABLE (AVX),
+@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+ 			      __mempcpy_avx512_no_vzeroupper)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      CPU_FEATURE_USABLE (AVX512VL),
+ 			      __mempcpy_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, mempcpy,
+ 			      CPU_FEATURE_USABLE (AVX),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+index fa09b9fb..014e95c7 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
+@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+-      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx512_no_vzeroupper);
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+	    return OPTIMIZE (avx512_unaligned_erms);
+ 
+-      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-	return OPTIMIZE (avx512_unaligned_erms);
+	  return OPTIMIZE (avx512_unaligned);
+	}
+ 
+-      return OPTIMIZE (avx512_unaligned);
+      return OPTIMIZE (avx512_no_vzeroupper);
+     }
+ 
+   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+index aac1515c..848848ab 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+@@ -1,11 +1,32 @@
+ #if IS_IN (libc)
+ # define VEC_SIZE	64
+-# define VEC(i)		zmm##i
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define YMM0		ymm16
+# define YMM1		ymm17
+# define VEC0		zmm16
+# define VEC1		zmm17
+# define VEC2		zmm18
+# define VEC3		zmm19
+# define VEC4		zmm20
+# define VEC5		zmm21
+# define VEC6		zmm22
+# define VEC7		zmm23
+# define VEC8		zmm24
+# define VEC9		zmm25
+# define VEC10		zmm26
+# define VEC11		zmm27
+# define VEC12		zmm28
+# define VEC13		zmm29
+# define VEC14		zmm30
+# define VEC15		zmm31
+# define VEC(i)		VEC##i
+ # define VMOVNT		vmovntdq
+ # define VMOVU		vmovdqu64
+ # define VMOVA		vmovdqa64
+# define VZEROUPPER
+ 
+-# define SECTION(p)		p##.avx512
+# define SECTION(p)		p##.evex512
+ # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
+ 
+ # include "memmove-vec-unaligned-erms.S"
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-21.patch
+++ b/SOURCES/glibc-RHEL-15696-21.patch
@ -0,0 +1,71 @@
+From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
+From: Sunil K Pandey <skpgkp2@gmail.com>
+Date: Thu, 1 Apr 2021 15:47:04 -0700
+Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+Fix some indentations of ifdef in file strlen-evex.S which are off by 1
+and confusing to read.
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
+ 1 file changed, 8 insertions(+), 8 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index cd022509..05838190 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -276,10 +276,10 @@ L(last_2x_vec):
+ 	.p2align 4
+ L(first_vec_x0_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -293,10 +293,10 @@ L(first_vec_x0_check):
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -311,10 +311,10 @@ L(first_vec_x1_check):
+ 	.p2align 4
+ L(first_vec_x2_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+@@ -329,10 +329,10 @@ L(first_vec_x2_check):
+ 	.p2align 4
+ L(first_vec_x3_check):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+ 	sall	$2, %eax
+-# endif
+#  endif
+ 	/* Check the end of data.  */
+ 	cmpq	%rax, %rsi
+ 	jbe	L(max)
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-22.patch
+++ b/SOURCES/glibc-RHEL-15696-22.patch
@ -0,0 +1,51 @@
+From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 07:07:21 -0700
+Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
+Content-type: text/plain; charset=UTF-8
+
+Since __strlen_evex and __strnlen_evex added by
+
+commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Fri Mar 5 06:24:52 2021 -0800
+
+    x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
+
+use sarx:
+
+c4 e2 6a f7 c0       	sarx   %edx,%eax,%eax
+
+require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
+ifunc-avx2.h already requires BMI2 for EVEX implementation.
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index fec384f6..cbfc1a5d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
+ 
+@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_evex)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
+ 
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-23.patch
+++ b/SOURCES/glibc-RHEL-15696-23.patch
@ -0,0 +1,584 @@
+From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:01:58 -0400
+Subject: [PATCH] x86: Optimize memchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-avx2.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+asaving a few instructions the in loop return loop. test-memchr,
+test-rawmemchr, and test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
+ 1 file changed, 247 insertions(+), 178 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index cf893e77..b377f22e 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -26,8 +26,22 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPCMPEQ	vpcmpeqd
+#  define VPBROADCAST	vpbroadcastd
+#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
+#  define VPBROADCAST	vpbroadcastb
+#  define CHAR_SIZE	1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+#  define ERAW_PTR_REG	ecx
+#  define RRAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define ERAW_PTR_REG	edi
+#  define RRAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -39,6 +53,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(null)
+ # endif
+-	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
+-	vmovd	%esi, %xmm0
+ # ifdef USE_AS_WMEMCHR
+ 	shl	$2, %RDX_LP
+-	vpbroadcastd %xmm0, %ymm0
+ # else
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ #  endif
+-	vpbroadcastb %xmm0, %ymm0
+ # endif
+	/* Broadcast CHAR to YMMMATCH.  */
+	vmovd	%esi, %xmm0
+	VPBROADCAST %xmm0, %ymm0
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
+-# else
+-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$VEC_SIZE, %rdx
+	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax), %rax
+	cmovle	%rcx, %rax
+	VZEROUPPER_RETURN
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+L(null):
+	xorl	%eax, %eax
+	ret
+ # endif
+-	jmp	L(more_4x_vec)
+-
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is necessary
+	   for computer return address if byte is found or adjusting length
+	   if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+	   rdi for rawmemchr.  */
+	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Calculate length until end of page (length checked for a
+	   match).  */
+	leaq	1(%ALGN_PTR_REG), %rsi
+	subq	%RRAW_PTR_REG, %rsi
+# endif
+ 	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+	sarxl	%ERAW_PTR_REG, %eax, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+	addq	%RRAW_PTR_REG, %rax
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	incq	%rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
+	.p2align 4
+L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+	/* Align data to VEC_SIZE - 1.  */
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	orq	$(VEC_SIZE - 1), %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+ 
+ # ifndef USE_AS_RAWMEMCHR
+	/* Check if at last VEC_SIZE * 4 length.  */
+ 	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+	   length.  */
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+ 	addq	%rcx, %rdx
+# else
+	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+ 
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+-
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
+ 	vpor	%ymm1, %ymm2, %ymm5
+ 	vpor	%ymm3, %ymm4, %ymm6
+ 	vpor	%ymm5, %ymm6, %ymm5
+ 
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+	vpmovmskb %ymm5, %ecx
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
+-	ja	L(loop_4x_vec)
+	testl	%ecx, %ecx
+	jnz	L(loop_4x_vec_end)
+ 
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+	.p2align 4
+L(last_4x_vec_or_less):
+	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	jnz	L(first_vec_x1_check)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+	/* If remaining length > VEC_SIZE * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jg	L(last_4x_vec)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+L(last_2x_vec):
+	/* If remaining length < VEC_SIZE.  */
+	addl	$VEC_SIZE, %edx
+	jle	L(zero_end)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	xorl	%eax, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	addq	$(VEC_SIZE + 1), %rdi
+	addq	%rdi, %rax
+L(zero_end):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+-	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+	vpmovmskb %ymm3, %eax
+	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
+ 
+ 	.p2align 4
+ L(first_vec_x1_check):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
+	/* Adjust length.  */
+	subl	$-(VEC_SIZE * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	incq	%rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+	.p2align 4
+L(set_zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+# endif
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
+L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+# else
+	incq	%rdi
+# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+# else
+	subq	$-(VEC_SIZE + 1), %rdi
+# endif
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+# ifndef USE_AS_RAWMEMCHR
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	jmp     L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+ 
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(VEC_SIZE * 2), %edx
+	jle	L(last_2x_vec)
+ 	.p2align 4
+-L(null):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+L(last_4x_vec):
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
+ 
+-	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	VZEROUPPER_RETURN
+	/* Create mask for possible matches within remaining length.  */
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+ 
+-	.p2align 4
+-L(first_vec_x2):
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= VEC_SIZE * 3 (Note this is after
+	   remaining length was found to be > VEC_SIZE * 2.  */
+	subl	$VEC_SIZE, %edx
+	jbe	L(zero_end2)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Shift remaining length mask for last VEC.  */
+	shrq	$32, %rcx
+	andl	%ecx, %eax
+	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+ 	addq	%rdi, %rax
+L(zero_end2):
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	vpmovmskb %ymm2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
+	subq	$-(VEC_SIZE * 2 + 1), %rdi
+ 	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-24.patch
+++ b/SOURCES/glibc-RHEL-15696-24.patch
@ -0,0 +1,388 @@
+From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 9 Jun 2021 16:25:32 -0400
+Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on n * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
+ 2 files changed, 98 insertions(+), 37 deletions(-)
+
+diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
+index cb320257..24f9a0c5 100644
+--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
+@@ -21,9 +21,11 @@
+ #ifdef USE_AS_WMEMCHR
+ # define MEMCHR		wmemchr
+ # define PCMPEQ		pcmpeqd
+# define CHAR_PER_VEC	4
+ #else
+ # define MEMCHR		memchr
+ # define PCMPEQ		pcmpeqb
+# define CHAR_PER_VEC	16
+ #endif
+ 
+ /* fast SSE2 version with using pmaxub and 64 byte loop */
+@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
+ 	movd	%esi, %xmm1
+ 	mov	%edi, %ecx
+ 
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+#endif
+ #ifdef USE_AS_WMEMCHR
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+-	shl	$2, %RDX_LP
+ #else
+-# ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
+-# endif
+ 	punpcklbw %xmm1, %xmm1
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(return_null)
+@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
+ 	test	%eax, %eax
+ 
+ 	jnz	L(matches_1)
+-	sub	$16, %rdx
+	sub	$CHAR_PER_VEC, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+ 	and	$15, %ecx
+ 	and	$-16, %rdi
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
+ 	add	%rcx, %rdx
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	jmp	L(loop_prolog)
+ 
+@@ -77,16 +81,21 @@ L(crosscache):
+ 	movdqa	(%rdi), %xmm0
+ 
+ 	PCMPEQ	%xmm1, %xmm0
+-/* Check if there is a match.  */
+	/* Check if there is a match.  */
+ 	pmovmskb %xmm0, %eax
+-/* Remove the leading bytes.  */
+	/* Remove the leading bytes.  */
+ 	sar	%cl, %eax
+ 	test	%eax, %eax
+ 	je	L(unaligned_no_match)
+-/* Check which byte is a match.  */
+	/* Check which byte is a match.  */
+ 	bsf	%eax, %eax
+-
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	add	%rcx, %rax
+@@ -94,15 +103,18 @@ L(crosscache):
+ 
+ 	.p2align 4
+ L(unaligned_no_match):
+-        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
+ 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
+ 	   possible addition overflow.  */
+ 	neg	%rcx
+ 	add	$16, %rcx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
+ 	sub	%rcx, %rdx
+ 	jbe	L(return_null)
+ 	add	$16, %rdi
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	.p2align 4
+@@ -135,7 +147,7 @@ L(loop_prolog):
+ 	test	$0x3f, %rdi
+ 	jz	L(align64_loop)
+ 
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -167,11 +179,14 @@ L(loop_prolog):
+ 	mov	%rdi, %rcx
+ 	and	$-64, %rdi
+ 	and	$63, %ecx
+#ifdef USE_AS_WMEMCHR
+	shr	$2, %ecx
+#endif
+ 	add	%rcx, %rdx
+ 
+ 	.p2align 4
+ L(align64_loop):
+-	sub	$64, %rdx
+	sub	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(exit_loop)
+ 	movdqa	(%rdi), %xmm0
+ 	movdqa	16(%rdi), %xmm2
+@@ -218,7 +233,7 @@ L(align64_loop):
+ 
+ 	.p2align 4
+ L(exit_loop):
+-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
+ 	jle	L(exit_loop_32)
+ 
+ 	movdqa	(%rdi), %xmm0
+@@ -238,7 +253,7 @@ L(exit_loop):
+ 	pmovmskb %xmm3, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches32_1)
+-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
+ 	jle	L(return_null)
+ 
+ 	PCMPEQ	48(%rdi), %xmm1
+@@ -250,13 +265,13 @@ L(exit_loop):
+ 
+ 	.p2align 4
+ L(exit_loop_32):
+-	add	$32, %edx
+	add	$(CHAR_PER_VEC * 2), %edx
+ 	movdqa	(%rdi), %xmm0
+ 	PCMPEQ	%xmm1, %xmm0
+ 	pmovmskb %xmm0, %eax
+ 	test	%eax, %eax
+ 	jnz	L(matches_1)
+-	sub	$16, %edx
+	sub	$CHAR_PER_VEC, %edx
+ 	jbe	L(return_null)
+ 
+ 	PCMPEQ	16(%rdi), %xmm1
+@@ -293,7 +308,13 @@ L(matches32):
+ 	.p2align 4
+ L(matches_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	add	%rdi, %rax
+ 	ret
+@@ -301,7 +322,13 @@ L(matches_1):
+ 	.p2align 4
+ L(matches16_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	lea	16(%rdi, %rax), %rax
+ 	ret
+@@ -309,7 +336,13 @@ L(matches16_1):
+ 	.p2align 4
+ L(matches32_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	lea	32(%rdi, %rax), %rax
+ 	ret
+@@ -317,7 +350,13 @@ L(matches32_1):
+ 	.p2align 4
+ L(matches48_1):
+ 	bsf	%eax, %eax
+#ifdef USE_AS_WMEMCHR
+	mov	%eax, %esi
+	shr	$2, %esi
+	sub	%rsi, %rdx
+#else
+ 	sub	%rax, %rdx
+#endif
+ 	jbe	L(return_null)
+ 	lea	48(%rdi, %rax), %rax
+ 	ret
+diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
+index b377f22e..16027abb 100644
+--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
+@@ -54,21 +54,19 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+-	test	%RDX_LP, %RDX_LP
+-	jz	L(null)
+-# endif
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
+ #  ifdef __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%edx, %edx
+	/* Clear upper bits.  */
+	and	%RDX_LP, %RDX_LP
+#  else
+	test	%RDX_LP, %RDX_LP
+ #  endif
+	jz	L(null)
+ # endif
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	vmovd	%esi, %xmm0
+@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
+ 	vpmovmskb %ymm1, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* If length < CHAR_PER_VEC handle special.  */
+-	cmpq	$VEC_SIZE, %rdx
+	cmpq	$CHAR_PER_VEC, %rdx
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	testl	%eax, %eax
+@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
+ L(first_vec_x0):
+ 	/* Check if first match was before length.  */
+ 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+ 	xorl	%ecx, %ecx
+ 	cmpl	%eax, %edx
+ 	leaq	(%rdi, %rax), %rax
+@@ -110,12 +112,12 @@ L(null):
+ # endif
+ 	.p2align 4
+ L(cross_page_boundary):
+-	/* Save pointer before aligning as its original value is necessary
+-	   for computer return address if byte is found or adjusting length
+-	   if it is not and this is memchr.  */
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+ 	movq	%rdi, %rcx
+-	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+-	   rdi for rawmemchr.  */
+	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+	   and rdi for rawmemchr.  */
+ 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+@@ -124,6 +126,10 @@ L(cross_page_boundary):
+ 	   match).  */
+ 	leaq	1(%ALGN_PTR_REG), %rsi
+ 	subq	%RRAW_PTR_REG, %rsi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %esi
+#  endif
+ # endif
+ 	/* Remove the leading bytes.  */
+ 	sarxl	%ERAW_PTR_REG, %eax, %eax
+@@ -181,6 +187,10 @@ L(cross_page_continue):
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	/* esi is for adjusting length to see if near the end.  */
+ 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+ # else
+ 	orq	$(VEC_SIZE - 1), %rdi
+ L(cross_page_continue):
+@@ -213,7 +223,7 @@ L(cross_page_continue):
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check if at last VEC_SIZE * 4 length.  */
+-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+ 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ 	   length.  */
+@@ -221,6 +231,10 @@ L(cross_page_continue):
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+ 	addq	%rcx, %rdx
+ # else
+ 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
+@@ -250,15 +264,19 @@ L(loop_4x_vec):
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 
+-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
+-	/* Fall through into less than 4 remaining vectors of length case.
+-	 */
+	/* Fall through into less than 4 remaining vectors of length
+	   case.  */
+ 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	.p2align 4
+ L(last_4x_vec_or_less):
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1_check)
+@@ -355,6 +373,10 @@ L(last_vec_x2_return):
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ 	vpmovmskb %ymm1, %eax
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %edx
+#  endif
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	/* Check first VEC regardless.  */
+ 	testl	%eax, %eax
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-25.patch
+++ b/SOURCES/glibc-RHEL-15696-25.patch
@ -0,0 +1,767 @@
+From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:07 -0400
+Subject: [PATCH] x86: Optimize strlen-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-avx2.S. The optimizations are
+mostly small things but they add up to roughly 10-30% performance
+improvement for strlen. The results for strnlen are bit more
+ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
+are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
+ sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
+ 2 files changed, 334 insertions(+), 214 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index cbfc1a5d..f1a6460a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+   IFUNC_IMPL (i, name, strlen,
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strlen,
+@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
+   IFUNC_IMPL (i, name, strnlen,
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strnlen,
+@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
+   IFUNC_IMPL (i, name, wcslen,
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcslen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen,
+@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+   IFUNC_IMPL (i, name, wcsnlen,
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcsnlen_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcsnlen_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index 82826e10..be8a5db5 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -27,9 +27,11 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMPEQ	vpcmpeqd
+ #  define VPMINU	vpminud
+#  define CHAR_SIZE	4
+ # else
+ #  define VPCMPEQ	vpcmpeqb
+ #  define VPMINU	vpminub
+#  define CHAR_SIZE	1
+ # endif
+ 
+ # ifndef VZEROUPPER
+@@ -41,349 +43,459 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
+	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+	mov	%RSI_LP, %R8_LP
+ #  ifdef USE_AS_WCSLEN
+ 	shl	$2, %RSI_LP
+ #  elif defined __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+-	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
+	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+ 	vpxor	%xmm0, %xmm0, %xmm0
+-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-
+	VPCMPEQ	(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
+	/* If length < VEC_SIZE handle special.  */
+	cmpq	$VEC_SIZE, %rsi
+	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	/* If empty continue to aligned_more. Otherwise return bit
+	   position of first match.  */
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+	VZEROUPPER_RETURN
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 4 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	incl	%edi
+	addl	%edi, %eax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	shrl	$2, %eax
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(aligned_more):
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 3 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE + 1), %edi
+	addl	%edi, %eax
+ # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE * 2 + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 2 + 1), %edi
+	addl	%edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	subl	$(VEC_SIZE + 1), %ecx
+	addl	%ecx, %eax
+# else
+	subl	%edx, %edi
+	addl	$(VEC_SIZE * 3 + 1), %edi
+	addl	%edi, %eax
+ # endif
+# ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
+	   code on the x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+	   it simplies the logic in last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+	subq	%rdx, %rcx
+# endif
+	/* Load first VEC regardless.  */
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+ 
+	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	incq	%rdi
+	movl	%edi, %ecx
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+# else
+	incq	%rdi
+	orq	$(VEC_SIZE * 4 - 1), %rdi
+ # endif
+-
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa (%rdi), %ymm1
+-	vmovdqa	VEC_SIZE(%rdi), %ymm2
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
+-	VPMINU	%ymm1, %ymm2, %ymm5
+-	VPMINU	%ymm3, %ymm4, %ymm6
+-	VPMINU	%ymm5, %ymm6, %ymm5
+-
+-	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
+ 	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
+-
+-L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	vmovdqa	1(%rdi), %ymm1
+	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+	VPMINU	%ymm2, %ymm4, %ymm5
+	VPCMPEQ	%ymm5, %ymm0, %ymm5
+	vpmovmskb	%ymm5, %ecx
+ 
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+ 
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	subq	%rdx, %rdi
+ 	testl	%eax, %eax
+	jnz	L(last_vec_return_x0)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+-
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	VPCMPEQ	%ymm2, %ymm0, %ymm2
+	vpmovmskb	%ymm2, %eax
+ 	testl	%eax, %eax
+-
+-	jnz	L(first_vec_x3_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+	jnz	L(last_vec_return_x1)
+
+	/* Combine last 2 VEC.  */
+	VPCMPEQ	%ymm3, %ymm0, %ymm3
+	vpmovmskb	%ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used if
+	   the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2 - 1), %rdi
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
+# endif
+ 	VZEROUPPER_RETURN
+ 
+
+# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
+-	VPCMPEQ (%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+	vpmovmskb	%ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(VEC_SIZE * 2), %esi
+	jnz	L(last_4x_vec)
+ 
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+	/* length may have been negative or positive by an offset of
+	   VEC_SIZE * 4 depending on where this was called from. This fixes
+	   that.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	VZEROUPPER_RETURN
+	jnz	L(last_vec_x1_check)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+	subl	$VEC_SIZE, %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+# endif
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
+L(last_vec_return_x0):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
+	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
+# endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
+L(last_vec_return_x1):
+ 	tzcntl	%eax, %eax
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
+	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-#  endif
+# endif
+ 	VZEROUPPER_RETURN
+ 
+# ifdef USE_AS_STRNLEN
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+ L(max):
+ 	movq	%r8, %rax
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+	/* Normalize length.  */
+	andl	$(VEC_SIZE * 4 - 1), %esi
+	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x3)
+
+	subl	$(VEC_SIZE * 3), %esi
+	jb	L(max)
+
+	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 3 + 1), %eax
+	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+ 
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
+L(last_vec_x1):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
+ 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+	incl	%eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
+#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
+L(last_vec_x2):
+	/* essentially duplicates of first_vec_x1 but use 64 bit
+	   instructions.  */
+ 	tzcntl	%eax, %eax
+-	addq	$VEC_SIZE, %rax
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
+#  endif
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 2), %rax
+	subl	$(VEC_SIZE * 2), %esi
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+#  ifdef USE_AS_WCSLEN
+ 	shrq	$2, %rax
+-# endif
+#  endif
+	VZEROUPPER_RETURN
+L(max_end):
+	movq	%r8, %rax
+ 	VZEROUPPER_RETURN
+# endif
+ 
+	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(4x_vec_end):
+-	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMPEQ %ymm2, %ymm0, %ymm2
+-	vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+	/* Align data to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+	vpmovmskb	%ymm1, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod rdx.  */
+	sarxl	%edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMPEQ %ymm3, %ymm0, %ymm3
+-	vpmovmskb %ymm3, %eax
+	jnz	L(cross_page_less_vec)
+	leaq	1(%rdi), %rcx
+	subq	%rdx, %rcx
+	/* Check length.  */
+	cmpq	%rsi, %rcx
+	jb	L(cross_page_continue)
+	movq	%r8, %rax
+# else
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMPEQ %ymm4, %ymm0, %ymm4
+-	vpmovmskb %ymm4, %eax
+-L(first_vec_x3):
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+ # endif
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+	.p2align 4
+L(cross_page_less_vec):
+	tzcntl	%eax, %eax
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+#  ifdef USE_AS_WCSLEN
+	shrl	$2, %eax
+#  endif
+ 	VZEROUPPER_RETURN
+# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-26.patch
+++ b/SOURCES/glibc-RHEL-15696-26.patch
@ -0,0 +1,701 @@
+From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 3 May 2021 03:03:19 -0400
+Subject: [PATCH] x86: Optimize memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memchr-evex.S. The optimizations include
+replacing some branches with cmovcc, avoiding some branches entirely
+in the less_4x_vec case, making the page cross logic less strict,
+saving some ALU in the alignment process, and most importantly
+increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
+test-wmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
+ 1 file changed, 322 insertions(+), 225 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 6dd5d67b..81d5cd64 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -26,14 +26,28 @@
+ 
+ # ifdef USE_AS_WMEMCHR
+ #  define VPBROADCAST	vpbroadcastd
+-#  define VPCMP		vpcmpd
+-#  define SHIFT_REG	r8d
+#  define VPMINU	vpminud
+#  define VPCMP	vpcmpd
+#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+-#  define VPCMP		vpcmpb
+-#  define SHIFT_REG	ecx
+#  define VPMINU	vpminub
+#  define VPCMP	vpcmpb
+#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+ # endif
+ 
+# ifdef USE_AS_RAWMEMCHR
+#  define RAW_PTR_REG	rcx
+#  define ALGN_PTR_REG	rdi
+# else
+#  define RAW_PTR_REG	rdi
+#  define ALGN_PTR_REG	rcx
+# endif
+
+# define XMMZERO	xmm23
+# define YMMZERO	ymm23
+ # define XMMMATCH	xmm16
+ # define YMMMATCH	ymm16
+ # define YMM1		ymm17
+@@ -44,6 +58,8 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCHR)
+@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
+ 	/* Check for zero length.  */
+ 	test	%RDX_LP, %RDX_LP
+ 	jz	L(zero)
+-# endif
+-	movl	%edi, %ecx
+-# ifdef USE_AS_WMEMCHR
+-	shl	$2, %RDX_LP
+-# else
+
+ #  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
+ 	/* Broadcast CHAR to YMMMATCH.  */
+ 	VPBROADCAST %esi, %YMMMATCH
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-
+	VPCMP	$0, (%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+ # ifndef USE_AS_RAWMEMCHR
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rdx
+-	jbe	L(zero)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	jnz	L(first_vec_x0)
+	addq	%rdi, %rax
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	ret
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-	jmp	L(more_4x_vec)
+L(zero):
+	xorl	%eax, %eax
+	ret
+ 
+	.p2align 5
+L(first_vec_x0):
+	/* Check if first match was before length.  */
+	tzcntl	%eax, %eax
+	xorl	%ecx, %ecx
+	cmpl	%eax, %edx
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	cmovle	%rcx, %rax
+	ret
+# else
+	/* NB: first_vec_x0 is 17 bytes which will leave
+	   cross_page_boundary (which is relatively cold) close enough
+	   to ideal alignment. So only realign L(cross_page_boundary) if
+	   rawmemchr.  */
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+	/* Save pointer before aligning as its original value is
+	   necessary for computer return address if byte is found or
+	   adjusting length if it is not and this is memchr.  */
+	movq	%rdi, %rcx
+	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+	   for rawmemchr.  */
+	andq	$-VEC_SIZE, %ALGN_PTR_REG
+	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+	kmovd	%k0, %r8d
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
+	sarl	$2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
+	subl	%eax, %esi
+ # endif
+-	andq	$-VEC_SIZE, %rdi
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+	andl	$(CHAR_PER_VEC - 1), %eax
+ # endif
+	/* Remove the leading bytes.  */
+	sarxl	%eax, %r8d, %eax
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+	cmpq	%rsi, %rdx
+	jbe	L(first_vec_x0)
+# endif
+	testl	%eax, %eax
+	jz	L(cross_page_continue)
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+	addq	%RAW_PTR_REG, %rax
+ # endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
+-# ifndef USE_AS_RAWMEMCHR
+-        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
+-	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
+-	   overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	/* Check the end of data.  */
+-	subq	%rcx, %rdx
+-	jbe	L(zero)
+-# endif
+	.p2align 4
+L(first_vec_x2):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	addq	$VEC_SIZE, %rdi
+	.p2align 4
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-# ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Align data to VEC_SIZE.  */
+L(cross_page_continue):
+	xorl	%ecx, %ecx
+	subl	%edi, %ecx
+	andq	$-VEC_SIZE, %rdi
+	/* esi is for adjusting length to see if near the end.  */
+	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %esi
+#  endif
+# else
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+	/* Load first VEC regardless.  */
+	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+	/* Adjust length. If near end handle specially.  */
+	subq	%rsi, %rdx
+	jbe	L(last_4x_vec_or_less)
+# endif
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+
+ 
+ # ifndef USE_AS_RAWMEMCHR
+-	subq	$(VEC_SIZE * 4), %rdx
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+	/* Check if at last CHAR_PER_VEC * 4 length.  */
+	subq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_4x_vec_or_less_cmpeq)
+	addq	$VEC_SIZE, %rdi
+ 
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+	 */
+#  ifdef USE_AS_WMEMCHR
+	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-
+-# ifndef USE_AS_RAWMEMCHR
+-	/* Adjust length.  */
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
+#  else
+	addq	%rdi, %rdx
+	andq	$-(4 * VEC_SIZE), %rdi
+	subq	%rdi, %rdx
+#  endif
+# else
+	addq	$VEC_SIZE, %rdi
+	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+ 
+	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
+-	kord	%k1, %k2, %k5
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
+-
+-	kord	%k3, %k4, %k6
+-	kortestd %k5, %k6
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+	/* It would be possible to save some instructions using 4x VPCMP
+	   but bottleneck on port 5 makes it not woth it.  */
+	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+	/* xor will set bytes match esi to zero.  */
+	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+-	jmp	L(loop_4x_vec)
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd %k2, %k3
+	jz	L(loop_4x_vec)
+ # else
+-	subq	$(VEC_SIZE * 4), %rdx
+	kortestd %k2, %k3
+	jnz	L(loop_4x_vec_end)
+
+	subq	$-(VEC_SIZE * 4), %rdi
+
+	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	ja	L(loop_4x_vec)
+ 
+	/* Fall through into less than 4 remaining vectors of length case.
+	 */
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	addq	$(VEC_SIZE * 3), %rdi
+	.p2align 4
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %edx
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(first_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	/* If remaining length > CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jg	L(last_4x_vec)
+ 
+-	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+L(last_2x_vec):
+	/* If remaining length < CHAR_PER_VEC.  */
+	addl	$CHAR_PER_VEC, %edx
+	jle	L(zero_end)
+ 
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+	/* Check VEC2 and compare any match with remaining length.  */
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+	ret
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+ 
+-	jnz	L(first_vec_x3_check)
+	.p2align 4
+L(first_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Adjust length.  */
+	subl	$-(CHAR_PER_VEC * 4), %edx
+	/* Check if match within remaining length.  */
+	cmpl	%eax, %edx
+	jbe	L(set_zero_end)
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+L(set_zero_end):
+ 	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %edx
+-	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+	/* rawmemchr will fall through into this if match was found in
+	   loop.  */
+
+	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-	testl	%eax, %eax
+# ifdef USE_AS_WMEMCHR
+	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+# else
+	incl	%eax
+# endif
+	jnz	L(last_vec_x1_return)
+ 
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %edx
+-	jle	L(zero)
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2_return)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
+-	kmovd	%k1, %eax
+	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	xorl	%eax, %eax
+-	ret
+	jnz	L(last_vec_x3_return)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
+L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+#  ifdef USE_AS_WMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+#  else
+ 	addq	%rdi, %rax
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x2_check):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+#  endif
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rdx
+-	jbe	L(zero)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(first_vec_x0):
+L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
+	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+ # endif
+ 	ret
+ 
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check first VEC regardless.  */
+	testl	%eax, %eax
+	jnz	L(first_vec_x1_check)
+
+	/* If remaining length <= CHAR_PER_VEC * 2.  */
+	addl	$(CHAR_PER_VEC * 2), %edx
+	jle	L(last_2x_vec)
+
+ 	.p2align 4
+-L(first_vec_x1):
+L(last_4x_vec):
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(last_vec_x2)
+
+
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Create mask for possible matches within remaining length.  */
+#  ifdef USE_AS_WMEMCHR
+	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+	bzhil	%edx, %ecx, %ecx
+#  else
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+#  endif
+	/* Test matches in data against length match.  */
+	andl	%ecx, %eax
+	jnz	L(last_vec_x3)
+
+	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+	   remaining length was found to be > CHAR_PER_VEC * 2.  */
+	subl	$CHAR_PER_VEC, %edx
+	jbe	L(zero_end2)
+
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	kmovd	%k0, %eax
+	/* Shift remaining length mask for last VEC.  */
+#  ifdef USE_AS_WMEMCHR
+	shrl	$CHAR_PER_VEC, %ecx
+#  else
+	shrq	$CHAR_PER_VEC, %rcx
+#  endif
+	andl	%ecx, %eax
+	jz	L(zero_end2)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# endif
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x2):
+L(last_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# endif
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(4x_vec_end):
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	testl	%eax, %eax
+-L(first_vec_x3):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WMEMCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-# endif
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+# endif
+ 
+ END (MEMCHR)
+ #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-27.patch
+++ b/SOURCES/glibc-RHEL-15696-27.patch
@ -0,0 +1,30 @@
+From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
+From: Alice Xu <alice.d.xu@gmail.com>
+Date: Fri, 7 May 2021 19:03:21 -0700
+Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+An unknown vector operation occurred in commit 2a76821c308. Fixed it
+by using "ymm{k1}{z}" but not "ymm {k1} {z}".
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index 81d5cd64..f3fdad4f 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -271,7 +271,7 @@ L(loop_4x_vec):
+ 	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ 	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+-	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
+	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-28.patch
+++ b/SOURCES/glibc-RHEL-15696-28.patch
@ -0,0 +1,566 @@
+From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 22 Jun 2021 20:42:10 -0700
+Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
+Content-type: text/plain; charset=UTF-8
+
+Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
+version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
+and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
+This also removes the unused symbols, __GI___strlen_sse2 and
+__GI___wcsnlen_sse4_1.
+---
+ sysdeps/x86_64/multiarch/strlen-sse2.S    |   2 +-
+ sysdeps/x86_64/multiarch/strlen-vec.S     | 257 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   2 +-
+ sysdeps/x86_64/strlen.S                   | 243 +-------------------
+ 4 files changed, 262 insertions(+), 242 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
+
+Conflicts:
+	sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+	(Copyright dates, URL)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
+index 7bc57b8d..449c8a7f 100644
+--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
+@@ -20,4 +20,4 @@
+ # define strlen __strlen_sse2
+ #endif
+ 
+-#include "../strlen.S"
+#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+new file mode 100644
+index 00000000..8f660bb9
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -0,0 +1,257 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+   Copyright (C) 2012-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU		pminud
+# define PCMPEQ		pcmpeqd
+# define SHIFT_RETURN	shrq $2, %rax
+#else
+# define PMINU		pminub
+# define PCMPEQ		pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+	%xmm3 - zero
+	%rdi   - s
+	%r10  (s+n) & (~(64-1))
+	%r11   s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+#define FIND_ZERO	\
+	PCMPEQ	(%rax), %xmm0;	\
+	PCMPEQ	16(%rax), %xmm1;	\
+	PCMPEQ	32(%rax), %xmm2;	\
+	PCMPEQ	48(%rax), %xmm3;	\
+	pmovmskb	%xmm0, %esi;	\
+	pmovmskb	%xmm1, %edx;	\
+	pmovmskb	%xmm2, %r8d;	\
+	pmovmskb	%xmm3, %ecx;	\
+	salq	$16, %rdx;	\
+	salq	$16, %rcx;	\
+	orq	%rsi, %rdx;	\
+	orq	%r8, %rcx;	\
+	salq	$32, %rcx;	\
+	orq	%rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0.  */
+	test	%RSI_LP, %RSI_LP
+	jne	L(n_nonzero)
+	xor	%rax, %rax
+	ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+	shl	$2, %RSI_LP
+# endif
+
+/* Initialize long lived registers.  */
+
+	add	%RDI_LP, %RSI_LP
+	mov	%RSI_LP, %R10_LP
+	and	$-64, %R10_LP
+	mov	%RSI_LP, %R11_LP
+#endif
+
+	pxor	%xmm0, %xmm0
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+	movq	%rdi, %rax
+	movq	%rdi, %rcx
+	andq	$4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+	cmpq	$4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower.  */
+	ja	L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes.  */
+# define STRNLEN_PROLOG	\
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG  andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string.  */
+#define PROLOG(lab)	\
+	movq	%rdi, %rcx;	\
+	xorq	%rax, %rcx;	\
+	STRNLEN_PROLOG;	\
+	sarq	%cl, %rdx;	\
+	test	%rdx, %rdx;	\
+	je	L(lab);	\
+	bsfq	%rdx, %rax;	\
+	SHIFT_RETURN;		\
+	ret
+
+#ifdef AS_STRNLEN
+	andq	$-16, %rax
+	FIND_ZERO
+#else
+	/* Test first 16 bytes unaligned.  */
+	movdqu	(%rax), %xmm4
+	PCMPEQ	%xmm0, %xmm4
+	pmovmskb	%xmm4, %edx
+	test	%edx, %edx
+	je 	L(next48_bytes)
+	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+	SHIFT_RETURN
+	ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+	andq	$-16, %rax
+	PCMPEQ 16(%rax), %xmm1
+	PCMPEQ 32(%rax), %xmm2
+	PCMPEQ 48(%rax), %xmm3
+	pmovmskb	%xmm1, %edx
+	pmovmskb	%xmm2, %r8d
+	pmovmskb	%xmm3, %ecx
+	salq	$16, %rdx
+	salq	$16, %rcx
+	orq	%r8, %rcx
+	salq	$32, %rcx
+	orq	%rcx, %rdx
+#endif
+
+	/* When no zero byte is found xmm1-3 are zero so we do not have to
+	   zero them.  */
+	PROLOG(loop)
+
+	.p2align 4
+L(cross_page):
+	andq	$-64, %rax
+	FIND_ZERO
+	PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1).  */
+L(strnlen_ret):
+	bts	%rsi, %rdx
+	sarq	%cl, %rdx
+	test	%rdx, %rdx
+	je	L(loop_init)
+	bsfq	%rdx, %rax
+	SHIFT_RETURN
+	ret
+#endif
+	.p2align 4
+L(loop_init):
+	pxor	%xmm1, %xmm1
+	pxor	%xmm2, %xmm2
+	pxor	%xmm3, %xmm3
+#ifdef AS_STRNLEN
+	.p2align 4
+L(loop):
+
+	addq	$64, %rax
+	cmpq	%rax, %r10
+	je	L(exit_end)
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit_end):
+	cmp	%rax, %r11
+	je	L(first) /* Do not read when end is at page boundary.  */
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+L(first):
+	bts	%r11, %rdx
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+	.p2align 4
+L(exit):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#else
+
+	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+	.p2align 4
+L(loop):
+
+	movdqa	64(%rax), %xmm0
+	PMINU	80(%rax), %xmm0
+	PMINU	96(%rax), %xmm0
+	PMINU	112(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit64)
+
+	subq	$-128, %rax
+
+	movdqa	(%rax), %xmm0
+	PMINU	16(%rax), %xmm0
+	PMINU	32(%rax), %xmm0
+	PMINU	48(%rax), %xmm0
+	PCMPEQ	%xmm3, %xmm0
+	pmovmskb	%xmm0, %edx
+	testl	%edx, %edx
+	jne	L(exit0)
+	jmp	L(loop)
+
+	.p2align 4
+L(exit64):
+	addq	$64, %rax
+L(exit0):
+	pxor	%xmm0, %xmm0
+	FIND_ZERO
+
+	bsfq	%rdx, %rdx
+	addq	%rdx, %rax
+	subq	%rdi, %rax
+	SHIFT_RETURN
+	ret
+
+#endif
+
+END(strlen)
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+index a8cab0cb..5fa51fe0 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+@@ -2,4 +2,4 @@
+ #define AS_STRNLEN
+ #define strlen	__wcsnlen_sse4_1
+ 
+-#include "../strlen.S"
+#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
+index f845f3d4..ad047d84 100644
+--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
+@@ -1,5 +1,5 @@
+-/* SSE2 version of strlen/wcslen.
+-   Copyright (C) 2012-2018 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+    This file is part of the GNU C Library.
+ 
+    The GNU C Library is free software; you can redistribute it and/or
+@@ -16,243 +16,6 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
+ 
+-#ifdef AS_WCSLEN
+-# define PMINU		pminud
+-# define PCMPEQ		pcmpeqd
+-# define SHIFT_RETURN	shrq $2, %rax
+-#else
+-# define PMINU		pminub
+-# define PCMPEQ		pcmpeqb
+-# define SHIFT_RETURN
+-#endif
+-
+-/* Long lived register in strlen(s), strnlen(s, n) are:
+-
+-	%xmm3 - zero
+-	%rdi   - s
+-	%r10  (s+n) & (~(64-1))
+-	%r11   s+n
+-*/
+-
+-
+-.text
+-ENTRY(strlen)
+-
+-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
+-#define FIND_ZERO	\
+-	PCMPEQ	(%rax), %xmm0;	\
+-	PCMPEQ	16(%rax), %xmm1;	\
+-	PCMPEQ	32(%rax), %xmm2;	\
+-	PCMPEQ	48(%rax), %xmm3;	\
+-	pmovmskb	%xmm0, %esi;	\
+-	pmovmskb	%xmm1, %edx;	\
+-	pmovmskb	%xmm2, %r8d;	\
+-	pmovmskb	%xmm3, %ecx;	\
+-	salq	$16, %rdx;	\
+-	salq	$16, %rcx;	\
+-	orq	%rsi, %rdx;	\
+-	orq	%r8, %rcx;	\
+-	salq	$32, %rcx;	\
+-	orq	%rcx, %rdx;
+-
+-#ifdef AS_STRNLEN
+-/* Do not read anything when n==0.  */
+-	test	%RSI_LP, %RSI_LP
+-	jne	L(n_nonzero)
+-	xor	%rax, %rax
+-	ret
+-L(n_nonzero):
+-# ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
+-# endif
+-
+-/* Initialize long lived registers.  */
+-
+-	add	%RDI_LP, %RSI_LP
+-	mov	%RSI_LP, %R10_LP
+-	and	$-64, %R10_LP
+-	mov	%RSI_LP, %R11_LP
+-#endif
+-
+-	pxor	%xmm0, %xmm0
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-	movq	%rdi, %rax
+-	movq	%rdi, %rcx
+-	andq	$4095, %rcx
+-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
+-	cmpq	$4047, %rcx
+-/* We cannot unify this branching as it would be ~6 cycles slower.  */
+-	ja	L(cross_page)
+-
+-#ifdef AS_STRNLEN
+-/* Test if end is among first 64 bytes.  */
+-# define STRNLEN_PROLOG	\
+-	mov	%r11, %rsi;	\
+-	subq	%rax, %rsi;	\
+-	andq	$-64, %rax;	\
+-	testq	$-64, %rsi;	\
+-	je	L(strnlen_ret)
+-#else
+-# define STRNLEN_PROLOG  andq $-64, %rax;
+-#endif
+-
+-/* Ignore bits in mask that come before start of string.  */
+-#define PROLOG(lab)	\
+-	movq	%rdi, %rcx;	\
+-	xorq	%rax, %rcx;	\
+-	STRNLEN_PROLOG;	\
+-	sarq	%cl, %rdx;	\
+-	test	%rdx, %rdx;	\
+-	je	L(lab);	\
+-	bsfq	%rdx, %rax;	\
+-	SHIFT_RETURN;		\
+-	ret
+-
+-#ifdef AS_STRNLEN
+-	andq	$-16, %rax
+-	FIND_ZERO
+-#else
+-	/* Test first 16 bytes unaligned.  */
+-	movdqu	(%rax), %xmm4
+-	PCMPEQ	%xmm0, %xmm4
+-	pmovmskb	%xmm4, %edx
+-	test	%edx, %edx
+-	je 	L(next48_bytes)
+-	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
+-	SHIFT_RETURN
+-	ret
+-
+-L(next48_bytes):
+-/* Same as FIND_ZERO except we do not check first 16 bytes.  */
+-	andq	$-16, %rax
+-	PCMPEQ 16(%rax), %xmm1
+-	PCMPEQ 32(%rax), %xmm2
+-	PCMPEQ 48(%rax), %xmm3
+-	pmovmskb	%xmm1, %edx
+-	pmovmskb	%xmm2, %r8d
+-	pmovmskb	%xmm3, %ecx
+-	salq	$16, %rdx
+-	salq	$16, %rcx
+-	orq	%r8, %rcx
+-	salq	$32, %rcx
+-	orq	%rcx, %rdx
+-#endif
+-
+-	/* When no zero byte is found xmm1-3 are zero so we do not have to
+-	   zero them.  */
+-	PROLOG(loop)
+-
+-	.p2align 4
+-L(cross_page):
+-	andq	$-64, %rax
+-	FIND_ZERO
+-	PROLOG(loop_init)
+-
+-#ifdef AS_STRNLEN
+-/* We must do this check to correctly handle strnlen (s, -1).  */
+-L(strnlen_ret):
+-	bts	%rsi, %rdx
+-	sarq	%cl, %rdx
+-	test	%rdx, %rdx
+-	je	L(loop_init)
+-	bsfq	%rdx, %rax
+-	SHIFT_RETURN
+-	ret
+-#endif
+-	.p2align 4
+-L(loop_init):
+-	pxor	%xmm1, %xmm1
+-	pxor	%xmm2, %xmm2
+-	pxor	%xmm3, %xmm3
+-#ifdef AS_STRNLEN
+-	.p2align 4
+-L(loop):
+-
+-	addq	$64, %rax
+-	cmpq	%rax, %r10
+-	je	L(exit_end)
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit_end):
+-	cmp	%rax, %r11
+-	je	L(first) /* Do not read when end is at page boundary.  */
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-L(first):
+-	bts	%r11, %rdx
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-	.p2align 4
+-L(exit):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#else
+-
+-	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
+-	.p2align 4
+-L(loop):
+-
+-	movdqa	64(%rax), %xmm0
+-	PMINU	80(%rax), %xmm0
+-	PMINU	96(%rax), %xmm0
+-	PMINU	112(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit64)
+-
+-	subq	$-128, %rax
+-
+-	movdqa	(%rax), %xmm0
+-	PMINU	16(%rax), %xmm0
+-	PMINU	32(%rax), %xmm0
+-	PMINU	48(%rax), %xmm0
+-	PCMPEQ	%xmm3, %xmm0
+-	pmovmskb	%xmm0, %edx
+-	testl	%edx, %edx
+-	jne	L(exit0)
+-	jmp	L(loop)
+-
+-	.p2align 4
+-L(exit64):
+-	addq	$64, %rax
+-L(exit0):
+-	pxor	%xmm0, %xmm0
+-	FIND_ZERO
+-
+-	bsfq	%rdx, %rdx
+-	addq	%rdx, %rax
+-	subq	%rdi, %rax
+-	SHIFT_RETURN
+-	ret
+-
+-#endif
+-
+-END(strlen)
+ libc_hidden_builtin_def (strlen)
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-29.patch
+++ b/SOURCES/glibc-RHEL-15696-29.patch
@ -0,0 +1,181 @@
+From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:19:34 -0400
+Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
+Content-type: text/plain; charset=UTF-8
+
+No bug. This comment adds the ifunc / build infrastructure
+necessary for wcslen to prefer the sse4.1 implementation
+in strlen-vec.S. test-wcslen.c is passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile          |  4 +-
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 ++
+ sysdeps/x86_64/multiarch/ifunc-wcslen.h    | 52 ++++++++++++++++++++++
+ sysdeps/x86_64/multiarch/wcslen-sse4_1.S   |  4 ++
+ sysdeps/x86_64/multiarch/wcslen.c          |  2 +-
+ sysdeps/x86_64/multiarch/wcsnlen.c         | 34 +-------------
+ 6 files changed, 63 insertions(+), 36 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
+ create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 491c7698..65fde4eb 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcscpy-ssse3 wcscpy-c \
+ 		   wcschr-sse2 wcschr-avx2 \
+ 		   wcsrchr-sse2 wcsrchr-avx2 \
+-		   wcsnlen-sse4_1 wcsnlen-c \
+-		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
+ 		   wcschr-avx2-rtm \
+ 		   wcscmp-avx2-rtm \
+ 		   wcslen-avx2-rtm \
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index f1a6460a..580913ca 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+			      CPU_FEATURE_USABLE (SSE4_1),
+			      __wcsnlen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+new file mode 100644
+index 00000000..39e33473
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
+@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	return OPTIMIZE (evex);
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+    return OPTIMIZE (sse4_1);
+
+  return OPTIMIZE (sse2);
+}
+diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+new file mode 100644
+index 00000000..7e62621a
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
+@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen	__wcslen_sse4_1
+
+#include "strlen-vec.S"
+diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
+index 6d06e47c..3b04b75b 100644
+--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
+@@ -24,7 +24,7 @@
+ # undef __wcslen
+ 
+ # define SYMBOL_NAME wcslen
+-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
+ weak_alias (__wcslen, wcslen);
+diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
+index 20b731ae..06736410 100644
+--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
+@@ -24,39 +24,7 @@
+ # undef __wcsnlen
+ 
+ # define SYMBOL_NAME wcsnlen
+-# include <init-arch.h>
+-
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+-
+-static inline void *
+-IFUNC_SELECTOR (void)
+-{
+-  const struct cpu_features* cpu_features = __get_cpu_features ();
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+-      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+-    {
+-      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+-	return OPTIMIZE (evex);
+-
+-      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-	return OPTIMIZE (avx2_rtm);
+-
+-      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+-	return OPTIMIZE (avx2);
+-    }
+-
+-  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+-    return OPTIMIZE (sse4_1);
+-
+-  return OPTIMIZE (sse2);
+-}
+# include "ifunc-wcslen.h"
+ 
+ libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
+ weak_alias (__wcsnlen, wcsnlen);
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-3.patch
+++ b/SOURCES/glibc-RHEL-15696-3.patch
@ -0,0 +1,396 @@
+From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:27:25 -0800
+Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
+	length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
+	Likewise.
+	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
+	Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
+	tst-size_t-wmemchr.
+	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
+---
+ sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 17 ++++--
+ sysdeps/x86_64/multiarch/memcpy-ssse3.S       | 17 ++++--
+ .../multiarch/memmove-avx512-no-vzeroupper.S  | 16 +++--
+ .../multiarch/memmove-vec-unaligned-erms.S    | 54 +++++++++--------
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-memcpy.c        | 58 +++++++++++++++++++
+ 6 files changed, 122 insertions(+), 42 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+index 3cd11233..568eebd3 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+index 0240bfa3..0bd5ee99 100644
+--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+@@ -45,28 +45,33 @@
+ 	.section .text.ssse3,"ax",@progbits
+ #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
+ ENTRY (MEMPCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMPCPY_CHK)
+ 
+ ENTRY (MEMPCPY)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY)
+ #endif
+ 
+ #if !defined USE_AS_BCOPY
+ ENTRY (MEMCPY_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMCPY_CHK)
+ #endif
+ 
+ ENTRY (MEMCPY)
+-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ #ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+ #endif
+ 
+ #ifdef USE_AS_MEMMOVE
+diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+index effc3ac2..6ca2bbc9 100644
+--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+@@ -24,27 +24,31 @@
+ 
+ 	.section .text.avx512,"ax",@progbits
+ ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__mempcpy_avx512_no_vzeroupper)
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (__mempcpy_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_chk_avx512_no_vzeroupper)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_avx512_no_vzeroupper)
+ 
+ ENTRY (__memmove_avx512_no_vzeroupper)
+-	mov	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ # ifdef USE_AS_MEMPCPY
+-	add	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+ # endif
+ L(start):
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+ 	lea	(%rsi, %rdx), %rcx
+ 	lea	(%rdi, %rdx), %r9
+ 	cmp	$512, %rdx
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c952576c..274aa1c7 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -95,20 +95,20 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start)
+ END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ #endif
+@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 	movq	%rdi, %rax
+ L(start):
+-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(more_2x_vec)
+ #if !defined USE_MULTIARCH || !IS_IN (libc)
+ L(last_2x_vec):
+@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__mempcpy_chk_erms)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__mempcpy_chk_erms)
+ 
+ /* Only used to measure performance of REP MOVSB.  */
+ ENTRY (__mempcpy_erms)
+-	movq	%rdi, %rax
+	mov	%RDI_LP, %RAX_LP
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+-	addq	%rdx, %rax
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_movsb)
+ END (__mempcpy_erms)
+ 
+ ENTRY (__memmove_chk_erms)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memmove_chk_erms)
+ 
+ ENTRY (__memmove_erms)
+ 	movq	%rdi, %rax
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jz	2f
+ L(start_movsb):
+-	movq	%rdx, %rcx
+-	cmpq	%rsi, %rdi
+	mov	%RDX_LP, %RCX_LP
+	cmp	%RSI_LP, %RDI_LP
+ 	jb	1f
+ 	/* Source == destination is less common.  */
+ 	je	2f
+-	leaq	(%rsi,%rcx), %rdx
+-	cmpq	%rdx, %rdi
+	lea	(%rsi,%rcx), %RDX_LP
+	cmp	%RDX_LP, %RDI_LP
+ 	jb	L(movsb_backward)
+ 1:
+ 	rep movsb
+@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+-	movq	%rdi, %rax
+-	addq	%rdx, %rax
+	mov	%RDI_LP, %RAX_LP
+	add	%RDX_LP, %RAX_LP
+ 	jmp	L(start_erms)
+ END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
+ 
+ # ifdef SHARED
+ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ # endif
+@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
+ ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+ 	movq	%rdi, %rax
+ L(start_erms):
+-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	movl	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(movsb_more_2x_vec)
+ L(last_2x_vec):
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
+@@ -236,7 +244,7 @@ L(movsb):
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+ 1:
+-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+ 	ret
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index ddec7f04..2fe1e5ac 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+new file mode 100644
+index 00000000..66b71e17
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
+@@ -0,0 +1,58 @@
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memcpy"
+#include "test-size_t.h"
+
+IMPL (memcpy, 1)
+
+typedef void *(*proto_t) (void *, const void *, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memcpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_memcpy (dest, src);
+      int res = memcmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-30.patch
+++ b/SOURCES/glibc-RHEL-15696-30.patch
@ -0,0 +1,497 @@
+From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 01:56:29 -0400
+Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
+ #27974]
+Content-type: text/plain; charset=UTF-8
+
+This commit fixes the bug mentioned in the previous commit.
+
+The previous implementations of wmemchr in these files relied
+on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
+
+The new overflow tests added in the previous commit now
+pass (As well as all the other tests).
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
+ sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
+ 2 files changed, 107 insertions(+), 38 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
+index be8a5db5..37688966 100644
+--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
+@@ -44,21 +44,21 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+ 	/* Check zero length.  */
+#  ifdef __ILP32__
+	/* Clear upper bits.  */
+	and	%RSI_LP, %RSI_LP
+#  else
+ 	test	%RSI_LP, %RSI_LP
+#  endif
+ 	jz	L(zero)
+ 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
+ 	mov	%RSI_LP, %R8_LP
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
+-	/* Clear the upper 32 bits.  */
+-	movl	%esi, %esi
+-#  endif
+ # endif
+ 	movl	%edi, %eax
+ 	movq	%rdi, %rdx
+@@ -72,10 +72,10 @@ ENTRY (STRLEN)
+ 
+ 	/* Check the first VEC_SIZE bytes.  */
+ 	VPCMPEQ	(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ # ifdef USE_AS_STRNLEN
+ 	/* If length < VEC_SIZE handle special.  */
+-	cmpq	$VEC_SIZE, %rsi
+	cmpq	$CHAR_PER_VEC, %rsi
+ 	jbe	L(first_vec_x0)
+ # endif
+ 	/* If empty continue to aligned_more. Otherwise return bit
+@@ -84,6 +84,7 @@ ENTRY (STRLEN)
+ 	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -97,9 +98,14 @@ L(zero):
+ L(first_vec_x0):
+ 	/* Set bit for max len so that tzcnt will return min of max len
+ 	   and position of first match.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+ 	btsq	%rsi, %rax
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -113,14 +119,19 @@ L(first_vec_x1):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE * 4 + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	incl	%edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -133,14 +144,19 @@ L(first_vec_x2):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE * 3 + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -153,14 +169,19 @@ L(first_vec_x3):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE * 2 + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 2 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -173,14 +194,19 @@ L(first_vec_x4):
+ # ifdef USE_AS_STRNLEN
+ 	/* Use ecx which was computed earlier to compute correct value.
+ 	 */
+#  ifdef USE_AS_WCSLEN
+	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+#  else
+ 	subl	$(VEC_SIZE + 1), %ecx
+ 	addl	%ecx, %eax
+#  endif
+ # else
+ 	subl	%edx, %edi
+ 	addl	$(VEC_SIZE * 3 + 1), %edi
+ 	addl	%edi, %eax
+ # endif
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -195,10 +221,14 @@ L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+ # ifdef USE_AS_STRNLEN
+-	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+-	   it simplies the logic in last_4x_vec_or_less.  */
+	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+	   because it simplies the logic in last_4x_vec_or_less.  */
+ 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+ # endif
+ 	/* Load first VEC regardless.  */
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+@@ -207,34 +237,38 @@ L(cross_page_continue):
+ 	subq	%rcx, %rsi
+ 	jb	L(last_4x_vec_or_less)
+ # endif
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x4)
+ 
+ 	/* Align data to VEC_SIZE * 4 - 1.  */
+ # ifdef USE_AS_STRNLEN
+ 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
+-	cmpq	$(VEC_SIZE * 4 - 1), %rsi
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+ 	jbe	L(last_4x_vec_or_less_load)
+ 	incq	%rdi
+ 	movl	%edi, %ecx
+ 	orq	$(VEC_SIZE * 4 - 1), %rdi
+ 	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+ 	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # else
+@@ -246,13 +280,13 @@ L(cross_page_continue):
+ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	/* Break if at end of length.  */
+-	subq	$(VEC_SIZE * 4), %rsi
+	subq	$(CHAR_PER_VEC * 4), %rsi
+ 	jb	L(last_4x_vec_or_less_cmpeq)
+ # endif
+-	/* Save some code size by microfusing VPMINU with the load. Since
+-	   the matches in ymm2/ymm4 can only be returned if there where no
+-	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+-	 */
+	/* Save some code size by microfusing VPMINU with the load.
+	   Since the matches in ymm2/ymm4 can only be returned if there
+	   where no matches in ymm1/ymm3 respectively there is no issue
+	   with overlap.  */
+ 	vmovdqa	1(%rdi), %ymm1
+ 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
+@@ -260,7 +294,7 @@ L(loop_4x_vec):
+ 
+ 	VPMINU	%ymm2, %ymm4, %ymm5
+ 	VPCMPEQ	%ymm5, %ymm0, %ymm5
+-	vpmovmskb	%ymm5, %ecx
+	vpmovmskb %ymm5, %ecx
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ 	testl	%ecx, %ecx
+@@ -268,27 +302,28 @@ L(loop_4x_vec):
+ 
+ 
+ 	VPCMPEQ	%ymm1, %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	subq	%rdx, %rdi
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x0)
+ 
+ 	VPCMPEQ	%ymm2, %ymm0, %ymm2
+-	vpmovmskb	%ymm2, %eax
+	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_return_x1)
+ 
+ 	/* Combine last 2 VEC.  */
+ 	VPCMPEQ	%ymm3, %ymm0, %ymm3
+-	vpmovmskb	%ymm3, %eax
+-	/* rcx has combined result from all 4 VEC. It will only be used if
+-	   the first 3 other VEC all did not contain a match.  */
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
+ 	salq	$32, %rcx
+ 	orq	%rcx, %rax
+ 	tzcntq	%rax, %rax
+ 	subq	$(VEC_SIZE * 2 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -297,15 +332,19 @@ L(loop_4x_vec):
+ # ifdef USE_AS_STRNLEN
+ 	.p2align 4
+ L(last_4x_vec_or_less_load):
+-	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
+	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
+	 */
+ 	subq	$-(VEC_SIZE * 4), %rdi
+ L(last_4x_vec_or_less_cmpeq):
+ 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
+ L(last_4x_vec_or_less):
+-
+-	vpmovmskb	%ymm1, %eax
+-	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+-	   VEC_SIZE * 4.  */
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+	vpmovmskb %ymm1, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off
+	   by VEC_SIZE * 4.  */
+ 	testl	$(VEC_SIZE * 2), %esi
+ 	jnz	L(last_4x_vec)
+ 
+@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -340,6 +380,7 @@ L(last_vec_return_x0):
+ 	subq	$(VEC_SIZE * 4 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -350,6 +391,7 @@ L(last_vec_return_x1):
+ 	subq	$(VEC_SIZE * 3 - 1), %rdi
+ 	addq	%rdi, %rax
+ # ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -366,6 +408,7 @@ L(last_vec_x1_check):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -381,14 +424,14 @@ L(last_4x_vec):
+ 	jnz	L(last_vec_x1)
+ 
+ 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2)
+ 
+ 	/* Normalize length.  */
+ 	andl	$(VEC_SIZE * 4 - 1), %esi
+ 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3)
+ 
+@@ -396,7 +439,7 @@ L(last_4x_vec):
+ 	jb	L(max)
+ 
+ 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	tzcntl	%eax, %eax
+ 	/* Check the end of data.  */
+ 	cmpl	%eax, %esi
+@@ -405,6 +448,7 @@ L(last_4x_vec):
+ 	addl	$(VEC_SIZE * 3 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -419,6 +463,7 @@ L(last_vec_x1):
+ 	incl	%eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -432,6 +477,7 @@ L(last_vec_x2):
+ 	addl	$(VEC_SIZE + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -447,6 +493,7 @@ L(last_vec_x3):
+ 	addl	$(VEC_SIZE * 2 + 1), %eax
+ 	addq	%rdi, %rax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+ 	shrq	$2, %rax
+ #  endif
+ 	VZEROUPPER_RETURN
+@@ -455,13 +502,13 @@ L(max_end):
+ 	VZEROUPPER_RETURN
+ # endif
+ 
+-	/* Cold case for crossing page with first load.	 */
+	/* Cold case for crossing page with first load.  */
+ 	.p2align 4
+ L(cross_page_boundary):
+ 	/* Align data to VEC_SIZE - 1.  */
+ 	orq	$(VEC_SIZE - 1), %rdi
+ 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+-	vpmovmskb	%ymm1, %eax
+	vpmovmskb %ymm1, %eax
+ 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ 	   so no need to manually mod rdx.  */
+ 	sarxl	%edx, %eax, %eax
+@@ -470,6 +517,10 @@ L(cross_page_boundary):
+ 	jnz	L(cross_page_less_vec)
+ 	leaq	1(%rdi), %rcx
+ 	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get wchar_t count.  */
+	shrl	$2, %ecx
+#  endif
+ 	/* Check length.  */
+ 	cmpq	%rsi, %rcx
+ 	jb	L(cross_page_continue)
+@@ -479,6 +530,7 @@ L(cross_page_boundary):
+ 	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+ #  ifdef USE_AS_WCSLEN
+	/* NB: Divide length by 4 to get wchar_t count.  */
+ 	shrl	$2, %eax
+ #  endif
+ # endif
+@@ -489,6 +541,10 @@ L(return_vzeroupper):
+ 	.p2align 4
+ L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
+#  ifdef USE_AS_WCSLEN
+	/* NB: Multiply length by 4 to get byte count.  */
+	sall	$2, %esi
+#  endif
+ 	cmpq	%rax, %rsi
+ 	cmovb	%esi, %eax
+ #  ifdef USE_AS_WCSLEN
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 8f660bb9..439e486a 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -65,12 +65,25 @@ ENTRY(strlen)
+ 	ret
+ L(n_nonzero):
+ # ifdef AS_WCSLEN
+-	shl	$2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+   overflow the only way this program doesn't have undefined behavior 
+   is if there is a null terminator in valid memory so wcslen will 
+   suffice.  */
+	mov	%RSI_LP, %R10_LP
+	sar	$62, %R10_LP
+	test	%R10_LP, %R10_LP
+	jnz	__wcslen_sse4_1
+	sal	$2, %RSI_LP
+ # endif
+ 
+
+ /* Initialize long lived registers.  */
+ 
+ 	add	%RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+	jbe	__wcslen_sse4_1
+# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-31.patch
+++ b/SOURCES/glibc-RHEL-15696-31.patch
@ -0,0 +1,745 @@
+From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 19:36:06 -0400
+Subject: [PATCH] x86: Optimize strlen-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strlen-evex.S. The
+optimizations are mostly small things but they add up to roughly
+10-30% performance improvement for strlen. The results for strnlen are
+bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
+test-wcsnlen are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
+ 1 file changed, 317 insertions(+), 264 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
+index 05838190..4bf6874b 100644
+--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
+@@ -29,11 +29,13 @@
+ # ifdef USE_AS_WCSLEN
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+-#  define SHIFT_REG	r9d
+#  define SHIFT_REG ecx
+#  define CHAR_SIZE	4
+ # else
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+-#  define SHIFT_REG	ecx
+#  define SHIFT_REG edx
+#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -46,132 +48,165 @@
+ # define YMM6		ymm22
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRLEN)
+ # ifdef USE_AS_STRNLEN
+-	/* Check for zero length.  */
+	/* Check zero length.  */
+ 	test	%RSI_LP, %RSI_LP
+ 	jz	L(zero)
+-#  ifdef USE_AS_WCSLEN
+-	shl	$2, %RSI_LP
+-#  elif defined __ILP32__
+#  ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%esi, %esi
+ #  endif
+ 	mov	%RSI_LP, %R8_LP
+ # endif
+-	movl	%edi, %ecx
+-	movq	%rdi, %rdx
+	movl	%edi, %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+-
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check.  */
+	andl	$(PAGE_SIZE - 1), %eax
+ 	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
+ 	   null byte.  */
+ 	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
+ # ifdef USE_AS_STRNLEN
+-	jnz	L(first_vec_x0_check)
+-	/* Adjust length and check the end of data.  */
+-	subq	$VEC_SIZE, %rsi
+-	jbe	L(max)
+-# else
+-	jnz	L(first_vec_x0)
+	/* If length < CHAR_PER_VEC handle special.  */
+	cmpq	$CHAR_PER_VEC, %rsi
+	jbe	L(first_vec_x0)
+ # endif
+-
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+	testl	%eax, %eax
+	jz	L(aligned_more)
+	tzcntl	%eax, %eax
+	ret
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+-	addq	%rcx, %rsi
+L(zero):
+	xorl	%eax, %eax
+	ret
+ 
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	.p2align 4
+L(first_vec_x0):
+	/* Set bit for max len so that tzcnt will return min of max len
+	   and position of first match.  */
+	btsq	%rsi, %rax
+	tzcntl	%eax, %eax
+	ret
+ # endif
+-	jmp	L(more_4x_vec)
+ 
+ 	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+-	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl	$2, %SHIFT_REG
+L(first_vec_x1):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+# ifdef USE_AS_STRNLEN
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	CHAR_PER_VEC(%rdi, %rax), %eax
+ # endif
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+	ret
+ 
+-	/* Remove the leading bytes.  */
+-	sarxl	%SHIFT_REG, %eax, %eax
+-	testl	%eax, %eax
+-	jz	L(aligned_more)
+	.p2align 4
+L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-# endif
+-	addq	%rdi, %rax
+-	addq	%rcx, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+-L(aligned_more):
+L(first_vec_x3):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
+-	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
+-	    to void possible addition overflow.  */
+-	negq	%rcx
+-	addq	$VEC_SIZE, %rcx
+-
+-	/* Check the end of data.  */
+-	subq	%rcx, %rsi
+-	jbe	L(max)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
+ # endif
+	ret
+ 
+-	addq	$VEC_SIZE, %rdi
+-
+	.p2align 4
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	/* Safe to use 32 bit instructions as these are only called for
+	   size = [1, 159].  */
+ # ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+	/* Use ecx which was computed earlier to compute correct value.
+	 */
+	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
+# else
+	subl	%edx, %edi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %edi
+#  endif
+	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
+ # endif
+	ret
+ 
+-L(more_4x_vec):
+	.p2align 5
+L(aligned_more):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-(VEC_SIZE), %rdi
+L(cross_page_continue):
+ 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+ 	   since data is only aligned to VEC_SIZE.  */
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+# ifdef USE_AS_STRNLEN
+	/* + CHAR_SIZE because it simplies the logic in
+	   last_4x_vec_or_less.  */
+	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
+	subq	%rdx, %rcx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+# endif
+	/* Load first VEC regardless.  */
+ 	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+# ifdef USE_AS_STRNLEN
+	/* Adjust length. If near end handle specially.  */
+	subq	%rcx, %rsi
+	jb	L(last_4x_vec_or_less)
+# endif
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+	test	%eax, %eax
+ 	jnz	L(first_vec_x2)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+@@ -179,258 +214,276 @@ L(more_4x_vec):
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x3)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-# ifdef USE_AS_STRNLEN
+-	subq	$(VEC_SIZE * 4), %rsi
+-	jbe	L(last_4x_vec_or_less)
+-# endif
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+ 
+	addq	$VEC_SIZE, %rdi
+ # ifdef USE_AS_STRNLEN
+-	/* Adjust length.  */
+	/* Check if at last VEC_SIZE * 4 length.  */
+	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
+	jbe	L(last_4x_vec_or_less_load)
+	movl	%edi, %ecx
+	andl	$(VEC_SIZE * 4 - 1), %ecx
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarl	$2, %ecx
+#  endif
+	/* Readjust length.  */
+ 	addq	%rcx, %rsi
+ # endif
+	/* Align data to VEC_SIZE * 4.  */
+	andq	$-(VEC_SIZE * 4), %rdi
+ 
+	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVA	(%rdi), %YMM1
+-	VMOVA	VEC_SIZE(%rdi), %YMM2
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
+-
+-	VPMINU	%YMM1, %YMM2, %YMM5
+-	VPMINU	%YMM3, %YMM4, %YMM6
+	/* Load first VEC regardless.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+# ifdef USE_AS_STRNLEN
+	/* Break if at end of length.  */
+	subq	$(CHAR_PER_VEC * 4), %rsi
+	jb	L(last_4x_vec_or_less_cmpeq)
+# endif
+	/* Save some code size by microfusing VPMINU with the load. Since
+	   the matches in ymm2/ymm4 can only be returned if there where no
+	   matches in ymm1/ymm3 respectively there is no issue with overlap.
+	 */
+	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
+	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+
+	VPCMP	$0, %YMM2, %YMMZERO, %k0
+	VPCMP	$0, %YMM4, %YMMZERO, %k1
+	subq	$-(VEC_SIZE * 4), %rdi
+	kortestd	%k0, %k1
+	jz	L(loop_4x_vec)
+
+	/* Check if end was in first half.  */
+	kmovd	%k0, %eax
+	subq	%rdx, %rdi
+# ifdef USE_AS_WCSLEN
+	shrq	$2, %rdi
+# endif
+	testl	%eax, %eax
+	jz	L(second_vec_return)
+ 
+-	VPMINU	%YMM5, %YMM6, %YMM5
+-	VPCMP	$0, %YMM5, %YMMZERO, %k0
+-	ktestd	%k0, %k0
+-	jnz	L(4x_vec_end)
+	VPCMP	$0, %YMM1, %YMMZERO, %k2
+	kmovd	%k2, %edx
+	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
+# ifdef USE_AS_WCSLEN
+	sall	$CHAR_PER_VEC, %eax
+	orl	%edx, %eax
+	tzcntl	%eax, %eax
+# else
+	salq	$CHAR_PER_VEC, %rax
+	orq	%rdx, %rax
+	tzcntq	%rax, %rax
+# endif
+	addq	%rdi, %rax
+	ret
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+ 
+-# ifndef USE_AS_STRNLEN
+-	jmp	L(loop_4x_vec)
+-# else
+-	subq	$(VEC_SIZE * 4), %rsi
+-	ja	L(loop_4x_vec)
+# ifdef USE_AS_STRNLEN
+ 
+L(last_4x_vec_or_less_load):
+	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+L(last_4x_vec_or_less_cmpeq):
+	VPCMP	$0, %YMM1, %YMMZERO, %k0
+	addq	$(VEC_SIZE * 3), %rdi
+ L(last_4x_vec_or_less):
+-	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
+-	addl	$(VEC_SIZE * 2), %esi
+-	jle	L(last_2x_vec)
+-
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
+	   VEC_SIZE * 4.  */
+	testl	$(CHAR_PER_VEC * 2), %esi
+	jnz	L(last_4x_vec)
+
+	/* length may have been negative or positive by an offset of
+	   CHAR_PER_VEC * 4 depending on where this was called from. This
+	   fixes that.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x1_check)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	/* Check the end of data.  */
+	subl	$CHAR_PER_VEC, %esi
+	jb	L(max)
+ 
+ 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+ 
+-	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x3_check)
+	subq	%rdx, %rdi
+#  ifdef USE_AS_WCSLEN
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+#  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+L(max):
+ 	movq	%r8, %rax
+	ret
+# endif
+
+	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
+	   in the 4x VEC loop can use 2 byte encoding.  */
+	.p2align 4
+L(second_vec_return):
+	VPCMP	$0, %YMM3, %YMMZERO, %k0
+	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
+# ifdef USE_AS_WCSLEN
+	kunpckbw	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+# else
+	kunpckdq	%k0, %k1, %k0
+	kmovq	%k0, %rax
+	tzcntq	%rax, %rax
+# endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+	ret
+
+
+# ifdef USE_AS_STRNLEN
+L(last_vec_x1_check):
+	tzcntl	%eax, %eax
+	/* Check the end of data.  */
+	cmpl	%eax, %esi
+	jb	L(max)
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	addl	$(VEC_SIZE * 2), %esi
+L(last_4x_vec):
+	/* Test first 2x VEC normally.  */
+	testl	%eax, %eax
+	jnz	L(last_vec_x1)
+ 
+-	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0_check)
+-	subl	$VEC_SIZE, %esi
+-	jle	L(max)
+	jnz	L(last_vec_x2)
+ 
+-	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
+	/* Normalize length.  */
+	andl	$(CHAR_PER_VEC * 4 - 1), %esi
+	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1_check)
+-	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
+	jnz	L(last_vec_x3)
+ 
+-	.p2align 4
+-L(first_vec_x0_check):
+	/* Check the end of data.  */
+	subl	$(CHAR_PER_VEC * 3), %esi
+	jb	L(max)
+
+	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+	cmpl	%eax, %esi
+	jb	L(max_end)
+
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x1_check):
+L(last_vec_x1):
+ 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x2_check):
+L(last_vec_x2):
+ 	tzcntl	%eax, %eax
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+-	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ 	ret
+ 
+ 	.p2align 4
+-L(first_vec_x3_check):
+L(last_vec_x3):
+ 	tzcntl	%eax, %eax
+-#  ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-#  endif
+	subl	$(CHAR_PER_VEC * 2), %esi
+ 	/* Check the end of data.  */
+-	cmpq	%rax, %rsi
+-	jbe	L(max)
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+	cmpl	%eax, %esi
+	jb	L(max_end)
+	subq	%rdx, %rdi
+ #  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+	sarq	$2, %rdi
+ #  endif
+	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
+ 	ret
+-
+-	.p2align 4
+-L(max):
+L(max_end):
+ 	movq	%r8, %rax
+-#  ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-#  endif
+-	ret
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+ 	ret
+ # endif
+ 
+	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+-L(first_vec_x0):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+	VPCMP	$0, (%rdi), %YMMZERO, %k0
+	kmovd	%k0, %eax
+	/* Remove the leading bytes.  */
+ # ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+	/* NB: Divide shift count by 4 since each bit in K0 represent 4
+	   bytes.  */
+	movl	%edx, %ecx
+	shrl	$2, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+ # endif
+-	ret
+-
+-	.p2align 4
+-L(first_vec_x1):
+	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
+	sarxl	%SHIFT_REG, %eax, %eax
+	testl	%eax, %eax
+# ifndef USE_AS_STRNLEN
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+ 	ret
+-
+-	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+# else
+	jnz	L(cross_page_less_vec)
+#  ifndef USE_AS_WCSLEN
+	movl	%edx, %ecx
+	andl	$(CHAR_PER_VEC - 1), %ecx
+#  endif
+	movl	$CHAR_PER_VEC, %eax
+	subl	%ecx, %eax
+	/* Check the end of data.  */
+	cmpq	%rax, %rsi
+	ja	L(cross_page_continue)
+	movl	%esi, %eax
+ 	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	VPCMP	$0, %YMM1, %YMMZERO, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-	VPCMP	$0, %YMM2, %YMMZERO, %k1
+-	kmovd	%k1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-	VPCMP	$0, %YMM3, %YMMZERO, %k2
+-	kmovd	%k2, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-	VPCMP	$0, %YMM4, %YMMZERO, %k3
+-	kmovd	%k3, %eax
+-L(first_vec_x3):
+L(cross_page_less_vec):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_WCSLEN
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	sall	$2, %eax
+-# endif
+-	addq	$(VEC_SIZE * 3), %rax
+-	addq	%rdi, %rax
+-	subq	%rdx, %rax
+-# ifdef USE_AS_WCSLEN
+-	shrq	$2, %rax
+-# endif
+	/* Select min of length and position of first null.  */
+	cmpq	%rax, %rsi
+	cmovb	%esi, %eax
+ 	ret
+# endif
+ 
+ END (STRLEN)
+ #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-32.patch
+++ b/SOURCES/glibc-RHEL-15696-32.patch
@ -0,0 +1,158 @@
+From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Wed, 30 Jun 2021 10:47:06 -0700
+Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
+Content-type: text/plain; charset=UTF-8
+
+From
+
+https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+
+* Intel TSX will be disabled by default.
+* The processor will force abort all Restricted Transactional Memory (RTM)
+  transactions by default.
+* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
+  which is set to indicate to updated software that the loaded microcode is
+  forcing RTM abort.
+* On processors that enumerate support for RTM, the CPUID enumeration bits
+  for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
+  be set by default after microcode update.
+* Workloads that were benefited from Intel TSX might experience a change
+  in performance.
+* System software may use a new bit in Model-Specific Register (MSR) 0x10F
+  TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
+  Elision (HLE) and RTM bits to indicate to software that Intel TSX is
+  disabled.
+
+1. Add RTM_ALWAYS_ABORT to CPUID features.
+2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set.  This skips the
+string/tst-memchr-rtm etc. testcases on the affected processors, which
+always fail after a microcde update.
+3. Check RTM feature, instead of usability, against /proc/cpuinfo.
+
+This fixes BZ #28033.
+---
+ manual/platform.texi                    | 3 +++
+ sysdeps/x86/cpu-features.c              | 5 ++++-
+ sysdeps/x86/sys/platform/x86.h          | 6 +++---
+ sysdeps/x86/tst-cpu-features-supports.c | 2 +-
+ sysdeps/x86/tst-get-cpu-features.c      | 2 ++
+ 5 files changed, 13 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86/bits/platform/x86.h
+	(doesn't exist)
+	sysdeps/x86/bits/platform/x86.h
+	(account for lack of upstream renames)
+
+diff --git a/manual/platform.texi b/manual/platform.texi
+index 8fec2933..b7e8aef7 100644
+--- a/manual/platform.texi
+++ b/manual/platform.texi
+@@ -510,6 +510,9 @@ capability.
+ @item
+ @code{RTM} -- RTM instruction extensions.
+ 
+@item
+@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
+
+ @item
+ @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
+ 
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 3610ee5c..4889f062 100644
+--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
+@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, HLE);
+   CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
+   CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
+-  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+   CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
+   CPU_FEATURE_SET_USABLE (cpu_features, ADX);
+   CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
+@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
+   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
+   CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
+  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
+   CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
+   CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
+   CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
+@@ -779,6 +779,9 @@ no_cpuid:
+     GLRO(dl_platform) = "i586";
+ #endif
+ 
+  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
+    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+
+ #if CET_ENABLED
+ # if HAVE_TUNABLES
+   TUNABLE_GET (x86_ibt, tunable_val_t *,
+diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
+index e5cc7c68..7a434926 100644
+--- a/sysdeps/x86/sys/platform/x86.h
+++ b/sysdeps/x86/sys/platform/x86.h
+@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
+ #define bit_cpu_INDEX_7_EDX_9	(1u << 9)
+ #define bit_cpu_MD_CLEAR	(1u << 10)
+-#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
+#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
+ #define bit_cpu_INDEX_7_EDX_12	(1u << 12)
+ #define bit_cpu_INDEX_7_EDX_13	(1u << 13)
+ #define bit_cpu_SERIALIZE	(1u << 14)
+@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_9	COMMON_CPUID_INDEX_7
+ #define index_cpu_MD_CLEAR	COMMON_CPUID_INDEX_7
+-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
+#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
+ #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
+ #define index_cpu_SERIALIZE	COMMON_CPUID_INDEX_7
+@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
+ #define reg_AVX512_VP2INTERSECT	edx
+ #define reg_INDEX_7_EDX_9	edx
+ #define reg_MD_CLEAR		edx
+-#define reg_INDEX_7_EDX_11	edx
+#define reg_RTM_ALWAYS_ABORT	edx
+ #define reg_INDEX_7_EDX_12	edx
+ #define reg_INDEX_7_EDX_13	edx
+ #define reg_SERIALIZE		edx
+diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
+index 287cf01f..8100a319 100644
+--- a/sysdeps/x86/tst-cpu-features-supports.c
+++ b/sysdeps/x86/tst-cpu-features-supports.c
+@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
+   fails += CHECK_SUPPORTS (rdpid, RDPID);
+   fails += CHECK_SUPPORTS (rdrnd, RDRAND);
+   fails += CHECK_SUPPORTS (rdseed, RDSEED);
+-  fails += CHECK_SUPPORTS (rtm, RTM);
+  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
+   fails += CHECK_SUPPORTS (serialize, SERIALIZE);
+   fails += CHECK_SUPPORTS (sha, SHA);
+   fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
+diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
+index 2763deb6..0717e5d8 100644
+--- a/sysdeps/x86/tst-get-cpu-features.c
+++ b/sysdeps/x86/tst-get-cpu-features.c
+@@ -183,6 +183,7 @@ do_test (void)
+   CHECK_CPU_FEATURE (UINTR);
+   CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE (MD_CLEAR);
+  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE (SERIALIZE);
+   CHECK_CPU_FEATURE (HYBRID);
+   CHECK_CPU_FEATURE (TSXLDTRK);
+@@ -344,6 +345,7 @@ do_test (void)
+   CHECK_CPU_FEATURE_USABLE (FSRM);
+   CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
+   CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
+  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
+   CHECK_CPU_FEATURE_USABLE (SERIALIZE);
+   CHECK_CPU_FEATURE_USABLE (HYBRID);
+   CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-33.patch
+++ b/SOURCES/glibc-RHEL-15696-33.patch
@ -0,0 +1,51 @@
+From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 8 Jul 2021 16:13:19 -0400
+Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
+ #28064]
+Content-type: text/plain; charset=UTF-8
+
+The following commit
+
+commit 6f573a27b6c8b4236445810a44660612323f5a73
+Author: Noah Goldstein <goldstein.w.n@gmail.com>
+Date:   Wed Jun 23 01:19:34 2021 -0400
+
+    x86-64: Add wcslen optimize for sse4.1
+
+Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
+not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
+fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
+implementation list and adding wcslen-sse4.1 to the ifunc
+implementation list.
+
+Testing:
+test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
+well as all other tests in wcsmbs and string.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 580913ca..695cdba6 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcslen_evex)
+-	      IFUNC_IMPL_ADD (array, i, wcsnlen,
+	      IFUNC_IMPL_ADD (array, i, wcslen,
+ 			      CPU_FEATURE_USABLE (SSE4_1),
+-			      __wcsnlen_sse4_1)
+			      __wcslen_sse4_1)
+ 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-112.patch
+++ b/SOURCES/glibc-upstream-2.34-112.patch
@ -1,26 +1,38 @@
-commit 38e0d2479413ccdbc02b4c9e9e246eca31e956c9
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Tue Feb 15 08:18:15 2022 -0600
+From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 15 Feb 2022 08:18:15 -0600
+Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
+ #28896]
+Content-type: text/plain; charset=UTF-8

-    x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #28896]
-    
-    In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
-    call strcmp-avx2 and wcscmp-avx2 respectively. This would have
-    not checks around vzeroupper and would trigger spurious
-    aborts. This commit fixes that.
-    
-    test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
-    AVX2 machines with and without RTM.
-    
-    Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit c6272098323153db373f2986c67786ea8c85f1cf)
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+
+Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile                        |  2 +-
+ sysdeps/x86/tst-strncmp-rtm.c               | 17 ++++++++++++++++-
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  2 +-
+ sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S |  1 +
+ sysdeps/x86_64/multiarch/strncmp-avx2.S     |  1 +
+ sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S |  2 +-
+ sysdeps/x86_64/multiarch/wcsncmp-avx2.S     |  2 +-
+ 7 files changed, 22 insertions(+), 5 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strcmp-avx2.S
+	(split into two patches due to upstream bug differences)

 diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
-index 36ca1a7126047b86..af934d6ccf1fa337 100644
+index 5be71ada..2d814915 100644
 --- a/sysdeps/x86/Makefile
 +++ b/sysdeps/x86/Makefile
-@@ -105,7 +105,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
+@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
 CFLAGS-tst-strchr-rtm.c += -mrtm
 CFLAGS-tst-strcpy-rtm.c += -mrtm
 CFLAGS-tst-strlen-rtm.c += -mrtm
@ -30,7 +42,7 @@ index 36ca1a7126047b86..af934d6ccf1fa337 100644
 endif
 
 diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
-index 236ad951b5b59cd1..4d0004b58aae428d 100644
+index 236ad951..4d0004b5 100644
 --- a/sysdeps/x86/tst-strncmp-rtm.c
 +++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -16,6 +16,7 @@
@ -66,7 +78,7 @@ index 236ad951b5b59cd1..4d0004b58aae428d 100644
 +  return status;
 }
 diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
-index 3dfcb1bf803cf9ec..fa70c994fc25dfd8 100644
+index 5d1c9d90..433ae047 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -95,7 +95,7 @@ ENTRY (STRCMP)
@ -79,7 +91,7 @@ index 3dfcb1bf803cf9ec..fa70c994fc25dfd8 100644
 	/* Convert units: from wide to byte char.  */
 	shl	$2, %RDX_LP
 diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
-index 37d1224bb9b7056b..68bad365ba728eec 100644
+index 37d1224b..68bad365 100644
 --- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
 +++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -1,3 +1,4 @@
@ -88,7 +100,7 @@ index 37d1224bb9b7056b..68bad365ba728eec 100644
 +#define OVERFLOW_STRCMP	__strcmp_avx2_rtm
 #include "strcmp-avx2-rtm.S"
 diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
-index 1678bcc235a4bc6a..f138e9f1fdcf277c 100644
+index 1678bcc2..f138e9f1 100644
 --- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
@@ -1,3 +1,4 @@
@ -97,7 +109,7 @@ index 1678bcc235a4bc6a..f138e9f1fdcf277c 100644
 +#define OVERFLOW_STRCMP __strcmp_avx2
 #include "strcmp-avx2.S"
 diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
-index 4e88c70cc696b82d..f467582cbedd4535 100644
+index 4e88c70c..f467582c 100644
 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
 +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -1,5 +1,5 @@
@ -108,7 +120,7 @@ index 4e88c70cc696b82d..f467582cbedd4535 100644
 +#define OVERFLOW_STRCMP	__wcscmp_avx2_rtm
 #include "strcmp-avx2-rtm.S"
 diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
-index 4fa1de4d3f1f97ff..e9ede522b8bde27d 100644
+index 4fa1de4d..e9ede522 100644
 --- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
@@ -1,5 +1,5 @@
@ -118,3 +130,6 @@ index 4fa1de4d3f1f97ff..e9ede522b8bde27d 100644
 -
 +#define OVERFLOW_STRCMP	__wcscmp_avx2
 #include "strcmp-avx2.S"
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-35.patch
+++ b/SOURCES/glibc-RHEL-15696-35.patch
@ -0,0 +1,51 @@
+From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 9 May 2020 12:04:23 -0700
+Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
+ #25966]
+Content-type: text/plain; charset=UTF-8
+
+Since __x86_shared_non_temporal_threshold is defined as
+
+long int __x86_shared_non_temporal_threshold;
+
+and long int is 4 bytes for x32, use RDX_LP to compare against
+__x86_shared_non_temporal_threshold in assembly code.
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 71f5954d..673b73aa 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -245,7 +245,7 @@ L(return):
+ #endif
+ 
+ L(movsb):
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	jae	L(more_8x_vec)
+ 	cmpq	%rsi, %rdi
+ 	jb	1f
+@@ -397,7 +397,7 @@ L(more_8x_vec):
+ 	addq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_forward)
+ #endif
+ L(loop_4x_vec_forward):
+@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %rdx
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ 	/* Check non-temporal store threshold.  */
+-	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
+	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ 	ja	L(large_backward)
+ #endif
+ L(loop_4x_vec_backward):
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-36.patch
+++ b/SOURCES/glibc-RHEL-15696-36.patch
@ -0,0 +1,44 @@
+From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Jun 2020 12:41:18 -0700
+Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
+Content-type: text/plain; charset=UTF-8
+
+Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
+%xmmN, instead of %ymmN, with vpxor to clear a vector register.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S  | 4 ++--
+ sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
+ 2 files changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 433ae047..70d8499b 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -105,8 +105,8 @@ ENTRY (STRCMP)
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+-	/* Make %ymm7 all zeros in this function.  */
+-	vpxor	%ymm7, %ymm7, %ymm7
+	/* Make %xmm7 (%ymm7) all zeros in this function.  */
+	vpxor	%xmm7, %xmm7, %xmm7
+ 	orl	%esi, %eax
+ 	andl	$(PAGE_SIZE - 1), %eax
+ 	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
+diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+index 9f22a15e..c949410b 100644
+--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
+@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
+ 	movl	%edi, %ecx
+ 	/* Broadcast CHAR to YMM4.  */
+ 	VPBROADCAST %xmm4, %ymm4
+-	vpxor	%ymm0, %ymm0, %ymm0
+	vpxor	%xmm0, %xmm0, %xmm0
+ 
+ 	/* Check if we may cross page boundary with one vector load.  */
+ 	andl	$(2 * VEC_SIZE - 1), %ecx
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-37.patch
+++ b/SOURCES/glibc-RHEL-15696-37.patch
@ -0,0 +1,359 @@
+From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Wed, 3 Feb 2021 00:38:59 -0500
+Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. Just seemed the performance could be improved a bit. Observed
+and expected behavior are unchanged. Optimized body of main
+loop. Updated page cross logic and optimized accordingly. Made a few
+minor instruction selection modifications. No regressions in test
+suite. Both test-strchrnul and test-strchr passed.
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
+ sysdeps/x86_64/multiarch/strchr.c      |   4 +-
+ 2 files changed, 114 insertions(+), 115 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/strchr.c
+	(account for missing upstream macros)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index da7d2620..919d256c 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -27,10 +27,12 @@
+ # ifdef USE_AS_WCSCHR
+ #  define VPBROADCAST	vpbroadcastd
+ #  define VPCMPEQ	vpcmpeqd
+#  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMPEQ	vpcmpeqb
+#  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+ # endif
+ 
+@@ -43,71 +45,54 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+ 	movl	%edi, %ecx
+-	/* Broadcast CHAR to YMM0.  */
+# ifndef USE_AS_STRCHRNUL
+	xorl	%edx, %edx
+# endif
+
+	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+ 	VPBROADCAST %xmm0, %ymm0
+-	/* Check if we may cross page boundary with one vector load.  */
+-	andl	$(2 * VEC_SIZE - 1), %ecx
+-	cmpl	$VEC_SIZE, %ecx
+-	ja	L(cros_page_boundary)
+ 
+-	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
+-	   null byte.  */
+-	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	/* Check if we cross page boundary with one vector load.  */
+	andl	$(PAGE_SIZE - 1), %ecx
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	ja  L(cross_page_boundary)
+ 
+-	/* Align data for aligned loads in the loop.  */
+-	addq	$VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+-
+-	jmp	L(more_4x_vec)
+-
+-	.p2align 4
+-L(cros_page_boundary):
+-	andl	$(VEC_SIZE - 1), %ecx
+-	andq	$-VEC_SIZE, %rdi
+	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bytes.  */
+-	sarl	%cl, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
+-	/* Found CHAR or the null byte.  */
+	jz	L(more_vecs)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rax
+-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+L(more_vecs):
+	/* Align data for aligned loads in the loop.  */
+	andq	$-VEC_SIZE, %rdi
+ L(aligned_more):
+-	addq	$VEC_SIZE, %rdi
+ 
+-L(more_4x_vec):
+-	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.  */
+-	vmovdqa	(%rdi), %ymm8
+	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.	*/
+	vmovdqa	VEC_SIZE(%rdi), %ymm8
+	addq	$VEC_SIZE, %rdi
+ 	VPCMPEQ %ymm8, %ymm0, %ymm1
+ 	VPCMPEQ %ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+@@ -137,61 +122,24 @@ L(more_4x_vec):
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x3)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	/* Align data to 4 * VEC_SIZE.  */
+-	movq	%rdi, %rcx
+-	andl	$(4 * VEC_SIZE - 1), %ecx
+-	andq	$-(4 * VEC_SIZE), %rdi
+-
+-	.p2align 4
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(%rdi), %ymm5
+-	vmovdqa	VEC_SIZE(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-
+-	VPCMPEQ %ymm5, %ymm0, %ymm1
+-	VPCMPEQ %ymm6, %ymm0, %ymm2
+-	VPCMPEQ %ymm7, %ymm0, %ymm3
+-	VPCMPEQ %ymm8, %ymm0, %ymm4
+-
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	VPCMPEQ %ymm6, %ymm9, %ymm6
+-	VPCMPEQ %ymm7, %ymm9, %ymm7
+-	VPCMPEQ %ymm8, %ymm9, %ymm8
+-
+-	vpor	%ymm1, %ymm5, %ymm1
+-	vpor	%ymm2, %ymm6, %ymm2
+-	vpor	%ymm3, %ymm7, %ymm3
+-	vpor	%ymm4, %ymm8, %ymm4
+-
+-	vpor	%ymm1, %ymm2, %ymm5
+-	vpor	%ymm3, %ymm4, %ymm6
+-
+-	vpor	%ymm5, %ymm6, %ymm5
+-
+-	vpmovmskb %ymm5, %eax
+-	testl	%eax, %eax
+-	jnz	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+	jz	L(prep_loop_4x)
+ 
+-	jmp	L(loop_4x_vec)
+	tzcntl	%eax, %eax
+	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+ 
+ 	.p2align 4
+ L(first_vec_x0):
+-	/* Found CHAR or the null byte.  */
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -199,13 +147,9 @@ L(first_vec_x0):
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$VEC_SIZE, %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+ 	leaq	VEC_SIZE(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+@@ -213,42 +157,97 @@ L(first_vec_x1):
+ 	.p2align 4
+ L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 2), %rax
+-	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+	/* Found CHAR or the null byte.	 */
+ 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+L(prep_loop_4x):
+	/* Align data to 4 * VEC_SIZE.	*/
+	andq	$-(VEC_SIZE * 4), %rdi
+
+ 	.p2align 4
+-L(4x_vec_end):
+L(loop_4x_vec):
+	/* Compare 4 * VEC at a time forward.  */
+	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+
+	/* Leaves only CHARS matching esi as 0.	 */
+	vpxor	%ymm5, %ymm0, %ymm1
+	vpxor	%ymm6, %ymm0, %ymm2
+	vpxor	%ymm7, %ymm0, %ymm3
+	vpxor	%ymm8, %ymm0, %ymm4
+
+	VPMINU	%ymm1, %ymm5, %ymm1
+	VPMINU	%ymm2, %ymm6, %ymm2
+	VPMINU	%ymm3, %ymm7, %ymm3
+	VPMINU	%ymm4, %ymm8, %ymm4
+
+	VPMINU	%ymm1, %ymm2, %ymm5
+	VPMINU	%ymm3, %ymm4, %ymm6
+
+	VPMINU	%ymm5, %ymm6, %ymm5
+
+	VPCMPEQ %ymm5, %ymm9, %ymm5
+	vpmovmskb %ymm5, %eax
+
+	addq	$(VEC_SIZE * 4), %rdi
+	testl	%eax, %eax
+	jz  L(loop_4x_vec)
+
+	VPCMPEQ %ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x0)
+
+	VPCMPEQ %ymm2, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+
+	VPCMPEQ %ymm3, %ymm9, %ymm3
+	VPCMPEQ %ymm4, %ymm9, %ymm4
+	vpmovmskb %ymm3, %ecx
+ 	vpmovmskb %ymm4, %eax
+	salq	$32, %rax
+	orq %rcx, %rax
+	tzcntq  %rax, %rax
+	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+	cmovne	%rdx, %rax
+# endif
+	VZEROUPPER
+	ret
+
+	/* Cold case for crossing page with first load.	 */
+	.p2align 4
+L(cross_page_boundary):
+	andq	$-VEC_SIZE, %rdi
+	andl	$(VEC_SIZE - 1), %ecx
+
+	vmovdqa	(%rdi), %ymm8
+	VPCMPEQ %ymm8, %ymm0, %ymm1
+	VPCMPEQ %ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Remove the leading bits.	 */
+	sarxl	%ecx, %eax, %eax
+ 	testl	%eax, %eax
+-L(first_vec_x3):
+	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_STRCHRNUL
+-	addq	$(VEC_SIZE * 3), %rax
+	addq	%rcx, %rdi
+ 	addq	%rdi, %rax
+-# else
+-	xorl	%edx, %edx
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+-	cmp	(%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+	cmp (%rax), %CHAR_REG
+ 	cmovne	%rdx, %rax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+ END (STRCHR)
+-#endif
+# endif
+diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
+index 7e582f02..5225bd4f 100644
+--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
+@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-38.patch
+++ b/SOURCES/glibc-RHEL-15696-38.patch
@ -0,0 +1,67 @@
+From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Sat, 25 Jan 2020 14:19:40 -0800
+Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
+Content-type: text/plain; charset=UTF-8
+
+When copying with "rep movsb", if the distance between source and
+destination is N*4GB + [1..63] with N >= 0, performance may be very
+slow.  This patch updates memmove-vec-unaligned-erms.S for AVX and
+AVX512 versions with the distance in RCX:
+
+	cmpl	$63, %ecx
+	// Don't use "rep movsb" if ECX <= 63
+	jbe	L(Don't use rep movsb")
+	Use "rep movsb"
+
+Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
+and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
+performance impact is within noise range as "rep movsb" is only used for
+data size >= 4KB.
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 21 +++++++++++++++++++
+ 1 file changed, 21 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 673b73aa..c475fed4 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -64,6 +64,13 @@
+ # endif
+ #endif
+ 
+/* Avoid short distance rep movsb only with non-SSE vector.  */
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+#else
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
+#endif
+
+ #ifndef PREFETCH
+ # define PREFETCH(addr) prefetcht0 addr
+ #endif
+@@ -255,7 +262,21 @@ L(movsb):
+ 	cmpq	%r9, %rdi
+ 	/* Avoid slow backward REP MOVSB.  */
+ 	jb	L(more_8x_vec_backward)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	movq	%rdi, %rcx
+	subq	%rsi, %rcx
+	jmp	2f
+# endif
+ 1:
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+	movq	%rsi, %rcx
+	subq	%rdi, %rcx
+2:
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
+   is N*4GB + [1..63] with N >= 0.  */
+	cmpl	$63, %ecx
+	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
+# endif
+ 	mov	%RDX_LP, %RCX_LP
+ 	rep movsb
+ L(nop):
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-39.patch
+++ b/SOURCES/glibc-RHEL-15696-39.patch
@ -0,0 +1,449 @@
+From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
+From: noah <goldstein.w.n@gmail.com>
+Date: Sat, 3 Apr 2021 04:12:15 -0400
+Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No Bug. This commit updates the large memcpy case (no overlap). The
+update is to perform memcpy on either 2 or 4 contiguous pages at
+once. This 1) helps to alleviate the affects of false memory aliasing
+when destination and source have a close 4k alignment and 2) In most
+cases and for most DRAM units is a modestly more efficient access
+pattern. These changes are a clear performance improvement for
+VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
+test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
+pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
+ 1 file changed, 265 insertions(+), 73 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+	(different number of sections)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index c475fed4..3e2dd6bc 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -32,7 +32,16 @@
+       overlapping addresses.
+    6. If size >= __x86_shared_non_temporal_threshold and there is no
+       overlap between destination and source, use non-temporal store
+-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
+ 
+ #include <sysdep.h>
+ 
+@@ -64,6 +73,34 @@
+ # endif
+ #endif
+ 
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
+ /* Avoid short distance rep movsb only with non-SSE vector.  */
+ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+ # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+@@ -103,6 +140,28 @@
+ # error Unsupported PREFETCH_SIZE!
+ #endif
+ 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVNT  vec0, (offset)base; \
+	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1; \
+	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
+	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVNT	vec0, (offset)base; \
+	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
+	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
+	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -390,6 +449,15 @@ L(last_4x_vec):
+ 	VZEROUPPER_RETURN
+ 
+ L(more_8x_vec):
+	/* Check if non-temporal move candidate.  */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	ja	L(large_memcpy_2x)
+#endif
+	/* Entry if rdx is greater than non-temporal threshold but there
+       is overlap.  */
+L(more_8x_vec_check):
+ 	cmpq	%rsi, %rdi
+ 	ja	L(more_8x_vec_backward)
+ 	/* Source == destination is less common.  */
+@@ -416,24 +484,21 @@ L(more_8x_vec):
+ 	subq	%r8, %rdi
+ 	/* Adjust length.  */
+ 	addq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_forward)
+-#endif
+
+	.p2align 4
+ L(loop_4x_vec_forward):
+ 	/* Copy 4 * VEC a time forward.  */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$(VEC_SIZE * 4), %rsi
+-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$-(VEC_SIZE * 4), %rsi
+	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%rdi)
+ 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+ 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+ 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_forward)
+ 	/* Store the last 4 * VEC.  */
+@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
+ 	subq	%r8, %r9
+ 	/* Adjust length.  */
+ 	subq	%r8, %rdx
+-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-	/* Check non-temporal store threshold.  */
+-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
+-	ja	L(large_backward)
+-#endif
+
+	.p2align 4
+ L(loop_4x_vec_backward):
+ 	/* Copy 4 * VEC a time backward.  */
+ 	VMOVU	(%rcx), %VEC(0)
+ 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+ 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+ 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$(VEC_SIZE * 4), %rcx
+-	subq	$(VEC_SIZE * 4), %rdx
+	addq	$-(VEC_SIZE * 4), %rcx
+	addq	$-(VEC_SIZE * 4), %rdx
+ 	VMOVA	%VEC(0), (%r9)
+ 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
+ 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
+ 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$(VEC_SIZE * 4), %r9
+	addq	$-(VEC_SIZE * 4), %r9
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+ 	ja	L(loop_4x_vec_backward)
+ 	/* Store the first 4 * VEC.  */
+@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
+ 	VZEROUPPER_RETURN
+ 
+ #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+-L(large_forward):
+	.p2align 4
+L(large_memcpy_2x):
+	/* Compute absolute value of difference between source and
+	   destination.  */
+	movq	%rdi, %r9
+	subq	%rsi, %r9
+	movq	%r9, %r8
+	leaq	-1(%r9), %rcx
+	sarq	$63, %r8
+	xorq	%r8, %r9
+	subq	%r8, %r9
+ 	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rdi, %rdx), %r10
+-	cmpq    %r10, %rsi
+-	jb	L(loop_4x_vec_forward)
+-L(loop_large_forward):
+	   destination and source since destination may be in cache when
+	   source is loaded.  */
+	cmpq	%r9, %rdx
+	ja	L(more_8x_vec_check)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+	VMOVU	(%rsi), %VEC(8)
+#if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+	VMOVU	%VEC(8), (%rdi)
+#if VEC_SIZE < 64
+	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+	/* Adjust source, destination, and size.  */
+	movq	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they do
+	   the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz	L(large_memcpy_4x)
+
+	movq	%rdx, %r10
+	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae	L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+	.p2align 4
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_2x_inner)
+	addq	$PAGE_SIZE, %rdi
+	addq	$PAGE_SIZE, %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_2x_outer)
+	sfence
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_2x_tail):
+ 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+-	addq	$PREFETCHED_LOAD_SIZE, %rsi
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%rdi)
+-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
+-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
+-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
+-	addq	$PREFETCHED_LOAD_SIZE, %rdi
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_forward)
+-	sfence
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
+ 	/* Store the last 4 * VEC.  */
+-	VMOVU	%VEC(5), (%rcx)
+-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
+-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
+-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
+-	/* Store the first VEC.  */
+-	VMOVU	%VEC(4), (%r11)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ 
+-L(large_backward):
+-	/* Don't use non-temporal store if there is overlap between
+-	   destination and source since destination may be in cache
+-	   when source is loaded.  */
+-	leaq    (%rcx, %rdx), %r10
+-	cmpq    %r10, %r9
+-	jb	L(loop_4x_vec_backward)
+-L(loop_large_backward):
+-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
+-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
+-	VMOVU	(%rcx), %VEC(0)
+-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
+-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
+-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
+-	subq	$PREFETCHED_LOAD_SIZE, %rcx
+-	subq	$PREFETCHED_LOAD_SIZE, %rdx
+-	VMOVNT	%VEC(0), (%r9)
+-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
+-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
+-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
+-	subq	$PREFETCHED_LOAD_SIZE, %r9
+-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
+-	ja	L(loop_large_backward)
+	.p2align 4
+L(large_memcpy_4x):
+	movq	%rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+	.p2align 4
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more time
+	   for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_4x_inner)
+	addq	$(PAGE_SIZE * 3), %rdi
+	addq	$(PAGE_SIZE * 3), %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_4x_outer)
+ 	sfence
+-	/* Store the first 4 * VEC.  */
+-	VMOVU	%VEC(4), (%rdi)
+-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
+-	/* Store the last VEC.  */
+-	VMOVU	%VEC(8), (%r11)
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
+ 	VZEROUPPER_RETURN
+ #endif
+ END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-4.patch
+++ b/SOURCES/glibc-RHEL-15696-4.patch
@ -0,0 +1,151 @@
+From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:29:58 -0800
+Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
+	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
+---
+ sysdeps/x86_64/memrchr.S                |  4 +-
+ sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +-
+ sysdeps/x86_64/x32/Makefile             |  3 +-
+ sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
+ 4 files changed, 63 insertions(+), 5 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
+index b8e3fa1d..dc82f8f7 100644
+--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
+@@ -24,13 +24,13 @@
+ ENTRY (__memrchr)
+ 	movd	%esi, %xmm1
+ 
+-	sub	$16, %rdx
+	sub	$16, %RDX_LP
+ 	jbe	L(length_less16)
+ 
+ 	punpcklbw	%xmm1, %xmm1
+ 	punpcklbw	%xmm1, %xmm1
+ 
+-	add	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
+ 	pshufd	$0, %xmm1, %xmm1
+ 
+ 	movdqu	(%rdi), %xmm0
+diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+index b41a58bc..ce488dd9 100644
+--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
+@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
+ 	vmovd	%esi, %xmm0
+ 	vpbroadcastb %xmm0, %ymm0
+ 
+-	subq	$VEC_SIZE, %rdx
+	sub	$VEC_SIZE, %RDX_LP
+ 	jbe	L(last_vec_or_less)
+ 
+-	addq	%rdx, %rdi
+	add	%RDX_LP, %RDI_LP
+ 
+ 	/* Check the last VEC_SIZE bytes.  */
+ 	vpcmpeqb (%rdi), %ymm0, %ymm1
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 2fe1e5ac..e99dbd7c 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
+ endif
+ 
+ ifeq ($(subdir),string)
+-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+	 tst-size_t-memrchr
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+new file mode 100644
+index 00000000..c83699c0
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
+@@ -0,0 +1,57 @@
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "memrchr"
+#include "test-size_t.h"
+
+IMPL (memchr, 1)
+
+typedef void * (*proto_t) (const void *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memrchr (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t src = { { page_size }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      void * res = do_memrchr (src, c);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %p != NULL",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-40.patch
+++ b/SOURCES/glibc-RHEL-15696-40.patch
@ -0,0 +1,92 @@
+From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 19 Apr 2021 10:45:07 -0700
+Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+Since strchr-avx2.S updated by
+
+commit 1f745ecc2109890886b161d4791e1406fdfc29b8
+Author: noah <goldstein.w.n@gmail.com>
+Date:   Wed Feb 3 00:38:59 2021 -0500
+
+    x86-64: Refactor and improve performance of strchr-avx2.S
+
+uses sarx:
+
+c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
+
+for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
+ifunc-avx2.h.
+---
+ sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
+ 2 files changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+index e0f30e61..ef72b73f 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
+@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
+   const struct cpu_features* cpu_features = __get_cpu_features ();
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ 	return OPTIMIZE (evex);
+ 
+       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 695cdba6..85b8863a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
+   IFUNC_IMPL (i, name, strchr,
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchr,
+@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
+   IFUNC_IMPL (i, name, strchrnul,
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __strchrnul_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __strchrnul_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, strchrnul,
+@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
+   IFUNC_IMPL (i, name, wcschr,
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-			      CPU_FEATURE_USABLE (AVX2),
+			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wcschr_avx2)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wcschr_avx2_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wcschr,
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-41.patch
+++ b/SOURCES/glibc-RHEL-15696-41.patch
@ -0,0 +1,265 @@
+From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 19 Apr 2021 17:48:10 -0400
+Subject: [PATCH] x86: Optimize less_vec evex and avx512
+ memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit adds optimized cased for less_vec memset case that
+uses the avx512vl/avx512bw mask store avoiding the excessive
+branches. test-memset and test-wmemset are passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 40 ++++++++++-----
+ sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 51 +++++++++++++++----
+ 5 files changed, 74 insertions(+), 27 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index 85b8863a..d59d65f8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_chk_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_chk_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      __memset_avx2_unaligned_erms_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_evex_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned_erms)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+-			       && CPU_FEATURE_USABLE (AVX512BW)),
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memset_avx512_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, memset,
+ 			      CPU_FEATURE_USABLE (AVX512F),
+@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemset_avx2_unaligned_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, wmemset,
+-			      CPU_FEATURE_USABLE (AVX512VL),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_avx512_unaligned))
+ 
+ #ifdef SHARED
+@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			      CPU_FEATURE_USABLE (AVX2),
+ 			      __wmemset_chk_avx2_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512VL),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_evex_unaligned)
+ 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+-			      CPU_FEATURE_USABLE (AVX512F),
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemset_chk_avx512_unaligned))
+ #endif
+ 
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
+index 19795938..100e3707 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
+@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
+       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (avx512_unaligned_erms);
+@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+-	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ 	{
+ 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ 	    return OPTIMIZE (evex_unaligned_erms);
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+index 22e7b187..8ad842fc 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex512
+ # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
+-
+# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+index ae0a4d6e..640f0929 100644
+--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+@@ -19,6 +19,6 @@
+ # define SECTION(p)		p##.evex
+ # define MEMSET_SYMBOL(p,s)	p##_evex_##s
+ # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
+-
+# define USE_LESS_VEC_MASK_STORE	1
+ # include "memset-vec-unaligned-erms.S"
+ #endif
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index bae5cba4..f877ac9d 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -63,6 +63,8 @@
+ # endif
+ #endif
+ 
+#define PAGE_SIZE 4096
+
+ #ifndef SECTION
+ # error SECTION is not defined!
+ #endif
+@@ -213,11 +215,38 @@ L(loop):
+ 	cmpq	%rcx, %rdx
+ 	jne	L(loop)
+ 	VZEROUPPER_SHORT_RETURN
+
+	.p2align 4
+ L(less_vec):
+ 	/* Less than 1 VEC.  */
+ # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
+ #  error Unsupported VEC_SIZE!
+ # endif
+# ifdef USE_LESS_VEC_MASK_STORE
+	/* Clear high bits from edi. Only keeping bits relevant to page
+	   cross check. Note that we are using rax which is set in
+	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+	 */
+	andl	$(PAGE_SIZE - 1), %edi
+	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
+	   performance degradation when it has to fault supress.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
+	ja	L(cross_page)
+# if VEC_SIZE > 32
+	movq	$-1, %rcx
+	bzhiq	%rdx, %rcx, %rcx
+	kmovq	%rcx, %k1
+# else
+	movl	$-1, %ecx
+	bzhil	%edx, %ecx, %ecx
+	kmovd	%ecx, %k1
+# endif
+	vmovdqu8	%VEC(0), (%rax) {%k1}
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(cross_page):
+# endif
+ # if VEC_SIZE > 32
+ 	cmpb	$32, %dl
+ 	jae	L(between_32_63)
+@@ -234,36 +263,36 @@ L(less_vec):
+ 	cmpb	$1, %dl
+ 	ja	L(between_2_3)
+ 	jb	1f
+-	movb	%cl, (%rdi)
+	movb	%cl, (%rax)
+ 1:
+ 	VZEROUPPER_RETURN
+ # if VEC_SIZE > 32
+ 	/* From 32 to 63.  No branch when size == 32.  */
+ L(between_32_63):
+-	VMOVU	%YMM0, -32(%rdi,%rdx)
+-	VMOVU	%YMM0, (%rdi)
+	VMOVU	%YMM0, -32(%rax,%rdx)
+	VMOVU	%YMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ # if VEC_SIZE > 16
+ 	/* From 16 to 31.  No branch when size == 16.  */
+ L(between_16_31):
+-	VMOVU	%XMM0, -16(%rdi,%rdx)
+-	VMOVU	%XMM0, (%rdi)
+	VMOVU	%XMM0, -16(%rax,%rdx)
+	VMOVU	%XMM0, (%rax)
+ 	VZEROUPPER_RETURN
+ # endif
+ 	/* From 8 to 15.  No branch when size == 8.  */
+ L(between_8_15):
+-	movq	%rcx, -8(%rdi,%rdx)
+-	movq	%rcx, (%rdi)
+	movq	%rcx, -8(%rax,%rdx)
+	movq	%rcx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_4_7):
+ 	/* From 4 to 7.  No branch when size == 4.  */
+-	movl	%ecx, -4(%rdi,%rdx)
+-	movl	%ecx, (%rdi)
+	movl	%ecx, -4(%rax,%rdx)
+	movl	%ecx, (%rax)
+ 	VZEROUPPER_RETURN
+ L(between_2_3):
+ 	/* From 2 to 3.  No branch when size == 2.  */
+-	movw	%cx, -2(%rdi,%rdx)
+-	movw	%cx, (%rdi)
+	movw	%cx, -2(%rax,%rdx)
+	movw	%cx, (%rax)
+ 	VZEROUPPER_RETURN
+ END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-42.patch
+++ b/SOURCES/glibc-RHEL-15696-42.patch
@ -0,0 +1,396 @@
+From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:24 -0400
+Subject: [PATCH] x86: Optimize strchr-avx2.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-avx2.S. The optimizations are all
+small things such as save an ALU in the alignment process, saving a
+few instructions in the loop return, saving some bytes in the main
+loop, and increasing the ILP in the return cases. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
+ 1 file changed, 170 insertions(+), 120 deletions(-)
+
+Conflics:
+	sysdeps/x86_64/multiarch/strchr-avx2.S
+	(rearranged to account for branch changes)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
+index 919d256c..5884726b 100644
+--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
+@@ -49,133 +49,144 @@
+ 
+ 	.section SECTION(.text),"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+ 	vmovd	%esi, %xmm0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	VPBROADCAST	%xmm0, %ymm0
+ 	vpxor	%xmm9, %xmm9, %xmm9
+-	VPBROADCAST %xmm0, %ymm0
+ 
+ 	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
+ 	   null byte.  */
+ 	vmovdqu	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jz	L(more_vecs)
+	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+# endif
+ 	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x4):
+	tzcntl	%eax, %eax
+	addq	$(VEC_SIZE * 3 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+-L(return_vzeroupper):
+-	ZERO_UPPER_VEC_REGISTERS_RETURN
+-
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	addq	$VEC_SIZE, %rdi
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	vmovdqa	VEC_SIZE(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+-	vpor	%ymm1, %ymm2, %ymm1
+-	vpmovmskb %ymm1, %eax
+-	testl	%eax, %eax
+-	jz	L(prep_loop_4x)
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	tzcntl	%eax, %eax
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+L(zero):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
+
+ 
+ 	.p2align 4
+-L(first_vec_x0):
+L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	addq	%rdi, %rax
+	incq	%rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x1):
+L(first_vec_x2):
+ 	tzcntl	%eax, %eax
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
+	addq	$(VEC_SIZE + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	addq	$(VEC_SIZE * 2 + 1), %rdi
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+	addq	%rdi, %rax
+ 	VZEROUPPER_RETURN
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
+-	andq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE - 1. This is the same number of
+	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
+	   on x4 check.  */
+	orq	$(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
+	   since data is only aligned to VEC_SIZE.  */
+	vmovdqa	1(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x2)
+
+	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+ 
+	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+	vpor	%ymm1, %ymm2, %ymm1
+	vpmovmskb %ymm1, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x4)
+	/* Align data to VEC_SIZE * 4 - 1.	*/
+	addq	$(VEC_SIZE * 4 + 1), %rdi
+	andq	$-(VEC_SIZE * 4), %rdi
+ 	.p2align 4
+ L(loop_4x_vec):
+ 	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
+-	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
+-	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
+-	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
+	vmovdqa	(%rdi), %ymm5
+	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
+	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
+	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
+ 
+ 	/* Leaves only CHARS matching esi as 0.	 */
+ 	vpxor	%ymm5, %ymm0, %ymm1
+@@ -191,63 +202,102 @@ L(loop_4x_vec):
+ 	VPMINU	%ymm1, %ymm2, %ymm5
+ 	VPMINU	%ymm3, %ymm4, %ymm6
+ 
+-	VPMINU	%ymm5, %ymm6, %ymm5
+	VPMINU	%ymm5, %ymm6, %ymm6
+ 
+-	VPCMPEQ %ymm5, %ymm9, %ymm5
+-	vpmovmskb %ymm5, %eax
+	VPCMPEQ	%ymm6, %ymm9, %ymm6
+	vpmovmskb %ymm6, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+	jz	L(loop_4x_vec)
+ 
+-	addq	$(VEC_SIZE * 4), %rdi
+-	testl	%eax, %eax
+-	jz  L(loop_4x_vec)
+ 
+-	VPCMPEQ %ymm1, %ymm9, %ymm1
+	VPCMPEQ	%ymm1, %ymm9, %ymm1
+ 	vpmovmskb %ymm1, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x0)
+
+ 
+-	VPCMPEQ %ymm2, %ymm9, %ymm2
+	VPCMPEQ	%ymm5, %ymm9, %ymm2
+ 	vpmovmskb %ymm2, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+	jnz	L(last_vec_x1)
+
+	VPCMPEQ	%ymm3, %ymm9, %ymm3
+	vpmovmskb %ymm3, %eax
+	/* rcx has combined result from all 4 VEC. It will only be used
+	   if the first 3 other VEC all did not contain a match.  */
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	subq	$(VEC_SIZE * 2), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+
+	.p2align 4
+L(last_vec_x0):
+	tzcntl	%eax, %eax
+	addq	$-(VEC_SIZE * 4), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+ 
+-	VPCMPEQ %ymm3, %ymm9, %ymm3
+-	VPCMPEQ %ymm4, %ymm9, %ymm4
+-	vpmovmskb %ymm3, %ecx
+-	vpmovmskb %ymm4, %eax
+-	salq	$32, %rax
+-	orq %rcx, %rax
+-	tzcntq  %rax, %rax
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+L(zero_end):
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+ # endif
+-	VZEROUPPER
+-	ret
+
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+	subq	$(VEC_SIZE * 3), %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdi, %rax), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	addq	%rdi, %rax
+	VZEROUPPER_RETURN
+
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
+-	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+-	vmovdqa	(%rdi), %ymm8
+-	VPCMPEQ %ymm8, %ymm0, %ymm1
+-	VPCMPEQ %ymm8, %ymm9, %ymm2
+	movq	%rdi, %rdx
+	/* Align rdi to VEC_SIZE - 1.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
+	VPCMPEQ	%ymm8, %ymm0, %ymm1
+	VPCMPEQ	%ymm8, %ymm9, %ymm2
+ 	vpor	%ymm1, %ymm2, %ymm1
+ 	vpmovmskb %ymm1, %eax
+-	/* Remove the leading bits.	 */
+-	sarxl	%ecx, %eax, %eax
+	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+	   so no need to manually mod edx.  */
+	sarxl	%edx, %eax, %eax
+ 	testl	%eax, %eax
+-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
+-	addq	%rdi, %rax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	xorl	%ecx, %ecx
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rdx, %rax), %CHAR_REG
+	leaq	(%rdx, %rax), %rax
+	cmovne	%rcx, %rax
+# else
+	addq	%rdx, %rax
+ # endif
+-	VZEROUPPER_RETURN
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ END (STRCHR)
+ # endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-43.patch
+++ b/SOURCES/glibc-RHEL-15696-43.patch
@ -0,0 +1,532 @@
+From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 23 Apr 2021 15:56:25 -0400
+Subject: [PATCH] x86: Optimize strchr-evex.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes strchr-evex.S. The optimizations are
+mostly small things such as save an ALU in the alignment process,
+saving a few instructions in the loop return. The one significant
+change is saving 2 instructions in the 4x loop. test-strchr,
+test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
+ 1 file changed, 218 insertions(+), 174 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
+index ddc86a70..7f9d4ee4 100644
+--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
+@@ -32,13 +32,15 @@
+ #  define VPCMP		vpcmpd
+ #  define VPMINU	vpminud
+ #  define CHAR_REG	esi
+-#  define SHIFT_REG	r8d
+#  define SHIFT_REG	ecx
+#  define CHAR_SIZE	4
+ # else
+ #  define VPBROADCAST	vpbroadcastb
+ #  define VPCMP		vpcmpb
+ #  define VPMINU	vpminub
+ #  define CHAR_REG	sil
+-#  define SHIFT_REG	ecx
+#  define SHIFT_REG	edx
+#  define CHAR_SIZE	1
+ # endif
+ 
+ # define XMMZERO	xmm16
+@@ -56,23 +58,20 @@
+ 
+ # define VEC_SIZE 32
+ # define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (STRCHR)
+-	movl	%edi, %ecx
+-# ifndef USE_AS_STRCHRNUL
+-	xorl	%edx, %edx
+-# endif
+-
+ 	/* Broadcast CHAR to YMM0.	*/
+-	VPBROADCAST %esi, %YMM0
+-
+	VPBROADCAST	%esi, %YMM0
+	movl	%edi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+ 
+-	/* Check if we cross page boundary with one vector load.  */
+-	andl	$(PAGE_SIZE - 1), %ecx
+-	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
+-	ja  L(cross_page_boundary)
+	/* Check if we cross page boundary with one vector load.
+	   Otherwise it is safe to use an unaligned load.  */
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	ja	L(cross_page_boundary)
+ 
+ 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ 	   null bytes.  */
+@@ -83,251 +82,296 @@ ENTRY (STRCHR)
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(more_vecs)
+ 	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jz	L(aligned_more)
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
+	 */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+ 	addq	%rdi, %rax
+ # endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(%rax), %CHAR_REG
+	jne	L(zero)
+ # endif
+ 	ret
+ 
+-	.p2align 4
+-L(more_vecs):
+-	/* Align data for aligned loads in the loop.  */
+-	andq	$-VEC_SIZE, %rdi
+-L(aligned_more):
+-
+-	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
+-	   since data is only aligned to VEC_SIZE.	*/
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	addq	$VEC_SIZE, %rdi
+-
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+-
+-	VMOVA	VEC_SIZE(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-	jnz	L(first_vec_x2)
+-
+-	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+-	/* Leaves only CHARS matching esi as 0.  */
+-	vpxorq	%YMM1, %YMM0, %YMM2
+-	VPMINU	%YMM2, %YMM1, %YMM2
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM2, %k0
+-	ktestd	%k0, %k0
+-	jz	L(prep_loop_4x)
+-
+-	kmovd	%k0, %eax
+	/* .p2align 5 helps keep performance more consistent if ENTRY()
+	   alignment % 32 was either 16 or 0. As well this makes the
+	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
+	   easier.  */
+	.p2align 5
+L(first_vec_x3):
+ 	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ 	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
+	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+-# endif
+L(zero):
+	xorl	%eax, %eax
+ 	ret
+# endif
+ 
+ 	.p2align 4
+-L(first_vec_x0):
+L(first_vec_x4):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x1):
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
+-# else
+-	leaq	VEC_SIZE(%rdi, %rax), %rax
+-# endif
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Found CHAR or the null byte.	 */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero)
+
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	.p2align 4
+ L(first_vec_x2):
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if first match was CHAR (k0) or null (k1).  */
+	kmovd	%k0, %eax
+ 	tzcntl	%eax, %eax
+-	/* Found CHAR or the null byte.	 */
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+	kmovd	%k1, %ecx
+	/* bzhil will not be 0 if first match was null.  */
+	bzhil	%eax, %ecx, %ecx
+	jne	L(zero)
+ # else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Combine CHAR and null matches.  */
+	kord	%k0, %k1, %k0
+	kmovd	%k0, %eax
+	tzcntl	%eax, %eax
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+-L(prep_loop_4x):
+-	/* Align data to 4 * VEC_SIZE.	*/
+	.p2align 4
+L(aligned_more):
+	/* Align data to VEC_SIZE.  */
+	andq	$-VEC_SIZE, %rdi
+L(cross_page_continue):
+	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
+	   data is only aligned to VEC_SIZE. Use two alternating methods
+	   for checking VEC to balance latency and port contention.  */
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x1)
+
+	/* This method has higher latency but has better port
+	   distribution.  */
+	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x2)
+
+	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
+	/* Leaves only CHARS matching esi as 0.  */
+	vpxorq	%YMM1, %YMM0, %YMM2
+	VPMINU	%YMM2, %YMM1, %YMM2
+	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(first_vec_x3)
+
+	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+	/* Each bit in K0 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMM0, %k0
+	/* Each bit in K1 represents a CHAR in YMM1.  */
+	VPCMP	$0, %YMM1, %YMMZERO, %k1
+	kortestd	%k0, %k1
+	jnz	L(first_vec_x4)
+
+	/* Align data to VEC_SIZE * 4 for the loop.  */
+	addq	$VEC_SIZE, %rdi
+ 	andq	$-(VEC_SIZE * 4), %rdi
+ 
+ 	.p2align 4
+ L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
+	   encoding.  */
+ 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
+ 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
+ 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
+ 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
+ 
+-	/* Leaves only CHARS matching esi as 0.  */
+	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+	   zero.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM5
+-	vpxorq	%YMM2, %YMM0, %YMM6
+	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
+	   k register. Its possible to save either 1 or 2 instructions
+	   using cmp no equals method for either YMM1 or YMM1 and YMM3
+	   respectively but bottleneck on p5 makes it not worth it.  */
+	VPCMP	$4, %YMM0, %YMM2, %k2
+ 	vpxorq	%YMM3, %YMM0, %YMM7
+-	vpxorq	%YMM4, %YMM0, %YMM8
+-
+-	VPMINU	%YMM5, %YMM1, %YMM5
+-	VPMINU	%YMM6, %YMM2, %YMM6
+-	VPMINU	%YMM7, %YMM3, %YMM7
+-	VPMINU	%YMM8, %YMM4, %YMM8
+-
+-	VPMINU	%YMM5, %YMM6, %YMM1
+-	VPMINU	%YMM7, %YMM8, %YMM2
+-
+-	VPMINU	%YMM1, %YMM2, %YMM1
+-
+-	/* Each bit in K0 represents a CHAR or a null byte.  */
+-	VPCMP	$0, %YMMZERO, %YMM1, %k0
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-
+-	ktestd	%k0, %k0
+	VPCMP	$4, %YMM0, %YMM4, %k4
+
+	/* Use min to select all zeros from either xor or end of string).
+	 */
+	VPMINU	%YMM1, %YMM5, %YMM1
+	VPMINU	%YMM3, %YMM7, %YMM3
+
+	/* Use min + zeromask to select for zeros. Since k2 and k4 will
+	   have 0 as positions that matched with CHAR which will set
+	   zero in the corresponding destination bytes in YMM2 / YMM4.
+	 */
+	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
+	VPMINU	%YMM3, %YMM4, %YMM4
+	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
+
+	VPCMP	$0, %YMMZERO, %YMM4, %k1
+	kmovd	%k1, %ecx
+	subq	$-(VEC_SIZE * 4), %rdi
+	testl	%ecx, %ecx
+ 	jz	L(loop_4x_vec)
+ 
+-	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+-	VPCMP	$0, %YMMZERO, %YMM5, %k0
+	VPCMP	$0, %YMMZERO, %YMM1, %k0
+ 	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x0)
+	jnz	L(last_vec_x1)
+ 
+-	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
+-	VPCMP	$0, %YMMZERO, %YMM6, %k1
+-	kmovd	%k1, %eax
+	VPCMP	$0, %YMMZERO, %YMM2, %k0
+	kmovd	%k0, %eax
+ 	testl	%eax, %eax
+-	jnz	L(first_vec_x1)
+-
+-	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
+-	VPCMP	$0, %YMMZERO, %YMM7, %k2
+-	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
+-	VPCMP	$0, %YMMZERO, %YMM8, %k3
+	jnz	L(last_vec_x2)
+ 
+	VPCMP	$0, %YMMZERO, %YMM3, %k0
+	kmovd	%k0, %eax
+	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Each bit in K2/K3 represents 4-byte element.  */
+-	kshiftlw $8, %k3, %k1
+	sall	$8, %ecx
+	orl	%ecx, %eax
+	tzcntl	%eax, %eax
+ # else
+-	kshiftlq $32, %k3, %k1
+	salq	$32, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+ # endif
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was CHAR or null.  */
+	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+ 
+-	/* Each bit in K1 represents a NULL or a mismatch.  */
+-	korq	%k1, %k2, %k1
+-	kmovq	%k1, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+	xorl	%eax, %eax
+	ret
+# endif
+ 
+-	tzcntq  %rax, %rax
+-# ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+-# else
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
+	.p2align 4
+L(last_vec_x1):
+	tzcntl	%eax, %eax
+# ifndef USE_AS_STRCHRNUL
+	/* Check if match was null.  */
+	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+	ret
+
+	.p2align 4
+L(last_vec_x2):
+	tzcntl	%eax, %eax
+ # ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	/* Check if match was null.  */
+	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+ # endif
+	/* NB: Multiply sizeof char type (1 or 4) to get the number of
+	   bytes.  */
+	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+ 
+ 	/* Cold case for crossing page with first load.	 */
+ 	.p2align 4
+ L(cross_page_boundary):
+	movq	%rdi, %rdx
+	/* Align rdi.  */
+ 	andq	$-VEC_SIZE, %rdi
+-	andl	$(VEC_SIZE - 1), %ecx
+-
+ 	VMOVA	(%rdi), %YMM1
+-
+ 	/* Leaves only CHARS matching esi as 0.  */
+ 	vpxorq	%YMM1, %YMM0, %YMM2
+ 	VPMINU	%YMM2, %YMM1, %YMM2
+ 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
+ 	VPCMP	$0, %YMMZERO, %YMM2, %k0
+ 	kmovd	%k0, %eax
+-	testl	%eax, %eax
+-
+	/* Remove the leading bits.	 */
+ # ifdef USE_AS_WCSCHR
+	movl	%edx, %SHIFT_REG
+ 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
+ 	   bytes.  */
+-	movl	%ecx, %SHIFT_REG
+-	sarl    $2, %SHIFT_REG
+	sarl	$2, %SHIFT_REG
+	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
+ # endif
+-
+-	/* Remove the leading bits.	 */
+ 	sarxl	%SHIFT_REG, %eax, %eax
+	/* If eax is zero continue.  */
+ 	testl	%eax, %eax
+-
+-	jz	L(aligned_more)
+	jz	L(cross_page_continue)
+ 	tzcntl	%eax, %eax
+-	addq	%rcx, %rdi
+# ifndef USE_AS_STRCHRNUL
+	/* Check to see if match was CHAR or null.  */
+	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
+	jne	L(zero_end)
+# endif
+ # ifdef USE_AS_WCSCHR
+-	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
+-	leaq	(%rdi, %rax, 4), %rax
+	/* NB: Multiply wchar_t count by 4 to get the number of
+	   bytes.  */
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+ # else
+-	addq	%rdi, %rax
+-# endif
+-# ifndef USE_AS_STRCHRNUL
+-	cmp (%rax), %CHAR_REG
+-	cmovne	%rdx, %rax
+	addq	%rdx, %rax
+ # endif
+ 	ret
+ 
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-44.patch
+++ b/SOURCES/glibc-RHEL-15696-44.patch
@ -0,0 +1,536 @@
+From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 4 May 2021 19:02:40 -0400
+Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
+Content-type: text/plain; charset=UTF-8
+
+No bug.
+
+This commit adds a new implementation for EVEX memchr that is not safe
+for RTM because it uses vzeroupper. The benefit is that by using
+ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
+faster than the RTM safe version which cannot use vpcmpeq because
+there is no EVEX encoding for the instruction. All parts of the
+implementation aside from the 4x loop are the same for the two
+versions and the optimization is only relevant for large sizes.
+
+Tigerlake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16
+512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21
+2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2
+2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06
+2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4
+2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <--
+
+Icelake:
+size  , algn  , Pos   , Cur T , New T , Win     , Dif
+512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3
+512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36
+2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1
+2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15
+2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54
+2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <--
+
+test-memchr, test-wmemchr, and test-rawmemchr are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/Makefile             |   7 +-
+ sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++
+ sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 +
+ sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++----
+ sysdeps/x86_64/multiarch/memchr.c             |   2 +-
+ sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 +
+ sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +-
+ sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 +
+ sysdeps/x86_64/multiarch/wmemchr.c            |   2 +-
+ 10 files changed, 217 insertions(+), 41 deletions(-)
+ create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
+ create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+ create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+
+diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
+index 65fde4eb..26be4095 100644
+--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
+@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
+ 		   strncmp-evex \
+ 		   strncpy-evex \
+ 		   strnlen-evex \
+-		   strrchr-evex
+		   strrchr-evex \
+		   memchr-evex-rtm \
+		   rawmemchr-evex-rtm
+ CFLAGS-varshift.c += -msse4
+ CFLAGS-strcspn-c.c += -msse4
+ CFLAGS-strpbrk-c.c += -msse4
+@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
+ 		   wcsnlen-evex \
+ 		   wcsrchr-evex \
+ 		   wmemchr-evex \
+-		   wmemcmp-evex-movbe
+		   wmemcmp-evex-movbe \
+		   wmemchr-evex-rtm
+ endif
+ 
+ ifeq ($(subdir),debug)
+diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
+new file mode 100644
+index 00000000..fc391edb
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
+@@ -0,0 +1,55 @@
+/* Common definition for ifunc selection optimized with EVEX.
+   All versions must be listed in ifunc-impl-list.c.
+   Copyright (C) 2017-2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
+
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+  const struct cpu_features* cpu_features = __get_cpu_features ();
+
+  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+    {
+      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+	{
+	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	    return OPTIMIZE (evex_rtm);
+
+	  return OPTIMIZE (evex);
+	}
+
+      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+	return OPTIMIZE (avx2_rtm);
+
+      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+	return OPTIMIZE (avx2);
+    }
+
+  return OPTIMIZE (sse2);
+}
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index d59d65f8..ac097e8d 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __memchr_evex)
+	      IFUNC_IMPL_ADD (array, i, memchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __memchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
+@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __rawmemchr_evex)
+	      IFUNC_IMPL_ADD (array, i, rawmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __rawmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
+@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+ 			       && CPU_FEATURE_USABLE (BMI2)),
+ 			      __wmemchr_evex)
+	      IFUNC_IMPL_ADD (array, i, wmemchr,
+			      (CPU_FEATURE_USABLE (AVX512VL)
+			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)),
+			      __wmemchr_evex_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
+ 
+   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+new file mode 100644
+index 00000000..19871882
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
+@@ -0,0 +1,8 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_evex_rtm
+#endif
+
+#define USE_IN_RTM 1
+#define SECTION(p) p##.evex.rtm
+
+#include "memchr-evex.S"
+diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
+index f3fdad4f..4d0ed6d1 100644
+--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
+@@ -38,10 +38,32 @@
+ #  define CHAR_SIZE	1
+ # endif
+ 
+	/* In the 4x loop the RTM and non-RTM versions have data pointer
+	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
+	   This is represented by BASE_OFFSET. As well because the RTM
+	   version uses vpcmp which stores a bit per element compared where
+	   the non-RTM version uses vpcmpeq which stores a bit per byte
+	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
+	   version.  */
+# ifdef USE_IN_RTM
+#  define VZEROUPPER
+#  define BASE_OFFSET	(VEC_SIZE * 4)
+#  define RET_SCALE	CHAR_SIZE
+# else
+#  define VZEROUPPER	vzeroupper
+#  define BASE_OFFSET	0
+#  define RET_SCALE	1
+# endif
+
+	/* In the return from 4x loop memchr and rawmemchr versions have
+	   data pointers off by VEC_SIZE * 4 with memchr version being
+	   VEC_SIZE * 4 greater.  */
+ # ifdef USE_AS_RAWMEMCHR
+#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
+ #  define RAW_PTR_REG	rcx
+ #  define ALGN_PTR_REG	rdi
+ # else
+#  define RET_OFFSET	BASE_OFFSET
+ #  define RAW_PTR_REG	rdi
+ #  define ALGN_PTR_REG	rcx
+ # endif
+@@ -57,11 +79,15 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
+# ifndef SECTION
+#  define SECTION(p)	p##.evex
+# endif
+
+ # define VEC_SIZE 32
+ # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+ # define PAGE_SIZE 4096
+ 
+-	.section .text.evex,"ax",@progbits
+	.section SECTION(.text),"ax",@progbits
+ ENTRY (MEMCHR)
+ # ifndef USE_AS_RAWMEMCHR
+ 	/* Check for zero length.  */
+@@ -237,14 +263,15 @@ L(cross_page_continue):
+ 	/* Check if at last CHAR_PER_VEC * 4 length.  */
+ 	subq	$(CHAR_PER_VEC * 4), %rdx
+ 	jbe	L(last_4x_vec_or_less_cmpeq)
+-	addq	$VEC_SIZE, %rdi
+	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 
+ 	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ 	 */
+ #  ifdef USE_AS_WMEMCHR
+ 	movl	%edi, %ecx
+ 	andq	$-(4 * VEC_SIZE), %rdi
+-	andl	$(VEC_SIZE * 4 - 1), %ecx
+	subl	%edi, %ecx
+ 	/* NB: Divide bytes by 4 to get the wchar_t count.  */
+ 	sarl	$2, %ecx
+ 	addq	%rcx, %rdx
+@@ -254,15 +281,28 @@ L(cross_page_continue):
+ 	subq	%rdi, %rdx
+ #  endif
+ # else
+-	addq	$VEC_SIZE, %rdi
+	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
+ 	andq	$-(4 * VEC_SIZE), %rdi
+ # endif
+-
+# ifdef USE_IN_RTM
+ 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
+# else
+	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
+	   encodable with EVEX registers (ymm16-ymm31).  */
+	vmovdqa64 %YMMMATCH, %ymm0
+# endif
+ 
+ 	/* Compare 4 * VEC at a time forward.  */
+ 	.p2align 4
+ L(loop_4x_vec):
+	/* Two versions of the loop. One that does not require
+	   vzeroupper by not using ymm0-ymm15 and another does that require
+	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
+	   is used at all is because there is no EVEX encoding vpcmpeq and
+	   with vpcmpeq this loop can be performed more efficiently. The
+	   non-vzeroupper version is safe for RTM while the vzeroupper
+	   version should be prefered if RTM are not supported.  */
+# ifdef USE_IN_RTM
+ 	/* It would be possible to save some instructions using 4x VPCMP
+ 	   but bottleneck on port 5 makes it not woth it.  */
+ 	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+@@ -273,12 +313,55 @@ L(loop_4x_vec):
+ 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
+ 	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
+ 	VPCMP	$0, %YMM3, %YMMZERO, %k2
+# else
+	/* Since vptern can only take 3x vectors fastest to do 1 vec
+	   seperately with EVEX vpcmp.  */
+#  ifdef USE_AS_WMEMCHR
+	/* vptern can only accept masks for epi32/epi64 so can only save
+	   instruction using not equals mask on vptern with wmemchr.  */
+	VPCMP	$4, (%rdi), %YMMMATCH, %k1
+#  else
+	VPCMP	$0, (%rdi), %YMMMATCH, %k1
+#  endif
+	/* Compare 3x with vpcmpeq and or them all together with vptern.
+	 */
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+#  ifdef USE_AS_WMEMCHR
+	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
+	   combines result from VEC0 with zero mask.  */
+	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
+	vpmovmskb %ymm4, %ecx
+#  else
+	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
+	vpternlogd $254, %ymm2, %ymm3, %ymm4
+	vpmovmskb %ymm4, %ecx
+	kmovd	%k1, %eax
+#  endif
+# endif
+
+ # ifdef USE_AS_RAWMEMCHR
+ 	subq	$-(VEC_SIZE * 4), %rdi
+# endif
+# ifdef USE_IN_RTM
+ 	kortestd %k2, %k3
+# else
+#  ifdef USE_AS_WMEMCHR
+	/* ecx contains not of matches. All 1s means no matches. incl will
+	   overflow and set zeroflag if that is the case.  */
+	incl	%ecx
+#  else
+	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
+	   to ecx is not an issue because if eax is non-zero it will be
+	   used for returning the match. If it is zero the add does
+	   nothing.  */
+	addq	%rax, %rcx
+#  endif
+# endif
+# ifdef USE_AS_RAWMEMCHR
+ 	jz	L(loop_4x_vec)
+ # else
+-	kortestd %k2, %k3
+ 	jnz	L(loop_4x_vec_end)
+ 
+ 	subq	$-(VEC_SIZE * 4), %rdi
+@@ -288,10 +371,11 @@ L(loop_4x_vec):
+ 
+ 	/* Fall through into less than 4 remaining vectors of length case.
+ 	 */
+-	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
+	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
+ 	kmovd	%k0, %eax
+-	addq	$(VEC_SIZE * 3), %rdi
+-	.p2align 4
+	VZEROUPPER
+
+ L(last_4x_vec_or_less):
+ 	/* Check if first VEC contained match.  */
+ 	testl	%eax, %eax
+@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
+ 	/* rawmemchr will fall through into this if match was found in
+ 	   loop.  */
+ 
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
+ 	/* k1 has not of matches with VEC1.  */
+ 	kmovd	%k1, %eax
+-# ifdef USE_AS_WMEMCHR
+#  ifdef USE_AS_WMEMCHR
+ 	subl	$((1 << CHAR_PER_VEC) - 1), %eax
+-# else
+#  else
+ 	incl	%eax
+#  endif
+# else
+	/* eax already has matches for VEC1.  */
+	testl	%eax, %eax
+ # endif
+ 	jnz	L(last_vec_x1_return)
+ 
+# ifdef USE_IN_RTM
+ 	VPCMP	$0, %YMM2, %YMMZERO, %k0
+ 	kmovd	%k0, %eax
+# else
+	vpmovmskb %ymm2, %eax
+# endif
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x2_return)
+ 
+# ifdef USE_IN_RTM
+ 	kmovd	%k2, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(last_vec_x3_return)
+ 
+ 	kmovd	%k3, %eax
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+	vpmovmskb %ymm3, %eax
+	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
+	salq	$VEC_SIZE, %rcx
+	orq	%rcx, %rax
+	tzcntq	%rax, %rax
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
+	VZEROUPPER
+ # endif
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x1_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-#  ifdef USE_AS_WMEMCHR
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
+-#  else
+-	addq	%rdi, %rax
+-#  endif
+	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
+ # else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+	addq	%rdi, %rax
+ # endif
+	VZEROUPPER
+ 	ret
+ 
+ 	.p2align 4
+ L(last_vec_x2_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
+	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
+	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
+	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
+	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
+	VZEROUPPER
+ 	ret
+ 
+# ifdef USE_IN_RTM
+ 	.p2align 4
+ L(last_vec_x3_return):
+ 	tzcntl	%eax, %eax
+-# ifdef USE_AS_RAWMEMCHR
+-	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+-# else
+ 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
+-	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
+-# endif
+	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
+ 	ret
+-
+# endif
+ 
+ # ifndef USE_AS_RAWMEMCHR
+ L(last_4x_vec_or_less_cmpeq):
+diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
+index 016f5784..f28aea77 100644
+--- a/sysdeps/x86_64/multiarch/memchr.c
+++ b/sysdeps/x86_64/multiarch/memchr.c
+@@ -24,7 +24,7 @@
+ # undef memchr
+ 
+ # define SYMBOL_NAME memchr
+-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
+ strong_alias (memchr, __memchr)
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..deda1ca3
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
+#define MEMCHR __rawmemchr_evex_rtm
+#define USE_AS_RAWMEMCHR 1
+#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
+index 8a0bc313..1f764f35 100644
+--- a/sysdeps/x86_64/multiarch/rawmemchr.c
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __rawmemchr
+ 
+ # define SYMBOL_NAME rawmemchr
+-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
+ 		       IFUNC_SELECTOR ());
+diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+new file mode 100644
+index 00000000..a346cd35
+--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
+@@ -0,0 +1,3 @@
+#define MEMCHR __wmemchr_evex_rtm
+#define USE_AS_WMEMCHR 1
+#include "memchr-evex-rtm.S"
+diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
+index 6d833702..f9c91915 100644
+--- a/sysdeps/x86_64/multiarch/wmemchr.c
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
+@@ -26,7 +26,7 @@
+ # undef __wmemchr
+ 
+ # define SYMBOL_NAME wmemchr
+-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
+ 
+ libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
+ weak_alias (__wmemchr, wmemchr)
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-45.patch
+++ b/SOURCES/glibc-RHEL-15696-45.patch
@ -0,0 +1,873 @@
+From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:56:52 -0400
+Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-avx2.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, and removing some unnecissary ALU instructions from the
+main loop. test-memcmp and test-wmemcmp are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   6 +
+ sysdeps/x86_64/multiarch/ifunc-memcmp.h      |   1 +
+ sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
+ 3 files changed, 402 insertions(+), 281 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+index ac097e8d..8be0d78a 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, memcmp,
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __memcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __memcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
+@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
+   IFUNC_IMPL (i, name, wmemcmp,
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_avx2_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX2)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)
+ 			       && CPU_FEATURE_USABLE (RTM)),
+ 			      __wmemcmp_avx2_movbe_rtm)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
+ 			      (CPU_FEATURE_USABLE (AVX512VL)
+ 			       && CPU_FEATURE_USABLE (AVX512BW)
+			       && CPU_FEATURE_USABLE (BMI2)
+ 			       && CPU_FEATURE_USABLE (MOVBE)),
+ 			      __wmemcmp_evex_movbe)
+ 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
+diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+index 8043c635..690dffe8 100644
+--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
+ 
+   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+     {
+       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+index 9d5c9c72..16fc673e 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+@@ -19,17 +19,23 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < VEC_SIZE
+      and loading from either s1 or s2 would cause a page cross.
+   2. For size from 2 to 7 bytes on page cross, load as big endian
+      with movbe and bswap to avoid branches.
+   3. Use xmm vector compare when size >= 4 bytes for memcmp or
+      size >= 8 bytes for wmemcmp.
+   4. Optimistically compare up to first 4 * VEC_SIZE one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   5. If size is 8 * VEC_SIZE or less, unroll the loop.
+   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+
+ 
+ # include <sysdep.h>
+ 
+@@ -38,8 +44,10 @@
+ # endif
+ 
+ # ifdef USE_AS_WMEMCMP
+#  define CHAR_SIZE	4
+ #  define VPCMPEQ	vpcmpeqd
+ # else
+#  define CHAR_SIZE	1
+ #  define VPCMPEQ	vpcmpeqb
+ # endif
+ 
+@@ -52,7 +60,7 @@
+ # endif
+ 
+ # define VEC_SIZE 32
+-# define VEC_MASK ((1 << VEC_SIZE) - 1)
+# define PAGE_SIZE	4096
+ 
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* NB: eax must be destination register if going to
+	   L(return_vec_[0,2]). For L(return_vec_3 destination register
+	   must be ecx.  */
+	incl	%eax
+	jnz	L(return_vec_0)
+ 
+ 	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	VPCMPEQ	%ymm0, %ymm0, %ymm0
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+	jbe	L(last_1x_vec)
+ 
+	/* Check second VEC no matter what.  */
+ 	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	/* If all 4 VEC where equal eax will be all 1s so incl will
+	   overflow and set zero flag.  */
+	incl	%eax
+	jnz	L(return_vec_1)
+ 
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	/* Less than 4 * VEC.  */
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_2x_vec)
+ 
+	/* Check third and fourth VEC no matter what.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(return_vec_2)
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+	vpmovmskb %ymm4, %ecx
+	incl	%ecx
+	jnz	L(return_vec_3)
+ 
+-	vpand	%ymm1, %ymm2, %ymm5
+-	vpand	%ymm3, %ymm4, %ymm6
+-	vpand	%ymm5, %ymm6, %ymm5
+	/* Go to 4x VEC loop.  */
+	cmpq	$(VEC_SIZE * 8), %rdx
+	ja	L(more_8x_vec)
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
+ 
+	/* Load first two VEC from s2 before adjusting addresses.  */
+	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
+	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
+ 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+ 
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
+	/* Wait to load from s1 until addressed adjust due to
+	   unlamination of microfusion with complex address mode.  */
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
+ 
+ 	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ 
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-	xorl	%eax, %eax
+	/* Reduce VEC0 - VEC4.  */
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	incl	%ecx
+	jnz	L(return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ L(return_vzeroupper):
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+L(return_vec_1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	VEC_SIZE(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	VEC_SIZE(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(return_vec_2):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
+	.p2align 5
+L(8x_return_vec_0_1_2_3):
+	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_0)
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+ 	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+	incl	%eax
+	jnz	L(return_vec_1)
+
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one
+	   pointer.  */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(loop_4x_vec):
+	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
+	 */
+	vmovdqu	(%rsi, %rdi), %ymm1
+	VPCMPEQ	(%rdi), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
+
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
+	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	incl	%ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	subq	$-(VEC_SIZE * 4), %rdi
+	/* Check if s1 pointer at end.  */
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	subq	%rdx, %rdi
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+
+	/* Check last 4 VEC.  */
+	vmovdqu	(%rsi, %rdx), %ymm1
+	VPCMPEQ	(%rdx), %ymm1, %ymm1
+
+	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
+	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
+
+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+
+	vpand	%ymm1, %ymm2, %ymm5
+	vpand	%ymm3, %ymm4, %ymm6
+	vpand	%ymm5, %ymm6, %ymm7
+	vpmovmskb %ymm7, %ecx
+	/* Restore s1 pointer to rdi.  */
+	movq	%rdx, %rdi
+	incl	%ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	VZEROUPPER_RETURN
+
+	/* Only entry is from L(more_8x_vec).  */
+	.p2align 4
+L(8x_last_2x_vec):
+	/* Check second to last VEC. rdx store end pointer of s1 and
+	   ymm3 has already been loaded with second to last VEC from s2.
+	 */
+	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+	vpmovmskb %ymm3, %eax
+	incl	%eax
+	jnz	L(8x_return_vec_2)
+	/* Check last VEC.  */
+	.p2align 4
+L(8x_last_1x_vec):
+	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+	vpmovmskb %ymm4, %eax
+	incl	%eax
+	jnz	L(8x_return_vec_3)
+ 	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
+L(last_2x_vec):
+	/* Check second to last VEC.  */
+	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_1_end)
+	/* Check last VEC.  */
+L(last_1x_vec):
+	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
+	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
+	vpmovmskb %ymm1, %eax
+	incl	%eax
+	jnz	L(return_vec_0_end)
+	VZEROUPPER_RETURN
+
+	.p2align 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	tzcntl	%eax, %eax
+	addq	%rdx, %rax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx), %edx
+-	cmpl	(%rsi, %rcx), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+ # endif
+ 	VZEROUPPER_RETURN
+ 
+-# ifdef USE_AS_WMEMCMP
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
+-	ret
+L(return_vec_1_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+-	ret
+L(return_vec_0_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-VEC_SIZE(%rdi, %rax), %ecx
+	xorl	%edx, %edx
+	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	-VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(exit):
+-	ret
+L(less_vec):
+	/* Check if one or less CHAR. This is necessary for size = 0 but
+	   is also faster for size = CHAR_SIZE.  */
+	cmpl	$CHAR_SIZE, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	vmovdqu	(%rsi), %ymm2
+	VPCMPEQ	(%rdi), %ymm2, %ymm2
+	vpmovmskb %ymm2, %eax
+	incl	%eax
+	/* Result will be zero if s1 and s2 match. Otherwise first set
+	   bit will be first mismatch.  */
+	bzhil	%edx, %eax, %edx
+	jnz	L(return_vec_0)
+	xorl	%eax, %eax
+	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(between_2_3):
+L(page_cross_less_vec):
+	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+	   bytes.  */
+	cmpl	$16, %edx
+	jae	L(between_16_31)
+# ifndef USE_AS_WMEMCMP
+	cmpl	$8, %edx
+	jae	L(between_8_15)
+	cmpl	$4, %edx
+	jae	L(between_4_7)
+
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+ 	movzwl	(%rsi), %ecx
+@@ -208,223 +439,106 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
+	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
+L(one_or_less):
+	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+-	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+	/* No ymm register was touched.  */
+ 	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
+L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
+	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
+	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+ 	leaq	-8(%rdi, %rdx), %rdi
+ 	leaq	-8(%rsi, %rdx), %rsi
+ 	vmovq	(%rdi), %xmm1
+ 	vmovq	(%rsi), %xmm2
+-	VPCMPEQ %xmm1, %xmm2, %xmm2
+	VPCMPEQ	%xmm1, %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+	/* No ymm register was touched.  */
+	ret
+
+	.p2align 4
+L(zero):
+	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
+	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
+
+	vmovdqu	-16(%rsi, %rdx), %xmm2
+ 	leaq	-16(%rdi, %rdx), %rdi
+ 	leaq	-16(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %xmm2
+-	VPCMPEQ (%rdi), %xmm2, %xmm2
+	VPCMPEQ	(%rdi), %xmm2, %xmm2
+ 	vpmovmskb %xmm2, %eax
+-	subl    $0xffff, %eax
+-	jnz	L(first_vec)
+	subl	$0xffff, %eax
+	jnz	L(return_vec_0)
+	/* No ymm register was touched.  */
+ 	ret
+ 
+-	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	vmovdqu	(%rsi), %ymm1
+-	VPCMPEQ (%rdi), %ymm1, %ymm1
+-
+-	vmovdqu	VEC_SIZE(%rsi), %ymm2
+-	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+-	vpand	%ymm2, %ymm1, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+-	vpand	%ymm3, %ymm5, %ymm5
+-
+-	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+-	vpand	%ymm4, %ymm5, %ymm5
+-
+-	vptest	%ymm0, %ymm5
+-	jnc	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	vmovdqu	(%rsi), %ymm2
+-	VPCMPEQ (%rdi), %ymm2, %ymm2
+-	vpmovmskb %ymm2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	VZEROUPPER_RETURN
+-
+-	.p2align 4
+-L(4x_vec_end):
+-	vpmovmskb %ymm1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	vpmovmskb %ymm2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	vpmovmskb %ymm3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	vpmovmskb %ymm4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+-
+ 	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+L(one_or_less):
+	jb	L(zero)
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(zero)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+	/* No ymm register was touched.  */
+	ret
+ # else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+-	VZEROUPPER_RETURN
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.
+	 */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	jz	L(zero_4_7)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+L(zero_4_7):
+	/* No ymm register was touched.  */
+	ret
+ # endif
+-	VZEROUPPER_RETURN
+
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-46.patch
+++ b/SOURCES/glibc-RHEL-15696-46.patch
@ -0,0 +1,851 @@
+From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 May 2021 13:57:24 -0400
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit optimizes memcmp-evex.S. The optimizations include
+adding a new vec compare path for small sizes, reorganizing the entry
+control flow, removing some unnecissary ALU instructions from the main
+loop, and most importantly replacing the heavy use of vpcmp + kand
+logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
+passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
+ 1 file changed, 408 insertions(+), 302 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 9c093972..654dc7ac 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -19,17 +19,22 @@
+ #if IS_IN (libc)
+ 
+ /* memcmp/wmemcmp is implemented as:
+-   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+-      to avoid branches.
+-   2. Use overlapping compare to avoid branch.
+-   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+-      bytes for wmemcmp.
+-   4. If size is 8 * VEC_SIZE or less, unroll the loop.
+-   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+   1. Use ymm vector compares when possible. The only case where
+      vector compares is not possible for when size < CHAR_PER_VEC
+      and loading from either s1 or s2 would cause a page cross.
+   2. For size from 2 to 7 bytes on page cross, load as big endian
+      with movbe and bswap to avoid branches.
+   3. Use xmm vector compare when size >= 4 bytes for memcmp or
+      size >= 8 bytes for wmemcmp.
+   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
+      to check for early mismatches. Only do this if its guranteed the
+      work is not wasted.
+   5. If size is 8 * VEC_SIZE or less, unroll the loop.
+   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
+       area.
+-   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+-   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+-   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
+   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
+ 
+ # include <sysdep.h>
+ 
+@@ -40,11 +45,21 @@
+ # define VMOVU		vmovdqu64
+ 
+ # ifdef USE_AS_WMEMCMP
+-#  define VPCMPEQ	vpcmpeqd
+#  define CHAR_SIZE	4
+#  define VPCMP	vpcmpd
+ # else
+-#  define VPCMPEQ	vpcmpeqb
+#  define CHAR_SIZE	1
+#  define VPCMP	vpcmpub
+ # endif
+ 
+# define VEC_SIZE	32
+# define PAGE_SIZE	4096
+# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
+
+# define XMM0		xmm16
+# define XMM1		xmm17
+# define XMM2		xmm18
+# define YMM0		ymm16
+ # define XMM1		xmm17
+ # define XMM2		xmm18
+ # define YMM1		ymm17
+@@ -54,15 +69,6 @@
+ # define YMM5		ymm21
+ # define YMM6		ymm22
+ 
+-# define VEC_SIZE 32
+-# ifdef USE_AS_WMEMCMP
+-#  define VEC_MASK 0xff
+-#  define XMM_MASK 0xf
+-# else
+-#  define VEC_MASK 0xffffffff
+-#  define XMM_MASK 0xffff
+-# endif
+-
+ /* Warning!
+            wmemcmp has to use SIGNED comparison for elements.
+            memcmp has to use UNSIGNED comparison for elemnts.
+@@ -70,145 +76,370 @@
+ 
+ 	.section .text.evex,"ax",@progbits
+ ENTRY (MEMCMP)
+-# ifdef USE_AS_WMEMCMP
+-	shl	$2, %RDX_LP
+-# elif defined __ILP32__
+# ifdef __ILP32__
+ 	/* Clear the upper 32 bits.  */
+ 	movl	%edx, %edx
+ # endif
+-	cmp	$VEC_SIZE, %RDX_LP
+	cmp	$CHAR_PER_VEC, %RDX_LP
+ 	jb	L(less_vec)
+ 
+ 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k1
+	VMOVU	(%rsi), %YMM1
+	/* Use compare not equals to directly check for mismatch.  */
+	VPCMP	$4, (%rdi), %YMM1, %k1
+ 	kmovd	%k1, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_vec)
+-
+-	/* More than 2 * VEC.  */
+-	cmpq	$(VEC_SIZE * 8), %rdx
+-	ja	L(more_8x_vec)
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+	/* NB: eax must be destination register if going to
+	   L(return_vec_[0,2]). For L(return_vec_3 destination register
+	   must be ecx.  */
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 
+-	/* From 4 * VEC to 8 * VEC, inclusively. */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+	cmpq	$(CHAR_PER_VEC * 2), %rdx
+	jbe	L(last_1x_vec)
+ 
+	/* Check second VEC no matter what.  */
+ 	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+
+	/* Less than 4 * VEC.  */
+	cmpq	$(CHAR_PER_VEC * 4), %rdx
+	jbe	L(last_2x_vec)
+ 
+	/* Check third and fourth VEC no matter what.  */
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_3)
+ 
+-	kandd	%k1, %k2, %k5
+-	kandd	%k3, %k4, %k6
+-	kandd	%k5, %k6, %k6
+	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+	   compare with zero to get a mask is needed.  */
+	vpxorq	%XMM0, %XMM0, %XMM0
+ 
+-	kmovd	%k6, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+	/* Go to 4x VEC loop.  */
+	cmpq	$(CHAR_PER_VEC * 8), %rdx
+	ja	L(more_8x_vec)
+ 
+-	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+	   branches.  */
+ 
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k1, %k2, %k5
+	/* Load first two VEC from s2 before adjusting addresses.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
+	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+
+	/* Wait to load from s1 until addressed adjust due to
+	   unlamination of microfusion with complex address mode.  */
+
+	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
+	   will have some 1s.  */
+	vpxorq	(%rdi), %YMM1, %YMM1
+	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
+ 
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+ 
+ 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
+	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+	   oring with YMM3. Result is stored in YMM4.  */
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
+	VPCMP	$4, %YMM4, %YMM0, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	ret
+ 
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-	xorl	%eax, %eax
+	/* NB: aligning 32 here allows for the rest of the jump targets
+	   to be tuned for 32 byte alignment. Most important this ensures
+	   the L(more_8x_vec) loop is 32 byte aligned.  */
+	.p2align 5
+L(less_vec):
+	/* Check if one or less CHAR. This is necessary for size = 0 but
+	   is also faster for size = CHAR_SIZE.  */
+	cmpl	$1, %edx
+	jbe	L(one_or_less)
+
+	/* Check if loading one VEC from either s1 or s2 could cause a
+	   page cross. This can have false positives but is by far the
+	   fastest method.  */
+	movl	%edi, %eax
+	orl	%esi, %eax
+	andl	$(PAGE_SIZE - 1), %eax
+	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
+	jg	L(page_cross_less_vec)
+
+	/* No page cross possible.  */
+	VMOVU	(%rsi), %YMM2
+	VPCMP	$4, (%rdi), %YMM2, %k1
+	kmovd	%k1, %eax
+	/* Create mask in ecx for potentially in bound matches.  */
+	bzhil	%edx, %eax, %eax
+	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(last_2x_vec):
+-	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+L(return_vec_0):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
+	/* NB: no partial register stall here because xorl zero idiom
+	   above.  */
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(%rsi, %rax), %ecx
+	movzbl	(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+ 
+-L(last_vec):
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
+-	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
+	   which is good enough for a target not in a loop.  */
+L(return_vec_1):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec):
+-	/* A byte or int32 is different within 16 or 32 bytes.  */
+-	tzcntl	%eax, %ecx
+	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
+	   which is good enough for a target not in a loop.  */
+L(return_vec_2):
+	tzcntl	%eax, %eax
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(%rdi, %rcx, 4), %edx
+-	cmpl	(%rsi, %rcx, 4), %edx
+-L(wmemcmp_return):
+-	setl	%al
+-	negl	%eax
+-	orl	$1, %eax
+	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+-	movzbl	(%rdi, %rcx), %eax
+-	movzbl	(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+ # endif
+ 	ret
+ 
+	.p2align 4
+L(8x_return_vec_0_1_2_3):
+	/* Returning from L(more_8x_vec) requires restoring rsi.  */
+	addq	%rdi, %rsi
+L(return_vec_0_1_2_3):
+	VPCMP	$4, %YMM1, %YMM0, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+
+	VPCMP	$4, %YMM2, %YMM0, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1)
+
+	VPCMP	$4, %YMM3, %YMM0, %k0
+	kmovd	%k0, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_2)
+L(return_vec_3):
+	tzcntl	%ecx, %ecx
+ # ifdef USE_AS_WMEMCMP
+	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
+	subl	%ecx, %eax
+# endif
+	ret
+
+ 	.p2align 4
+-L(4):
+-	xorl	%eax, %eax
+-	movl	(%rdi), %edx
+-	cmpl	(%rsi), %edx
+-	jne	L(wmemcmp_return)
+L(more_8x_vec):
+	/* Set end of s1 in rdx.  */
+	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
+	/* rsi stores s2 - s1. This allows loop to only update one
+	   pointer.  */
+	subq	%rdi, %rsi
+	/* Align s1 pointer.  */
+	andq	$-VEC_SIZE, %rdi
+	/* Adjust because first 4x vec where check already.  */
+	subq	$-(VEC_SIZE * 4), %rdi
+	.p2align 4
+L(loop_4x_vec):
+	VMOVU	(%rsi, %rdi), %YMM1
+	vpxorq	(%rdi), %YMM1, %YMM1
+
+	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
+	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
+
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+	VPCMP	$4, %YMM4, %YMM0, %k1
+	kmovd	%k1, %ecx
+	testl	%ecx, %ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rdx, %rdi
+	jb	L(loop_4x_vec)
+
+	subq	%rdx, %rdi
+	/* rdi has 4 * VEC_SIZE - remaining length.  */
+	cmpl	$(VEC_SIZE * 3), %edi
+	jae	L(8x_last_1x_vec)
+	/* Load regardless of branch.  */
+	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+	cmpl	$(VEC_SIZE * 2), %edi
+	jae	L(8x_last_2x_vec)
+
+	VMOVU	(%rsi, %rdx), %YMM1
+	vpxorq	(%rdx), %YMM1, %YMM1
+
+	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
+	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
+
+	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+	VPCMP	$4, %YMM4, %YMM0, %k1
+	kmovd	%k1, %ecx
+	/* Restore s1 pointer to rdi.  */
+	movq	%rdx, %rdi
+	testl	%ecx, %ecx
+	jnz	L(8x_return_vec_0_1_2_3)
+	/* NB: eax must be zero to reach here.  */
+	ret
+
+	/* Only entry is from L(more_8x_vec).  */
+	.p2align 4
+L(8x_last_2x_vec):
+	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(8x_return_vec_2)
+	/* Naturally aligned to 16 bytes.  */
+L(8x_last_1x_vec):
+	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(8x_return_vec_3)
+	ret
+
+	.p2align 4
+L(last_2x_vec):
+	/* Check second to last VEC.  */
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_1_end)
+
+	/* Check last VEC.  */
+	.p2align 4
+L(last_1x_vec):
+	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
+	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0_end)
+ 	ret
+
+	.p2align 4
+L(8x_return_vec_2):
+	subq	$VEC_SIZE, %rdx
+L(8x_return_vec_3):
+	tzcntl	%eax, %eax
+# ifdef USE_AS_WMEMCMP
+	leaq	(%rdx, %rax, CHAR_SIZE), %rax
+	movl	(VEC_SIZE * 3)(%rax), %ecx
+	xorl	%edx, %edx
+	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ # else
+	addq	%rdx, %rax
+	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
+	movzbl	(VEC_SIZE * 3)(%rax), %eax
+	subl	%ecx, %eax
+# endif
+	ret
+
+ 	.p2align 4
+-L(between_4_7):
+-	/* Load as big endian with overlapping movbe to avoid branches.  */
+-	movbe	(%rdi), %eax
+-	movbe	(%rsi), %ecx
+-	shlq	$32, %rax
+-	shlq	$32, %rcx
+-	movbe	-4(%rdi, %rdx), %edi
+-	movbe	-4(%rsi, %rdx), %esi
+-	orq	%rdi, %rax
+-	orq	%rsi, %rcx
+-	subq	%rcx, %rax
+-	je	L(exit)
+-	sbbl	%eax, %eax
+-	orl	$1, %eax
+L(return_vec_0_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
+	movzbl	-VEC_SIZE(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+ 	.p2align 4
+-L(exit):
+L(return_vec_1_end):
+	tzcntl	%eax, %eax
+	addl	%edx, %eax
+# ifdef USE_AS_WMEMCMP
+	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+	xorl	%edx, %edx
+	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+# else
+	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
+	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
+	subl	%ecx, %eax
+# endif
+ 	ret
+ 
+
+ 	.p2align 4
+L(page_cross_less_vec):
+	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+	   bytes.  */
+	cmpl	$(16 / CHAR_SIZE), %edx
+	jae	L(between_16_31)
+# ifndef USE_AS_WMEMCMP
+	cmpl	$8, %edx
+	jae	L(between_8_15)
+	cmpl	$4, %edx
+	jae	L(between_4_7)
+ L(between_2_3):
+ 	/* Load as big endian to avoid branches.  */
+ 	movzwl	(%rdi), %eax
+@@ -217,224 +448,99 @@ L(between_2_3):
+ 	shll	$8, %ecx
+ 	bswap	%eax
+ 	bswap	%ecx
+-	movb	-1(%rdi, %rdx), %al
+-	movb	-1(%rsi, %rdx), %cl
+	movzbl	-1(%rdi, %rdx), %edi
+	movzbl	-1(%rsi, %rdx), %esi
+	orl	%edi, %eax
+	orl	%esi, %ecx
+ 	/* Subtraction is okay because the upper 8 bits are zero.  */
+ 	subl	%ecx, %eax
+ 	ret
+-
+ 	.p2align 4
+-L(1):
+-	movzbl	(%rdi), %eax
+L(one_or_less):
+	jb	L(zero)
+ 	movzbl	(%rsi), %ecx
+	movzbl	(%rdi), %eax
+ 	subl	%ecx, %eax
+ 	ret
+-# endif
+-
+-	.p2align 4
+-L(zero):
+-	xorl	%eax, %eax
+-	ret
+ 
+ 	.p2align 4
+-L(less_vec):
+-# ifdef USE_AS_WMEMCMP
+-	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
+-	cmpb	$4, %dl
+-	je	L(4)
+-	jb	L(zero)
+-# else
+-	cmpb	$1, %dl
+-	je	L(1)
+-	jb	L(zero)
+-	cmpb	$4, %dl
+-	jb	L(between_2_3)
+-	cmpb	$8, %dl
+-	jb	L(between_4_7)
+L(between_8_15):
+ # endif
+-	cmpb	$16, %dl
+-	jae	L(between_16_31)
+-	/* It is between 8 and 15 bytes.  */
+	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+	VPCMP	$4, %XMM1, %XMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-8(%rdi, %rdx), %rdi
+-	leaq	-8(%rsi, %rdx), %rsi
+	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
+	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
+ 	vmovq	(%rdi), %XMM1
+ 	vmovq	(%rsi), %XMM2
+-	VPCMPEQ %XMM1, %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+	VPCMP	$4, %XMM1, %XMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 	ret
+ 
+ 	.p2align 4
+-L(between_16_31):
+-	/* From 16 to 31 bytes.  No branch when size == 16.  */
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Use overlapping loads to avoid branches.  */
+-	leaq	-16(%rdi, %rdx), %rdi
+-	leaq	-16(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %XMM2
+-	VPCMPEQ (%rdi), %XMM2, %k2
+-	kmovw	%k2, %eax
+-	subl    $XMM_MASK, %eax
+-	jnz	L(first_vec)
+L(zero):
+	xorl	%eax, %eax
+ 	ret
+ 
+ 	.p2align 4
+-L(more_8x_vec):
+-	/* More than 8 * VEC.  Check the first VEC.  */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	/* Align the first memory area for aligned loads in the loop.
+-	   Compute how much the first memory area is misaligned.  */
+-	movq	%rdi, %rcx
+-	andl	$(VEC_SIZE - 1), %ecx
+-	/* Get the negative of offset for alignment.  */
+-	subq	$VEC_SIZE, %rcx
+-	/* Adjust the second memory area.  */
+-	subq	%rcx, %rsi
+-	/* Adjust the first memory area which should be aligned now.  */
+-	subq	%rcx, %rdi
+-	/* Adjust length.  */
+-	addq	%rcx, %rdx
+-
+-L(loop_4x_vec):
+-	/* Compare 4 * VEC at a time forward.  */
+-	VMOVU	(%rsi), %YMM1
+-	VPCMPEQ (%rdi), %YMM1, %k1
+-
+-	VMOVU	VEC_SIZE(%rsi), %YMM2
+-	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+-	kandd	%k2, %k1, %k5
+-
+-	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
+-	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+-	kandd	%k3, %k5, %k5
+-
+-	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
+-	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+-	kandd	%k4, %k5, %k5
+-
+-	kmovd	%k5, %eax
+-	cmpl	$VEC_MASK, %eax
+-	jne	L(4x_vec_end)
+-
+-	addq	$(VEC_SIZE * 4), %rdi
+-	addq	$(VEC_SIZE * 4), %rsi
+-
+-	subq	$(VEC_SIZE * 4), %rdx
+-	cmpq	$(VEC_SIZE * 4), %rdx
+-	jae	L(loop_4x_vec)
+-
+-	/* Less than 4 * VEC.  */
+-	cmpq	$VEC_SIZE, %rdx
+-	jbe	L(last_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+-	jbe	L(last_2x_vec)
+-
+-L(last_4x_vec):
+-	/* From 2 * VEC to 4 * VEC. */
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+L(between_16_31):
+	/* From 16 to 31 bytes.  No branch when size == 16.  */
+	VMOVU	(%rsi), %XMM2
+	VPCMP	$4, (%rdi), %XMM2, %k1
+	kmovd	%k1, %eax
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 
+ 	/* Use overlapping loads to avoid branches.  */
+-	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+-	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+ 
+-	addq	$VEC_SIZE, %rdi
+-	addq	$VEC_SIZE, %rsi
+-	VMOVU	(%rsi), %YMM2
+-	VPCMPEQ (%rdi), %YMM2, %k2
+-	kmovd	%k2, %eax
+-	subl    $VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	ret
+-
+-	.p2align 4
+-L(4x_vec_end):
+	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
+	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
+	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
+	VPCMP	$4, (%rdi), %XMM2, %k1
+ 	kmovd	%k1, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec)
+-	kmovd	%k2, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x1)
+-	kmovd	%k3, %eax
+-	subl	$VEC_MASK, %eax
+-	jnz	L(first_vec_x2)
+-	kmovd	%k4, %eax
+-	subl	$VEC_MASK, %eax
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+	testl	%eax, %eax
+	jnz	L(return_vec_0)
+ 	ret
+ 
+-	.p2align 4
+-L(first_vec_x1):
+-	tzcntl	%eax, %ecx
+ # ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
+-	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	VEC_SIZE(%rdi, %rcx), %eax
+-	movzbl	VEC_SIZE(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+	.p2align 4
+L(one_or_less):
+	jb	L(zero)
+	movl	(%rdi), %ecx
+	xorl	%edx, %edx
+	cmpl	(%rsi), %ecx
+	je	L(zero)
+	setg	%dl
+	leal	-1(%rdx, %rdx), %eax
+ 	ret
+# else
+ 
+ 	.p2align 4
+-L(first_vec_x2):
+-	tzcntl	%eax, %ecx
+-# ifdef USE_AS_WMEMCMP
+-	xorl	%eax, %eax
+-	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+-	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+-	jmp	L(wmemcmp_return)
+-# else
+-	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
+-	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
+-	sub	%edx, %eax
+-# endif
+L(between_4_7):
+	/* Load as big endian with overlapping movbe to avoid branches.
+	 */
+	movbe	(%rdi), %eax
+	movbe	(%rsi), %ecx
+	shlq	$32, %rax
+	shlq	$32, %rcx
+	movbe	-4(%rdi, %rdx), %edi
+	movbe	-4(%rsi, %rdx), %esi
+	orq	%rdi, %rax
+	orq	%rsi, %rcx
+	subq	%rcx, %rax
+	jz	L(zero_4_7)
+	sbbl	%eax, %eax
+	orl	$1, %eax
+L(zero_4_7):
+ 	ret
+# endif
+
+ END (MEMCMP)
+ #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-47.patch
+++ b/SOURCES/glibc-RHEL-15696-47.patch
@ -0,0 +1,104 @@
+From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Thu, 20 May 2021 13:13:51 -0400
+Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. This commit makes a few small improvements to
+memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
+instead of 128. Either alignment will perform equally well in a loop
+and 128 just increases the odds of having to do an extra iteration
+which can be significant overhead for small values. 2) Align some
+targets and the loop. 3) Remove an ALU from the alignment process. 4)
+Reorder the last 4x VEC so that they are stored after the loop. 5)
+Move the condition for leq 8x VEC to before the alignment
+process. test-memset and test-wmemset are both passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
+ 1 file changed, 28 insertions(+), 22 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index f877ac9d..909c33f6 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VZEROUPPER_RETURN
+ 
+	.p2align 4
+ L(stosb_more_2x_vec):
+ 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
+ 	ja	L(stosb)
+#else
+	.p2align 4
+ #endif
+ L(more_2x_vec):
+-	cmpq  $(VEC_SIZE * 4), %rdx
+-	ja	L(loop_start)
+	/* Stores to first 2x VEC before cmp as any path forward will
+	   require it.  */
+ 	VMOVU	%VEC(0), (%rdi)
+ 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+	cmpq	$(VEC_SIZE * 4), %rdx
+	ja	L(loop_start)
+ 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+ L(return):
+ #if VEC_SIZE > 16
+ 	ZERO_UPPER_VEC_REGISTERS_RETURN
+@@ -192,28 +197,29 @@ L(return):
+ #endif
+ 
+ L(loop_start):
+-	leaq	(VEC_SIZE * 4)(%rdi), %rcx
+-	VMOVU	%VEC(0), (%rdi)
+-	andq	$-(VEC_SIZE * 4), %rcx
+-	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+-	VMOVU	%VEC(0), VEC_SIZE(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
+ 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
+-	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
+-	addq	%rdi, %rdx
+-	andq	$-(VEC_SIZE * 4), %rdx
+-	cmpq	%rdx, %rcx
+-	je	L(return)
+	cmpq	$(VEC_SIZE * 8), %rdx
+	jbe	L(loop_end)
+	andq	$-(VEC_SIZE * 2), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
+	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
+	.p2align 4
+ L(loop):
+-	VMOVA	%VEC(0), (%rcx)
+-	VMOVA	%VEC(0), VEC_SIZE(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
+-	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
+-	addq	$(VEC_SIZE * 4), %rcx
+-	cmpq	%rcx, %rdx
+-	jne	L(loop)
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(0), VEC_SIZE(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpq	%rcx, %rdi
+	jb	L(loop)
+L(loop_end):
+	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+	       rdx as length is also unchanged.  */
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
+ 	VZEROUPPER_SHORT_RETURN
+ 
+ 	.p2align 4
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-48.patch
+++ b/SOURCES/glibc-RHEL-15696-48.patch
@ -0,0 +1,84 @@
+From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 23 May 2021 19:43:24 -0400
+Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8
+
+This patch changes the condition for copy 4x VEC so that if length is
+exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
+8x VEC case.
+
+Results For Skylake memcpy-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.137   , 6.873   , New , 75.22
+128 , 7   , 0   , 12.933  , 7.732   , New , 59.79
+128 , 0   , 7   , 11.852  , 6.76    , New , 57.04
+128 , 7   , 7   , 12.587  , 6.808   , New , 54.09
+
+Results For Icelake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 9.963   , 5.416   , New , 54.36
+128 , 7   , 0   , 16.467  , 8.061   , New , 48.95
+128 , 0   , 7   , 14.388  , 7.644   , New , 53.13
+128 , 7   , 7   , 14.546  , 7.642   , New , 52.54
+
+Results For Tigerlake memcpy-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 0   , 8.979   , 4.95    , New , 55.13
+128 , 7   , 0   , 14.245  , 7.122   , New , 50.0
+128 , 0   , 7   , 12.668  , 6.675   , New , 52.69
+128 , 7   , 7   , 13.042  , 6.802   , New , 52.15
+
+Results For Skylake memmove-avx2-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 6.181   , 5.691   , New , 92.07
+128 , 32  , 0   , 6.165   , 5.752   , New , 93.3
+128 , 0   , 7   , 13.923  , 9.37    , New , 67.3
+128 , 7   , 0   , 12.049  , 10.182  , New , 84.5
+
+Results For Icelake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.479   , 4.889   , New , 89.23
+128 , 32  , 0   , 5.127   , 4.911   , New , 95.79
+128 , 0   , 7   , 18.885  , 13.547  , New , 71.73
+128 , 7   , 0   , 15.565  , 14.436  , New , 92.75
+
+Results For Tigerlake memmove-evex-erms
+size, al1 , al2 , Cur T   , New T   , Win , New / Cur
+128 , 0   , 32  , 5.275   , 4.815   , New , 91.28
+128 , 32  , 0   , 5.376   , 4.565   , New , 84.91
+128 , 0   , 7   , 19.426  , 14.273  , New , 73.47
+128 , 7   , 0   , 15.924  , 14.951  , New , 93.89
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+index 3e2dd6bc..572cef04 100644
+--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+@@ -417,8 +417,8 @@ L(more_2x_vec):
+ 	cmpq	$(VEC_SIZE * 8), %rdx
+ 	ja	L(more_8x_vec)
+ 	cmpq	$(VEC_SIZE * 4), %rdx
+-	jb	L(last_4x_vec)
+-	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
+	jbe	L(last_4x_vec)
+	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+@@ -437,7 +437,7 @@ L(more_2x_vec):
+ 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
+ 	VZEROUPPER_RETURN
+ L(last_4x_vec):
+-	/* Copy from 2 * VEC to 4 * VEC. */
+	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
+ 	VMOVU	(%rsi), %VEC(0)
+ 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+ 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-49.patch
+++ b/SOURCES/glibc-RHEL-15696-49.patch
@ -0,0 +1,55 @@
+From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 23 Jun 2021 19:19:34 -0400
+Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
+Content-type: text/plain; charset=UTF-8
+
+No bug. The way wcsnlen will check if near the end of maxlen
+is the following macro:
+
+	mov	%r11, %rsi;	\
+	subq	%rax, %rsi;	\
+	andq	$-64, %rax;	\
+	testq	$-64, %rsi;	\
+	je	L(strnlen_ret)
+
+Which words independently of s + maxlen overflowing. So the
+second overflow check is unnecissary for correctness and
+just extra overhead in the common no overflow case.
+
+test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
+all passing
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
+ 1 file changed, 7 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
+index 439e486a..b7657282 100644
+--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
+@@ -71,19 +71,12 @@ L(n_nonzero):
+    suffice.  */
+ 	mov	%RSI_LP, %R10_LP
+ 	sar	$62, %R10_LP
+-	test	%R10_LP, %R10_LP
+ 	jnz	__wcslen_sse4_1
+ 	sal	$2, %RSI_LP
+ # endif
+ 
+-
+ /* Initialize long lived registers.  */
+-
+ 	add	%RDI_LP, %RSI_LP
+-# ifdef AS_WCSLEN
+-/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
+-	jbe	__wcslen_sse4_1
+-# endif
+ 	mov	%RSI_LP, %R10_LP
+ 	and	$-64, %R10_LP
+ 	mov	%RSI_LP, %R11_LP
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-5.patch
+++ b/SOURCES/glibc-RHEL-15696-5.patch
@ -0,0 +1,290 @@
+From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:32:24 -0800
+Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On
+x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
+	RDX_LP for length.  Clear the upper 32 bits of RDX register.
+	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
+	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
+---
+ .../multiarch/memset-avx512-no-vzeroupper.S   |  6 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 34 +++++----
+ sysdeps/x86_64/x32/Makefile                   |  4 +-
+ sysdeps/x86_64/x32/tst-size_t-memset.c        | 73 +++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wmemset.c       | 20 +++++
+ 5 files changed, 121 insertions(+), 16 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+index 689cc119..99e25519 100644
+--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+@@ -29,12 +29,16 @@
+ 	.section .text.avx512,"ax",@progbits
+ #if defined PIC
+ ENTRY (MEMSET_CHK)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (MEMSET_CHK)
+ #endif
+ 
+ ENTRY (MEMSET)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+ 	vpxor	%xmm0, %xmm0, %xmm0
+ 	vmovd	%esi, %xmm1
+ 	lea	(%rdi, %rdx), %rsi
+diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+index 270a1d49..9a0fd818 100644
+--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+@@ -65,8 +65,8 @@
+ 	.section SECTION(.text),"ax",@progbits
+ #if VEC_SIZE == 16 && IS_IN (libc)
+ ENTRY (__bzero)
+-	movq	%rdi, %rax /* Set return value.  */
+-	movq	%rsi, %rdx /* Set n.  */
+	mov	%RDI_LP, %RAX_LP /* Set return value.  */
+	mov	%RSI_LP, %RDX_LP /* Set n.  */
+ 	pxor	%xmm0, %xmm0
+ 	jmp	L(entry_from_bzero)
+ END (__bzero)
+@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
+ #if IS_IN (libc)
+ # if defined SHARED
+ ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
+ # endif
+ 
+ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
+-	shlq	$2, %rdx
+	shl	$2, %RDX_LP
+ 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ 	jmp	L(entry_from_bzero)
+ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
+ 
+ #if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
+ #endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+ L(entry_from_bzero):
+ 	cmpq	$VEC_SIZE, %rdx
+ 	jb	L(less_vec)
+@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
+ 
+ # if VEC_SIZE == 16
+ ENTRY (__memset_chk_erms)
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END (__memset_chk_erms)
+ 
+ /* Only used to measure performance of REP STOSB.  */
+ ENTRY (__memset_erms)
+ 	/* Skip zero length.  */
+-	testq	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	jnz	 L(stosb)
+ 	movq	%rdi, %rax
+ 	ret
+@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
+ L(stosb):
+ 	/* Issue vzeroupper before rep stosb.  */
+ 	VZEROUPPER
+-	movq	%rdx, %rcx
+	mov	%RDX_LP, %RCX_LP
+ 	movzbl	%sil, %eax
+-	movq	%rdi, %rdx
+	mov	%RDI_LP, %RDX_LP
+ 	rep stosb
+-	movq	%rdx, %rax
+	mov	%RDX_LP, %RAX_LP
+ 	ret
+ # if VEC_SIZE == 16
+ END (__memset_erms)
+@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
+ 
+ # if defined SHARED && IS_IN (libc)
+ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+-	cmpq	%rdx, %rcx
+	cmp	%RDX_LP, %RCX_LP
+ 	jb	HIDDEN_JUMPTARGET (__chk_fail)
+ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
+ # endif
+ 
+ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+-	cmpq	$VEC_SIZE, %rdx
+# ifdef __ILP32__
+	/* Clear the upper 32 bits.  */
+	mov	%edx, %edx
+# endif
+	cmp	$VEC_SIZE, %RDX_LP
+ 	jb	L(less_vec)
+-	cmpq	$(VEC_SIZE * 2), %rdx
+	cmp	$(VEC_SIZE * 2), %RDX_LP
+ 	ja	L(stosb_more_2x_vec)
+ 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
+ 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index e99dbd7c..98bd9ae9 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,9 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr
+	 tst-size_t-memrchr tst-size_t-memset
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
+new file mode 100644
+index 00000000..2c367af6
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
+@@ -0,0 +1,73 @@
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wmemset"
+#else
+# define TEST_NAME "memset"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+#else
+# define MEMSET memset
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memset (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  CHAR ch = 0x23;
+  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      c.fn = impl->fn;
+      CHAR *p = (CHAR *) do_memset (src, c);
+      size_t i;
+      for (i = 0; i < src.len; i++)
+	if (p[i] != ch)
+	  {
+	    error (0, 0, "Wrong result in function %s", impl->name);
+	    ret = 1;
+	  }
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+new file mode 100644
+index 00000000..955eb488
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
+@@ -0,0 +1,20 @@
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-memset.c"
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-50.patch
+++ b/SOURCES/glibc-RHEL-15696-50.patch
@ -0,0 +1,43 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>  2021-05-23 21:43:10
+Committer: H.J. Lu <hjl.tools@gmail.com>  2021-06-27 10:56:57
+Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
+Child:  1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
+Branches: master, remotes/origin/master and many more (41)
+Follows: glibc-2.33.9000
+Precedes: glibc-2.34
+
+    math: redirect roundeven function
+    
+    This patch redirect roundeven function for futhermore changes.
+    
+    Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+
+Conflicts:
+	*
+	(rewritten for older branch)
+
+diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+index 7bbbb2dc..8728d0f2 100644
+--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -67,5 +68,6 @@ __roundeven (double x)
+   INSERT_WORDS64 (x, ix);
+   return x;
+ }
+-hidden_def (__roundeven)
+#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
+#endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-51.patch
+++ b/SOURCES/glibc-RHEL-15696-51.patch
@ -0,0 +1,118 @@
+From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:10 +0800
+Subject: [PATCH] math: redirect roundeven function
+Content-type: text/plain; charset=UTF-8
+
+This patch redirect roundeven function for futhermore changes.
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ include/math.h                             | 3 ++-
+ sysdeps/ieee754/dbl-64/s_roundeven.c       | 4 +++-
+ sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
+ sysdeps/ieee754/flt-32/s_roundevenf.c      | 3 +++
+ sysdeps/ieee754/ldbl-128/s_roundevenl.c    | 1 +
+ sysdeps/ieee754/ldbl-96/s_roundevenl.c     | 1 +
+ 6 files changed, 11 insertions(+), 2 deletions(-)
+
+Conflicts:
+	include/math.h
+	(missing MATH_REDIRECT macros)
+
+diff --git a/include/math.h b/include/math.h
+index e21d34b8..1f9f9a54 100644
+--- a/include/math.h
+++ b/include/math.h
+@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
+ libm_hidden_proto (__issignalingf)
+ libm_hidden_proto (__exp)
+ libm_hidden_proto (__expf)
+-libm_hidden_proto (__roundeven)
+ 
+ # ifndef __NO_LONG_DOUBLE_MATH
+ libm_hidden_proto (__fpclassifyl)
+@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
+ 
+ # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
+ #  ifndef NO_MATH_REDIRECT
+float (roundevenf) (float) asm ("__roundevenf");
+double (roundeven) (double) asm ("__roundeven");
+ /* Declare sqrt for use within GLIBC.  Compilers typically inline sqrt as a
+    single instruction.  Use an asm to avoid use of PLTs if it doesn't.  */
+ float (sqrtf) (float) asm ("__ieee754_sqrtf");
+diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
+index 1438e81d..61962184 100644
+--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
+++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-double.h>
+@@ -101,5 +102,6 @@ __roundeven (double x)
+   INSERT_WORDS (x, hx, lx);
+   return x;
+ }
+-hidden_def (__roundeven)
+#ifndef __roundeven
+ libm_alias_double (__roundeven, roundeven)
+#endif
+diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
+index 5a9b3f39..e0faf727 100644
+--- a/sysdeps/ieee754/float128/s_roundevenf128.c
+++ b/sysdeps/ieee754/float128/s_roundevenf128.c
+@@ -1,2 +1,3 @@
+#define NO_MATH_REDIRECT
+ #include <float128_private.h>
+ #include "../ldbl-128/s_roundevenl.c"
+diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
+index 90f991d5..a661875e 100644
+--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
+++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-float.h>
+@@ -67,4 +68,6 @@ __roundevenf (float x)
+   SET_FLOAT_WORD (x, ix);
+   return x;
+ }
+#ifndef __roundevenf
+ libm_alias_float (__roundeven, roundeven)
+#endif
+diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+index 5fc59af4..b9375b6c 100644
+--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+index be2e4fa4..65031ab7 100644
+--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+@@ -17,6 +17,7 @@
+    License along with the GNU C Library; if not, see
+    <http://www.gnu.org/licenses/>.  */
+ 
+#define NO_MATH_REDIRECT
+ #include <math.h>
+ #include <math_private.h>
+ #include <libm-alias-ldouble.h>
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-52.patch
+++ b/SOURCES/glibc-RHEL-15696-52.patch
@ -0,0 +1,242 @@
+From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
+From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Date: Mon, 24 May 2021 09:43:11 +0800
+Subject: [PATCH] x86_64: roundeven with sse4.1 support
+Content-type: text/plain; charset=UTF-8
+
+This patch adds support for the sse4.1 hardware floating point
+roundeven.
+
+Here is some benchmark results on my systems:
+
+=AMD Ryzen 9 3900X 12-Core Processor=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  3.75587e+09 |  3.75114e+09 |
+| iterations |  3.93053e+08 |  4.35402e+08 |
+| max        | 52.592       | 58.71        |
+| min        |  7.98        |  7.22        |
+| mean       |  9.55563     |  8.61535     |
+
+* benchmark result after this commit
+|            |     roundeven |   roundevenf |
+|------------|---------------|--------------|
+| duration   |   3.73815e+09 |  3.73738e+09 |
+| iterations |   5.82692e+08 |  5.91498e+08 |
+| max        |  56.468       | 51.642       |
+| min        |   6.27        |  6.156       |
+| mean       |   6.41532     |  6.3185      |
+
+=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
+
+* benchmark result before this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.18208e+09 |  2.18258e+09 |
+| iterations |  2.39932e+08 |  2.46924e+08 |
+| max        | 96.378       | 98.035       |
+| min        |  6.776       |  5.94        |
+| mean       |  9.09456     |  8.83907     |
+
+* benchmark result after this commit
+|            |    roundeven |   roundevenf |
+|------------|--------------|--------------|
+| duration   |  2.17415e+09 |  2.17005e+09 |
+| iterations |  3.56193e+08 |  4.09824e+08 |
+| max        | 51.693       | 97.192       |
+| min        |  5.926       |  5.093       |
+| mean       |  6.10385     |  5.29507     |
+
+Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/fpu/multiarch/Makefile         |  5 +--
+ sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c  |  2 ++
+ .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundeven.c    | 31 +++++++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c |  3 ++
+ .../fpu/multiarch/s_roundevenf-sse4_1.S       | 24 ++++++++++++++
+ sysdeps/x86_64/fpu/multiarch/s_roundevenf.c   | 31 +++++++++++++++++++
+ 7 files changed, 118 insertions(+), 2 deletions(-)
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+ create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+
+diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
+index 9f387248..6ddd1c01 100644
+--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
+@@ -1,11 +1,12 @@
+ ifeq ($(subdir),math)
+ libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
+ 			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
+-			s_trunc-c s_truncf-c
+			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
+ 
+ libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
+ 			s_floorf-sse4_1 s_nearbyint-sse4_1 \
+-			s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
+			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+ 			s_trunc-sse4_1 s_truncf-sse4_1
+ 
+ libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+new file mode 100644
+index 00000000..c7be43cb
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
+@@ -0,0 +1,2 @@
+#define __roundeven __roundeven_c
+#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+new file mode 100644
+index 00000000..6ae8f6b1
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
+@@ -0,0 +1,24 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY(__roundeven_sse41)
+	roundsd	$8, %xmm0, %xmm0
+	ret
+END(__roundeven_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+new file mode 100644
+index 00000000..d92eda65
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
+@@ -0,0 +1,31 @@
+/* Multiple versions of __roundeven.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <libm-alias-double.h>
+
+#define roundeven __redirect_roundeven
+#define __roundeven __redirect___roundeven
+#include <math.h>
+#undef roundeven
+#undef __roundeven
+
+#define SYMBOL_NAME roundeven
+#include "ifunc-sse4_1.h"
+
+libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
+libm_alias_double (__roundeven, roundeven)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+new file mode 100644
+index 00000000..72a6e7d1
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
+@@ -0,0 +1,3 @@
+#undef __roundevenf
+#define __roundevenf __roundevenf_c
+#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+new file mode 100644
+index 00000000..a76e1080
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
+@@ -0,0 +1,24 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+	.section .text.sse4.1,"ax",@progbits
+ENTRY(__roundevenf_sse41)
+	roundss	$8, %xmm0, %xmm0
+	ret
+END(__roundevenf_sse41)
+diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+new file mode 100644
+index 00000000..2ee196e6
+--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
+@@ -0,0 +1,31 @@
+/* Multiple versions of __roundevenf.
+   Copyright (C) 2021 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <libm-alias-float.h>
+
+#define roundevenf __redirect_roundevenf
+#define __roundevenf __redirect___roundevenf
+#include <math.h>
+#undef roundevenf
+#undef __roundevenf
+
+#define SYMBOL_NAME roundevenf
+#include "ifunc-sse4_1.h"
+
+libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
+libm_alias_float (__roundeven, roundeven)
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-53.patch
+++ b/SOURCES/glibc-RHEL-15696-53.patch
@ -0,0 +1,41 @@
+From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sun, 9 Jan 2022 16:02:28 -0600
+Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
+Content-type: text/plain; charset=UTF-8
+
+Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
+__wcscmp_evex. For x86_64 this covers the entire address range so any
+length larger could not possibly be used to bound `s1` or `s2`.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
+ 1 file changed, 10 insertions(+)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 459eeed0..d5aa6daa 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -97,6 +97,16 @@ ENTRY (STRCMP)
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+#  ifndef __ILP32__
+	movq	%rdx, %rcx
+	/* Check if length could overflow when multiplied by
+	   sizeof(wchar_t). Checking top 8 bits will cover all potential
+	   overflow cases as well as redirect cases where its impossible to
+	   length to bound a valid memory region. In these cases just use
+	   'wcscmp'.  */
+	shrq	$56, %rcx
+	jnz	__wcscmp_evex
+#  endif
+ 	/* Convert units: from wide to byte char.  */
+ 	shl	$2, %RDX_LP
+ #  endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-173.patch
+++ b/SOURCES/glibc-upstream-2.34-173.patch
@ -1,27 +1,38 @@
-commit 16245986fb9bfe396113fc7dfd1929f69a9e748e
-Author: H.J. Lu <hjl.tools@gmail.com>
-Date:   Fri Aug 20 06:42:24 2021 -0700
+From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 20 Aug 2021 06:42:24 -0700
+Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
+ #28252]
+Content-type: text/plain; charset=UTF-8

-    x86-64: Optimize load of all bits set into ZMM register [BZ #28252]
-    
-    Optimize loads of all bits set into ZMM register in AVX512 SVML codes
-    by replacing
-    
-            vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
-    
-    and
-    
-            vmovups   .L_2il0floatpacket.13(%rip), %zmmX
-    
-    with
-            vpternlogd $0xff, %zmmX, %zmmX, %zmmX
-    
-    This fixes BZ #28252.
-    
-    (cherry picked from commit 78c9ec9000f873abe7a15a91b87080a2e4308260)
+Optimize loads of all bits set into ZMM register in AVX512 SVML codes
+by replacing
+
+	vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
+
+and
+
+	vmovups   .L_2il0floatpacket.13(%rip), %zmmX
+
+with
+	vpternlogd $0xff, %zmmX, %zmmX, %zmmX
+
+This fixes BZ #28252.
+---
+ .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S   |  7 +------
+ .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S   |  7 +------
+ .../fpu/multiarch/svml_d_sincos8_core_avx512.S       |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
+ .../fpu/multiarch/svml_s_sincosf16_core_avx512.S     |  7 +------
+ .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S |  7 +------
+ 10 files changed, 11 insertions(+), 64 deletions(-)

 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
-index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
+index 24e3b363..07dfed85 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
@ -43,7 +54,7 @@ index e68fcdbb16a79f36..58e588a3d42a8bc9 100644
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.16,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
-index dfa2acafc486b56b..f5f117d474f66176 100644
+index ae8af8d8..ddb60e5b 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
@ -65,7 +76,7 @@ index dfa2acafc486b56b..f5f117d474f66176 100644
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.12,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
-index be8ab7c6e0e33819..48d251db16ccab9d 100644
+index 2d4b14fd..529c454a 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
@ -87,7 +98,7 @@ index be8ab7c6e0e33819..48d251db16ccab9d 100644
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.14,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
-index 611887082a545854..a4944a4feef6aa98 100644
+index 2df626c0..e501a53a 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
@ -109,7 +120,7 @@ index 611887082a545854..a4944a4feef6aa98 100644
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.15,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
-index f671d60d5dab5a0e..fe8474fed943e8ad 100644
+index 6ea1137b..377af394 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
@ -131,7 +142,7 @@ index f671d60d5dab5a0e..fe8474fed943e8ad 100644
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.13,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
-index 637bfe3c06ab9ad4..229b7828cde04db2 100644
+index 89ba0df2..46f33d46 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
@ -153,7 +164,7 @@ index 637bfe3c06ab9ad4..229b7828cde04db2 100644
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.13,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
-index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
+index 4cf0a96f..9e254956 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
@ -175,7 +186,7 @@ index 9d790fbf0ad6c8ec..fa2aae986f543582 100644
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.7,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
-index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
+index bdcd50af..e8331ba1 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
@ -209,7 +220,7 @@ index c5c43c46ff7af5a3..6aea2a4f11d1f85f 100644
 -	.long	0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.24,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
-index 9cf359c86ff9bd70..a446c504f63c9399 100644
+index 5fa4bc41..1f46f334 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
@ -231,7 +242,7 @@ index 9cf359c86ff9bd70..a446c504f63c9399 100644
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.13,@object
 diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
-index bd05109a62181f22..c1b352d0ad1992cd 100644
+index 141f747e..1fc9308a 100644
 --- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
 +++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
@ -252,3 +263,6 @@ index bd05109a62181f22..c1b352d0ad1992cd 100644
 -.L_2il0floatpacket.11:
 -	.long	0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
 -	.type	.L_2il0floatpacket.11,@object
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-174.patch
+++ b/SOURCES/glibc-upstream-2.34-174.patch
@ -1,22 +1,25 @@
-commit b5a44a6a471aafd3677659a610f32468c40a666b
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Tue Sep 21 18:31:49 2021 -0500
+From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:31:49 -0500
+Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be
+ specified
+Content-type: text/plain; charset=UTF-8

-    x86: Modify ENTRY in sysdep.h so that p2align can be specified
-    
-    No bug.
-    
-    This change adds a new macro ENTRY_P2ALIGN which takes a second
-    argument, log2 of the desired function alignment.
-    
-    The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
-    doesn't affect any existing functionality.
-    
-    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
-    (cherry picked from commit fc5bd179ef3a953dff8d1655bd530d0e230ffe71)
+No bug.
+
+This change adds a new macro ENTRY_P2ALIGN which takes a second
+argument, log2 of the desired function alignment.
+
+The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
+doesn't affect any existing functionality.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86/sysdep.h | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)

 diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
-index cac1d762fb3f99d0..937180c1bd791570 100644
+index 01bac0f6..a70bb3a2 100644
 --- a/sysdeps/x86/sysdep.h
 +++ b/sysdeps/x86/sysdep.h
@@ -78,15 +78,18 @@ enum cf_protection_level
@ -40,3 +43,6 @@ index cac1d762fb3f99d0..937180c1bd791570 100644
 #undef	END
 #define END(name)							      \
   cfi_endproc;								      \
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-175.patch
+++ b/SOURCES/glibc-upstream-2.34-175.patch
@ -1,37 +1,39 @@
-commit 5ec3416853c4150c4d13312e05f93a053586d528
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Tue Sep 21 18:45:03 2021 -0500
+From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 21 Sep 2021 18:45:03 -0500
+Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
+ size
+Content-type: text/plain; charset=UTF-8

-    x86: Optimize memcmp-evex-movbe.S for frontend behavior and size
-    
-    No bug.
-    
-    The frontend optimizations are to:
-    1. Reorganize logically connected basic blocks so they are either in
-       the same cache line or adjacent cache lines.
-    2. Avoid cases when basic blocks unnecissarily cross cache lines.
-    3. Try and 32 byte align any basic blocks possible without sacrificing
-       code size. Smaller / Less hot basic blocks are used for this.
-    
-    Overall code size shrunk by 168 bytes. This should make up for any
-    extra costs due to aligning to 64 bytes.
-    
-    In general performance before deviated a great deal dependending on
-    whether entry alignment % 64 was 0, 16, 32, or 48. These changes
-    essentially make it so that the current implementation is at least
-    equal to the best alignment of the original for any arguments.
-    
-    The only additional optimization is in the page cross case. Branch on
-    equals case was removed from the size == [4, 7] case. As well the [4,
-    7] and [2, 3] case where swapped as [4, 7] is likely a more hot
-    argument size.
-    
-    test-memcmp and test-wmemcmp are both passing.
-    
-    (cherry picked from commit 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa)
+No bug.
+
+The frontend optimizations are to:
+1. Reorganize logically connected basic blocks so they are either in
+   the same cache line or adjacent cache lines.
+2. Avoid cases when basic blocks unnecissarily cross cache lines.
+3. Try and 32 byte align any basic blocks possible without sacrificing
+   code size. Smaller / Less hot basic blocks are used for this.
+
+Overall code size shrunk by 168 bytes. This should make up for any
+extra costs due to aligning to 64 bytes.
+
+In general performance before deviated a great deal dependending on
+whether entry alignment % 64 was 0, 16, 32, or 48. These changes
+essentially make it so that the current implementation is at least
+equal to the best alignment of the original for any arguments.
+
+The only additional optimization is in the page cross case. Branch on
+equals case was removed from the size == [4, 7] case. As well the [4,
+7] and [2, 3] case where swapped as [4, 7] is likely a more hot
+argument size.
+
+test-memcmp and test-wmemcmp are both passing.
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
+ 1 file changed, 242 insertions(+), 192 deletions(-)

 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
-index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
+index 654dc7ac..2761b54f 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -34,7 +34,24 @@
@ -651,3 +653,6 @@ index 654dc7ac8ccb9445..2761b54f2e7dea9f 100644
 -
 END (MEMCMP)
 #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-176.patch
+++ b/SOURCES/glibc-upstream-2.34-176.patch
@ -1,48 +1,58 @@
-commit 6d18a93dbbde2958001d65dff3080beed7ae675a
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Mon Sep 20 16:20:15 2021 -0500
+From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 20 Sep 2021 16:20:15 -0500
+Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
+Content-type: text/plain; charset=UTF-8

-    x86: Optimize memset-vec-unaligned-erms.S
-    
-    No bug.
-    
-    Optimization are
-    
-    1. change control flow for L(more_2x_vec) to fall through to loop and
-       jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
-       size and saves jumps for length > 4x VEC_SIZE.
-    
-    2. For EVEX/AVX512 move L(less_vec) closer to entry.
-    
-    3. Avoid complex address mode for length > 2x VEC_SIZE
-    
-    4. Slightly better aligning code for the loop from the perspective of
-       code size and uops.
-    
-    5. Align targets so they make full use of their fetch block and if
-       possible cache line.
-    
-    6. Try and reduce total number of icache lines that will need to be
-       pulled in for a given length.
-    
-    7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
-       jumping to the stosb target in the sse2 code section will almost
-       certainly be to a new page. The new version does increase code size
-       marginally by duplicating the target but should get better iTLB
-       behavior as a result.
-    
-    test-memset, test-wmemset, and test-bzero are all passing.
-    
-    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    (cherry picked from commit e59ced238482fd71f3e493717f14f6507346741e)
+No bug.
+
+Optimization are
+
+1. change control flow for L(more_2x_vec) to fall through to loop and
+   jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
+   size and saves jumps for length > 4x VEC_SIZE.
+
+2. For EVEX/AVX512 move L(less_vec) closer to entry.
+
+3. Avoid complex address mode for length > 2x VEC_SIZE
+
+4. Slightly better aligning code for the loop from the perspective of
+   code size and uops.
+
+5. Align targets so they make full use of their fetch block and if
+   possible cache line.
+
+6. Try and reduce total number of icache lines that will need to be
+   pulled in for a given length.
+
+7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
+   jumping to the stosb target in the sse2 code section will almost
+   certainly be to a new page. The new version does increase code size
+   marginally by duplicating the target but should get better iTLB
+   behavior as a result.
+
+test-memset, test-wmemset, and test-bzero are all passing.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/memset.S                       |  10 +-
+ .../multiarch/memset-avx2-unaligned-erms.S    |  10 +-
+ .../multiarch/memset-avx512-unaligned-erms.S  |  11 +-
+ .../multiarch/memset-evex-unaligned-erms.S    |  11 +-
+ .../multiarch/memset-vec-unaligned-erms.S     | 285 ++++++++++++------
+ 5 files changed, 232 insertions(+), 95 deletions(-)
+
+Conflicts:
+	sysdeps/x86_64/memset.S
+	(GNU URL)

 diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
-index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
+index b3426795..8672b030 100644
 --- a/sysdeps/x86_64/memset.S
 +++ b/sysdeps/x86_64/memset.S
@@ -18,13 +18,15 @@
-    <https://www.gnu.org/licenses/>.  */
+    <http://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
 +#define USE_WITH_SSE2	1
@ -62,7 +72,7 @@ index 7d4a327eba29ecb4..0137eba4cdd9f830 100644
 #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
 diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
-index ae0860f36a47d594..1af668af0aeda59e 100644
+index ae0860f3..1af668af 100644
 --- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -1,8 +1,14 @@
@ -83,7 +93,7 @@ index ae0860f36a47d594..1af668af0aeda59e 100644
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
   vmovd d, %xmm0; \
 diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
-index 8ad842fc2f140527..f14d6f8493c21a36 100644
+index 8ad842fc..f14d6f84 100644
 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,11 +1,18 @@
@ -108,7 +118,7 @@ index 8ad842fc2f140527..f14d6f8493c21a36 100644
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
-index 640f092903302ad0..64b09e77cc20cc42 100644
+index 640f0929..64b09e77 100644
 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -1,11 +1,18 @@
@ -133,7 +143,7 @@ index 640f092903302ad0..64b09e77cc20cc42 100644
 
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
-index ff196844a093dc3b..e723413a664c088f 100644
+index 909c33f6..f08b7323 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -63,8 +63,27 @@
@ -495,3 +505,6 @@ index ff196844a093dc3b..e723413a664c088f 100644
 +	movb	%dil, -1(%rax, %rdx)
 	VZEROUPPER_RETURN
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-58.patch
+++ b/SOURCES/glibc-RHEL-15696-58.patch
@ -0,0 +1,45 @@
+From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Sat, 23 Oct 2021 01:26:47 -0400
+Subject: [PATCH] x86: Replace sse2 instructions with avx in
+ memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8
+
+This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
+
+it could potentially be dangerous to use SSE2 if this function is ever
+called without using 'vzeroupper' beforehand. While compilers appear
+to use 'vzeroupper' before function calls if AVX2 has been used, using
+SSE2 here is more brittle. Since it is not absolutely necessary it
+should be avoided.
+
+It costs 2-extra bytes but the extra bytes should only eat into
+alignment padding.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+index 2761b54f..640f6757 100644
+--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+@@ -561,13 +561,13 @@ L(between_16_31):
+ 	/* From 16 to 31 bytes.  No branch when size == 16.  */
+ 
+ 	/* Use movups to save code size.  */
+-	movups	(%rsi), %xmm2
+	vmovdqu	(%rsi), %xmm2
+ 	VPCMP	$4, (%rdi), %xmm2, %k1
+ 	kmovd	%k1, %eax
+ 	testl	%eax, %eax
+ 	jnz	L(return_vec_0_lv)
+ 	/* Use overlapping loads to avoid branches.  */
+-	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
+ 	kmovd	%k1, %eax
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-178.patch
+++ b/SOURCES/glibc-upstream-2.34-178.patch
@ -1,44 +1,46 @@
-commit f35ad30da4880a1574996df0674986ecf82fa7ae
-Author: H.J. Lu <hjl.tools@gmail.com>
-Date:   Fri Oct 29 12:40:20 2021 -0700
+From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 29 Oct 2021 12:40:20 -0700
+Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load
+Content-type: text/plain; charset=UTF-8

-    x86-64: Improve EVEX strcmp with masked load
-    
-    In strcmp-evex.S, to compare 2 32-byte strings, replace
-    
-            VMOVU   (%rdi, %rdx), %YMM0
-            VMOVU   (%rsi, %rdx), %YMM1
-            /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
-            VPCMP   $4, %YMM0, %YMM1, %k0
-            VPCMP   $0, %YMMZERO, %YMM0, %k1
-            VPCMP   $0, %YMMZERO, %YMM1, %k2
-            /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
-            kord    %k1, %k2, %k1
-            /* Each bit in K1 represents a NULL or a mismatch.  */
-            kord    %k0, %k1, %k1
-            kmovd   %k1, %ecx
-            testl   %ecx, %ecx
-            jne     L(last_vector)
-    
-    with
-    
-            VMOVU   (%rdi, %rdx), %YMM0
-            VPTESTM %YMM0, %YMM0, %k2
-            /* Each bit cleared in K1 represents a mismatch or a null CHAR
-               in YMM0 and 32 bytes at (%rsi, %rdx).  */
-            VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
-            kmovd   %k1, %ecx
-            incl    %ecx
-            jne     L(last_vector)
-    
-    It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
-    and Ice Lake.
-    
-    Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
-    (cherry picked from commit c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55)
+In strcmp-evex.S, to compare 2 32-byte strings, replace
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VMOVU   (%rsi, %rdx), %YMM1
+        /* Each bit in K0 represents a mismatch in YMM0 and YMM1.  */
+        VPCMP   $4, %YMM0, %YMM1, %k0
+        VPCMP   $0, %YMMZERO, %YMM0, %k1
+        VPCMP   $0, %YMMZERO, %YMM1, %k2
+        /* Each bit in K1 represents a NULL in YMM0 or YMM1.  */
+        kord    %k1, %k2, %k1
+        /* Each bit in K1 represents a NULL or a mismatch.  */
+        kord    %k0, %k1, %k1
+        kmovd   %k1, %ecx
+        testl   %ecx, %ecx
+        jne     L(last_vector)
+
+with
+
+        VMOVU   (%rdi, %rdx), %YMM0
+        VPTESTM %YMM0, %YMM0, %k2
+        /* Each bit cleared in K1 represents a mismatch or a null CHAR
+           in YMM0 and 32 bytes at (%rsi, %rdx).  */
+        VPCMP   $0, (%rsi, %rdx), %YMM0, %k1{%k2}
+        kmovd   %k1, %ecx
+        incl    %ecx
+        jne     L(last_vector)
+
+It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
+and Ice Lake.
+
+Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------
+ 1 file changed, 243 insertions(+), 218 deletions(-)

 diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
-index d5aa6daa46c7ed25..82f12ac89bcae20b 100644
+index d5aa6daa..82f12ac8 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -41,6 +41,8 @@
@ -688,3 +690,6 @@ index d5aa6daa46c7ed25..82f12ac89bcae20b 100644
 	jne	L(last_vector)
 
 	addl	$4, %edx
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-6.patch
+++ b/SOURCES/glibc-RHEL-15696-6.patch
@ -0,0 +1,300 @@
+From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:33:52 -0800
+Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
+ [BZ# 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes the strncmp family for x32.  Tested on x86-64 and x32.
+On x86-64, libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
+	* sysdeps/x86_64/strcmp.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
+	tst-size_t-strncmp and tst-size_t-wcsncmp.
+	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
+	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
+	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
+---
+ sysdeps/x86_64/multiarch/strcmp-avx2.S      |  6 +-
+ sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +-
+ sysdeps/x86_64/strcmp.S                     |  6 +-
+ sysdeps/x86_64/x32/Makefile                 |  6 +-
+ sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++
+ sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++
+ 7 files changed, 170 insertions(+), 11 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+index 327e3d87..156c1949 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
+@@ -79,15 +79,15 @@
+ ENTRY (STRCMP)
+ # ifdef USE_AS_STRNCMP
+ 	/* Check for simple cases (0 or 1) in offset.  */
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	L(char0)
+ 	jb	L(zero)
+ #  ifdef USE_AS_WCSCMP
+ 	/* Convert units: from wide to byte char.  */
+-	shl	$2, %rdx
+	shl	$2, %RDX_LP
+ #  endif
+ 	/* Register %r11 tracks the maximum offset.  */
+-	movq	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
+ # endif
+ 	movl	%edi, %eax
+ 	xorl	%edx, %edx
+diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+index d3c07bd2..a1ebea46 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
+@@ -156,11 +156,11 @@ STRCMP_SSE42:
+ #endif
+ 
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
+index e16945b9..f47c8ad4 100644
+--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
+@@ -135,11 +135,11 @@ ENTRY (STRCMP)
+  * This implementation uses SSE to compare up to 16 bytes at a time.
+  */
+ #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
+-	test	%rdx, %rdx
+	test	%RDX_LP, %RDX_LP
+ 	je	LABEL(strcmp_exitz)
+-	cmp	$1, %rdx
+	cmp	$1, %RDX_LP
+ 	je	LABEL(Byte0)
+-	mov	%rdx, %r11
+	mov	%RDX_LP, %R11_LP
+ #endif
+ 	mov	%esi, %ecx
+ 	mov	%edi, %eax
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index 98bd9ae9..db302839 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -7,9 +7,11 @@ endif
+ 
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+-	 tst-size_t-memrchr tst-size_t-memset
+	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+	 tst-size_t-strncmp
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+	 tst-size_t-wcsncmp
+ endif
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+new file mode 100644
+index 00000000..86233593
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
+@@ -0,0 +1,59 @@
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncasecmp"
+#include "test-size_t.h"
+
+IMPL (strncasecmp, 1)
+
+typedef int (*proto_t) (const char *, const char *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncasecmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  strncpy ((char *) buf1, (const char *) buf2, page_size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncasecmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+new file mode 100644
+index 00000000..54e6bd83
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
+@@ -0,0 +1,78 @@
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#ifdef WIDE
+# define TEST_NAME "wcsncmp"
+#else
+# define TEST_NAME "strncmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+
+# define STRNCMP wcsncmp
+# define STRNCPY wcsncpy
+# define CHAR wchar_t
+#else
+# define STRNCMP strncmp
+# define STRNCPY strncpy
+# define CHAR char
+#endif
+
+IMPL (STRNCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncmp (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  size_t size = page_size / sizeof (CHAR);
+  parameter_t dest = { { size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      int res = do_strncmp (dest, src);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+new file mode 100644
+index 00000000..4829647c
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
+@@ -0,0 +1,20 @@
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define WIDE 1
+#include "tst-size_t-strncmp.c"
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-180.patch
+++ b/SOURCES/glibc-upstream-2.34-180.patch
@ -1,23 +1,26 @@
-commit 2e64237a8744dd50f9222293275fa52e7248ff76
-Author: Fangrui Song <maskray@google.com>
-Date:   Tue Nov 2 20:59:52 2021 -0700
+From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001
+From: Fangrui Song <maskray@google.com>
+Date: Tue, 2 Nov 2021 20:59:52 -0700
+Subject: [PATCH] x86-64: Replace movzx with movzbl
+Content-type: text/plain; charset=UTF-8

-    x86-64: Replace movzx with movzbl
-    
-    Clang cannot assemble movzx in the AT&T dialect mode.
-    
-    ../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
-     movzx (%rsi), %ecx
-                   ^~~~
-    
-    Change movzx to movzbl, which follows the AT&T dialect and is used
-    elsewhere in the file.
-    
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    (cherry picked from commit 6720d36b6623c5e48c070d86acf61198b33e144e)
+Clang cannot assemble movzx in the AT&T dialect mode.
+
+../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
+ movzx (%rsi), %ecx
+               ^~~~
+
+Change movzx to movzbl, which follows the AT&T dialect and is used
+elsewhere in the file.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++--
+ sysdeps/x86_64/strcmp.S                 | 4 ++--
+ 2 files changed, 4 insertions(+), 4 deletions(-)

 diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
-index bc19547b09639071..6197a723b9e0606e 100644
+index a1ebea46..d8fdeb3a 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
@ -32,7 +35,7 @@ index bc19547b09639071..6197a723b9e0606e 100644
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
 diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
-index 824e648230a15739..7f8a1bc756f86aee 100644
+index f47c8ad4..aa6df898 100644
 --- a/sysdeps/x86_64/strcmp.S
 +++ b/sysdeps/x86_64/strcmp.S
@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
@ -46,3 +49,6 @@ index 824e648230a15739..7f8a1bc756f86aee 100644
 
 #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
 	leaq	_nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-61.patch
+++ b/SOURCES/glibc-RHEL-15696-61.patch
@ -0,0 +1,56 @@
+From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 30 Apr 2021 05:58:59 -0700
+Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
+Content-type: text/plain; charset=UTF-8
+
+The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
+that REP MOVSB became faster after 2112 bytes:
+
+                                      Vector Move       REP MOVSB
+length=2112, align1=0, align2=0:        24.20             24.40
+length=2112, align1=1, align2=0:        26.07             23.13
+length=2112, align1=0, align2=1:        27.18             28.13
+length=2112, align1=1, align2=1:        26.23             25.16
+length=2176, align1=0, align2=0:        23.18             22.52
+length=2176, align1=2, align2=0:        25.45             22.52
+length=2176, align1=0, align2=2:        27.14             27.82
+length=2176, align1=2, align2=2:        22.73             25.56
+length=2240, align1=0, align2=0:        24.62             24.25
+length=2240, align1=3, align2=0:        29.77             27.15
+length=2240, align1=0, align2=3:        35.55             29.93
+length=2240, align1=3, align2=3:        34.49             25.15
+length=2304, align1=0, align2=0:        34.75             26.64
+length=2304, align1=4, align2=0:        32.09             22.63
+length=2304, align1=0, align2=4:        28.43             31.24
+
+Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
+fast short REP MOVSB (FSRM).
+
+	* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
+	rep_movsb_threshold to 2112 on processors with fast short REP
+	MOVSB (FSRM).
+---
+ sysdeps/x86/cacheinfo.h | 6 ++++++
+ 1 file changed, 6 insertions(+)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index f72f634a..cc3941d3 100644
+--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
+@@ -430,6 +430,12 @@ init_cacheinfo (void)
+       rep_movsb_threshold = 2048 * (16 / 16);
+       minimum_rep_movsb_threshold = 16 * 8;
+     }
+
+  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
+     short REP MOVSB (FSRM).  */
+  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+    rep_movsb_threshold = 2112;
+
+   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
+     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
+   else
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-182.patch
+++ b/SOURCES/glibc-upstream-2.34-182.patch
@ -1,75 +1,79 @@
-commit cecbac52123456e2fbcff062a4165bf7b9174797
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Mon Nov 1 00:49:52 2021 -0500
+From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 1 Nov 2021 00:49:52 -0500
+Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
+ dl-cacheinfo.h
+Content-type: text/plain; charset=UTF-8

-    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
-    
-    No bug.
-    
-    This patch doubles the rep_movsb_threshold when using ERMS. Based on
-    benchmarks the vector copy loop, especially now that it handles 4k
-    aliasing, is better for these medium ranged.
-    
-    On Skylake with ERMS:
-    
-    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
-    4096,   0,      0,      0,      0.975
-    4096,   0,      0,      1,      0.953
-    4096,   12,     0,      0,      0.969
-    4096,   12,     0,      1,      0.872
-    4096,   44,     0,      0,      0.979
-    4096,   44,     0,      1,      0.83
-    4096,   0,      12,     0,      1.006
-    4096,   0,      12,     1,      0.989
-    4096,   0,      44,     0,      0.739
-    4096,   0,      44,     1,      0.942
-    4096,   12,     12,     0,      1.009
-    4096,   12,     12,     1,      0.973
-    4096,   44,     44,     0,      0.791
-    4096,   44,     44,     1,      0.961
-    4096,   2048,   0,      0,      0.978
-    4096,   2048,   0,      1,      0.951
-    4096,   2060,   0,      0,      0.986
-    4096,   2060,   0,      1,      0.963
-    4096,   2048,   12,     0,      0.971
-    4096,   2048,   12,     1,      0.941
-    4096,   2060,   12,     0,      0.977
-    4096,   2060,   12,     1,      0.949
-    8192,   0,      0,      0,      0.85
-    8192,   0,      0,      1,      0.845
-    8192,   13,     0,      0,      0.937
-    8192,   13,     0,      1,      0.939
-    8192,   45,     0,      0,      0.932
-    8192,   45,     0,      1,      0.927
-    8192,   0,      13,     0,      0.621
-    8192,   0,      13,     1,      0.62
-    8192,   0,      45,     0,      0.53
-    8192,   0,      45,     1,      0.516
-    8192,   13,     13,     0,      0.664
-    8192,   13,     13,     1,      0.659
-    8192,   45,     45,     0,      0.593
-    8192,   45,     45,     1,      0.575
-    8192,   2048,   0,      0,      0.854
-    8192,   2048,   0,      1,      0.834
-    8192,   2061,   0,      0,      0.863
-    8192,   2061,   0,      1,      0.857
-    8192,   2048,   13,     0,      0.63
-    8192,   2048,   13,     1,      0.629
-    8192,   2061,   13,     0,      0.627
-    8192,   2061,   13,     1,      0.62
-    
-    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
+No bug.

-diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
-index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
--- a/sysdeps/x86/dl-cacheinfo.h
-+++ b/sysdeps/x86/dl-cacheinfo.h
-@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+This patch doubles the rep_movsb_threshold when using ERMS. Based on
+benchmarks the vector copy loop, especially now that it handles 4k
+aliasing, is better for these medium ranged.
+
+On Skylake with ERMS:
+
+Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
+4096,   0,      0,      0,      0.975
+4096,   0,      0,      1,      0.953
+4096,   12,     0,      0,      0.969
+4096,   12,     0,      1,      0.872
+4096,   44,     0,      0,      0.979
+4096,   44,     0,      1,      0.83
+4096,   0,      12,     0,      1.006
+4096,   0,      12,     1,      0.989
+4096,   0,      44,     0,      0.739
+4096,   0,      44,     1,      0.942
+4096,   12,     12,     0,      1.009
+4096,   12,     12,     1,      0.973
+4096,   44,     44,     0,      0.791
+4096,   44,     44,     1,      0.961
+4096,   2048,   0,      0,      0.978
+4096,   2048,   0,      1,      0.951
+4096,   2060,   0,      0,      0.986
+4096,   2060,   0,      1,      0.963
+4096,   2048,   12,     0,      0.971
+4096,   2048,   12,     1,      0.941
+4096,   2060,   12,     0,      0.977
+4096,   2060,   12,     1,      0.949
+8192,   0,      0,      0,      0.85
+8192,   0,      0,      1,      0.845
+8192,   13,     0,      0,      0.937
+8192,   13,     0,      1,      0.939
+8192,   45,     0,      0,      0.932
+8192,   45,     0,      1,      0.927
+8192,   0,      13,     0,      0.621
+8192,   0,      13,     1,      0.62
+8192,   0,      45,     0,      0.53
+8192,   0,      45,     1,      0.516
+8192,   13,     13,     0,      0.664
+8192,   13,     13,     1,      0.659
+8192,   45,     45,     0,      0.593
+8192,   45,     45,     1,      0.575
+8192,   2048,   0,      0,      0.854
+8192,   2048,   0,      1,      0.834
+8192,   2061,   0,      0,      0.863
+8192,   2061,   0,      1,      0.857
+8192,   2048,   13,     0,      0.63
+8192,   2048,   13,     1,      0.629
+8192,   2061,   13,     0,      0.627
+8192,   2061,   13,     1,      0.62
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/cacheinfo.h      |  8 +++++---
+ sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
+ 2 files changed, 20 insertions(+), 14 deletions(-)
+
+diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+index cc3941d3..ac025e08 100644
+--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
+@@ -411,18 +411,20 @@ init_cacheinfo (void)
+ 
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
- #endif
 -  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
 +  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
 +     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
@ -80,20 +84,18 @@ index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
     {
 -      rep_movsb_threshold = 2048 * (64 / 16);
 +      rep_movsb_threshold = 4096 * (64 / 16);
- #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
- #endif
-@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+     }
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
 -      rep_movsb_threshold = 2048 * (32 / 16);
 +      rep_movsb_threshold = 4096 * (32 / 16);
- #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
- #endif
+     }
+   else
 diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
-index dd6e1d65c9490d4f..419313804d49cf65 100644
+index 89bf2966..56c6834a 100644
 --- a/sysdeps/x86/dl-tunables.list
 +++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
@ -129,3 +131,6 @@ index dd6e1d65c9490d4f..419313804d49cf65 100644
       minval: 1
     }
     x86_rep_stosb_threshold {
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-183.patch
+++ b/SOURCES/glibc-upstream-2.34-183.patch
@ -1,34 +1,36 @@
-commit 7cb126e7e7febf9dc3e369cc3e4885e34fb9433b
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Wed Nov 10 16:18:56 2021 -0600
+From 2f9062d7171850451e6044ef78d91ff8c017b9c0 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 10 Nov 2021 16:18:56 -0600
+Subject: [PATCH] x86: Shrink memcmp-sse4.S code size
+Content-type: text/plain; charset=UTF-8

-    x86: Shrink memcmp-sse4.S code size
-    
-    No bug.
-    
-    This implementation refactors memcmp-sse4.S primarily with minimizing
-    code size in mind. It does this by removing the lookup table logic and
-    removing the unrolled check from (256, 512] bytes.
-    
-    memcmp-sse4 code size reduction : -3487 bytes
-    wmemcmp-sse4 code size reduction: -1472 bytes
-    
-    The current memcmp-sse4.S implementation has a large code size
-    cost. This has serious adverse affects on the ICache / ITLB. While
-    in micro-benchmarks the implementations appears fast, traces of
-    real-world code have shown that the speed in micro benchmarks does not
-    translate when the ICache/ITLB are not primed, and that the cost
-    of the code size has measurable negative affects on overall
-    application performance.
-    
-    See https://research.google/pubs/pub48320/ for more details.
-    
-    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    (cherry picked from commit 2f9062d7171850451e6044ef78d91ff8c017b9c0)
+No bug.
+
+This implementation refactors memcmp-sse4.S primarily with minimizing
+code size in mind. It does this by removing the lookup table logic and
+removing the unrolled check from (256, 512] bytes.
+
+memcmp-sse4 code size reduction : -3487 bytes
+wmemcmp-sse4 code size reduction: -1472 bytes
+
+The current memcmp-sse4.S implementation has a large code size
+cost. This has serious adverse affects on the ICache / ITLB. While
+in micro-benchmarks the implementations appears fast, traces of
+real-world code have shown that the speed in micro benchmarks does not
+translate when the ICache/ITLB are not primed, and that the cost
+of the code size has measurable negative affects on overall
+application performance.
+
+See https://research.google/pubs/pub48320/ for more details.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-sse4.S | 2267 +++++++-----------------
+ 1 file changed, 646 insertions(+), 1621 deletions(-)

 diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
-index b7ac034569ec6178..97c102a9c5ab2b91 100644
+index 302900f5..50060006 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -25,14 +25,14 @@
@ -2421,3 +2423,6 @@ index b7ac034569ec6178..97c102a9c5ab2b91 100644
 -	.int	JMPTBL (L(unreal_case), L(table_64bytes))
 -# endif
 #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-330.patch
+++ b/SOURCES/glibc-upstream-2.34-330.patch
@ -1,19 +1,22 @@
-commit a2e259014f8a0e5f3ff938314f3087b74255804d
-Author: H.J. Lu <hjl.tools@gmail.com>
-Date:   Thu Nov 11 06:31:51 2021 -0800
+From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:31:51 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ
+ #28537]
+Content-type: text/plain; charset=UTF-8

-    Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
-    
-    Replace boolean CAS with value CAS to avoid the extra load.
-    
-    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
-    (cherry picked from commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f)
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)

 diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
-index da624f322d06d0ee..a04e0158451c8fff 100644
+index 29cc143e..60ada70d 100644
 --- a/nptl/pthread_mutex_lock.c
 +++ b/nptl/pthread_mutex_lock.c
-@@ -298,12 +298,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
 	     meantime.  */
 	  if ((oldval & FUTEX_WAITERS) == 0)
 	    {
@ -31,3 +34,6 @@ index da624f322d06d0ee..a04e0158451c8fff 100644
 		  continue;
 		}
 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-331.patch
+++ b/SOURCES/glibc-upstream-2.34-331.patch
@ -1,19 +1,22 @@
-commit ed8300c054cae4aeb0bbfa043f5fccc91a4adbf5
-Author: H.J. Lu <hjl.tools@gmail.com>
-Date:   Thu Nov 11 06:54:01 2021 -0800
+From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Thu, 11 Nov 2021 06:54:01 -0800
+Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common
+ [BZ #28537]
+Content-type: text/plain; charset=UTF-8

-    Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
-    
-    Replace boolean CAS with value CAS to avoid the extra load.
-    
-    Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
-    (cherry picked from commit 49302b8fdf9103b6fc0a398678668a22fa19574c)
+Replace boolean CAS with value CAS to avoid the extra load.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_timedlock.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)

 diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
-index 11ad7005d07afc6e..90cede9446e33fcf 100644
+index 888c12fe..c4627ef6 100644
 --- a/nptl/pthread_mutex_timedlock.c
 +++ b/nptl/pthread_mutex_timedlock.c
-@@ -234,12 +234,12 @@ __pthread_mutex_clocklock_common (pthread_mutex_t *mutex,
+@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
 	     meantime.  */
 	  if ((oldval & FUTEX_WAITERS) == 0)
 	    {
@ -31,3 +34,6 @@ index 11ad7005d07afc6e..90cede9446e33fcf 100644
 		  continue;
 		}
 	      oldval |= FUTEX_WAITERS;
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-66.patch
+++ b/SOURCES/glibc-RHEL-15696-66.patch
@ -0,0 +1,51 @@
+From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Tue, 2 Nov 2021 18:33:07 -0700
+Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
+Content-type: text/plain; charset=UTF-8
+
+CAS instruction is expensive.  From the x86 CPU's point of view, getting
+a cache line for writing is more expensive than reading.  See Appendix
+A.2 Spinlock in:
+
+https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
+
+The full compare and swap will grab the cache line exclusive and cause
+excessive cache line bouncing.
+
+Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
+loop if compare may fail to reduce cache line bouncing on contended locks.
+
+Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
+---
+ nptl/pthread_mutex_lock.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index 60ada70d..eb4d8baa 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -56,6 +56,11 @@
+ #define FORCE_ELISION(m, s)
+ #endif
+ 
+#ifndef LLL_MUTEX_READ_LOCK
+# define LLL_MUTEX_READ_LOCK(mutex) \
+  atomic_load_relaxed (&(mutex)->__data.__lock)
+#endif
+
+ static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+      __attribute_noinline__;
+ 
+@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
+	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+		continue;
+ 	    }
+ 	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-67.patch
+++ b/SOURCES/glibc-RHEL-15696-67.patch
@ -0,0 +1,71 @@
+From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Fri, 12 Nov 2021 11:47:42 -0800
+Subject: [PATCH] Move assignment out of the CAS condition
+Content-type: text/plain; charset=UTF-8
+
+Update
+
+commit 49302b8fdf9103b6fc0a398678668a22fa19574c
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:54:01 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+and
+
+commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
+Author: H.J. Lu <hjl.tools@gmail.com>
+Date:   Thu Nov 11 06:31:51 2021 -0800
+
+    Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
+
+    Replace boolean CAS with value CAS to avoid the extra load.
+
+by moving assignment out of the CAS condition.
+---
+ nptl/pthread_mutex_lock.c      | 7 +++----
+ nptl/pthread_mutex_timedlock.c | 7 +++----
+ 2 files changed, 6 insertions(+), 8 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index eb4d8baa..a633d95e 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
+	      int val = atomic_compare_and_exchange_val_acq
+		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
+	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
+index c4627ef6..a76c30b7 100644
+--- a/nptl/pthread_mutex_timedlock.c
+++ b/nptl/pthread_mutex_timedlock.c
+@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
+ 	     meantime.  */
+ 	  if ((oldval & FUTEX_WAITERS) == 0)
+ 	    {
+-	      int val;
+-	      if ((val = atomic_compare_and_exchange_val_acq
+-		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+-		    oldval)) != oldval)
+	      int val = atomic_compare_and_exchange_val_acq
+		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
+	      if (val != oldval)
+ 		{
+ 		  oldval = val;
+ 		  continue;
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-68.patch
+++ b/SOURCES/glibc-RHEL-15696-68.patch
@ -0,0 +1,60 @@
+From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 3 Dec 2021 15:29:25 -0800
+Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
+Content-type: text/plain; charset=UTF-8
+
+Must use notl %edi here as lower bits are for CHAR comparisons
+potentially out of range thus can be 0 without indicating mismatch.
+This fixes BZ #28646.
+
+Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
+ 1 file changed, 8 insertions(+), 6 deletions(-)
+
+Conflicts:
+	string/test-strcmp.c
+	(new check omitted)
+
+diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
+index 82f12ac8..6f5c4bf9 100644
+--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
+@@ -656,12 +656,13 @@ L(loop_cross_page):
+ 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
+ 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ 	kmovd	%k3, %edi
+    /* Must use notl %edi here as lower bits are for CHAR
+	   comparisons potentially out of range thus can be 0 without
+	   indicating mismatch.  */
+	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
+ 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
+ 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ 	kmovd	%k3, %edi
+	/* Must use notl %edi here as lower bits are for CHAR
+	   comparisons potentially out of range thus can be 0 without
+	   indicating mismatch.  */
+	notl	%edi
+ # ifdef USE_AS_WCSCMP
+ 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
+-	notl	%edi
+ 	andl	$0xff, %edi
+-# else
+-	incl	%edi
+ # endif
+ 
+ # ifdef USE_AS_WCSCMP
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-185.patch
+++ b/SOURCES/glibc-upstream-2.34-185.patch
@ -1,20 +1,22 @@
-commit f3a99b2216114f89b20329ae7664b764248b4bbd
-Author: H.J. Lu <hjl.tools@gmail.com>
-Date:   Mon Dec 6 07:14:12 2021 -0800
+From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 6 Dec 2021 07:14:12 -0800
+Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512
+ and AVX-VNNI
+Content-type: text/plain; charset=UTF-8

-    x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI
-    
-    Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
-    they won't lower CPU frequency when ZMM load and store instructions are
-    used.
-    
-    (cherry picked from commit ceeffe968c01b1202e482f4855cb6baf5c6cb713)
+Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
+they won't lower CPU frequency when ZMM load and store instructions are
+used.
+---
+ sysdeps/x86/cpu-features.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)

 diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
-index f4d4049e391cbabd..09590d8794b1c6fb 100644
+index 956bfb4f..5ff2baa0 100644
 --- a/sysdeps/x86/cpu-features.c
 +++ b/sysdeps/x86/cpu-features.c
-@@ -566,8 +566,11 @@ disable_tsx:
+@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
 	  |= bit_arch_Prefer_No_VZEROUPPER;
       else
 	{
@ -28,3 +30,6 @@ index f4d4049e391cbabd..09590d8794b1c6fb 100644
 
 	  /* Avoid RTM abort triggered by VZEROUPPER inside a
 	     transactionally executing RTM region.  */
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-7.patch
+++ b/SOURCES/glibc-RHEL-15696-7.patch
@ -0,0 +1,153 @@
+From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
+From: "H.J. Lu" <hjl.tools@gmail.com>
+Date: Mon, 21 Jan 2019 11:35:18 -0800
+Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
+ 24097]
+Content-type: text/plain; charset=UTF-8
+
+On x32, the size_t parameter may be passed in the lower 32 bits of a
+64-bit register with the non-zero upper 32 bits.  The string/memory
+functions written in assembly can only use the lower 32 bits of a
+64-bit register as length or must clear the upper 32 bits before using
+the full 64-bit register for length.
+
+This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
+libc.so is the same with and withou the fix.
+
+	[BZ# 24097]
+	CVE-2019-6488
+	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
+	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
+	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
+	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
+	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
+---
+ .../x86_64/multiarch/strcpy-sse2-unaligned.S  |  4 +-
+ sysdeps/x86_64/multiarch/strcpy-ssse3.S       |  6 +-
+ sysdeps/x86_64/x32/Makefile                   |  2 +-
+ sysdeps/x86_64/x32/tst-size_t-strncpy.c       | 58 +++++++++++++++++++
+ 4 files changed, 64 insertions(+), 6 deletions(-)
+ create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
+
+Conflicts:
+	ChangeLog
+	(removed)
+	sysdeps/x86_64/multiarch/strcpy-avx2.S
+	(skipped, only needed for x32 arch)
+
+diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+index 72bf7e85..50aca22d 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+@@ -40,8 +40,8 @@
+ .text
+ ENTRY (STRCPY)
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
+-	test	%r8, %r8
+	mov	%RDX_LP, %R8_LP
+	test	%R8_LP, %R8_LP
+ 	jz	L(ExitZero)
+ #  endif
+ 	mov	%rsi, %rcx
+diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+index 9858d0c4..0a62814a 100644
+--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+@@ -31,13 +31,13 @@ ENTRY (STRCPY)
+ 
+ 	mov	%rsi, %rcx
+ #  ifdef USE_AS_STRNCPY
+-	mov	%rdx, %r8
+	mov	%RDX_LP, %R8_LP
+ #  endif
+ 	mov	%rdi, %rdx
+ #  ifdef USE_AS_STRNCPY
+-	test	%r8, %r8
+	test	%R8_LP, %R8_LP
+ 	jz	L(Exit0)
+-	cmp	$8, %r8
+	cmp	$8, %R8_LP
+ 	jbe	L(StrncpyExit8Bytes)
+ # endif
+ 	cmpb	$0, (%rcx)
+diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
+index db302839..2a9e20a9 100644
+--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
+@@ -8,7 +8,7 @@ endif
+ ifeq ($(subdir),string)
+ tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+-	 tst-size_t-strncmp
+	 tst-size_t-strncmp tst-size_t-strncpy
+ endif
+ 
+ ifeq ($(subdir),wcsmbs)
+diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+new file mode 100644
+index 00000000..4dec71e6
+--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
+@@ -0,0 +1,58 @@
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
+   Copyright (C) 2019 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <http://www.gnu.org/licenses/>.  */
+
+#define TEST_NAME "strncpy"
+#include "test-size_t.h"
+
+IMPL (strncpy, 1)
+
+typedef char *(*proto_t) (char *, const char*, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_strncpy (parameter_t a, parameter_t b)
+{
+  return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+  test_init ();
+
+  parameter_t dest = { { page_size }, buf1 };
+  parameter_t src = { { 0 }, buf2 };
+
+  int ret = 0;
+  FOR_EACH_IMPL (impl, 0)
+    {
+      src.fn = impl->fn;
+      do_strncpy (dest, src);
+      int res = strncmp (dest.p, src.p, dest.len);
+      if (res)
+	{
+	  error (0, 0, "Wrong result in function %s: %i != 0",
+		 impl->name, res);
+	  ret = 1;
+	}
+    }
+
+  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-186.patch
+++ b/SOURCES/glibc-upstream-2.34-186.patch
@ -1,28 +1,30 @@
-commit c796418d00f65c8c5fbed477f3ba6da2bee64ece
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Fri Dec 24 18:54:41 2021 -0600
+From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 24 Dec 2021 18:54:41 -0600
+Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
+Content-type: text/plain; charset=UTF-8

-    x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
-    
-    No bug.
-    Optimizations are twofold.
-    
-    1) Replace page cross and 0/1 checks with masked load instructions in
-       L(less_vec). In applications this reduces branch-misses in the
-       hot [0, 32] case.
-    2) Change controlflow so that L(less_vec) case gets the fall through.
-    
-    Change 2) helps copies in the [0, 32] size range but comes at the cost
-    of copies in the [33, 64] size range.  From profiles of GCC and
-    Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
-    appears to the the right tradeoff.
-    
-    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    (cherry picked from commit abddd61de090ae84e380aff68a98bd94ef704667)
+No bug.
+Optimizations are twofold.
+
+1) Replace page cross and 0/1 checks with masked load instructions in
+   L(less_vec). In applications this reduces branch-misses in the
+   hot [0, 32] case.
+2) Change controlflow so that L(less_vec) case gets the fall through.
+
+Change 2) helps copies in the [0, 32] size range but comes at the cost
+of copies in the [33, 64] size range.  From profiles of GCC and
+Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
+appears to the the right tradeoff.
+
+Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
+ 1 file changed, 56 insertions(+), 193 deletions(-)

 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
-index 640f6757fac8a356..d2899e7c7078cd41 100644
+index 640f6757..d2899e7c 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -62,15 +62,18 @@ Latency:
@ -382,3 +384,6 @@ index 640f6757fac8a356..d2899e7c7078cd41 100644
 -# endif
 END (MEMCMP)
 #endif
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-71.patch
+++ b/SOURCES/glibc-RHEL-15696-71.patch
@ -0,0 +1,43 @@
+From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
+From: Jangwoong Kim <6812skiii@gmail.com>
+Date: Tue, 14 Dec 2021 21:30:51 +0900
+Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
+Content-type: text/plain; charset=UTF-8
+
+The commit:
+"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
+SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
+
+introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
+if atomic load fails. But, "continue" inside of do-while loop
+does not skip the evaluation of escape expression, thus CAS
+is not skipped.
+
+Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
+LLL_MUTEX_READ_LOCK fails.
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ nptl/pthread_mutex_lock.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
+index a633d95e..d96a9933 100644
+--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
+@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
+ 		  break;
+ 		}
+ 	      atomic_spin_nop ();
+-	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+-		continue;
+ 	    }
+-	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
+		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
+ 
+ 	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
+ 	}
+-- 
+GitLab
+
--- a/SOURCES/glibc-upstream-2.34-113.patch
+++ b/SOURCES/glibc-upstream-2.34-113.patch
@ -1,25 +1,29 @@
-commit d093b677c36ef4b360bf30483b68b95d9f0ad1d2
-Author: Noah Goldstein <goldstein.w.n@gmail.com>
-Date:   Fri Feb 18 14:19:15 2022 -0600
+From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 14:19:15 -0600
+Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
+Content-type: text/plain; charset=UTF-8

-    x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
-    
-    In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
-    call strcmp-avx2 and wcscmp-avx2 respectively. This would have
-    not checks around vzeroupper and would trigger spurious
-    aborts. This commit fixes that.
-    
-    test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
-    AVX2 machines with and without RTM.
-    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
-    
-    (cherry picked from commit 7835d611af0854e69a0c71e3806f8fe379282d6f)
+In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
+call strcmp-avx2 and wcscmp-avx2 respectively. This would have
+not checks around vzeroupper and would trigger spurious
+aborts. This commit fixes that.
+
+test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
+AVX2 machines with and without RTM.
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/Makefile          |  5 ++++-
+ sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++---------
+ sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++
+ 3 files changed, 48 insertions(+), 10 deletions(-)
+ create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c

 diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
-index af934d6ccf1fa337..cd94e683afd5b4a4 100644
+index 2d814915..c2111f49 100644
 --- a/sysdeps/x86/Makefile
 +++ b/sysdeps/x86/Makefile
-@@ -95,7 +95,9 @@ tests += \
+@@ -28,7 +28,9 @@ tests += \
   tst-strcpy-rtm \
   tst-strlen-rtm \
   tst-strncmp-rtm \
@ -30,7 +34,7 @@ index af934d6ccf1fa337..cd94e683afd5b4a4 100644
 
 CFLAGS-tst-memchr-rtm.c += -mrtm
 CFLAGS-tst-memcmp-rtm.c += -mrtm
-@@ -107,6 +109,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
+@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
 CFLAGS-tst-strlen-rtm.c += -mrtm
 CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
 CFLAGS-tst-strrchr-rtm.c += -mrtm
@ -39,7 +43,7 @@ index af934d6ccf1fa337..cd94e683afd5b4a4 100644
 
 ifneq ($(enable-cet),no)
 diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
-index 4d0004b58aae428d..4e9f094f39c72f67 100644
+index 4d0004b5..4e9f094f 100644
 --- a/sysdeps/x86/tst-strncmp-rtm.c
 +++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -19,18 +19,32 @@
@ -112,7 +116,7 @@ index 4d0004b58aae428d..4e9f094f39c72f67 100644
 }
 diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
 new file mode 100644
-index 0000000000000000..bad3b863782c5e56
+index 00000000..bad3b863
 --- /dev/null
 +++ b/sysdeps/x86/tst-wcsncmp-rtm.c
@@ -0,0 +1,21 @@
@ -137,3 +141,6 @@ index 0000000000000000..bad3b863782c5e56
 +#define WIDE 1
 +#include <wchar.h>
 +#include "tst-strncmp-rtm.c"
+-- 
+GitLab
+
--- a/SOURCES/glibc-RHEL-15696-73.patch
+++ b/SOURCES/glibc-RHEL-15696-73.patch
@ -0,0 +1,37 @@
+From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Fri, 18 Feb 2022 17:00:25 -0600
+Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
+Content-type: text/plain; charset=UTF-8
+
+Previously TEST_NAME was passing a function pointer. This didn't fail
+because of the -Wno-error flag (to allow for overflow sizes passed
+to strncmp/wcsncmp)
+
+Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
+---
+ sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
+index 4e9f094f..aef9866c 100644
+--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
+@@ -23,12 +23,12 @@
+ # define CHAR wchar_t
+ # define MEMSET wmemset
+ # define STRNCMP wcsncmp
+-# define TEST_NAME wcsncmp
+# define TEST_NAME "wcsncmp"
+ #else /* !WIDE */
+ # define CHAR char
+ # define MEMSET memset
+ # define STRNCMP strncmp
+-# define TEST_NAME strncmp
+# define TEST_NAME "strncmp"
+ #endif /* !WIDE */
+ 
+ 
+-- 
+GitLab
+
--- a/Show More
+++ b/Show More