1444 changed files with 83508 additions and 181363 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1 +1,2 @@
-SOURCES/glibc-2.28.tar.xz
+SOURCES/glibc-2.34.tar.xz
 SOURCES/glibc-upstream-2.34-373.patch
--- a/.glibc.metadata
+++ b/.glibc.metadata
@ -1 +1,2 @@
-ccb5dc9e51a9884df8488f86982439d47b283b2a SOURCES/glibc-2.28.tar.xz
+7c3b8890a6346793b6334cc5f2fea5d437d307b8 SOURCES/glibc-2.34.tar.xz
 6022f103e5596ad229f22bc966327d71208f7016 SOURCES/glibc-upstream-2.34-373.patch
--- a/SOURCES/ChangeLog.old
+++ b/SOURCES/ChangeLog.old
--- a/SOURCES/SUPPORTED
+++ b/SOURCES/SUPPORTED
@ -1,496 +0,0 @@
 # This file names the currently supported and somewhat tested locales.
 # If you have any additions please file a glibc bug report.
 SUPPORTED-LOCALES=\
 C.UTF-8/UTF-8 \
 aa_DJ.UTF-8/UTF-8 \
 aa_DJ/ISO-8859-1 \
 aa_ER/UTF-8 \
 aa_ER@saaho/UTF-8 \
 aa_ET/UTF-8 \
 af_ZA.UTF-8/UTF-8 \
 af_ZA/ISO-8859-1 \
 agr_PE/UTF-8 \
 ak_GH/UTF-8 \
 am_ET/UTF-8 \
 an_ES.UTF-8/UTF-8 \
 an_ES/ISO-8859-15 \
 anp_IN/UTF-8 \
 ar_AE.UTF-8/UTF-8 \
 ar_AE/ISO-8859-6 \
 ar_BH.UTF-8/UTF-8 \
 ar_BH/ISO-8859-6 \
 ar_DZ.UTF-8/UTF-8 \
 ar_DZ/ISO-8859-6 \
 ar_EG.UTF-8/UTF-8 \
 ar_EG/ISO-8859-6 \
 ar_IN/UTF-8 \
 ar_IQ.UTF-8/UTF-8 \
 ar_IQ/ISO-8859-6 \
 ar_JO.UTF-8/UTF-8 \
 ar_JO/ISO-8859-6 \
 ar_KW.UTF-8/UTF-8 \
 ar_KW/ISO-8859-6 \
 ar_LB.UTF-8/UTF-8 \
 ar_LB/ISO-8859-6 \
 ar_LY.UTF-8/UTF-8 \
 ar_LY/ISO-8859-6 \
 ar_MA.UTF-8/UTF-8 \
 ar_MA/ISO-8859-6 \
 ar_OM.UTF-8/UTF-8 \
 ar_OM/ISO-8859-6 \
 ar_QA.UTF-8/UTF-8 \
 ar_QA/ISO-8859-6 \
 ar_SA.UTF-8/UTF-8 \
 ar_SA/ISO-8859-6 \
 ar_SD.UTF-8/UTF-8 \
 ar_SD/ISO-8859-6 \
 ar_SS/UTF-8 \
 ar_SY.UTF-8/UTF-8 \
 ar_SY/ISO-8859-6 \
 ar_TN.UTF-8/UTF-8 \
 ar_TN/ISO-8859-6 \
 ar_YE.UTF-8/UTF-8 \
 ar_YE/ISO-8859-6 \
 ayc_PE/UTF-8 \
 az_AZ/UTF-8 \
 az_IR/UTF-8 \
 as_IN/UTF-8 \
 ast_ES.UTF-8/UTF-8 \
 ast_ES/ISO-8859-15 \
 be_BY.UTF-8/UTF-8 \
 be_BY/CP1251 \
 be_BY@latin/UTF-8 \
 bem_ZM/UTF-8 \
 ber_DZ/UTF-8 \
 ber_MA/UTF-8 \
 bg_BG.UTF-8/UTF-8 \
 bg_BG/CP1251 \
 bhb_IN.UTF-8/UTF-8 \
 bho_IN/UTF-8 \
 bho_NP/UTF-8 \
 bi_VU/UTF-8 \
 bn_BD/UTF-8 \
 bn_IN/UTF-8 \
 bo_CN/UTF-8 \
 bo_IN/UTF-8 \
 br_FR.UTF-8/UTF-8 \
 br_FR/ISO-8859-1 \
 br_FR@euro/ISO-8859-15 \
 brx_IN/UTF-8 \
 bs_BA.UTF-8/UTF-8 \
 bs_BA/ISO-8859-2 \
 byn_ER/UTF-8 \
 ca_AD.UTF-8/UTF-8 \
 ca_AD/ISO-8859-15 \
 ca_ES.UTF-8/UTF-8 \
 ca_ES/ISO-8859-1 \
 ca_ES@euro/ISO-8859-15 \
 ca_ES@valencia/UTF-8 \
 ca_FR.UTF-8/UTF-8 \
 ca_FR/ISO-8859-15 \
 ca_IT.UTF-8/UTF-8 \
 ca_IT/ISO-8859-15 \
 ce_RU/UTF-8 \
 chr_US/UTF-8 \
 cmn_TW/UTF-8 \
 crh_UA/UTF-8 \
 cs_CZ.UTF-8/UTF-8 \
 cs_CZ/ISO-8859-2 \
 csb_PL/UTF-8 \
 cv_RU/UTF-8 \
 cy_GB.UTF-8/UTF-8 \
 cy_GB/ISO-8859-14 \
 da_DK.UTF-8/UTF-8 \
 da_DK/ISO-8859-1 \
 da_DK.ISO-8859-15/ISO-8859-15 \
 de_AT.UTF-8/UTF-8 \
 de_AT/ISO-8859-1 \
 de_AT@euro/ISO-8859-15 \
 de_BE.UTF-8/UTF-8 \
 de_BE/ISO-8859-1 \
 de_BE@euro/ISO-8859-15 \
 de_CH.UTF-8/UTF-8 \
 de_CH/ISO-8859-1 \
 de_DE.UTF-8/UTF-8 \
 de_DE/ISO-8859-1 \
 de_DE@euro/ISO-8859-15 \
 de_IT.UTF-8/UTF-8 \
 de_IT/ISO-8859-1 \
 de_LI.UTF-8/UTF-8 \
 de_LU.UTF-8/UTF-8 \
 de_LU/ISO-8859-1 \
 de_LU@euro/ISO-8859-15 \
 doi_IN/UTF-8 \
 dsb_DE/UTF-8 \
 dv_MV/UTF-8 \
 dz_BT/UTF-8 \
 el_GR.UTF-8/UTF-8 \
 el_GR/ISO-8859-7 \
 el_GR@euro/ISO-8859-7 \
 el_CY.UTF-8/UTF-8 \
 el_CY/ISO-8859-7 \
 en_AG/UTF-8 \
 en_AU.UTF-8/UTF-8 \
 en_AU/ISO-8859-1 \
 en_BW.UTF-8/UTF-8 \
 en_BW/ISO-8859-1 \
 en_CA.UTF-8/UTF-8 \
 en_CA/ISO-8859-1 \
 en_DK.UTF-8/UTF-8 \
 en_DK/ISO-8859-1 \
 en_GB.UTF-8/UTF-8 \
 en_GB/ISO-8859-1 \
 en_GB.ISO-8859-15/ISO-8859-15 \
 en_HK.UTF-8/UTF-8 \
 en_HK/ISO-8859-1 \
 en_IE.UTF-8/UTF-8 \
 en_IE/ISO-8859-1 \
 en_IE@euro/ISO-8859-15 \
 en_IL/UTF-8 \
 en_IN/UTF-8 \
 en_NG/UTF-8 \
 en_NZ.UTF-8/UTF-8 \
 en_NZ/ISO-8859-1 \
 en_PH.UTF-8/UTF-8 \
 en_PH/ISO-8859-1 \
 en_SC.UTF-8/UTF-8 \
 en_SG.UTF-8/UTF-8 \
 en_SG/ISO-8859-1 \
 en_US.UTF-8/UTF-8 \
 en_US/ISO-8859-1 \
 en_US.ISO-8859-15/ISO-8859-15 \
 en_US@ampm/UTF-8 \
 en_US.UTF-8@ampm/UTF-8 \
 en_ZA.UTF-8/UTF-8 \
 en_ZA/ISO-8859-1 \
 en_ZM/UTF-8 \
 en_ZW.UTF-8/UTF-8 \
 en_ZW/ISO-8859-1 \
 eo/UTF-8 \
 es_AR.UTF-8/UTF-8 \
 es_AR/ISO-8859-1 \
 es_BO.UTF-8/UTF-8 \
 es_BO/ISO-8859-1 \
 es_CL.UTF-8/UTF-8 \
 es_CL/ISO-8859-1 \
 es_CO.UTF-8/UTF-8 \
 es_CO/ISO-8859-1 \
 es_CR.UTF-8/UTF-8 \
 es_CR/ISO-8859-1 \
 es_CU/UTF-8 \
 es_DO.UTF-8/UTF-8 \
 es_DO/ISO-8859-1 \
 es_EC.UTF-8/UTF-8 \
 es_EC/ISO-8859-1 \
 es_ES.UTF-8/UTF-8 \
 es_ES/ISO-8859-1 \
 es_ES@euro/ISO-8859-15 \
 es_GT.UTF-8/UTF-8 \
 es_GT/ISO-8859-1 \
 es_HN.UTF-8/UTF-8 \
 es_HN/ISO-8859-1 \
 es_MX.UTF-8/UTF-8 \
 es_MX/ISO-8859-1 \
 es_NI.UTF-8/UTF-8 \
 es_NI/ISO-8859-1 \
 es_PA.UTF-8/UTF-8 \
 es_PA/ISO-8859-1 \
 es_PE.UTF-8/UTF-8 \
 es_PE/ISO-8859-1 \
 es_PR.UTF-8/UTF-8 \
 es_PR/ISO-8859-1 \
 es_PY.UTF-8/UTF-8 \
 es_PY/ISO-8859-1 \
 es_SV.UTF-8/UTF-8 \
 es_SV/ISO-8859-1 \
 es_US.UTF-8/UTF-8 \
 es_US/ISO-8859-1 \
 es_UY.UTF-8/UTF-8 \
 es_UY/ISO-8859-1 \
 es_VE.UTF-8/UTF-8 \
 es_VE/ISO-8859-1 \
 et_EE.UTF-8/UTF-8 \
 et_EE/ISO-8859-1 \
 et_EE.ISO-8859-15/ISO-8859-15 \
 eu_ES.UTF-8/UTF-8 \
 eu_ES/ISO-8859-1 \
 eu_ES@euro/ISO-8859-15 \
 fa_IR/UTF-8 \
 ff_SN/UTF-8 \
 fi_FI.UTF-8/UTF-8 \
 fi_FI/ISO-8859-1 \
 fi_FI@euro/ISO-8859-15 \
 fil_PH/UTF-8 \
 fo_FO.UTF-8/UTF-8 \
 fo_FO/ISO-8859-1 \
 fr_BE.UTF-8/UTF-8 \
 fr_BE/ISO-8859-1 \
 fr_BE@euro/ISO-8859-15 \
 fr_CA.UTF-8/UTF-8 \
 fr_CA/ISO-8859-1 \
 fr_CH.UTF-8/UTF-8 \
 fr_CH/ISO-8859-1 \
 fr_FR.UTF-8/UTF-8 \
 fr_FR/ISO-8859-1 \
 fr_FR@euro/ISO-8859-15 \
 fr_LU.UTF-8/UTF-8 \
 fr_LU/ISO-8859-1 \
 fr_LU@euro/ISO-8859-15 \
 fur_IT/UTF-8 \
 fy_NL/UTF-8 \
 fy_DE/UTF-8 \
 ga_IE.UTF-8/UTF-8 \
 ga_IE/ISO-8859-1 \
 ga_IE@euro/ISO-8859-15 \
 gd_GB.UTF-8/UTF-8 \
 gd_GB/ISO-8859-15 \
 gez_ER/UTF-8 \
 gez_ER@abegede/UTF-8 \
 gez_ET/UTF-8 \
 gez_ET@abegede/UTF-8 \
 gl_ES.UTF-8/UTF-8 \
 gl_ES/ISO-8859-1 \
 gl_ES@euro/ISO-8859-15 \
 gu_IN/UTF-8 \
 gv_GB.UTF-8/UTF-8 \
 gv_GB/ISO-8859-1 \
 ha_NG/UTF-8 \
 hak_TW/UTF-8 \
 he_IL.UTF-8/UTF-8 \
 he_IL/ISO-8859-8 \
 hi_IN/UTF-8 \
 hif_FJ/UTF-8 \
 hne_IN/UTF-8 \
 hr_HR.UTF-8/UTF-8 \
 hr_HR/ISO-8859-2 \
 hsb_DE/ISO-8859-2 \
 hsb_DE.UTF-8/UTF-8 \
 ht_HT/UTF-8 \
 hu_HU.UTF-8/UTF-8 \
 hu_HU/ISO-8859-2 \
 hy_AM/UTF-8 \
 hy_AM.ARMSCII-8/ARMSCII-8 \
 ia_FR/UTF-8 \
 id_ID.UTF-8/UTF-8 \
 id_ID/ISO-8859-1 \
 ig_NG/UTF-8 \
 ik_CA/UTF-8 \
 is_IS.UTF-8/UTF-8 \
 is_IS/ISO-8859-1 \
 it_CH.UTF-8/UTF-8 \
 it_CH/ISO-8859-1 \
 it_IT.UTF-8/UTF-8 \
 it_IT/ISO-8859-1 \
 it_IT@euro/ISO-8859-15 \
 iu_CA/UTF-8 \
 ja_JP.EUC-JP/EUC-JP \
 ja_JP.UTF-8/UTF-8 \
 ka_GE.UTF-8/UTF-8 \
 ka_GE/GEORGIAN-PS \
 kab_DZ/UTF-8 \
 kk_KZ.UTF-8/UTF-8 \
 kk_KZ/PT154 \
 kl_GL.UTF-8/UTF-8 \
 kl_GL/ISO-8859-1 \
 km_KH/UTF-8 \
 kn_IN/UTF-8 \
 ko_KR.EUC-KR/EUC-KR \
 ko_KR.UTF-8/UTF-8 \
 kok_IN/UTF-8 \
 ks_IN/UTF-8 \
 ks_IN@devanagari/UTF-8 \
 ku_TR.UTF-8/UTF-8 \
 ku_TR/ISO-8859-9 \
 kw_GB.UTF-8/UTF-8 \
 kw_GB/ISO-8859-1 \
 ky_KG/UTF-8 \
 lb_LU/UTF-8 \
 lg_UG.UTF-8/UTF-8 \
 lg_UG/ISO-8859-10 \
 li_BE/UTF-8 \
 li_NL/UTF-8 \
 lij_IT/UTF-8 \
 ln_CD/UTF-8 \
 lo_LA/UTF-8 \
 lt_LT.UTF-8/UTF-8 \
 lt_LT/ISO-8859-13 \
 lv_LV.UTF-8/UTF-8 \
 lv_LV/ISO-8859-13 \
 lzh_TW/UTF-8 \
 mag_IN/UTF-8 \
 mai_IN/UTF-8 \
 mai_NP/UTF-8 \
 mfe_MU/UTF-8 \
 mg_MG.UTF-8/UTF-8 \
 mg_MG/ISO-8859-15 \
 mhr_RU/UTF-8 \
 mi_NZ.UTF-8/UTF-8 \
 mi_NZ/ISO-8859-13 \
 miq_NI/UTF-8 \
 mjw_IN/UTF-8 \
 mk_MK.UTF-8/UTF-8 \
 mk_MK/ISO-8859-5 \
 ml_IN/UTF-8 \
 mn_MN/UTF-8 \
 mni_IN/UTF-8 \
 mr_IN/UTF-8 \
 ms_MY.UTF-8/UTF-8 \
 ms_MY/ISO-8859-1 \
 mt_MT.UTF-8/UTF-8 \
 mt_MT/ISO-8859-3 \
 my_MM/UTF-8 \
 nan_TW/UTF-8 \
 nan_TW@latin/UTF-8 \
 nb_NO.UTF-8/UTF-8 \
 nb_NO/ISO-8859-1 \
 nds_DE/UTF-8 \
 nds_NL/UTF-8 \
 ne_NP/UTF-8 \
 nhn_MX/UTF-8 \
 niu_NU/UTF-8 \
 niu_NZ/UTF-8 \
 nl_AW/UTF-8 \
 nl_BE.UTF-8/UTF-8 \
 nl_BE/ISO-8859-1 \
 nl_BE@euro/ISO-8859-15 \
 nl_NL.UTF-8/UTF-8 \
 nl_NL/ISO-8859-1 \
 nl_NL@euro/ISO-8859-15 \
 nn_NO.UTF-8/UTF-8 \
 nn_NO/ISO-8859-1 \
 nr_ZA/UTF-8 \
 nso_ZA/UTF-8 \
 oc_FR.UTF-8/UTF-8 \
 oc_FR/ISO-8859-1 \
 om_ET/UTF-8 \
 om_KE.UTF-8/UTF-8 \
 om_KE/ISO-8859-1 \
 or_IN/UTF-8 \
 os_RU/UTF-8 \
 pa_IN/UTF-8 \
 pa_PK/UTF-8 \
 pap_AW/UTF-8 \
 pap_CW/UTF-8 \
 pl_PL.UTF-8/UTF-8 \
 pl_PL/ISO-8859-2 \
 ps_AF/UTF-8 \
 pt_BR.UTF-8/UTF-8 \
 pt_BR/ISO-8859-1 \
 pt_PT.UTF-8/UTF-8 \
 pt_PT/ISO-8859-1 \
 pt_PT@euro/ISO-8859-15 \
 quz_PE/UTF-8 \
 raj_IN/UTF-8 \
 ro_RO.UTF-8/UTF-8 \
 ro_RO/ISO-8859-2 \
 ru_RU.KOI8-R/KOI8-R \
 ru_RU.UTF-8/UTF-8 \
 ru_RU/ISO-8859-5 \
 ru_UA.UTF-8/UTF-8 \
 ru_UA/KOI8-U \
 rw_RW/UTF-8 \
 sa_IN/UTF-8 \
 sah_RU/UTF-8 \
 sat_IN/UTF-8 \
 sc_IT/UTF-8 \
 sd_IN/UTF-8 \
 sd_IN@devanagari/UTF-8 \
 se_NO/UTF-8 \
 sgs_LT/UTF-8 \
 shn_MM/UTF-8 \
 shs_CA/UTF-8 \
 si_LK/UTF-8 \
 sid_ET/UTF-8 \
 sk_SK.UTF-8/UTF-8 \
 sk_SK/ISO-8859-2 \
 sl_SI.UTF-8/UTF-8 \
 sl_SI/ISO-8859-2 \
 sm_WS/UTF-8 \
 so_DJ.UTF-8/UTF-8 \
 so_DJ/ISO-8859-1 \
 so_ET/UTF-8 \
 so_KE.UTF-8/UTF-8 \
 so_KE/ISO-8859-1 \
 so_SO.UTF-8/UTF-8 \
 so_SO/ISO-8859-1 \
 sq_AL.UTF-8/UTF-8 \
 sq_AL/ISO-8859-1 \
 sq_MK/UTF-8 \
 sr_ME/UTF-8 \
 sr_RS/UTF-8 \
 sr_RS@latin/UTF-8 \
 ss_ZA/UTF-8 \
 st_ZA.UTF-8/UTF-8 \
 st_ZA/ISO-8859-1 \
 sv_FI.UTF-8/UTF-8 \
 sv_FI/ISO-8859-1 \
 sv_FI@euro/ISO-8859-15 \
 sv_SE.UTF-8/UTF-8 \
 sv_SE/ISO-8859-1 \
 sv_SE.ISO-8859-15/ISO-8859-15 \
 sw_KE/UTF-8 \
 sw_TZ/UTF-8 \
 szl_PL/UTF-8 \
 ta_IN/UTF-8 \
 ta_LK/UTF-8 \
 tcy_IN.UTF-8/UTF-8 \
 te_IN/UTF-8 \
 tg_TJ.UTF-8/UTF-8 \
 tg_TJ/KOI8-T \
 th_TH.UTF-8/UTF-8 \
 th_TH/TIS-620 \
 the_NP/UTF-8 \
 ti_ER/UTF-8 \
 ti_ET/UTF-8 \
 tig_ER/UTF-8 \
 tk_TM/UTF-8 \
 tl_PH.UTF-8/UTF-8 \
 tl_PH/ISO-8859-1 \
 tn_ZA/UTF-8 \
 to_TO/UTF-8 \
 tpi_PG/UTF-8 \
 tr_CY.UTF-8/UTF-8 \
 tr_CY/ISO-8859-9 \
 tr_TR.UTF-8/UTF-8 \
 tr_TR/ISO-8859-9 \
 ts_ZA/UTF-8 \
 tt_RU/UTF-8 \
 tt_RU@iqtelif/UTF-8 \
 ug_CN/UTF-8 \
 uk_UA.UTF-8/UTF-8 \
 uk_UA/KOI8-U \
 unm_US/UTF-8 \
 ur_IN/UTF-8 \
 ur_PK/UTF-8 \
 uz_UZ.UTF-8/UTF-8 \
 uz_UZ/ISO-8859-1 \
 uz_UZ@cyrillic/UTF-8 \
 ve_ZA/UTF-8 \
 vi_VN/UTF-8 \
 wa_BE/ISO-8859-1 \
 wa_BE@euro/ISO-8859-15 \
 wa_BE.UTF-8/UTF-8 \
 wae_CH/UTF-8 \
 wal_ET/UTF-8 \
 wo_SN/UTF-8 \
 xh_ZA.UTF-8/UTF-8 \
 xh_ZA/ISO-8859-1 \
 yi_US.UTF-8/UTF-8 \
 yi_US/CP1255 \
 yo_NG/UTF-8 \
 yue_HK/UTF-8 \
 yuw_PG/UTF-8 \
 zh_CN.GB18030/GB18030 \
 zh_CN.GBK/GBK \
 zh_CN.UTF-8/UTF-8 \
 zh_CN/GB2312 \
 zh_HK.UTF-8/UTF-8 \
 zh_HK/BIG5-HKSCS \
 zh_SG.UTF-8/UTF-8 \
 zh_SG.GBK/GBK \
 zh_SG/GB2312 \
 zh_TW.EUC-TW/EUC-TW \
 zh_TW.UTF-8/UTF-8 \
 zh_TW/BIG5 \
 zu_ZA.UTF-8/UTF-8 \
 zu_ZA/ISO-8859-1 \
--- a/SOURCES/build-locale-archive.c
+++ b/SOURCES/build-locale-archive.c
@ -1,862 +0,0 @@
 #define _GNU_SOURCE
 #include <assert.h>
 #include <dirent.h>
 #include <errno.h>
 #include <fcntl.h>
 #include <locale.h>
 #include <stdarg.h>
 #include <stdbool.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <getopt.h>
 #include <string.h>
 #include <sys/mman.h>
 #include <sys/stat.h>
 #include <unistd.h>
 #include "../locale/hashval.h"
 #define __LC_LAST 13
 #include "../locale/locarchive.h"
 #include "../crypt/md5.h"
 const char *alias_file = DATADIR "/locale/locale.alias";
 const char *locar_file = PREFIX "/lib/locale/locale-archive";
 const char *tmpl_file = PREFIX "/lib/locale/locale-archive.tmpl";
 const char *loc_path = PREFIX "/lib/locale/";
 /* Flags set by `--verbose` option.  */
 int be_quiet = 1;
 int verbose = 0;
 int max_locarchive_open_retry = 10;
 const char *output_prefix;
 /* Endianness should have been taken care of by localedef.  We don't need to do
   additional swapping.  We need this variable exported however, since
   locarchive.c uses it to determine if it needs to swap endianness of a value
   before writing to or reading from the archive.  */
 bool swap_endianness_p = false;
 static const char *locnames[] =
  {
 #define DEFINE_CATEGORY(category, category_name, items, a) \
  [category] = category_name,
 #include "../locale/categories.def"
 #undef  DEFINE_CATEGORY
  };
 static int
 is_prime (unsigned long candidate)
 {
  /* No even number and none less than 10 will be passed here.  */
  unsigned long int divn = 3;
  unsigned long int sq = divn * divn;
  while (sq < candidate && candidate % divn != 0)
    {
      ++divn;
      sq += 4 * divn;
      ++divn;
    }
  return candidate % divn != 0;
 }
 unsigned long
 next_prime (unsigned long seed)
 {
  /* Make it definitely odd.  */
  seed |= 1;
  while (!is_prime (seed))
    seed += 2;
  return seed;
 }
 void
 error (int status, int errnum, const char *message, ...)
 {
  va_list args;
  va_start (args, message);
  fflush (stdout);
  fprintf (stderr, "%s: ", program_invocation_name);
  vfprintf (stderr, message, args);
  va_end (args);
  if (errnum)
    fprintf (stderr, ": %s", strerror (errnum));
  putc ('\n', stderr);
  fflush (stderr);
  if (status)
    exit (errnum == EROFS ? 0 : status);
 }
 void *
 xmalloc (size_t size)
 {
  void *p = malloc (size);
  if (p == NULL)
    error (EXIT_FAILURE, errno, "could not allocate %zd bytes of memory", size);
  return p;
 }
 static void
 open_tmpl_archive (struct locarhandle *ah)
 {
  struct stat64 st;
  int fd;
  struct locarhead head;
  const char *archivefname = ah->fname == NULL ? tmpl_file : ah->fname;
  /* Open the archive.  We must have exclusive write access.  */
  fd = open64 (archivefname, O_RDONLY);
  if (fd == -1)
    error (EXIT_FAILURE, errno, "cannot open locale archive template file \"%s\"",
 	   archivefname);
  if (fstat64 (fd, &st) < 0)
    error (EXIT_FAILURE, errno, "cannot stat locale archive template file \"%s\"",
 	   archivefname);
  /* Read the header.  */
  if (TEMP_FAILURE_RETRY (read (fd, &head, sizeof (head))) != sizeof (head))
    error (EXIT_FAILURE, errno, "cannot read archive header");
  ah->fd = fd;
  ah->mmaped = (head.sumhash_offset
 		+ head.sumhash_size * sizeof (struct sumhashent));
  if (ah->mmaped > (unsigned long) st.st_size)
    error (EXIT_FAILURE, 0, "locale archive template file truncated");
  ah->mmaped = st.st_size;
  ah->reserved = st.st_size;
  /* Now we know how large the administrative information part is.
     Map all of it.  */
  ah->addr = mmap64 (NULL, ah->mmaped, PROT_READ, MAP_SHARED, fd, 0);
  if (ah->addr == MAP_FAILED)
    error (EXIT_FAILURE, errno, "cannot map archive header");
 }
 /* Open the locale archive.  */
 extern void open_archive (struct locarhandle *ah, bool readonly);
 /* Close the locale archive.  */
 extern void close_archive (struct locarhandle *ah);
 /* Add given locale data to the archive.  */
 extern int add_locale_to_archive (struct locarhandle *ah, const char *name,
 				  locale_data_t data, bool replace);
 extern void add_alias (struct locarhandle *ah, const char *alias,
 		       bool replace, const char *oldname,
 		       uint32_t *locrec_offset_p);
 extern struct namehashent *
 insert_name (struct locarhandle *ah,
 	     const char *name, size_t name_len, bool replace);
 struct nameent
 {
  char *name;
  struct locrecent *locrec;
 };
 struct dataent
 {
  const unsigned char *sum;
  uint32_t file_offset;
 };
 static int
 nameentcmp (const void *a, const void *b)
 {
  struct locrecent *la = ((const struct nameent *) a)->locrec;
  struct locrecent *lb = ((const struct nameent *) b)->locrec;
  uint32_t start_a = -1, end_a = 0;
  uint32_t start_b = -1, end_b = 0;
  int cnt;
  for (cnt = 0; cnt < __LC_LAST; ++cnt)
    if (cnt != LC_ALL)
      {
 	if (la->record[cnt].offset < start_a)
 	  start_a = la->record[cnt].offset;
 	if (la->record[cnt].offset + la->record[cnt].len > end_a)
 	  end_a = la->record[cnt].offset + la->record[cnt].len;
      }
  assert (start_a != (uint32_t)-1);
  assert (end_a != 0);
  for (cnt = 0; cnt < __LC_LAST; ++cnt)
    if (cnt != LC_ALL)
      {
 	if (lb->record[cnt].offset < start_b)
 	  start_b = lb->record[cnt].offset;
 	if (lb->record[cnt].offset + lb->record[cnt].len > end_b)
 	  end_b = lb->record[cnt].offset + lb->record[cnt].len;
      }
  assert (start_b != (uint32_t)-1);
  assert (end_b != 0);
  if (start_a != start_b)
    return (int)start_a - (int)start_b;
  return (int)end_a - (int)end_b;
 }
 static int
 dataentcmp (const void *a, const void *b)
 {
  if (((const struct dataent *) a)->file_offset
      < ((const struct dataent *) b)->file_offset)
    return -1;
  if (((const struct dataent *) a)->file_offset
      > ((const struct dataent *) b)->file_offset)
    return 1;
  return 0;
 }
 static int
 sumsearchfn (const void *key, const void *ent)
 {
  uint32_t keyn = *(uint32_t *)key;
  uint32_t entn = ((struct dataent *)ent)->file_offset;
  if (keyn < entn)
    return -1;
  if (keyn > entn)
    return 1;
  return 0;
 }
 static void
 compute_data (struct locarhandle *ah, struct nameent *name, size_t sumused,
 	      struct dataent *files, locale_data_t data)
 {
  int cnt;
  struct locrecent *locrec = name->locrec;
  struct dataent *file;
  data[LC_ALL].addr = ((char *) ah->addr) + locrec->record[LC_ALL].offset;
  data[LC_ALL].size = locrec->record[LC_ALL].len;
  for (cnt = 0; cnt < __LC_LAST; ++cnt)
    if (cnt != LC_ALL)
      {
 	data[cnt].addr = ((char *) ah->addr) + locrec->record[cnt].offset;
 	data[cnt].size = locrec->record[cnt].len;
 	if (data[cnt].addr >= data[LC_ALL].addr
 	    && data[cnt].addr + data[cnt].size
 	       <= data[LC_ALL].addr + data[LC_ALL].size)
 	  __md5_buffer (data[cnt].addr, data[cnt].size, data[cnt].sum);
 	else
 	  {
 	    file = bsearch (&locrec->record[cnt].offset, files, sumused,
 			    sizeof (*files), sumsearchfn);
 	    if (file == NULL)
 	      error (EXIT_FAILURE, 0, "inconsistent template file");
 	    memcpy (data[cnt].sum, file->sum, sizeof (data[cnt].sum));
 	  }
      }
 }
 static int
 fill_archive (struct locarhandle *tmpl_ah,
 	      const char *fname,
 	      size_t install_langs_count, char *install_langs_list[],
 	      size_t nlist, char *list[],
 	      const char *primary)
 {
  struct locarhandle ah;
  struct locarhead *head;
  int result = 0;
  struct nameent *names;
  struct namehashent *namehashtab;
  size_t cnt, used;
  struct dataent *files;
  struct sumhashent *sumhashtab;
  size_t sumused;
  struct locrecent *primary_locrec = NULL;
  struct nameent *primary_nameent = NULL;
  head = tmpl_ah->addr;
  names = (struct nameent *) malloc (head->namehash_used
 				     * sizeof (struct nameent));
  files = (struct dataent *) malloc (head->sumhash_used
 				     * sizeof (struct dataent));
  if (names == NULL || files == NULL)
    error (EXIT_FAILURE, errno, "could not allocate tables");
  namehashtab = (struct namehashent *) ((char *) tmpl_ah->addr
 					+ head->namehash_offset);
  sumhashtab = (struct sumhashent *) ((char *) tmpl_ah->addr
 				      + head->sumhash_offset);
  for (cnt = used = 0; cnt < head->namehash_size; ++cnt)
    if (namehashtab[cnt].locrec_offset != 0)
      {
 	char * name;
 	int i;
 	assert (used < head->namehash_used);
        name = tmpl_ah->addr + namehashtab[cnt].name_offset;
        if (install_langs_count == 0)
          {
 	    /* Always intstall the entry.  */
            names[used].name = name;
            names[used++].locrec
                = (struct locrecent *) ((char *) tmpl_ah->addr +
                                        namehashtab[cnt].locrec_offset);
          }
        else
          {
 	    /* Only install the entry if the user asked for it via
 	       --install-langs.  */
            for (i = 0; i < install_langs_count; i++)
              {
 		/* Add one for "_" and one for the null terminator.  */
 		size_t len = strlen (install_langs_list[i]) + 2;
 		char *install_lang = (char *)xmalloc (len);
                strcpy (install_lang, install_langs_list[i]);
                if (strchr (install_lang, '_') == NULL)
                  strcat (install_lang, "_");
                if (strncmp (name, install_lang, strlen (install_lang)) == 0)
                  {
                    names[used].name = name;
                    names[used++].locrec
 		      = (struct locrecent *) ((char *)tmpl_ah->addr
 					      + namehashtab[cnt].locrec_offset);
                  }
 		free (install_lang);
              }
          }
      }
  /* Sort the names.  */
  qsort (names, used, sizeof (struct nameent), nameentcmp);
  for (cnt = sumused = 0; cnt < head->sumhash_size; ++cnt)
    if (sumhashtab[cnt].file_offset != 0)
      {
 	assert (sumused < head->sumhash_used);
 	files[sumused].sum = (const unsigned char *) sumhashtab[cnt].sum;
 	files[sumused++].file_offset = sumhashtab[cnt].file_offset;
      }
  /* Sort by file locations.  */
  qsort (files, sumused, sizeof (struct dataent), dataentcmp);
  /* Open the archive.  This call never returns if we cannot
     successfully open the archive.  */
  ah.fname = NULL;
  if (fname != NULL)
    ah.fname = fname;
  open_archive (&ah, false);
  if (primary != NULL)
    {
      for (cnt = 0; cnt < used; ++cnt)
 	if (strcmp (names[cnt].name, primary) == 0)
 	  break;
      if (cnt < used)
 	{
 	  locale_data_t data;
 	  compute_data (tmpl_ah, &names[cnt], sumused, files, data);
 	  result |= add_locale_to_archive (&ah, primary, data, 0);
 	  primary_locrec = names[cnt].locrec;
 	  primary_nameent = &names[cnt];
 	}
    }
  for (cnt = 0; cnt < used; ++cnt)
    if (&names[cnt] == primary_nameent)
      continue;
    else if ((cnt > 0 && names[cnt - 1].locrec == names[cnt].locrec)
 	     || names[cnt].locrec == primary_locrec)
      {
 	const char *oldname;
 	struct namehashent *namehashent;
 	uint32_t locrec_offset;
 	if (names[cnt].locrec == primary_locrec)
 	  oldname = primary;
 	else
 	  oldname = names[cnt - 1].name;
 	namehashent = insert_name (&ah, oldname, strlen (oldname), true);
 	assert (namehashent->name_offset != 0);
 	assert (namehashent->locrec_offset != 0);
 	locrec_offset = namehashent->locrec_offset;
 	add_alias (&ah, names[cnt].name, 0, oldname, &locrec_offset);
      }
    else
      {
 	locale_data_t data;
 	compute_data (tmpl_ah, &names[cnt], sumused, files, data);
 	result |= add_locale_to_archive (&ah, names[cnt].name, data, 0);
      }
  while (nlist-- > 0)
    {
      const char *fname = *list++;
      size_t fnamelen = strlen (fname);
      struct stat64 st;
      DIR *dirp;
      struct dirent64 *d;
      int seen;
      locale_data_t data;
      int cnt;
      /* First see whether this really is a directory and whether it
 	 contains all the require locale category files.  */
      if (stat64 (fname, &st) < 0)
 	{
 	  error (0, 0, "stat of \"%s\" failed: %s: ignored", fname,
 		 strerror (errno));
 	  continue;
 	}
      if (!S_ISDIR (st.st_mode))
 	{
 	  error (0, 0, "\"%s\" is no directory; ignored", fname);
 	  continue;
 	}
      dirp = opendir (fname);
      if (dirp == NULL)
 	{
 	  error (0, 0, "cannot open directory \"%s\": %s: ignored",
 		 fname, strerror (errno));
 	  continue;
 	}
      seen = 0;
      while ((d = readdir64 (dirp)) != NULL)
 	{
 	  for (cnt = 0; cnt < __LC_LAST; ++cnt)
 	    if (cnt != LC_ALL)
 	      if (strcmp (d->d_name, locnames[cnt]) == 0)
 		{
 		  unsigned char d_type;
 		  /* We have an object of the required name.  If it's
 		     a directory we have to look at a file with the
 		     prefix "SYS_".  Otherwise we have found what we
 		     are looking for.  */
 #ifdef _DIRENT_HAVE_D_TYPE
 		  d_type = d->d_type;
 		  if (d_type != DT_REG)
 #endif
 		    {
 		      char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
 #ifdef _DIRENT_HAVE_D_TYPE
 		      if (d_type == DT_UNKNOWN || d_type == DT_LNK)
 #endif
 			{
 			  strcpy (stpcpy (stpcpy (fullname, fname), "/"),
 				  d->d_name);
 			  if (stat64 (fullname, &st) == -1)
 			    /* We cannot stat the file, ignore it.  */
 			    break;
 			  d_type = IFTODT (st.st_mode);
 			}
 		      if (d_type == DT_DIR)
 			{
 			  /* We have to do more tests.  The file is a
 			     directory and it therefore must contain a
 			     regular file with the same name except a
 			     "SYS_" prefix.  */
 			  char *t = stpcpy (stpcpy (fullname, fname), "/");
 			  strcpy (stpcpy (stpcpy (t, d->d_name), "/SYS_"),
 				  d->d_name);
 			  if (stat64 (fullname, &st) == -1)
 			    /* There is no SYS_* file or we cannot
 			       access it.  */
 			    break;
 			  d_type = IFTODT (st.st_mode);
 			}
 		    }
 		  /* If we found a regular file (eventually after
 		     following a symlink) we are successful.  */
 		  if (d_type == DT_REG)
 		    ++seen;
 		  break;
 		}
 	}
      closedir (dirp);
      if (seen != __LC_LAST - 1)
 	{
 	  /* We don't have all locale category files.  Ignore the name.  */
 	  error (0, 0, "incomplete set of locale files in \"%s\"",
 		 fname);
 	  continue;
 	}
      /* Add the files to the archive.  To do this we first compute
 	 sizes and the MD5 sums of all the files.  */
      for (cnt = 0; cnt < __LC_LAST; ++cnt)
 	if (cnt != LC_ALL)
 	  {
 	    char fullname[fnamelen + 2 * strlen (locnames[cnt]) + 7];
 	    int fd;
 	    strcpy (stpcpy (stpcpy (fullname, fname), "/"), locnames[cnt]);
 	    fd = open64 (fullname, O_RDONLY);
 	    if (fd == -1 || fstat64 (fd, &st) == -1)
 	      {
 		/* Cannot read the file.  */
 		if (fd != -1)
 		  close (fd);
 		break;
 	      }
 	    if (S_ISDIR (st.st_mode))
 	      {
 		char *t;
 		close (fd);
 		t = stpcpy (stpcpy (fullname, fname), "/");
 		strcpy (stpcpy (stpcpy (t, locnames[cnt]), "/SYS_"),
 			locnames[cnt]);
 		fd = open64 (fullname, O_RDONLY);
 		if (fd == -1 || fstat64 (fd, &st) == -1
 		    || !S_ISREG (st.st_mode))
 		  {
 		    if (fd != -1)
 		      close (fd);
 		    break;
 		  }
 	      }
 	    /* Map the file.  */
 	    data[cnt].addr = mmap64 (NULL, st.st_size, PROT_READ, MAP_SHARED,
 				     fd, 0);
 	    if (data[cnt].addr == MAP_FAILED)
 	      {
 		/* Cannot map it.  */
 		close (fd);
 		break;
 	      }
 	    data[cnt].size = st.st_size;
 	    __md5_buffer (data[cnt].addr, st.st_size, data[cnt].sum);
 	    /* We don't need the file descriptor anymore.  */
 	    close (fd);
 	  }
      if (cnt != __LC_LAST)
 	{
 	  while (cnt-- > 0)
 	    if (cnt != LC_ALL)
 	      munmap (data[cnt].addr, data[cnt].size);
 	  error (0, 0, "cannot read all files in \"%s\": ignored", fname);
 	  continue;
 	}
      result |= add_locale_to_archive (&ah, basename (fname), data, 0);
      for (cnt = 0; cnt < __LC_LAST; ++cnt)
 	if (cnt != LC_ALL)
 	  munmap (data[cnt].addr, data[cnt].size);
    }
  /* We are done.  */
  close_archive (&ah);
  return result;
 }
 void usage()
 {
  printf ("\
 Usage: build-locale-archive [OPTION]... [TEMPLATE-FILE] [ARCHIVE-FILE]\n\
 Builds a locale archive from a template file.\n\
 Options:\n\
  -h, --help                 Print this usage message.\n\
  -v, --verbose              Verbose execution.\n\
  -l, --install-langs=LIST   Only include locales given in LIST into the \n\
                             locale archive.  LIST is a colon separated list\n\
                             of locale prefixes, for example \"de:en:ja\".\n\
                             The special argument \"all\" means to install\n\
                             all languages and it must be present by itself.\n\
                             If \"all\" is present with any other language it\n\
                             will be treated as the name of a locale.\n\
                             If the --install-langs option is missing, all\n\
                             locales are installed. The colon separated list\n\
                             can contain any strings matching the beginning of\n\
                             locale names.\n\
                             If a string does not contain a \"_\", it is added.\n\
                             Examples:\n\
                               --install-langs=\"en\"\n\
                                 installs en_US, en_US.iso88591,\n\
                                 en_US.iso885915, en_US.utf8,\n\
                                 en_GB ...\n\
                               --install-langs=\"en_US.utf8\"\n\
                                 installs only en_US.utf8.\n\
                               --install-langs=\"ko\"\n\
                                 installs ko_KR, ko_KR.euckr,\n\
                                 ko_KR.utf8 but *not* kok_IN\n\
                                 because \"ko\" does not contain\n\
                                 \"_\" and it is silently added\n\
                               --install-langs\"ko:kok\"\n\
                                 installs ko_KR, ko_KR.euckr,\n\
                                 ko_KR.utf8, kok_IN, and\n\
                                 kok_IN.utf8.\n\
                               --install-langs=\"POSIX\" will\n\
                                 installs *no* locales at all\n\
                                 because POSIX matches none of\n\
                                 the locales. Actually, any string\n\
                                 matching nothing will do that.\n\
                                 POSIX and C will always be\n\
                                 available because they are\n\
                                 builtin.\n\
                             Aliases are installed as well,\n\
                             i.e. --install-langs=\"de\"\n\
                             will install not only every locale starting with\n\
                             \"de\" but also the aliases \"deutsch\"\n\
                             and and \"german\" although the latter does not\n\
                             start with \"de\".\n\
 \n\
  If the arguments TEMPLATE-FILE and ARCHIVE-FILE are not given the locations\n\
  where the glibc used expects these files are used by default.\n\
 ");
 }
 int main (int argc, char *argv[])
 {
  char path[4096];
  DIR *dirp;
  struct dirent64 *d;
  struct stat64 st;
  char *list[16384], *primary;
  char *lang;
  int install_langs_count = 0;
  int i;
  char *install_langs_arg, *ila_start;
  char **install_langs_list = NULL;
  unsigned int cnt = 0;
  struct locarhandle tmpl_ah;
  char *new_locar_fname = NULL;
  size_t loc_path_len = strlen (loc_path);
  while (1)
    {
      int c;
      static struct option long_options[] =
        {
            {"help",            no_argument,       0, 'h'},
            {"verbose",         no_argument,       0, 'v'},
            {"install-langs",   required_argument, 0, 'l'},
            {0, 0, 0, 0}
        };
      /* getopt_long stores the option index here. */
      int option_index = 0;
      c = getopt_long (argc, argv, "vhl:",
                       long_options, &option_index);
      /* Detect the end of the options. */
      if (c == -1)
        break;
      switch (c)
        {
        case 0:
          printf ("unknown option %s", long_options[option_index].name);
          if (optarg)
            printf (" with arg %s", optarg);
          printf ("\n");
          usage ();
          exit (1);
        case 'v':
          verbose = 1;
          be_quiet = 0;
          break;
        case 'h':
          usage ();
          exit (0);
        case 'l':
          install_langs_arg = ila_start = strdup (optarg);
          /* If the argument to --install-lang is "all", do
             not limit the list of languages to install and install
             them all.  We do not support installing a single locale
 	     called "all".  */
 #define MAGIC_INSTALL_ALL "all"
          if (install_langs_arg != NULL
 	      && install_langs_arg[0] != '\0'
 	      && !(strncmp(install_langs_arg, MAGIC_INSTALL_ALL,
 			   strlen(MAGIC_INSTALL_ALL)) == 0
 		   && strlen (install_langs_arg) == 3))
            {
 	      /* Count the number of languages we will install.  */
              while (true)
                {
                  lang = strtok(install_langs_arg, ":;,");
                  if (lang == NULL)
                    break;
                  install_langs_count++;
                  install_langs_arg = NULL;
                }
 	      free (ila_start);
 	      /* Reject an entire string made up of delimiters.  */
 	      if (install_langs_count == 0)
 		break;
 	      /* Copy the list.  */
 	      install_langs_list = (char **)xmalloc (sizeof(char *) * install_langs_count);
 	      install_langs_arg = ila_start = strdup (optarg);
 	      install_langs_count = 0;
 	      while (true)
                {
                  lang = strtok(install_langs_arg, ":;,");
                  if (lang == NULL)
                    break;
                  install_langs_list[install_langs_count] = lang;
 		  install_langs_count++;
                  install_langs_arg = NULL;
                }
            }
          break;
        case '?':
          /* getopt_long already printed an error message. */
          usage ();
          exit (0);
        default:
          abort ();
        }
    }
  tmpl_ah.fname = NULL;
  if (optind < argc)
    tmpl_ah.fname = argv[optind];
  if (optind + 1 < argc)
    new_locar_fname = argv[optind + 1];
  if (verbose)
    {
      if (tmpl_ah.fname)
        printf("input archive file specified on command line: %s\n",
               tmpl_ah.fname);
      else
        printf("using default input archive file.\n");
      if (new_locar_fname)
        printf("output archive file specified on command line: %s\n",
               new_locar_fname);
      else
        printf("using default output archive file.\n");
    }
  dirp = opendir (loc_path);
  if (dirp == NULL)
    error (EXIT_FAILURE, errno, "cannot open directory \"%s\"", loc_path);
  open_tmpl_archive (&tmpl_ah);
  if (new_locar_fname)
    unlink (new_locar_fname);
  else
    unlink (locar_file);
  primary = getenv ("LC_ALL");
  if (primary == NULL)
    primary = getenv ("LANG");
  if (primary != NULL)
    {
      if (strncmp (primary, "ja", 2) != 0
 	  && strncmp (primary, "ko", 2) != 0
 	  && strncmp (primary, "zh", 2) != 0)
 	{
 	  char *ptr = malloc (strlen (primary) + strlen (".utf8") + 1), *p, *q;
 	  /* This leads to invalid locales sometimes:
 	     de_DE.iso885915@euro -> de_DE.utf8@euro */
 	  if (ptr != NULL)
 	    {
 	      p = ptr;
 	      q = primary;
 	      while (*q && *q != '.' && *q != '@')
 		*p++ = *q++;
 	      if (*q == '.')
 		while (*q && *q != '@')
 		  q++;
 	      p = stpcpy (p, ".utf8");
 	      strcpy (p, q);
 	      primary = ptr;
 	    }
 	  else
 	    primary = NULL;
 	}
    }
  memcpy (path, loc_path, loc_path_len);
  while ((d = readdir64 (dirp)) != NULL)
    {
      if (strcmp (d->d_name, ".") == 0 || strcmp (d->d_name, "..") == 0)
 	continue;
      if (strchr (d->d_name, '_') == NULL)
 	continue;
      size_t d_name_len = strlen (d->d_name);
      if (loc_path_len + d_name_len + 1 > sizeof (path))
 	{
 	  error (0, 0, "too long filename \"%s\"", d->d_name);
 	  continue;
 	}
      memcpy (path + loc_path_len, d->d_name, d_name_len + 1);
      if (stat64 (path, &st) < 0)
 	{
 	  error (0, errno, "cannot stat \"%s\"", path);
 	  continue;
 	}
      if (! S_ISDIR (st.st_mode))
 	continue;
      if (cnt == 16384)
 	{
 	  error (0, 0, "too many directories in \"%s\"", loc_path);
 	  break;
 	}
      list[cnt] = strdup (path);
      if (list[cnt] == NULL)
 	{
 	  error (0, errno, "cannot add file to list \"%s\"", path);
 	  continue;
 	}
      if (primary != NULL && cnt > 0 && strcmp (primary, d->d_name) == 0)
 	{
 	  char *p = list[0];
 	  list[0] = list[cnt];
 	  list[cnt] = p;
 	}
      cnt++;
    }
  closedir (dirp);
  /* Store the archive to the file specified as the second argument on the
     command line or the default locale archive.  */
  fill_archive (&tmpl_ah, new_locar_fname,
                install_langs_count, install_langs_list,
                cnt, list, primary);
  close_archive (&tmpl_ah);
  truncate (tmpl_file, 0);
  if (install_langs_count > 0)
    {
      free (ila_start);
      free (install_langs_list);
    }
  char *tz_argv[] = { "/usr/sbin/tzdata-update", NULL };
  execve (tz_argv[0], (char *const *)tz_argv, (char *const *)&tz_argv[1]);
  exit (0);
 }
--- a/SOURCES/glibc-RHEL-1017-1.patch
+++ b/SOURCES/glibc-RHEL-1017-1.patch
@ -0,0 +1,432 @@
 From e4ca6de1bc5e4ba3f94cf0c501a293c5bc827b10 Mon Sep 17 00:00:00 2001
 From: Anton Blanchard <anton@ozlabs.org>
 Date: Tue, 27 Jul 2021 15:47:49 +1000
 Subject: powerpc64: Replace some PPC_FEATURE_HAS_VSX with
 PPC_FEATURE_ARCH_2_06
 We use PPC_FEATURE_HAS_VSX to select a number of POWER7 optimised
 functions. These functions don't use any VSX instructions, so
 PPC_FEATURE_ARCH_2_06 seems like a better fit.
 Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index 0acdf22ba3..32564c8f1f 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -95,7 +95,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __memset_power8)
 -	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __memset_power7)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
 			      __memset_power6)
@@ -139,7 +139,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strlen_power8)
 -	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strlen_power7)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1,
 			      __strlen_ppc))
@@ -152,7 +152,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncmp_power8)
 -	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strncmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap & PPC_FEATURE_POWER4,
 			      __strncmp_power4)
@@ -165,7 +165,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strchr_power8)
 	      IFUNC_IMPL_ADD (array, i, strchr,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strchr_power7)
 	      IFUNC_IMPL_ADD (array, i, strchr, 1,
 			      __strchr_ppc))
@@ -176,7 +176,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strchrnul_power8)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strchrnul_power7)
 	      IFUNC_IMPL_ADD (array, i, strchrnul, 1,
 			      __strchrnul_ppc))
@@ -192,7 +192,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #endif
 	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __memcmp_power8)
 -	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __memcmp_power7)
 	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_POWER4,
 			      __memcmp_power4)
@@ -244,7 +244,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __memchr_power8)
 	      IFUNC_IMPL_ADD (array, i, memchr,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __memchr_power7)
 	      IFUNC_IMPL_ADD (array, i, memchr, 1,
 			      __memchr_ppc))
@@ -255,7 +255,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __memrchr_power8)
 	      IFUNC_IMPL_ADD (array, i, memrchr,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __memrchr_power7)
 	      IFUNC_IMPL_ADD (array, i, memrchr, 1,
 			      __memrchr_ppc))
@@ -272,7 +272,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __rawmemchr_power9)
 #endif
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __rawmemchr_power7)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1,
 			      __rawmemchr_ppc))
@@ -282,7 +282,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, strnlen,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strnlen_power8)
 -	      IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strnlen_power7)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1,
 			      __strnlen_ppc))
@@ -293,14 +293,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strcasecmp_power8)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strcasecmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp, 1, __strcasecmp_ppc))
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c.  */
   IFUNC_IMPL (i, name, strcasecmp_l,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strcasecmp_l_power7)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp_l, 1,
 			      __strcasecmp_l_ppc))
@@ -311,14 +311,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncasecmp_power8)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strncasecmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp, 1, __strncasecmp_ppc))
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase_l.c.  */
   IFUNC_IMPL (i, name, strncasecmp_l,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strncasecmp_l_power7)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp_l, 1,
 			      __strncasecmp_l_ppc))
@@ -329,7 +329,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strrchr_power8)
 	      IFUNC_IMPL_ADD (array, i, strrchr,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strrchr_power7)
 	      IFUNC_IMPL_ADD (array, i, strrchr, 1,
 			      __strrchr_ppc))
@@ -357,7 +357,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strncpy_power8)
 	      IFUNC_IMPL_ADD (array, i, strncpy,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strncpy, 1,
 			     __strncpy_ppc))
@@ -374,7 +374,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __stpncpy_power8)
 	      IFUNC_IMPL_ADD (array, i, stpncpy,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __stpncpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpncpy, 1,
 			     __stpncpy_ppc))
@@ -390,7 +390,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __strcmp_power8)
 	      IFUNC_IMPL_ADD (array, i, strcmp,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strcmp_power7)
 	      IFUNC_IMPL_ADD (array, i, strcmp, 1,
 			     __strcmp_ppc))
@@ -425,7 +425,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strstr.c.  */
   IFUNC_IMPL (i, name, strstr,
              IFUNC_IMPL_ADD (array, i, strstr,
 -                             hwcap & PPC_FEATURE_HAS_VSX,
 +                             hwcap & PPC_FEATURE_ARCH_2_06,
                              __strstr_power7)
              IFUNC_IMPL_ADD (array, i, strstr, 1,
                              __strstr_ppc))
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
 index 0c718d4f15..c24186689e 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -30,7 +30,7 @@ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
 libc_ifunc (__memchr,
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 	    ? __memchr_power8 :
 -	    (hwcap & PPC_FEATURE_HAS_VSX)
 +	    (hwcap & PPC_FEATURE_ARCH_2_06)
             ? __memchr_power7
             : __memchr_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
 index 4fd089aba7..99559bce26 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect_memcmp, memcmp,
 #endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __memcmp_power8 :
 -		       (hwcap & PPC_FEATURE_HAS_VSX)
 +		       (hwcap & PPC_FEATURE_ARCH_2_06)
 		       ? __memcmp_power7
 		       : (hwcap & PPC_FEATURE_POWER4)
 			 ? __memcmp_power4
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
 index e06d6468b8..16bb6f0042 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
@@ -30,7 +30,7 @@ extern __typeof (__memrchr) __memrchr_power8 attribute_hidden;
 libc_ifunc (__memrchr,
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 	    ? __memrchr_power8 :
 -	      (hwcap & PPC_FEATURE_HAS_VSX)
 +	      (hwcap & PPC_FEATURE_ARCH_2_06)
 	      ? __memrchr_power7
 	    : __memrchr_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
 index 5994bf02e6..c1aa143f60 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -48,7 +48,7 @@ libc_ifunc (__libc_memset,
 # endif
             (hwcap2 & PPC_FEATURE2_ARCH_2_07)
             ? __memset_power8 :
 -	      (hwcap & PPC_FEATURE_HAS_VSX)
 +	      (hwcap & PPC_FEATURE_ARCH_2_06)
 	      ? __memset_power7 :
 		(hwcap & PPC_FEATURE_ARCH_2_05)
 		? __memset_power6 :
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
 index c0ffea2b93..b5d2d3a635 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
@@ -41,7 +41,7 @@ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
 		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
 		       ? __rawmemchr_power9 :
 # endif
 -		         (hwcap & PPC_FEATURE_HAS_VSX)
 +		         (hwcap & PPC_FEATURE_ARCH_2_06)
 		         ? __rawmemchr_power7
 		       : __rawmemchr_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
 index bebd377fd9..e7035761a7 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/stpncpy.c
@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect___stpncpy, __stpncpy,
 # endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __stpncpy_power8
 -		       : (hwcap & PPC_FEATURE_HAS_VSX)
 +		       : (hwcap & PPC_FEATURE_ARCH_2_06)
 			 ? __stpncpy_power7
 			 : __stpncpy_ppc);
 weak_alias (__stpncpy, stpncpy)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
 index dcd7774403..55ca6c85c4 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
@@ -29,7 +29,7 @@ extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
 libc_ifunc (__libc_strcasecmp,
 	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
              ? __strcasecmp_power8:
 -	     (hwcap & PPC_FEATURE_HAS_VSX)
 +	     (hwcap & PPC_FEATURE_ARCH_2_06)
              ? __strcasecmp_power7
              : __strcasecmp_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
 index 96a70b8b11..1afee5d7fd 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp_l.c
@@ -32,7 +32,7 @@ extern __typeof (__strcasecmp_l) __strcasecmp_l_power7 attribute_hidden;
 extern __typeof (__strcasecmp_l) __libc_strcasecmp_l;
 libc_ifunc (__libc_strcasecmp_l,
 -	    (hwcap & PPC_FEATURE_HAS_VSX)
 +	    (hwcap & PPC_FEATURE_ARCH_2_06)
             ? __strcasecmp_l_power7
             : __strcasecmp_l_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr.c b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
 index ea9ac1134f..27c794c6b7 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
@@ -35,7 +35,7 @@ extern __typeof (strchr) __strchr_power8 attribute_hidden;
 libc_ifunc_redirected (__redirect_strchr, strchr,
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strchr_power8 :
 -		       (hwcap & PPC_FEATURE_HAS_VSX)
 +		       (hwcap & PPC_FEATURE_ARCH_2_06)
 		       ? __strchr_power7
 		       : __strchr_ppc);
 weak_alias (strchr, index)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
 index 4688e7c3f0..4a07b4a242 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
@@ -30,7 +30,7 @@ extern __typeof (__strchrnul) __strchrnul_power8 attribute_hidden;
 libc_ifunc (__strchrnul,
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 	    ? __strchrnul_power8 :
 -	    (hwcap & PPC_FEATURE_HAS_VSX)
 +	    (hwcap & PPC_FEATURE_ARCH_2_06)
             ? __strchrnul_power7
             : __strchrnul_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
 index 72f9a639bf..4b0b25fff6 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -40,7 +40,7 @@ libc_ifunc_redirected (__redirect_strcmp, strcmp,
 # endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strcmp_power8
 -		       : (hwcap & PPC_FEATURE_HAS_VSX)
 +		       : (hwcap & PPC_FEATURE_ARCH_2_06)
 			 ? __strcmp_power7
 			 : __strcmp_ppc);
 #endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
 index 109c8a90bd..0cd1c6faff 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -42,7 +42,7 @@ libc_ifunc (__libc_strlen,
 # endif
 	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 	    ? __strlen_power8 :
 -	      (hwcap & PPC_FEATURE_HAS_VSX)
 +	      (hwcap & PPC_FEATURE_ARCH_2_06)
 	      ? __strlen_power7
 	      : __strlen_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
 index 2013a5d75a..644046bd74 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
@@ -29,7 +29,7 @@ extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
 libc_ifunc (__libc_strncasecmp,
 	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
              ? __strncasecmp_power8:
 -	     (hwcap & PPC_FEATURE_HAS_VSX)
 +	     (hwcap & PPC_FEATURE_ARCH_2_06)
              ? __strncasecmp_power7
              : __strncasecmp_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c b/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
 index cad6da302d..d2d761af72 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strncase_l.c
@@ -34,7 +34,7 @@ extern __typeof (__strncasecmp_l) __strncasecmp_l_power7 attribute_hidden;
    ifunc symbol properly.  */
 extern __typeof (__strncasecmp_l) __libc_strncasecmp_l;
 libc_ifunc (__libc_strncasecmp_l,
 -	     (hwcap & PPC_FEATURE_HAS_VSX)
 +	     (hwcap & PPC_FEATURE_ARCH_2_06)
              ? __strncasecmp_l_power7
              : __strncasecmp_l_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
 index eef524ddfb..1f689e5c05 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -43,7 +43,7 @@ libc_ifunc_redirected (__redirect_strncmp, strncmp,
 # endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncmp_power8
 -		       : (hwcap & PPC_FEATURE_HAS_VSX)
 +		       : (hwcap & PPC_FEATURE_ARCH_2_06)
 			 ? __strncmp_power7
 			 : (hwcap & PPC_FEATURE_POWER4)
 			   ? __strncmp_power4
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
 index 7da9def358..d4d3463bd1 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strncpy.c
@@ -43,7 +43,7 @@ libc_ifunc_redirected (__redirect_strncpy, strncpy,
 # endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strncpy_power8
 -		       : (hwcap & PPC_FEATURE_HAS_VSX)
 +		       : (hwcap & PPC_FEATURE_ARCH_2_06)
 			 ? __strncpy_power7
 			 : __strncpy_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
 index 264b7a752d..baf375a75a 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
@@ -31,7 +31,7 @@ extern __typeof (__strnlen) __strnlen_power8 attribute_hidden;
 libc_ifunc_redirected (__redirect___strnlen, __strnlen,
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strnlen_power8 :
 -			 (hwcap & PPC_FEATURE_HAS_VSX)
 +			 (hwcap & PPC_FEATURE_ARCH_2_06)
 			 ? __strnlen_power7
 			 : __strnlen_ppc);
 weak_alias (__strnlen, strnlen)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
 index bb06b93d19..1c9eea1817 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -33,7 +33,7 @@ extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
 libc_ifunc_redirected (__redirect_strrchr, strrchr,
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 		       ? __strrchr_power8 :
 -		       (hwcap & PPC_FEATURE_HAS_VSX)
 +		       (hwcap & PPC_FEATURE_ARCH_2_06)
 		       ? __strrchr_power7
 		       : __strrchr_ppc);
 weak_alias (strrchr, rindex)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strstr.c b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
 index bb0588844e..6582798dda 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strstr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strstr.c
@@ -30,7 +30,7 @@ extern __typeof (strstr) __strstr_power7 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect_strstr, strstr,
 -		       (hwcap & PPC_FEATURE_HAS_VSX)
 +		       (hwcap & PPC_FEATURE_ARCH_2_06)
 		       ? __strstr_power7
 		       : __strstr_ppc);
 #endif
--- a/SOURCES/glibc-RHEL-1017-2.patch
+++ b/SOURCES/glibc-RHEL-1017-2.patch
@ -0,0 +1,83 @@
 From f2a15dd668913c5a1388ba7e1131b25162b2ea75 Mon Sep 17 00:00:00 2001
 From: Anton Blanchard <anton@ozlabs.org>
 Date: Tue, 27 Jul 2021 15:47:50 +1000
 Subject: powerpc64: Check cacheline size before using optimised memset
 routines
 A number of optimised memset routines assume the cacheline size is 128B,
 so we better check before using them.
 Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index 32564c8f1f..a3fdcd43bd 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -35,6 +35,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   unsigned long int hwcap = GLRO(dl_hwcap);
   unsigned long int hwcap2 = GLRO(dl_hwcap2);
 +#ifdef SHARED
 +  int cacheline_size = GLRO(dl_cache_line_size);
 +#endif
   /* hwcap contains only the latest supported ISA, the code checks which is
      and fills the previous supported ones.  */
@@ -90,16 +93,21 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      hwcap2 & PPC_FEATURE2_ARCH_3_1
 			      && hwcap2 & PPC_FEATURE2_HAS_ISEL
 -			      && hwcap & PPC_FEATURE_HAS_VSX,
 +			      && hwcap & PPC_FEATURE_HAS_VSX
 +			      && cacheline_size == 128,
 			      __memset_power10)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && cacheline_size == 128,
 			      __memset_power8)
 -	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06,
 +	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06
 +			      && cacheline_size == 128,
 			      __memset_power7)
 -	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05,
 +	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_05
 +			      && cacheline_size == 128,
 			      __memset_power6)
 -	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4,
 +	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_POWER4
 +			      && cacheline_size == 128,
 			      __memset_power4)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_ppc))
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
 index c1aa143f60..056e911699 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -43,16 +43,21 @@ libc_ifunc (__libc_memset,
 # ifdef __LITTLE_ENDIAN__
 	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
 	     && hwcap2 & PPC_FEATURE2_HAS_ISEL
 -	     && hwcap & PPC_FEATURE_HAS_VSX)
 +	     && hwcap & PPC_FEATURE_HAS_VSX
 +	     && GLRO(dl_cache_line_size) == 128)
 	    ? __memset_power10 :
 # endif
 -            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +            (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && GLRO(dl_cache_line_size) == 128)
             ? __memset_power8 :
 -	      (hwcap & PPC_FEATURE_ARCH_2_06)
 +	      (hwcap & PPC_FEATURE_ARCH_2_06
 +	       && GLRO(dl_cache_line_size) == 128)
 	      ? __memset_power7 :
 -		(hwcap & PPC_FEATURE_ARCH_2_05)
 +		(hwcap & PPC_FEATURE_ARCH_2_05
 +	         && GLRO(dl_cache_line_size) == 128)
 		? __memset_power6 :
 -		  (hwcap & PPC_FEATURE_POWER4)
 +		  (hwcap & PPC_FEATURE_POWER4
 +	           && GLRO(dl_cache_line_size) == 128)
 		  ? __memset_power4
             : __memset_ppc);
--- a/SOURCES/glibc-RHEL-1017-3.patch
+++ b/SOURCES/glibc-RHEL-1017-3.patch
@ -0,0 +1,703 @@
 From 60b4dd25790342b40e8942e3a4115f511a6b6911 Mon Sep 17 00:00:00 2001
 From: Anton Blanchard <anton@ozlabs.org>
 Date: Tue, 27 Jul 2021 15:47:51 +1000
 Subject: powerpc64: Add checks for Altivec and VSX in ifunc selection
 We'd like to support processors without Altivec or VSX, so check
 the relevant hwcap bits before selecting them.
 Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
 index 660d7dc686..c8ffbea01c 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -38,11 +38,13 @@ libc_ifunc (__bzero,
 	     && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __bzero_power10 :
 # endif
 -            (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
             ? __bzero_power8 :
 	      (hwcap & PPC_FEATURE_HAS_VSX)
 	      ? __bzero_power7 :
 -		(hwcap & PPC_FEATURE_ARCH_2_05)
 +		(hwcap & PPC_FEATURE_ARCH_2_05
 +		 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		? __bzero_power6 :
 		  (hwcap & PPC_FEATURE_POWER4)
 		  ? __bzero_power4
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index a3fdcd43bd..c3e25c5981 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -60,9 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __memcpy_power10)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __memcpy_power8_cached)
 -	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __memcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __memcpy_a2)
@@ -83,7 +85,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __memmove_power10)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_ARCH_2_06
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __memmove_power7)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
@@ -98,6 +101,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_power10)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC
 			      && cacheline_size == 128,
 			      __memset_power8)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_ARCH_2_06
@@ -114,12 +118,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcpy.c.  */
   IFUNC_IMPL (i, name, strcpy,
 #ifdef __LITTLE_ENDIAN__
 -	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
 +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcpy_power9)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strcpy_power8)
 -	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, strcpy, hwcap & PPC_FEATURE_ARCH_2_06
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, strcpy, 1,
 			      __strcpy_ppc))
@@ -127,12 +134,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/stpcpy.c.  */
   IFUNC_IMPL (i, name, stpcpy,
 #ifdef __LITTLE_ENDIAN__
 -	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00,
 +	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpcpy_power9)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __stpcpy_power8)
 -	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_HAS_VSX,
 +	      IFUNC_IMPL_ADD (array, i, stpcpy, hwcap & PPC_FEATURE_ARCH_2_06
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __stpcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, stpcpy, 1,
 			      __stpcpy_ppc))
@@ -140,12 +150,15 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
 #ifdef __LITTLE_ENDIAN__
 -	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1,
 +	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strlen_power10)
 -	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
 +	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strlen_power9)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strlen_power8)
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strlen_power7)
@@ -155,7 +168,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c.  */
   IFUNC_IMPL (i, name, strncmp,
 #ifdef __LITTLE_ENDIAN__
 -	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00,
 +	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strncmp_power9)
 #endif
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
@@ -170,7 +184,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strchr.c.  */
   IFUNC_IMPL (i, name, strchr,
 	      IFUNC_IMPL_ADD (array, i, strchr,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strchr_power8)
 	      IFUNC_IMPL_ADD (array, i, strchr,
 			      hwcap & PPC_FEATURE_ARCH_2_06,
@@ -181,7 +196,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strchrnul.c.  */
   IFUNC_IMPL (i, name, strchrnul,
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strchrnul_power8)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 			      hwcap & PPC_FEATURE_ARCH_2_06,
@@ -198,7 +214,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
             && hwcap & PPC_FEATURE_HAS_VSX,
 			      __memcmp_power10)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __memcmp_power8)
 	      IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __memcmp_power7)
@@ -215,11 +232,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __bzero_power10)
 #endif
 -	      IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +	      IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __bzero_power8)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
 			      __bzero_power7)
 -	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05,
 +	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_ARCH_2_05
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __bzero_power6)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_POWER4,
 			      __bzero_power4)
@@ -241,7 +260,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/mempcpy.c.  */
   IFUNC_IMPL (i, name, mempcpy,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __mempcpy_power7)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, 1,
 			      __mempcpy_ppc))
@@ -249,7 +269,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c.  */
   IFUNC_IMPL (i, name, memchr,
 	      IFUNC_IMPL_ADD (array, i, memchr,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __memchr_power8)
 	      IFUNC_IMPL_ADD (array, i, memchr,
 			      hwcap & PPC_FEATURE_ARCH_2_06,
@@ -260,7 +281,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/memrchr.c.  */
   IFUNC_IMPL (i, name, memrchr,
 	      IFUNC_IMPL_ADD (array, i, memrchr,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __memrchr_power8)
 	      IFUNC_IMPL_ADD (array, i, memrchr,
 			      hwcap & PPC_FEATURE_ARCH_2_06,
@@ -276,7 +298,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
                               && (hwcap & PPC_FEATURE_HAS_VSX),
                               __rawmemchr_power10)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 -			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
 +			      hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __rawmemchr_power9)
 #endif
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
@@ -288,7 +311,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strnlen.c.  */
   IFUNC_IMPL (i, name, strnlen,
 	      IFUNC_IMPL_ADD (array, i, strnlen,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strnlen_power8)
 	      IFUNC_IMPL_ADD (array, i, strnlen, hwcap & PPC_FEATURE_ARCH_2_06,
 			      __strnlen_power7)
@@ -298,7 +322,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c.  */
   IFUNC_IMPL (i, name, strcasecmp,
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strcasecmp_power8)
 	      IFUNC_IMPL_ADD (array, i, strcasecmp,
 			      hwcap & PPC_FEATURE_ARCH_2_06,
@@ -316,7 +341,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncase.c.  */
   IFUNC_IMPL (i, name, strncasecmp,
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			       && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strncasecmp_power8)
 	      IFUNC_IMPL_ADD (array, i, strncasecmp,
 			      hwcap & PPC_FEATURE_ARCH_2_06,
@@ -334,7 +360,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strrchr.c.  */
   IFUNC_IMPL (i, name, strrchr,
 	      IFUNC_IMPL_ADD (array, i, strrchr,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strrchr_power8)
 	      IFUNC_IMPL_ADD (array, i, strrchr,
 			      hwcap & PPC_FEATURE_ARCH_2_06,
@@ -345,10 +372,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncat.c.  */
   IFUNC_IMPL (i, name, strncat,
 	      IFUNC_IMPL_ADD (array, i, strncat,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncat_power8)
 	      IFUNC_IMPL_ADD (array, i, strncat,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strncat_power7)
 	      IFUNC_IMPL_ADD (array, i, strncat, 1,
 			      __strncat_ppc))
@@ -391,7 +420,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, strcmp,
 #ifdef __LITTLE_ENDIAN__
 	      IFUNC_IMPL_ADD (array, i, strcmp,
 -			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
 +			      hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strcmp_power9)
 #endif
 	      IFUNC_IMPL_ADD (array, i, strcmp,
@@ -406,10 +436,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcat.c.  */
   IFUNC_IMPL (i, name, strcat,
 	      IFUNC_IMPL_ADD (array, i, strcat,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcat_power8)
 	      IFUNC_IMPL_ADD (array, i, strcat,
 -			      hwcap & PPC_FEATURE_HAS_VSX,
 +			      hwcap & PPC_FEATURE_ARCH_2_06
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __strcat_power7)
 	      IFUNC_IMPL_ADD (array, i, strcat, 1,
 			     __strcat_ppc))
@@ -417,7 +449,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strspn.c.  */
   IFUNC_IMPL (i, name, strspn,
              IFUNC_IMPL_ADD (array, i, strspn,
 -                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +                             hwcap2 & PPC_FEATURE2_ARCH_2_07
 +                             && hwcap & PPC_FEATURE_HAS_VSX,
                              __strspn_power8)
              IFUNC_IMPL_ADD (array, i, strspn, 1,
                              __strspn_ppc))
@@ -425,7 +458,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcspn.c.  */
   IFUNC_IMPL (i, name, strcspn,
              IFUNC_IMPL_ADD (array, i, strcspn,
 -                             hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +                             hwcap2 & PPC_FEATURE2_ARCH_2_07
 +                             && hwcap & PPC_FEATURE_HAS_VSX,
                              __strcspn_power8)
              IFUNC_IMPL_ADD (array, i, strcspn, 1,
                              __strcspn_ppc))
@@ -442,7 +476,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcasestr.c.  */
   IFUNC_IMPL (i, name, strcasestr,
 	      IFUNC_IMPL_ADD (array, i, strcasestr,
 -			      hwcap2 & PPC_FEATURE2_ARCH_2_07,
 +			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strcasestr_power8)
              IFUNC_IMPL_ADD (array, i, strcasestr, 1,
                              __strcasestr_ppc))
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
 index c24186689e..f40013e061 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -28,7 +28,8 @@ extern __typeof (__memchr) __memchr_power8 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__memchr,
 -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 	    ? __memchr_power8 :
 	    (hwcap & PPC_FEATURE_ARCH_2_06)
             ? __memchr_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
 index 99559bce26..89b56c103b 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
@@ -38,7 +38,8 @@ libc_ifunc_redirected (__redirect_memcmp, memcmp,
 				 && hwcap & PPC_FEATURE_HAS_VSX)
 				 ? __memcmp_power10 :
 #endif
 -		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		       ? __memcmp_power8 :
 		       (hwcap & PPC_FEATURE_ARCH_2_06)
 		       ? __memcmp_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
 index 53ab32ef26..684ee064f2 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
@@ -45,9 +45,12 @@ libc_ifunc (__libc_memcpy,
 	    (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __memcpy_power10 :
 # endif
 -	    ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_ALTIVEC
 +	     && use_cached_memopt)
 	    ? __memcpy_power8_cached :
 -	      (hwcap & PPC_FEATURE_HAS_VSX)
 +	      (hwcap & PPC_FEATURE_ARCH_2_06
 +	       && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 	      ? __memcpy_power7 :
 		(hwcap & PPC_FEATURE_ARCH_2_06)
 		? __memcpy_a2 :
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
 index 637b2cbf7f..50253b4554 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -41,7 +41,8 @@ libc_ifunc (__libc_memmove,
 	     && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __memmove_power10 :
 #endif
 -		     (hwcap & PPC_FEATURE_HAS_VSX)
 +		     (hwcap & PPC_FEATURE_ARCH_2_06
 +		      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		     ? __memmove_power7
 		     : __memmove_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c b/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
 index b37e0f35b5..563095a5ec 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/mempcpy.c
@@ -33,7 +33,8 @@ extern __typeof (__mempcpy) __mempcpy_power7 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect___mempcpy, __mempcpy,
 -		       (hwcap & PPC_FEATURE_HAS_VSX)
 +		       (hwcap & PPC_FEATURE_ARCH_2_06
 +			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		       ? __mempcpy_power7
 		       : __mempcpy_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
 index 16bb6f0042..a8b985b06a 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memrchr.c
@@ -28,7 +28,8 @@ extern __typeof (__memrchr) __memrchr_power8 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__memrchr,
 -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 	    ? __memrchr_power8 :
 	      (hwcap & PPC_FEATURE_ARCH_2_06)
 	      ? __memrchr_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
 index 056e911699..a2bc223bcc 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -48,6 +48,7 @@ libc_ifunc (__libc_memset,
 	    ? __memset_power10 :
 # endif
             (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_ALTIVEC
 	     && GLRO(dl_cache_line_size) == 128)
             ? __memset_power8 :
 	      (hwcap & PPC_FEATURE_ARCH_2_06
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
 index b5d2d3a635..43eb459e02 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
@@ -38,7 +38,8 @@ libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
 		     (hwcap2 & PPC_FEATURE2_ARCH_3_1)
 		     && (hwcap & PPC_FEATURE_HAS_VSX)
 		     ? __rawmemchr_power10 :
 -		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
 +		       (hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			&& hwcap & PPC_FEATURE_HAS_VSX)
 		       ? __rawmemchr_power9 :
 # endif
 		         (hwcap & PPC_FEATURE_ARCH_2_06)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c b/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
 index d4eb4285fc..5be413405e 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/stpcpy.c
@@ -32,12 +32,15 @@ extern __typeof (__stpcpy) __stpcpy_power9 attribute_hidden;
 libc_ifunc_hidden (__stpcpy, __stpcpy,
 # ifdef __LITTLE_ENDIAN__
 -		   (hwcap2 & PPC_FEATURE2_ARCH_3_00)
 +		   (hwcap2 & PPC_FEATURE2_ARCH_3_00
 +		    && hwcap & PPC_FEATURE_HAS_VSX)
 		   ? __stpcpy_power9 :
 # endif
 -		     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		     (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +		      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		     ? __stpcpy_power8
 -		     : (hwcap & PPC_FEATURE_HAS_VSX)
 +		     : (hwcap & PPC_FEATURE_ARCH_2_06
 +		        && hwcap & PPC_FEATURE_HAS_VSX)
 		       ? __stpcpy_power7
 		       : __stpcpy_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
 index 55ca6c85c4..21ce2d279b 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcasecmp.c
@@ -27,7 +27,8 @@ extern __typeof (__strcasecmp) __strcasecmp_power7 attribute_hidden;
 extern __typeof (__strcasecmp) __strcasecmp_power8 attribute_hidden;
 libc_ifunc (__libc_strcasecmp,
 -	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	     (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
              ? __strcasecmp_power8:
 	     (hwcap & PPC_FEATURE_ARCH_2_06)
              ? __strcasecmp_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c b/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
 index 7e4bd3b5ac..5bb3016022 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcasestr.c
@@ -27,7 +27,8 @@ extern __typeof (__strcasestr) __strcasestr_power8 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__strcasestr,
 -		(hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		(hwcap2 & PPC_FEATURE2_ARCH_2_07
 +		 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		? __strcasestr_power8
 		: __strcasestr_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcat.c b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
 index 6d342324c4..d8d9870824 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcat.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcat.c
@@ -28,9 +28,11 @@ extern __typeof (strcat) __strcat_power8 attribute_hidden;
 # undef strcat
 libc_ifunc_redirected (__redirect_strcat, strcat,
 -		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			&& hwcap & PPC_FEATURE_HAS_VSX)
 		       ? __strcat_power8
 -		       : (hwcap & PPC_FEATURE_HAS_VSX)
 +		       : (hwcap & PPC_FEATURE_ARCH_2_06
 +			  && hwcap & PPC_FEATURE_HAS_VSX)
 			 ? __strcat_power7
 			 : __strcat_ppc);
 #endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchr.c b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
 index 27c794c6b7..62b202baf9 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strchr.c
@@ -33,7 +33,8 @@ extern __typeof (strchr) __strchr_power8 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect_strchr, strchr,
 -		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		       ? __strchr_power8 :
 		       (hwcap & PPC_FEATURE_ARCH_2_06)
 		       ? __strchr_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
 index 4a07b4a242..40e529b9d9 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strchrnul.c
@@ -28,7 +28,8 @@ extern __typeof (__strchrnul) __strchrnul_power8 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__strchrnul,
 -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 	    ? __strchrnul_power8 :
 	    (hwcap & PPC_FEATURE_ARCH_2_06)
             ? __strchrnul_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
 index 4b0b25fff6..8132682a99 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -35,7 +35,8 @@ extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
 libc_ifunc_redirected (__redirect_strcmp, strcmp,
 # ifdef __LITTLE_ENDIAN__
 -			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
 +			(hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 			? __strcmp_power9 :
 # endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
 index b733fa5a23..5af1d45cc1 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcpy.c
@@ -32,12 +32,15 @@ extern __typeof (strcpy) __strcpy_power9 attribute_hidden;
 libc_ifunc_redirected (__redirect_strcpy, strcpy,
 # ifdef __LITTLE_ENDIAN__
 -			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
 +			(hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			 && hwcap & PPC_FEATURE_HAS_VSX)
 			? __strcpy_power9 :
 # endif
 -		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		       ? __strcpy_power8
 -		       : (hwcap & PPC_FEATURE_HAS_VSX)
 +		       : (hwcap & PPC_FEATURE_ARCH_2_06
 +		          && hwcap & PPC_FEATURE_HAS_VSX)
 			 ? __strcpy_power7
 			 : __strcpy_ppc);
 #endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
 index 683aa104d7..8ba01c13b1 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strcspn.c
@@ -27,7 +27,8 @@ extern __typeof (strcspn) __strcspn_ppc attribute_hidden;
 extern __typeof (strcspn) __strcspn_power8 attribute_hidden;
 libc_ifunc (__libc_strcspn,
 -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __strcspn_power8
 	    : __strcspn_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
 index 0cd1c6faff..f1e28414e0 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -35,12 +35,15 @@ extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden;
 libc_ifunc (__libc_strlen,
 # ifdef __LITTLE_ENDIAN__
 -	(hwcap2 & PPC_FEATURE2_ARCH_3_1)
 +	(hwcap2 & PPC_FEATURE2_ARCH_3_1
 +	 && hwcap & PPC_FEATURE_HAS_VSX)
 	? __strlen_power10 :
 -	  (hwcap2 & PPC_FEATURE2_ARCH_3_00)
 +	  (hwcap2 & PPC_FEATURE2_ARCH_3_00
 +	   && hwcap & PPC_FEATURE_HAS_VSX)
 	  ? __strlen_power9 :
 # endif
 -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 	    ? __strlen_power8 :
 	      (hwcap & PPC_FEATURE_ARCH_2_06)
 	      ? __strlen_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncase.c b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
 index 644046bd74..2802cf2c3f 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strncase.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strncase.c
@@ -27,7 +27,8 @@ extern __typeof (__strncasecmp) __strncasecmp_power7 attribute_hidden;
 extern __typeof (__strncasecmp) __strncasecmp_power8 attribute_hidden;
 libc_ifunc (__libc_strncasecmp,
 -	     (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	     (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
              ? __strncasecmp_power8:
 	     (hwcap & PPC_FEATURE_ARCH_2_06)
              ? __strncasecmp_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncat.c b/sysdeps/powerpc/powerpc64/multiarch/strncat.c
 index 0036fca91a..9ea294a72d 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strncat.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strncat.c
@@ -26,9 +26,11 @@ extern __typeof (strncat) __strncat_power7 attribute_hidden;
 extern __typeof (strncat) __strncat_power8 attribute_hidden;
 libc_ifunc (strncat,
 -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __strncat_power8
 -	    : (hwcap & PPC_FEATURE_HAS_VSX)
 +	    : (hwcap & PPC_FEATURE_ARCH_2_06
 +	       && hwcap & PPC_FEATURE_HAS_VSX)
             ? __strncat_power7
             : __strncat_ppc);
 #endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
 index 1f689e5c05..2d21122854 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -38,7 +38,8 @@ extern __typeof (strncmp) __strncmp_power9 attribute_hidden;
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect_strncmp, strncmp,
 # ifdef __LITTLE_ENDIAN__
 -			(hwcap2 & PPC_FEATURE2_ARCH_3_00)
 +			(hwcap2 & PPC_FEATURE2_ARCH_3_00
 +			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 			? __strncmp_power9 :
 # endif
 		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
 index baf375a75a..e68e9d9f88 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strnlen.c
@@ -29,7 +29,8 @@ extern __typeof (__strnlen) __strnlen_power8 attribute_hidden;
 # undef strnlen
 # undef __strnlen
 libc_ifunc_redirected (__redirect___strnlen, __strnlen,
 -		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		       ? __strnlen_power8 :
 			 (hwcap & PPC_FEATURE_ARCH_2_06)
 			 ? __strnlen_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
 index 1c9eea1817..7f0cf2a1b7 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strrchr.c
@@ -31,7 +31,8 @@ extern __typeof (strrchr) __strrchr_power8 attribute_hidden;
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect_strrchr, strrchr,
 -		       (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +		       (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +			&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
 		       ? __strrchr_power8 :
 		       (hwcap & PPC_FEATURE_ARCH_2_06)
 		       ? __strrchr_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strspn.c b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
 index 70167a176b..7613ab3d55 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strspn.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strspn.c
@@ -27,7 +27,8 @@ extern __typeof (strspn) __strspn_ppc attribute_hidden;
 extern __typeof (strspn) __strspn_power8 attribute_hidden;
 libc_ifunc (__libc_strspn,
 -	    (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
 +	     && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __strspn_power8
 	    : __strspn_ppc);
--- a/SOURCES/glibc-RHEL-1017-4.patch
+++ b/SOURCES/glibc-RHEL-1017-4.patch
@ -0,0 +1,652 @@
 From 21841f0d562f0e944c4d267a28cc3ebd19c847e9 Mon Sep 17 00:00:00 2001
 From: Mahesh Bodapati <bmahi496@linux.ibm.com>
 Date: Tue, 1 Aug 2023 07:41:17 -0500
 Subject: PowerPC: Influence cpu/arch hwcap features via GLIBC_TUNABLES
 This patch enables the option to influence hwcaps used by PowerPC.
 The environment variable, GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,-zzz....,
 can be used to enable CPU/ARCH feature yyy, disable CPU/ARCH feature xxx
 and zzz, where the feature name is case-sensitive and has to match the ones
 mentioned in the file{sysdeps/powerpc/dl-procinfo.c}.
 Note that the hwcap tunables only used in the IFUNC selection.
 Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 [rebased to c9s by DJ]
 diff -rupN a/manual/tunables.texi b/manual/tunables.texi
 --- a/manual/tunables.texi	2023-09-13 01:16:19.979884270 -0400
 +++ b/manual/tunables.texi	2023-09-13 01:17:19.217179994 -0400
@@ -476,7 +476,10 @@ On s390x, the supported HWCAP and STFLE
 @code{sysdeps/s390/cpu-features.c}.  In addition the user can also set
 a CPU arch-level like @code{z13} instead of single HWCAP and STFLE features.
 -This tunable is specific to i386, x86-64 and s390x.
 +On powerpc, the supported HWCAP and HWCAP2 features can be found in
 +@code{sysdeps/powerpc/dl-procinfo.c}.
 +
 +This tunable is specific to i386, x86-64, s390x and powerpc.
 @end deftp
 @deftp Tunable glibc.cpu.cached_memopt
 diff -rupN a/sysdeps/powerpc/cpu-features.c b/sysdeps/powerpc/cpu-features.c
 --- a/sysdeps/powerpc/cpu-features.c	2021-08-01 21:33:43.000000000 -0400
 +++ b/sysdeps/powerpc/cpu-features.c	1969-12-31 19:00:00.000000000 -0500
@@ -1,39 +0,0 @@
 -/* Initialize cpu feature data.  PowerPC version.
 -   Copyright (C) 2017-2021 Free Software Foundation, Inc.
 -   This file is part of the GNU C Library.
 -
 -   The GNU C Library is free software; you can redistribute it and/or
 -   modify it under the terms of the GNU Lesser General Public
 -   License as published by the Free Software Foundation; either
 -   version 2.1 of the License, or (at your option) any later version.
 -
 -   The GNU C Library is distributed in the hope that it will be useful,
 -   but WITHOUT ANY WARRANTY; without even the implied warranty of
 -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -   Lesser General Public License for more details.
 -
 -   You should have received a copy of the GNU Lesser General Public
 -   License along with the GNU C Library; if not, see
 -   <https://www.gnu.org/licenses/>.  */
 -
 -#include <stdint.h>
 -#include <cpu-features.h>
 -
 -#if HAVE_TUNABLES
 -# include <elf/dl-tunables.h>
 -#endif
 -
 -static inline void
 -init_cpu_features (struct cpu_features *cpu_features)
 -{
 -  /* Default is to use aligned memory access on optimized function unless
 -     tunables is enable, since for this case user can explicit disable
 -     unaligned optimizations.  */
 -#if HAVE_TUNABLES
 -  int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t,
 -					NULL);
 -  cpu_features->use_cached_memopt = (cached_memfunc > 0);
 -#else
 -  cpu_features->use_cached_memopt = false;
 -#endif
 -}
 diff -rupN a/sysdeps/powerpc/cpu-features.h b/sysdeps/powerpc/cpu-features.h
 --- a/sysdeps/powerpc/cpu-features.h	2021-08-01 21:33:43.000000000 -0400
 +++ b/sysdeps/powerpc/cpu-features.h	1969-12-31 19:00:00.000000000 -0500
@@ -1,28 +0,0 @@
 -/* Initialize cpu feature data.  PowerPC version.
 -   Copyright (C) 2017-2021 Free Software Foundation, Inc.
 -
 -   The GNU C Library is free software; you can redistribute it and/or
 -   modify it under the terms of the GNU Lesser General Public
 -   License as published by the Free Software Foundation; either
 -   version 2.1 of the License, or (at your option) any later version.
 -
 -   The GNU C Library is distributed in the hope that it will be useful,
 -   but WITHOUT ANY WARRANTY; without even the implied warranty of
 -   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 -   Lesser General Public License for more details.
 -
 -   You should have received a copy of the GNU Lesser General Public
 -   License along with the GNU C Library; if not, see
 -   <https://www.gnu.org/licenses/>.  */
 -
 -#ifndef __CPU_FEATURES_POWERPC_H
 -# define __CPU_FEATURES_POWERPC_H
 -
 -#include <stdbool.h>
 -
 -struct cpu_features
 -{
 -  bool use_cached_memopt;
 -};
 -
 -#endif /* __CPU_FEATURES_H  */
 diff -rupN a/sysdeps/powerpc/dl-tunables.list b/sysdeps/powerpc/dl-tunables.list
 --- a/sysdeps/powerpc/dl-tunables.list	2021-08-01 21:33:43.000000000 -0400
 +++ b/sysdeps/powerpc/dl-tunables.list	2023-09-13 01:17:19.226180343 -0400
@@ -24,5 +24,8 @@ glibc {
       maxval: 1
       default: 0
     }
 +    hwcaps {
 +      type: STRING
 +    }
   }
 }
 diff -rupN a/sysdeps/powerpc/hwcapinfo.c b/sysdeps/powerpc/hwcapinfo.c
 --- a/sysdeps/powerpc/hwcapinfo.c	2021-08-01 21:33:43.000000000 -0400
 +++ b/sysdeps/powerpc/hwcapinfo.c	2023-09-13 01:17:19.229180459 -0400
@@ -19,6 +19,7 @@
 #include <unistd.h>
 #include <shlib-compat.h>
 #include <dl-procinfo.h>
 +#include <cpu-features.c>
 uint64_t __tcb_hwcap __attribute__ ((visibility ("hidden")));
 uint32_t __tcb_platform __attribute__ ((visibility ("hidden")));
@@ -64,6 +65,9 @@ __tcb_parse_hwcap_and_convert_at_platfor
   else if (h1 & PPC_FEATURE_POWER5)
     h1 |= PPC_FEATURE_POWER4;
 +  uint64_t array_hwcaps[] = { h1, h2 };
 +  init_cpu_features (&GLRO(dl_powerpc_cpu_features), array_hwcaps);
 +
   /* Consolidate both HWCAP and HWCAP2 into a single doubleword so that
      we can read both in a single load later.  */
   __tcb_hwcap = h2;
 diff -rupN a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c
 --- a/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c	2021-08-01 21:33:43.000000000 -0400
 +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/ifunc-impl-list.c	2023-09-13 01:17:19.232180575 -0400
@@ -21,6 +21,7 @@
 #include <wchar.h>
 #include <ldsodefs.h>
 #include <ifunc-impl-list.h>
 +#include <cpu-features.h>
 /* Maximum number of IFUNC implementations.  */
 #define MAX_IFUNC	6
@@ -33,7 +34,8 @@ __libc_ifunc_impl_list (const char *name
   size_t i = 0;
 -  unsigned long int hwcap = GLRO(dl_hwcap);
 +  const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);
 +  unsigned long int hwcap = features->hwcap;
   /* hwcap contains only the latest supported ISA, the code checks which is
      and fills the previous supported ones.  */
   if (hwcap & PPC_FEATURE_ARCH_2_06)
 diff -rupN a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h
 --- a/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h	2021-08-01 21:33:43.000000000 -0400
 +++ b/sysdeps/powerpc/powerpc32/power4/multiarch/init-arch.h	2023-09-13 01:17:19.232180575 -0400
@@ -16,6 +16,7 @@
    <https://www.gnu.org/licenses/>.  */
 #include <ldsodefs.h>
 +#include <cpu-features.h>
 /* The code checks if _rtld_global_ro was realocated before trying to access
    the dl_hwcap field. The assembly is to make the compiler not optimize the
@@ -32,11 +33,12 @@
 # define __GLRO(value)  GLRO(value)
 #endif
 -/* dl_hwcap contains only the latest supported ISA, the macro checks which is
 -   and fills the previous ones.  */
 +/* Get the hardware information post the tunables set, the macro checks
 +   it and fills the previous ones.  */
 #define INIT_ARCH() \
 -  unsigned long int hwcap = __GLRO(dl_hwcap); 			\
 -  unsigned long int __attribute__((unused)) hwcap2 = __GLRO(dl_hwcap2); \
 +  const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);	\
 +  unsigned long int hwcap = features->hwcap;				\
 +  unsigned long int __attribute__((unused)) hwcap2 = features->hwcap2; \
   bool __attribute__((unused)) use_cached_memopt =		\
     __GLRO(dl_powerpc_cpu_features.use_cached_memopt);		\
   if (hwcap & PPC_FEATURE_ARCH_2_06)				\
 diff -rupN a/sysdeps/powerpc/powerpc64/dl-machine.h b/sysdeps/powerpc/powerpc64/dl-machine.h
 --- a/sysdeps/powerpc/powerpc64/dl-machine.h	2023-09-13 01:16:17.582791395 -0400
 +++ b/sysdeps/powerpc/powerpc64/dl-machine.h	2023-09-13 01:17:19.236180730 -0400
@@ -27,7 +27,6 @@
 #include <dl-tls.h>
 #include <sysdep.h>
 #include <hwcapinfo.h>
 -#include <cpu-features.c>
 #include <dl-static-tls.h>
 #include <dl-funcdesc.h>
 #include <dl-machine-rel.h>
@@ -293,7 +292,6 @@ static inline void __attribute__ ((unuse
 dl_platform_init (void)
 {
   __tcb_parse_hwcap_and_convert_at_platform ();
 -  init_cpu_features (&GLRO(dl_powerpc_cpu_features));
 }
 #endif
 diff -rupN a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c	2023-09-13 01:16:20.219893569 -0400
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c	2023-09-13 01:19:17.169756083 -0400
@@ -17,6 +17,7 @@
    <https://www.gnu.org/licenses/>.  */
 #include <assert.h>
 +#include <cpu-features.h>
 #include <string.h>
 #include <wchar.h>
 #include <ldsodefs.h>
@@ -32,9 +33,9 @@ __libc_ifunc_impl_list (const char *name
   assert (max >= MAX_IFUNC);
   size_t i = 0;
 -
 -  unsigned long int hwcap = GLRO(dl_hwcap);
 -  unsigned long int hwcap2 = GLRO(dl_hwcap2);
 +  const struct cpu_features *features = &GLRO(dl_powerpc_cpu_features);
 +  unsigned long int hwcap = features->hwcap;
 +  unsigned long int hwcap2 = features->hwcap2;
 #ifdef SHARED
   int cacheline_size = GLRO(dl_cache_line_size);
 #endif
 diff -rupN a/sysdeps/unix/sysv/linux/powerpc/Makefile b/sysdeps/unix/sysv/linux/powerpc/Makefile
 --- a/sysdeps/unix/sysv/linux/powerpc/Makefile	2021-08-01 21:33:43.000000000 -0400
 +++ b/sysdeps/unix/sysv/linux/powerpc/Makefile	2023-09-13 01:17:19.243181002 -0400
@@ -21,7 +21,12 @@ ifeq ($(subdir),misc)
 sysdep_headers += bits/ppc.h
 sysdep_routines += get_timebase_freq
 tests-static += test-gettimebasefreq-static
 -tests += $(tests-static)
 -tests += test-gettimebasefreq
 -tests += test-powerpc-linux-sysconf
 +tests += \
 +  $(tests-static) \
 +  test-gettimebasefreq \
 +  test-powerpc-linux-sysconf \
 +  tst-hwcap-tunables \
 +  # tests
 +
 +tst-hwcap-tunables-ARGS = -- $(host-test-program-cmd)
 endif
 diff -rupN a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c
 --- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.c	1969-12-31 19:00:00.000000000 -0500
 +++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.c	2023-09-13 01:17:19.247181157 -0400
@@ -0,0 +1,124 @@
 +/* Initialize cpu feature data.  PowerPC version.
 +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <array_length.h>
 +#include <stdint.h>
 +#include <cpu-features.h>
 +#include <elf/dl-tunables.h>
 +#include <unistd.h>
 +#include <string.h>
 +
 +static void
 +TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 +{
 +  /* The current IFUNC selection is always using the most recent
 +     features which are available via AT_HWCAP or AT_HWCAP2.  But in
 +     some scenarios it is useful to adjust this selection.
 +
 +     The environment variable:
 +
 +     GLIBC_TUNABLES=glibc.cpu.hwcaps=-xxx,yyy,....
 +
 +     Can be used to enable HWCAP/HWCAP2 feature yyy, disable HWCAP/HWCAP2
 +     feature xxx, where the feature name is case-sensitive and has to match
 +     the ones mentioned in the file{sysdeps/powerpc/dl-procinfo.c}. */
 +
 +  /* Copy the features from dl_powerpc_cpu_features, which contains the
 +     features provided by AT_HWCAP and AT_HWCAP2.  */
 +  struct cpu_features *cpu_features = &GLRO(dl_powerpc_cpu_features);
 +  unsigned long int tcbv_hwcap = cpu_features->hwcap;
 +  unsigned long int tcbv_hwcap2 = cpu_features->hwcap2;
 +  const char *token = valp->strval;
 +  do
 +    {
 +      const char *token_end, *feature;
 +      bool disable;
 +      size_t token_len, i, feature_len, offset = 0;
 +      /* Find token separator or end of string.  */
 +      for (token_end = token; *token_end != ','; token_end++)
 +	if (*token_end == '\0')
 +	  break;
 +
 +      /* Determine feature.  */
 +      token_len = token_end - token;
 +      if (*token == '-')
 +	{
 +	  disable = true;
 +	  feature = token + 1;
 +	  feature_len = token_len - 1;
 +	}
 +      else
 +	{
 +	  disable = false;
 +	  feature = token;
 +	  feature_len = token_len;
 +	}
 +      for (i = 0; i < array_length (hwcap_tunables); ++i)
 +	{
 +	  const char *hwcap_name = hwcap_names + offset;
 +	  size_t hwcap_name_len = strlen (hwcap_name);
 +	  /* Check the tunable name on the supported list.  */
 +	  if (hwcap_name_len == feature_len
 +	      && memcmp (feature, hwcap_name, feature_len) == 0)
 +	    {
 +	      /* Update the hwcap and hwcap2 bits.  */
 +	      if (disable)
 +		{
 +		  /* Id is 1 for hwcap2 tunable.  */
 +		  if (hwcap_tunables[i].id)
 +		    cpu_features->hwcap2 &= ~(hwcap_tunables[i].mask);
 +		  else
 +		    cpu_features->hwcap &= ~(hwcap_tunables[i].mask);
 +		}
 +	      else
 +		{
 +		  /* Enable the features and also check that no unsupported
 +		     features were enabled by user.  */
 +		  if (hwcap_tunables[i].id)
 +		    cpu_features->hwcap2 |= (tcbv_hwcap2 & hwcap_tunables[i].mask);
 +		  else
 +		    cpu_features->hwcap |= (tcbv_hwcap & hwcap_tunables[i].mask);
 +		}
 +	      break;
 +	    }
 +	  offset += hwcap_name_len + 1;
 +	}
 +	token += token_len;
 +	/* ... and skip token separator for next round.  */
 +	if (*token == ',')
 +	  token++;
 +    }
 +  while (*token != '\0');
 +}
 +
 +static inline void
 +init_cpu_features (struct cpu_features *cpu_features, uint64_t hwcaps[])
 +{
 +  /* Fill the cpu_features with the supported hwcaps
 +     which are set by __tcb_parse_hwcap_and_convert_at_platform.  */
 +  cpu_features->hwcap = hwcaps[0];
 +  cpu_features->hwcap2 = hwcaps[1];
 +  /* Default is to use aligned memory access on optimized function unless
 +     tunables is enable, since for this case user can explicit disable
 +     unaligned optimizations.  */
 +  int32_t cached_memfunc = TUNABLE_GET (glibc, cpu, cached_memopt, int32_t,
 +					NULL);
 +  cpu_features->use_cached_memopt = (cached_memfunc > 0);
 +  TUNABLE_GET (glibc, cpu, hwcaps, tunable_val_t *,
 +	       TUNABLE_CALLBACK (set_hwcaps));
 +}
 diff -rupN a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h
 --- a/sysdeps/unix/sysv/linux/powerpc/cpu-features.h	1969-12-31 19:00:00.000000000 -0500
 +++ b/sysdeps/unix/sysv/linux/powerpc/cpu-features.h	2023-09-13 01:17:19.251181312 -0400
@@ -0,0 +1,130 @@
 +/* Initialize cpu feature data.  PowerPC version.
 +   Copyright (C) 2017-2023 Free Software Foundation, Inc.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#ifndef __CPU_FEATURES_POWERPC_H
 +# define __CPU_FEATURES_POWERPC_H
 +
 +#include <stdbool.h>
 +#include <sys/auxv.h>
 +
 +struct cpu_features
 +{
 +  bool use_cached_memopt;
 +  unsigned long int hwcap;
 +  unsigned long int hwcap2;
 +};
 +
 +static const char hwcap_names[] = {
 +  "4xxmac\0"
 +  "altivec\0"
 +  "arch_2_05\0"
 +  "arch_2_06\0"
 +  "archpmu\0"
 +  "booke\0"
 +  "cellbe\0"
 +  "dfp\0"
 +  "efpdouble\0"
 +  "efpsingle\0"
 +  "fpu\0"
 +  "ic_snoop\0"
 +  "mmu\0"
 +  "notb\0"
 +  "pa6t\0"
 +  "power4\0"
 +  "power5\0"
 +  "power5+\0"
 +  "power6x\0"
 +  "ppc32\0"
 +  "ppc601\0"
 +  "ppc64\0"
 +  "ppcle\0"
 +  "smt\0"
 +  "spe\0"
 +  "true_le\0"
 +  "ucache\0"
 +  "vsx\0"
 +  "arch_2_07\0"
 +  "dscr\0"
 +  "ebb\0"
 +  "htm\0"
 +  "htm-nosc\0"
 +  "htm-no-suspend\0"
 +  "isel\0"
 +  "tar\0"
 +  "vcrypto\0"
 +  "arch_3_00\0"
 +  "ieee128\0"
 +  "darn\0"
 +  "scv\0"
 +  "arch_3_1\0"
 +  "mma\0"
 +};
 +
 +static const struct
 +{
 +  unsigned int mask;
 +  bool id;
 +} hwcap_tunables[] = {
 +   /* AT_HWCAP tunable masks.  */
 +   { PPC_FEATURE_HAS_4xxMAC,                 0 },
 +   { PPC_FEATURE_HAS_ALTIVEC,                0 },
 +   { PPC_FEATURE_ARCH_2_05,                  0 },
 +   { PPC_FEATURE_ARCH_2_06,                  0 },
 +   { PPC_FEATURE_PSERIES_PERFMON_COMPAT,     0 },
 +   { PPC_FEATURE_BOOKE,                      0 },
 +   { PPC_FEATURE_CELL_BE,                    0 },
 +   { PPC_FEATURE_HAS_DFP,                    0 },
 +   { PPC_FEATURE_HAS_EFP_DOUBLE,             0 },
 +   { PPC_FEATURE_HAS_EFP_SINGLE,             0 },
 +   { PPC_FEATURE_HAS_FPU,                    0 },
 +   { PPC_FEATURE_ICACHE_SNOOP,               0 },
 +   { PPC_FEATURE_HAS_MMU,                    0 },
 +   { PPC_FEATURE_NO_TB,                      0 },
 +   { PPC_FEATURE_PA6T,                       0 },
 +   { PPC_FEATURE_POWER4,                     0 },
 +   { PPC_FEATURE_POWER5,                     0 },
 +   { PPC_FEATURE_POWER5_PLUS,                0 },
 +   { PPC_FEATURE_POWER6_EXT,                 0 },
 +   { PPC_FEATURE_32,                         0 },
 +   { PPC_FEATURE_601_INSTR,                  0 },
 +   { PPC_FEATURE_64,                         0 },
 +   { PPC_FEATURE_PPC_LE,                     0 },
 +   { PPC_FEATURE_SMT,                        0 },
 +   { PPC_FEATURE_HAS_SPE,                    0 },
 +   { PPC_FEATURE_TRUE_LE,                    0 },
 +   { PPC_FEATURE_UNIFIED_CACHE,              0 },
 +   { PPC_FEATURE_HAS_VSX,                    0 },
 +
 +   /* AT_HWCAP2 tunable masks.  */
 +   { PPC_FEATURE2_ARCH_2_07,                 1 },
 +   { PPC_FEATURE2_HAS_DSCR,                  1 },
 +   { PPC_FEATURE2_HAS_EBB,                   1 },
 +   { PPC_FEATURE2_HAS_HTM,                   1 },
 +   { PPC_FEATURE2_HTM_NOSC,                  1 },
 +   { PPC_FEATURE2_HTM_NO_SUSPEND,            1 },
 +   { PPC_FEATURE2_HAS_ISEL,                  1 },
 +   { PPC_FEATURE2_HAS_TAR,                   1 },
 +   { PPC_FEATURE2_HAS_VEC_CRYPTO,            1 },
 +   { PPC_FEATURE2_ARCH_3_00,                 1 },
 +   { PPC_FEATURE2_HAS_IEEE128,               1 },
 +   { PPC_FEATURE2_DARN,                      1 },
 +   { PPC_FEATURE2_SCV,                       1 },
 +   { PPC_FEATURE2_ARCH_3_1,                  1 },
 +   { PPC_FEATURE2_MMA,                       1 },
 +};
 +
 +#endif /* __CPU_FEATURES_H  */
 diff -rupN a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list
 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list	2023-09-13 01:16:19.989884657 -0400
 +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/le/dl-tunables.list	2023-09-13 01:17:19.254181428 -0400
@@ -28,3 +28,4 @@
 @order glibc.malloc.check
 @order glibc.gmon.minarcs
 @order glibc.gmon.maxarcs
 +@order glibc.cpu.hwcaps
 diff -rupN a/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c b/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c
 --- a/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c	1969-12-31 19:00:00.000000000 -0500
 +++ b/sysdeps/unix/sysv/linux/powerpc/tst-hwcap-tunables.c	2023-09-13 01:17:19.258181583 -0400
@@ -0,0 +1,128 @@
 +/* Tests for powerpc GLIBC_TUNABLES=glibc.cpu.hwcaps filter.
 +   Copyright (C) 2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#include <array_length.h>
 +#include <getopt.h>
 +#include <ifunc-impl-list.h>
 +#include <spawn.h>
 +#include <stdio.h>
 +#include <stdlib.h>
 +#include <string.h>
 +#include <support/check.h>
 +#include <support/support.h>
 +#include <support/xunistd.h>
 +#include <sys/auxv.h>
 +#include <sys/wait.h>
 +
 +/* Nonzero if the program gets called via `exec'.  */
 +#define CMDLINE_OPTIONS \
 +  { "restart", no_argument, &restart, 1 },
 +static int restart;
 +
 +/* Hold the four initial argument used to respawn the process, plus the extra
 +   '--direct', '--restart', and the function to check  */
 +static char *spargs[8];
 +static int fc;
 +
 +/* Called on process re-execution.  */
 +_Noreturn static void
 +handle_restart (int argc, char *argv[])
 +{
 +  TEST_VERIFY_EXIT (argc == 1);
 +  const char *funcname = argv[0];
 +
 +  struct libc_ifunc_impl impls[32];
 +  int cnt = __libc_ifunc_impl_list ("memcpy", impls, array_length (impls));
 +  if (cnt == 0)
 +    _exit (EXIT_SUCCESS);
 +  TEST_VERIFY_EXIT (cnt >= 1);
 +  for (int i = 0; i < cnt; i++) {
 +    if (strcmp (impls[i].name, funcname) == 0)
 +      {
 +	TEST_COMPARE (impls[i].usable, false);
 +	break;
 +      }
 +  }
 +
 +  _exit (EXIT_SUCCESS);
 +}
 +
 +static void
 +run_test (const char *filter, const char *funcname)
 +{
 +  printf ("info: checking filter %s (expect %s ifunc selection to be removed)\n",
 +	  filter, funcname);
 +  char *tunable = xasprintf ("GLIBC_TUNABLES=glibc.cpu.hwcaps=%s", filter);
 +  char *const newenvs[] = { (char*) tunable, NULL };
 +  spargs[fc] = (char *) funcname;
 +
 +  pid_t pid;
 +  TEST_COMPARE (posix_spawn (&pid, spargs[0], NULL, NULL, spargs, newenvs), 0);
 +  int status;
 +  TEST_COMPARE (xwaitpid (pid, &status, 0), pid);
 +  TEST_VERIFY (WIFEXITED (status));
 +  TEST_VERIFY (!WIFSIGNALED (status));
 +  TEST_COMPARE (WEXITSTATUS (status), 0);
 +
 +  free (tunable);
 +}
 +
 +static int
 +do_test (int argc, char *argv[])
 +{
 +  if (restart)
 +    handle_restart (argc - 1, &argv[1]);
 +
 +  TEST_VERIFY_EXIT (argc == 2 || argc == 5);
 +
 +  int i;
 +  for (i = 0; i < argc - 1; i++)
 +    spargs[i] = argv[i + 1];
 +  spargs[i++] = (char *) "--direct";
 +  spargs[i++] = (char *) "--restart";
 +  fc = i++;
 +  spargs[i] = NULL;
 +
 +  unsigned long int hwcap = getauxval (AT_HWCAP);
 +  unsigned long int hwcap2 = getauxval (AT_HWCAP2);
 +  if (__WORDSIZE == 64)
 +    {
 +      if (hwcap2 & PPC_FEATURE2_ARCH_3_1)
 +	run_test ("-arch_3_1", "__memcpy_power10");
 +      if (hwcap2 & PPC_FEATURE2_ARCH_2_07)
 +	run_test ("-arch_2_07", "__memcpy_power8_cached");
 +      if (hwcap & PPC_FEATURE_ARCH_2_06)
 +	run_test ("-arch_2_06", "__memcpy_power7");
 +      if (hwcap & PPC_FEATURE_ARCH_2_05)
 +	run_test ("-arch_2_06,-arch_2_05","__memcpy_power6");
 +      run_test ("-arch_2_06,-arch_2_05,-power5+,-power5,-power4", "__memcpy_power4");
 +    }
 +  else
 +    {
 +      if (hwcap & PPC_FEATURE_HAS_VSX)
 +	run_test ("-vsx", "__memcpy_power7");
 +      if (hwcap & PPC_FEATURE_ARCH_2_06)
 +	run_test ("-arch_2_06", "__memcpy_a2");
 +      if (hwcap & PPC_FEATURE_ARCH_2_05)
 +	run_test ("-arch_2_05", "__memcpy_power6");
 +    }
 +  return 0;
 +}
 +
 +#define TEST_FUNCTION_ARGV do_test
 +#include <support/test-driver.c>
--- a/SOURCES/glibc-RHEL-1191.patch
+++ b/SOURCES/glibc-RHEL-1191.patch
@ -0,0 +1,69 @@
 commit 1493622f4f9048ffede3fbedb64695efa49d662a
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Mon Aug 28 12:08:14 2023 -0700
    x86: Check the lower byte of EAX of CPUID leaf 2 [BZ #30643]
    The old Intel software developer manual specified that the low byte of
    EAX of CPUID leaf 2 returned 1 which indicated the number of rounds of
    CPUDID leaf 2 was needed to retrieve the complete cache information. The
    newer Intel manual has been changed to that it should always return 1
    and be ignored.  If the lower byte isn't 1, CPUID leaf 2 can't be used.
    In this case, we ignore CPUID leaf 2 and use CPUID leaf 4 instead.  If
    CPUID leaf 4 doesn't contain the cache information, cache information
    isn't available at all.  This addresses BZ #30643.
 diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
 index f950e488cfbe42dd..bd2f2b65f78056ca 100644
 --- a/sysdeps/x86/dl-cacheinfo.h
 +++ b/sysdeps/x86/dl-cacheinfo.h
@@ -187,7 +187,7 @@ intel_check_word (int name, unsigned int value, bool *has_level_2,
 	      ++round;
 	    }
 	  /* There is no other cache information anywhere else.  */
 -	  break;
 +	  return -1;
 	}
       else
 	{
@@ -257,28 +257,23 @@ handle_intel (int name, const struct cpu_features *cpu_features)
   /* OK, we can use the CPUID instruction to get all info about the
      caches.  */
 -  unsigned int cnt = 0;
 -  unsigned int max = 1;
   long int result = 0;
   bool no_level_2_or_3 = false;
   bool has_level_2 = false;
 +  unsigned int eax;
 +  unsigned int ebx;
 +  unsigned int ecx;
 +  unsigned int edx;
 +  __cpuid (2, eax, ebx, ecx, edx);
 -  while (cnt++ < max)
 +  /* The low byte of EAX of CPUID leaf 2 should always return 1 and it
 +     should be ignored.  If it isn't 1, use CPUID leaf 4 instead.  */
 +  if ((eax & 0xff) != 1)
 +    return intel_check_word (name, 0xff, &has_level_2, &no_level_2_or_3,
 +			     cpu_features);
 +  else
     {
 -      unsigned int eax;
 -      unsigned int ebx;
 -      unsigned int ecx;
 -      unsigned int edx;
 -      __cpuid (2, eax, ebx, ecx, edx);
 -
 -      /* The low byte of EAX in the first round contain the number of
 -	 rounds we have to make.  At least one, the one we are already
 -	 doing.  */
 -      if (cnt == 1)
 -	{
 -	  max = eax & 0xff;
 -	  eax &= 0xffffff00;
 -	}
 +      eax &= 0xffffff00;
       /* Process the individual registers' value.  */
       result = intel_check_word (name, eax, &has_level_2,
--- a/SOURCES/glibc-RHEL-14383-1.patch
+++ b/SOURCES/glibc-RHEL-14383-1.patch
@ -14,10 +14,10 @@ Conflicts:
 	  (missing alloca removal downstream)
 diff --git a/elf/ldconfig.c b/elf/ldconfig.c
-index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
+index be47ad8c2d7f89f3..f0c811001965cc46 100644
 --- a/elf/ldconfig.c
 +++ b/elf/ldconfig.c
-@@ -771,6 +771,31 @@ struct dlib_entry
+@@ -778,6 +778,31 @@ struct dlib_entry
   struct dlib_entry *next;
 };
@ -49,7 +49,7 @@ index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
 static void
 search_dir (const struct dir_entry *entry)
-@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
+@@ -854,18 +879,8 @@ search_dir (const struct dir_entry *entry)
 	continue;
       size_t len = strlen (direntry->d_name);
--- a/SOURCES/glibc-RHEL-14383-2.patch
+++ b/SOURCES/glibc-RHEL-14383-2.patch
@ -14,10 +14,10 @@ Date:   Mon Oct 23 12:53:16 2023 +0200
    temporary files created by package managers").
 diff --git a/elf/ldconfig.c b/elf/ldconfig.c
-index 51de08f91fbaf093..fb19dd68d41c07a4 100644
+index f0c811001965cc46..4a96c409994d96c8 100644
 --- a/elf/ldconfig.c
 +++ b/elf/ldconfig.c
-@@ -771,6 +771,17 @@ struct dlib_entry
+@@ -778,6 +778,17 @@ struct dlib_entry
   struct dlib_entry *next;
 };
@ -35,7 +35,7 @@ index 51de08f91fbaf093..fb19dd68d41c07a4 100644
 /* Skip some temporary DSO files.  These files may be partially written
    and lead to ldconfig crashes when examined.  */
 static bool
-@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
+@@ -787,8 +798,7 @@ skip_dso_based_on_name (const char *name, size_t len)
      names like these are never really DSOs we want to look at.  */
   if (len >= sizeof (".#prelink#") - 1)
     {
@ -45,7 +45,7 @@ index 51de08f91fbaf093..fb19dd68d41c07a4 100644
 	return true;
       if (len >= sizeof (".#prelink#.XXXXXX") - 1
 	  && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
-@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
+@@ -796,10 +806,11 @@ skip_dso_based_on_name (const char *name, size_t len)
 	return true;
     }
   /* Skip temporary files created by RPM.  */
--- a/SOURCES/glibc-RHEL-15343-1.patch
+++ b/SOURCES/glibc-RHEL-15343-1.patch
@ -0,0 +1,26 @@
 commit 1626d8a521c7c771d4118b1328421fea113cab64
 Author: Joe Simmons-Talbott <josimmon@redhat.com>
 Date:   Fri Apr 21 09:24:22 2023 -0400
    string: Allow use of test-string.h for non-ifunc implementations.
    Mark two variables as unused to silence warning when using
    test-string.h for non-ifunc implementations.
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
 diff --git a/string/test-string.h b/string/test-string.h
 index 41de973479..8bcb8afd0a 100644
 --- a/string/test-string.h
 +++ b/string/test-string.h
@@ -130,8 +130,8 @@ cmdline_process_function (int c)
 /* Increase size of FUNC_LIST if assert is triggered at run-time.  */
 static struct libc_ifunc_impl func_list[32];
 static int func_count;
 -static int impl_count = -1;
 -static impl_t *impl_array;
 +static int impl_count __attribute__ ((unused)) = -1;
 +static impl_t *impl_array __attribute__ ((unused));
 # define FOR_EACH_IMPL(impl, notall) \
   impl_t *impl;								\
--- a/SOURCES/glibc-RHEL-15343-2.patch
+++ b/SOURCES/glibc-RHEL-15343-2.patch
@ -0,0 +1,233 @@
 commit eaaad78db41724e5a18a42becb238bfc4e683998
 Author: Joe Simmons-Talbott <josimmon@redhat.com>
 Date:   Fri Apr 21 09:24:23 2023 -0400
    string: Add tests for strdup (BZ #30266)
    Copy strcpy tests for strdup.  Covers some basic testcases with random
    strings.  Add a zero-length string testcase.
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
    Conflicts:
        string/Makefile
          (different test backport order)
 diff -Nrup a/string/Makefile b/string/Makefile
 --- a/string/Makefile	2023-11-30 10:59:16.400251685 -0500
 +++ b/string/Makefile	2023-11-30 11:16:42.829613344 -0500
@@ -63,7 +63,8 @@ tests		:= tester inl-tester noinl-tester
 		   tst-strtok_r bug-strcoll2 tst-cmp tst-xbzero-opt	\
 		   test-endian-types test-endian-file-scope		\
 		   test-endian-sign-conversion tst-memmove-overflow	\
 -		   test-sig_np tst-strerror-fail
 +		   test-sig_np tst-strerror-fail			\
 +		   test-strdup
 # Both tests require the .mo translation files generated by msgfmt.
 tests-translation := tst-strsignal					\
 diff -Nrup a/string/test-strdup.c b/string/test-strdup.c
 --- a/string/test-strdup.c	1969-12-31 19:00:00.000000000 -0500
 +++ b/string/test-strdup.c	2023-11-30 11:11:32.850447614 -0500
@@ -0,0 +1,201 @@
 +/* Test and measure strdup functions.
 +   Copyright (C) 2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <support/check.h>
 +
 +#ifdef WIDE
 +# include <wchar.h>
 +# define CHAR wchar_t
 +# define sfmt "ls"
 +# define BIG_CHAR WCHAR_MAX
 +# define SMALL_CHAR 1273
 +# define STRCMP wcscmp
 +# define MEMCMP wmemcmp
 +# define MEMSET wmemset
 +# define TCS TEST_COMPARE_STRING_WIDE
 +#else
 +# define CHAR char
 +# define sfmt "s"
 +# define BIG_CHAR CHAR_MAX
 +# define SMALL_CHAR 127
 +# define STRCMP strcmp
 +# define MEMCMP memcmp
 +# define MEMSET memset
 +# define TCS TEST_COMPARE_STRING
 +#endif
 +
 +#ifndef STRDUP_RESULT
 +# define STRDUP_RESULT(dst, len) dst
 +# define TEST_MAIN
 +# ifndef WIDE
 +#  define TEST_NAME "strdup"
 +# else
 +#  define TEST_NAME "wcsdup"
 +# endif
 +# include "test-string.h"
 +# ifndef WIDE
 +#  define STRDUP strdup
 +# else
 +#  define STRDUP wcsdup
 +# endif
 +#endif
 +
 +typedef CHAR *(*proto_t) (const CHAR *);
 +
 +static void
 +do_zero_len_test (void)
 +{
 +  CHAR src[1] = { '\0' };
 +  CHAR *dst = STRDUP (src);
 +
 +  TCS (dst, src);
 +  free (dst);
 +}
 +
 +static void
 +do_one_test (const CHAR *src,
 +	     size_t len __attribute__((unused)))
 +{
 +  CHAR *dst = STRDUP (src);
 +
 +  if (STRCMP (dst, src) != 0)
 +    {
 +      error (0, 0,
 +	     "Wrong result in function %s dst \"%" sfmt "\" src \"%" sfmt "\"",
 +	     TEST_NAME, dst, src);
 +      ret = 1;
 +      free (dst);
 +      return;
 +    }
 +  free (dst);
 +}
 +
 +static void
 +do_test (size_t align1, size_t align2, size_t len, int max_char)
 +{
 +  size_t i;
 +  CHAR *s1;
 +/* For wcsdup: align1 and align2 here mean alignment not in bytes,
 +   but in wchar_ts, in bytes it will equal to align * (sizeof (wchar_t))
 +   len for wcschr here isn't in bytes but it's number of wchar_t symbols.  */
 +  align1 &= 7;
 +  if ((align1 + len) * sizeof (CHAR) >= page_size)
 +    return;
 +
 +  align2 &= 7;
 +  if ((align2 + len) * sizeof (CHAR) >= page_size)
 +    return;
 +
 +  s1 = (CHAR *) (buf1) + align1;
 +
 +  for (i = 0; i < len; i++)
 +    s1[i] = 32 + 23 * i % (max_char - 32);
 +  s1[len] = 0;
 +
 +  do_one_test (s1, len);
 +}
 +
 +static void
 +do_random_tests (void)
 +{
 +  size_t i, j, n, align1, align2, len;
 +  CHAR *p1 = (CHAR *)(buf1 + page_size) - 512;
 +  CHAR *res;
 +
 +  for (n = 0; n < ITERATIONS; n++)
 +    {
 +      /* align1 and align2 are expressed as wchar_t and not in bytes for wide
 +     char test, and thus it will be equal to align times wchar_t size.
 +
 +     For non wide version we need to check all alignments from 0 to 63
 +     since some assembly implementations have separate prolog for alignments
 +     more 48.  */
 +
 +      align1 = random () & (63 / sizeof (CHAR));
 +      if (random () & 1)
 +	align2 = random () & (63 / sizeof (CHAR));
 +      else
 +	align2 = align1 + (random () & 24);
 +      len = random () & 511;
 +      j = align1;
 +      if (align2 > j)
 +	j = align2;
 +      if (len + j >= 511)
 +	len = 510 - j - (random () & 7);
 +      j = len + align1 + 64;
 +      if (j > 512)
 +	j = 512;
 +      for (i = 0; i < j; i++)
 +	{
 +	  if (i == len + align1)
 +	    p1[i] = 0;
 +	  else
 +	    {
 +	      p1[i] = random () & BIG_CHAR;
 +	      if (i >= align1 && i < len + align1 && !p1[i])
 +		p1[i] = (random () & SMALL_CHAR) + 3;
 +	    }
 +	}
 +
 +      res =  STRDUP(p1 + align1);
 +      TCS (res, (p1 + align1));
 +      free (res);
 +    }
 +}
 +
 +
 +int
 +test_main (void)
 +{
 +  size_t i;
 +
 +  test_init ();
 +
 +  printf ("%23s", "");
 +  printf ("\t%s", TEST_NAME);
 +  putchar ('\n');
 +
 +  for (i = 0; i < 16; ++i)
 +    {
 +      do_test (0, 0, i, SMALL_CHAR);
 +      do_test (0, 0, i, BIG_CHAR);
 +      do_test (0, i, i, SMALL_CHAR);
 +      do_test (i, 0, i, BIG_CHAR);
 +    }
 +
 +  for (i = 1; i < 8; ++i)
 +    {
 +      do_test (0, 0, 8 << i, SMALL_CHAR);
 +      do_test (8 - i, 2 * i, 8 << i, SMALL_CHAR);
 +    }
 +
 +  for (i = 1; i < 8; ++i)
 +    {
 +      do_test (i, 2 * i, 8 << i, SMALL_CHAR);
 +      do_test (2 * i, i, 8 << i, BIG_CHAR);
 +      do_test (i, i, 8 << i, SMALL_CHAR);
 +      do_test (i, i, 8 << i, BIG_CHAR);
 +    }
 +
 +  do_zero_len_test ();
 +  do_random_tests ();
 +
 +  return ret;
 +}
 +
 +#include <support/test-driver.c>
--- a/SOURCES/glibc-RHEL-15343-3.patch
+++ b/SOURCES/glibc-RHEL-15343-3.patch
@ -0,0 +1,232 @@
 commit 0c48aa0551151ea201f7f528492e89a0b08a6890
 Author: Joe Simmons-Talbott <josimmon@redhat.com>
 Date:   Fri Apr 21 09:24:24 2023 -0400
    string: Add tests for strndup (BZ #30266)
    Copy strncpy tests for strndup.  Covers some basic testcases with random
    strings.  Remove tests that set the destination's bytes and checked the
    resulting buffer's bytes.  Remove wide character test support since
    wcsndup() doesn't exist.
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
    Conflicts:
 	string/Makefile
 	  (different test backport order)
 diff -Nrup a/string/Makefile b/string/Makefile
 --- a/string/Makefile	2023-11-30 11:55:02.263010916 -0500
 +++ b/string/Makefile	2023-11-30 11:58:29.238954539 -0500
@@ -64,7 +64,7 @@ tests		:= tester inl-tester noinl-tester
 		   test-endian-types test-endian-file-scope		\
 		   test-endian-sign-conversion tst-memmove-overflow	\
 		   test-sig_np tst-strerror-fail			\
 -		   test-strdup
 +		   test-strdup test-strndup
 # Both tests require the .mo translation files generated by msgfmt.
 tests-translation := tst-strsignal					\
 diff -Nrup a/string/test-strndup.c b/string/test-strndup.c
 --- a/string/test-strndup.c	1969-12-31 19:00:00.000000000 -0500
 +++ b/string/test-strndup.c	2023-11-30 11:56:24.986388053 -0500
@@ -0,0 +1,200 @@
 +/* Test strndup functions.
 +   Copyright (C) 2023 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <support/check.h>
 +
 +#define TEST_MAIN
 +#include "test-string.h"
 +
 +static void
 +do_one_test (const char *src, size_t len, size_t n)
 +{
 +  char *dst = strndup (src, n);
 +  size_t s = (len > n ? n: len) * sizeof (char);
 +
 +  TEST_COMPARE_BLOB (dst, s, src, s);
 +
 +  free (dst);
 +}
 +
 +static void
 +do_test (size_t align1, size_t align2, size_t len, size_t n, int max_char)
 +{
 +  size_t i;
 +  char *s1;
 +
 +  align1 &= 7;
 +  if ((align1 + len) * sizeof (char) >= page_size)
 +    return;
 +
 +  align2 &= 7;
 +  if ((align2 + len) * sizeof (char) >= page_size)
 +    return;
 +
 +  s1 = (char *) (buf1) + align1;
 +
 +  for (i = 0; i < len; ++i)
 +    s1[i] = 32 + 23 * i % (max_char - 32);
 +  s1[len] = 0;
 +  for (i = len + 1; (i + align1) * sizeof (char) < page_size && i < len + 64;
 +       ++i)
 +    s1[i] = 32 + 32 * i % (max_char - 32);
 +
 +  do_one_test (s1, len, n);
 +}
 +
 +static void
 +do_page_tests (void)
 +{
 +  char *s1;
 +  const size_t maxoffset = 64;
 +
 +  /* Put s1 at the maxoffset from the edge of buf1's last page.  */
 +  s1 = (char *) buf1 + BUF1PAGES * page_size / sizeof (char) - maxoffset;
 +
 +  memset (s1, 'a', maxoffset - 1);
 +  s1[maxoffset - 1] = '\0';
 +
 +  /* Both strings are bounded to a page with read/write access and the next
 +     page is protected with PROT_NONE (meaning that any access outside of the
 +     page regions will trigger an invalid memory access).
 +
 +     The loop copies the string s1 for all possible offsets up to maxoffset
 +     for both inputs with a size larger than s1 (so memory access outside the
 +     expected memory regions might trigger invalid access).  */
 +
 +  for (size_t off1 = 0; off1 < maxoffset; off1++)
 +    for (size_t off2 = 0; off2 < maxoffset; off2++)
 +      do_one_test (s1 + off1, maxoffset - off1 - 1,
 +		   maxoffset + (maxoffset - off2));
 +}
 +
 +static void
 +do_random_tests (void)
 +{
 +  size_t i, j, n, align1, align2, len, size, mode;
 +  char *p1 = (char *) (buf1 + page_size) - 512;
 +  char *res;
 +
 +  for (n = 0; n < ITERATIONS; n++)
 +    {
 +      mode = random ();
 +      if (mode & 1)
 +	{
 +	  size = random () & 255;
 +	  align1 = 512 - size - (random () & 15);
 +	  if (mode & 2)
 +	    align2 = align1 - (random () & 24);
 +	  else
 +	    align2 = align1 - (random () & 31);
 +	  if (mode & 4)
 +	    {
 +	      j = align1;
 +	      align1 = align2;
 +	      align2 = j;
 +	    }
 +	  if (mode & 8)
 +	    len = size - (random () & 31);
 +	  else
 +	    len = 512;
 +	  if (len >= 512)
 +	    len = random () & 511;
 +	}
 +      else
 +	{
 +	  align1 = random () & 31;
 +	  if (mode & 2)
 +	    align2 = random () & 31;
 +	  else
 +	    align2 = align1 + (random () & 24);
 +	  len = random () & 511;
 +	  j = align1;
 +	  if (align2 > j)
 +	    j = align2;
 +	  if (mode & 4)
 +	    {
 +	      size = random () & 511;
 +	      if (size + j > 512)
 +		size = 512 - j - (random () & 31);
 +	    }
 +	  else
 +	    size = 512 - j;
 +	  if ((mode & 8) && len + j >= 512)
 +	    len = 512 - j - (random () & 7);
 +	}
 +      j = len + align1 + 64;
 +      if (j > 512)
 +	j = 512;
 +      for (i = 0; i < j; i++)
 +	{
 +	  if (i == len + align1)
 +	    p1[i] = 0;
 +	  else
 +	    {
 +	      p1[i] = random () & CHAR_MAX;
 +	      if (i >= align1 && i < len + align1 && !p1[i])
 +		p1[i] = (random () & 127) + 3;
 +	    }
 +	}
 +
 +	res = (char *) strndup ((char *) (p1 + align1), size);
 +	j = len + 1;
 +	if (size < j)
 +	  j = size;
 +	TEST_COMPARE_BLOB (res, j, (char *) (p1 + align1), j);
 +	free (res);
 +    }
 +}
 +
 +int
 +test_main (void)
 +{
 +  size_t i;
 +
 +  test_init ();
 +
 +  printf ("%28s", "");
 +  printf ("\t%s", "strndup");
 +  putchar ('\n');
 +
 +  for (i = 1; i < 8; ++i)
 +    {
 +      do_test (i, i, 16, 16, 127);
 +      do_test (i, i, 16, 16, CHAR_MAX);
 +      do_test (i, 2 * i, 16, 16, 127);
 +      do_test (2 * i, i, 16, 16, CHAR_MAX);
 +      do_test (8 - i, 2 * i, 1 << i, 2 << i, 127);
 +      do_test (2 * i, 8 - i, 2 << i, 1 << i, 127);
 +      do_test (8 - i, 2 * i, 1 << i, 2 << i, CHAR_MAX);
 +      do_test (2 * i, 8 - i, 2 << i, 1 << i, CHAR_MAX);
 +    }
 +
 +  for (i = 1; i < 8; ++i)
 +    {
 +      do_test (0, 0, 4 << i, 8 << i, 127);
 +      do_test (0, 0, 16 << i, 8 << i, 127);
 +      do_test (8 - i, 2 * i, 4 << i, 8 << i, 127);
 +      do_test (8 - i, 2 * i, 16 << i, 8 << i, 127);
 +    }
 +
 +  do_random_tests ();
 +  do_page_tests ();
 +  return ret;
 +}
 +
 +#include <support/test-driver.c>
--- a/SOURCES/glibc-RHEL-15343-4.patch
+++ b/SOURCES/glibc-RHEL-15343-4.patch
@ -0,0 +1,33 @@
 commit 0aa5b28a504c6f1f17b387d8147715d1496fff62
 Author: Joe Simmons-Talbott <josimmon@redhat.com>
 Date:   Fri Apr 21 09:24:25 2023 -0400
    wcsmbs: Add wcsdup() tests. (BZ #30266)
    Enable wide character testcases for wcsdup().
    Reviewed-by: Adhemerval Zanella  <adhemerval.zanella@linaro.org>
    Conflicts:
 	wcsmbs/Makefile
          (different test backport order)
 diff -Nrup a/wcsmbs/Makefile b/wcsmbs/Makefile
 --- a/wcsmbs/Makefile	2023-11-30 14:14:18.755010508 -0500
 +++ b/wcsmbs/Makefile	2023-11-30 14:38:18.511131851 -0500
@@ -53,7 +53,8 @@ tests := tst-wcstof wcsmbs-tst1 tst-wcsn
 	 tst-c16c32-1 wcsatcliff tst-wcstol-locale tst-wcstod-nan-locale \
 	 tst-wcstod-round test-char-types tst-fgetwc-after-eof \
 	 tst-wcstod-nan-sign tst-c16-surrogate tst-c32-state \
 -	 $(addprefix test-,$(strop-tests)) tst-mbstowcs
 +	 $(addprefix test-,$(strop-tests)) tst-mbstowcs \
 +	 test-wcsdup
 include ../Rules
 diff -Nrup a/wcsmbs/test-wcsdup.c b/wcsmbs/test-wcsdup.c
 --- a/wcsmbs/test-wcsdup.c	1969-12-31 19:00:00.000000000 -0500
 +++ b/wcsmbs/test-wcsdup.c	2023-11-30 14:14:48.869138712 -0500
@@ -0,0 +1,2 @@
 +#define WIDE 1
 +#include "../string/test-strdup.c"
--- a/SOURCES/glibc-RHEL-15696-1.patch
+++ b/SOURCES/glibc-RHEL-15696-1.patch
@ -1,259 +0,0 @@
 From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:23:59 -0800
 Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
 [BZ# 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes memchr/wmemchr for x32.  Tested on x86-64 and x32.  On
 x86-64, libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/memchr.S: Use RDX_LP for length.  Clear the
 	upper 32 bits of RDX register.
 	* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
 	tst-size_t-wmemchr.
 	* sysdeps/x86_64/x32/test-size_t.h: New file.
 	* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
 	* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
 ---
 sysdeps/x86_64/memchr.S                 | 10 ++--
 sysdeps/x86_64/multiarch/memchr-avx2.S  |  8 ++-
 sysdeps/x86_64/x32/Makefile             |  8 +++
 sysdeps/x86_64/x32/test-size_t.h        | 35 ++++++++++++
 sysdeps/x86_64/x32/tst-size_t-memchr.c  | 72 +++++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
 6 files changed, 148 insertions(+), 5 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/test-size_t.h
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
 Conflicts:
 	ChangeLog
 	(removed)
 	NEWS
 	(removed)
 diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
 index feef5d4f..cb320257 100644
 --- a/sysdeps/x86_64/memchr.S
 +++ b/sysdeps/x86_64/memchr.S
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
 	mov	%edi, %ecx
 #ifdef USE_AS_WMEMCHR
 -	test	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
 -	shl	$2, %rdx
 +	shl	$2, %RDX_LP
 #else
 +# ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 +# endif
 	punpcklbw %xmm1, %xmm1
 -	test	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
 	punpcklbw %xmm1, %xmm1
 #endif
 diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
 index 5f5e7725..c81da19b 100644
 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -40,16 +40,20 @@
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 -	testq	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	jz	L(null)
 # endif
 	movl	%edi, %ecx
 	/* Broadcast CHAR to YMM0.  */
 	vmovd	%esi, %xmm0
 # ifdef USE_AS_WMEMCHR
 -	shl	$2, %rdx
 +	shl	$2, %RDX_LP
 	vpbroadcastd %xmm0, %ymm0
 # else
 +#  ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 +#  endif
 	vpbroadcastb %xmm0, %ymm0
 # endif
 	/* Check if we may cross page boundary with one vector load.  */
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index f2ebc24f..7d528889 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
 # 64-bit llround.  Add -fno-builtin-lround to silence the compiler.
 CFLAGS-s_llround.c += -fno-builtin-lround
 endif
 +
 +ifeq ($(subdir),string)
 +tests += tst-size_t-memchr
 +endif
 +
 +ifeq ($(subdir),wcsmbs)
 +tests += tst-size_t-wmemchr
 +endif
 diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
 new file mode 100644
 index 00000000..78a94086
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/test-size_t.h
@@ -0,0 +1,35 @@
 +/* Test string/memory functions with size_t in the lower 32 bits of
 +   64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define TEST_MAIN
 +#include <string/test-string.h>
 +
 +/* On x32, parameter_t may be passed in a 64-bit register with the LEN
 +   field in the lower 32 bits.  When the LEN field of 64-bit register
 +   is passed to string/memory function as the size_t parameter, only
 +   the lower 32 bits can be used.  */
 +typedef struct
 +{
 +  union
 +    {
 +      size_t len;
 +      void (*fn) (void);
 +    };
 +  void *p;
 +} parameter_t;
 diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
 new file mode 100644
 index 00000000..29a3daf1
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
@@ -0,0 +1,72 @@
 +/* Test memchr with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#ifndef WIDE
 +# define TEST_NAME "memchr"
 +#else
 +# define TEST_NAME "wmemchr"
 +#endif /* WIDE */
 +#include "test-size_t.h"
 +
 +#ifndef WIDE
 +# define MEMCHR memchr
 +# define CHAR char
 +# define UCHAR unsigned char
 +#else
 +# include <wchar.h>
 +# define MEMCHR wmemchr
 +# define CHAR wchar_t
 +# define UCHAR wchar_t
 +#endif /* WIDE */
 +
 +IMPL (MEMCHR, 1)
 +
 +typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
 +
 +static CHAR *
 +__attribute__ ((noinline, noclone))
 +do_memchr (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
 +  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      c.fn = impl->fn;
 +      CHAR *res = do_memchr (src, c);
 +      if (res)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %p != NULL",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
 new file mode 100644
 index 00000000..877801d6
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
@@ -0,0 +1,20 @@
 +/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define WIDE 1
 +#include "tst-size_t-memchr.c"
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-10.patch
+++ b/SOURCES/glibc-RHEL-15696-10.patch
@ -1,41 +0,0 @@
 From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Sun, 9 Jan 2022 16:02:21 -0600
 Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
 Content-type: text/plain; charset=UTF-8
 Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
 __wcscmp_avx2. For x86_64 this covers the entire address range so any
 length larger could not possibly be used to bound `s1` or `s2`.
 test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
 1 file changed, 10 insertions(+)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 index 156c1949..8fb8eedc 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
 	je	L(char0)
 	jb	L(zero)
 #  ifdef USE_AS_WCSCMP
 +#  ifndef __ILP32__
 +	movq	%rdx, %rcx
 +	/* Check if length could overflow when multiplied by
 +	   sizeof(wchar_t). Checking top 8 bits will cover all potential
 +	   overflow cases as well as redirect cases where its impossible to
 +	   length to bound a valid memory region. In these cases just use
 +	   'wcscmp'.  */
 +	shrq	$56, %rcx
 +	jnz	__wcscmp_avx2
 +#  endif
 	/* Convert units: from wide to byte char.  */
 	shl	$2, %RDX_LP
 #  endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-102.patch
+++ b/SOURCES/glibc-RHEL-15696-102.patch
@ -1,263 +0,0 @@
 From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Fri, 15 Apr 2022 12:28:01 -0500
 Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
 Content-type: text/plain; charset=UTF-8
 Old code was both inefficient and wasted code size. New code (-62
 bytes) and comparable or better performance in the page cross case.
 geometric_mean(N=20) of page cross cases New / Original: 0.960
 size, align0, align1, ret, New Time/Old Time
   1,   4095,      0,   0,             1.001
   1,   4095,      0,   1,             0.999
   1,   4095,      0,  -1,               1.0
   2,   4094,      0,   0,               1.0
   2,   4094,      0,   1,               1.0
   2,   4094,      0,  -1,               1.0
   3,   4093,      0,   0,               1.0
   3,   4093,      0,   1,               1.0
   3,   4093,      0,  -1,               1.0
   4,   4092,      0,   0,             0.987
   4,   4092,      0,   1,               1.0
   4,   4092,      0,  -1,               1.0
   5,   4091,      0,   0,             0.984
   5,   4091,      0,   1,             1.002
   5,   4091,      0,  -1,             1.005
   6,   4090,      0,   0,             0.993
   6,   4090,      0,   1,             1.001
   6,   4090,      0,  -1,             1.003
   7,   4089,      0,   0,             0.991
   7,   4089,      0,   1,               1.0
   7,   4089,      0,  -1,             1.001
   8,   4088,      0,   0,             0.875
   8,   4088,      0,   1,             0.881
   8,   4088,      0,  -1,             0.888
   9,   4087,      0,   0,             0.872
   9,   4087,      0,   1,             0.879
   9,   4087,      0,  -1,             0.883
  10,   4086,      0,   0,             0.878
  10,   4086,      0,   1,             0.886
  10,   4086,      0,  -1,             0.873
  11,   4085,      0,   0,             0.878
  11,   4085,      0,   1,             0.881
  11,   4085,      0,  -1,             0.879
  12,   4084,      0,   0,             0.873
  12,   4084,      0,   1,             0.889
  12,   4084,      0,  -1,             0.875
  13,   4083,      0,   0,             0.873
  13,   4083,      0,   1,             0.863
  13,   4083,      0,  -1,             0.863
  14,   4082,      0,   0,             0.838
  14,   4082,      0,   1,             0.869
  14,   4082,      0,  -1,             0.877
  15,   4081,      0,   0,             0.841
  15,   4081,      0,   1,             0.869
  15,   4081,      0,  -1,             0.876
  16,   4080,      0,   0,             0.988
  16,   4080,      0,   1,              0.99
  16,   4080,      0,  -1,             0.989
  17,   4079,      0,   0,             0.978
  17,   4079,      0,   1,             0.981
  17,   4079,      0,  -1,              0.98
  18,   4078,      0,   0,             0.981
  18,   4078,      0,   1,              0.98
  18,   4078,      0,  -1,             0.985
  19,   4077,      0,   0,             0.977
  19,   4077,      0,   1,             0.979
  19,   4077,      0,  -1,             0.986
  20,   4076,      0,   0,             0.977
  20,   4076,      0,   1,             0.986
  20,   4076,      0,  -1,             0.984
  21,   4075,      0,   0,             0.977
  21,   4075,      0,   1,             0.983
  21,   4075,      0,  -1,             0.988
  22,   4074,      0,   0,             0.983
  22,   4074,      0,   1,             0.994
  22,   4074,      0,  -1,             0.993
  23,   4073,      0,   0,              0.98
  23,   4073,      0,   1,             0.992
  23,   4073,      0,  -1,             0.995
  24,   4072,      0,   0,             0.989
  24,   4072,      0,   1,             0.989
  24,   4072,      0,  -1,             0.991
  25,   4071,      0,   0,              0.99
  25,   4071,      0,   1,             0.999
  25,   4071,      0,  -1,             0.996
  26,   4070,      0,   0,             0.993
  26,   4070,      0,   1,             0.995
  26,   4070,      0,  -1,             0.998
  27,   4069,      0,   0,             0.993
  27,   4069,      0,   1,             0.999
  27,   4069,      0,  -1,               1.0
  28,   4068,      0,   0,             0.997
  28,   4068,      0,   1,               1.0
  28,   4068,      0,  -1,             0.999
  29,   4067,      0,   0,             0.996
  29,   4067,      0,   1,             0.999
  29,   4067,      0,  -1,             0.999
  30,   4066,      0,   0,             0.991
  30,   4066,      0,   1,             1.001
  30,   4066,      0,  -1,             0.999
  31,   4065,      0,   0,             0.988
  31,   4065,      0,   1,             0.998
  31,   4065,      0,  -1,             0.998
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
 1 file changed, 61 insertions(+), 37 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 index 16fc673e..99258cf5 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
 # ifndef USE_AS_WMEMCMP
 	cmpl	$8, %edx
 	jae	L(between_8_15)
 +	/* Fall through for [4, 7].  */
 	cmpl	$4, %edx
 -	jae	L(between_4_7)
 +	jb	L(between_2_3)
 -	/* Load as big endian to avoid branches.  */
 -	movzwl	(%rdi), %eax
 -	movzwl	(%rsi), %ecx
 -	shll	$8, %eax
 -	shll	$8, %ecx
 -	bswap	%eax
 -	bswap	%ecx
 -	movzbl	-1(%rdi, %rdx), %edi
 -	movzbl	-1(%rsi, %rdx), %esi
 -	orl	%edi, %eax
 -	orl	%esi, %ecx
 -	/* Subtraction is okay because the upper 8 bits are zero.  */
 -	subl	%ecx, %eax
 +	movbe	(%rdi), %eax
 +	movbe	(%rsi), %ecx
 +	shlq	$32, %rax
 +	shlq	$32, %rcx
 +	movbe	-4(%rdi, %rdx), %edi
 +	movbe	-4(%rsi, %rdx), %esi
 +	orq	%rdi, %rax
 +	orq	%rsi, %rcx
 +	subq	%rcx, %rax
 +	/* Fast path for return zero.  */
 +	jnz	L(ret_nonzero)
 	/* No ymm register was touched.  */
 	ret
@@ -457,9 +456,33 @@ L(one_or_less):
 	/* No ymm register was touched.  */
 	ret
 +	.p2align 4,, 5
 +L(ret_nonzero):
 +	sbbl	%eax, %eax
 +	orl	$1, %eax
 +	/* No ymm register was touched.  */
 +	ret
 +
 +	.p2align 4,, 2
 +L(zero):
 +	xorl	%eax, %eax
 +	/* No ymm register was touched.  */
 +	ret
 +
 	.p2align 4
 L(between_8_15):
 -# endif
 +	movbe	(%rdi), %rax
 +	movbe	(%rsi), %rcx
 +	subq	%rcx, %rax
 +	jnz	L(ret_nonzero)
 +	movbe	-8(%rdi, %rdx), %rax
 +	movbe	-8(%rsi, %rdx), %rcx
 +	subq	%rcx, %rax
 +	/* Fast path for return zero.  */
 +	jnz	L(ret_nonzero)
 +	/* No ymm register was touched.  */
 +	ret
 +# else
 	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
 	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
 +	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
 +# endif
 -	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 -	ret
 -
 -	.p2align 4
 +	.p2align 4,, 10
 L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	vmovdqu	(%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
 	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 	subl	$0xffff, %eax
 +	/* Fast path for return zero.  */
 	jnz	L(return_vec_0)
 	/* No ymm register was touched.  */
 	ret
 # ifdef USE_AS_WMEMCMP
 +	.p2align 4,, 2
 +L(zero):
 +	xorl	%eax, %eax
 +	ret
 +
 	.p2align 4
 L(one_or_less):
 	jb	L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
 # else
 	.p2align 4
 -L(between_4_7):
 -	/* Load as big endian with overlapping movbe to avoid branches.
 -	 */
 -	movbe	(%rdi), %eax
 -	movbe	(%rsi), %ecx
 -	shlq	$32, %rax
 -	shlq	$32, %rcx
 -	movbe	-4(%rdi, %rdx), %edi
 -	movbe	-4(%rsi, %rdx), %esi
 -	orq	%rdi, %rax
 -	orq	%rsi, %rcx
 -	subq	%rcx, %rax
 -	jz	L(zero_4_7)
 -	sbbl	%eax, %eax
 -	orl	$1, %eax
 -L(zero_4_7):
 +L(between_2_3):
 +	/* Load as big endian to avoid branches.  */
 +	movzwl	(%rdi), %eax
 +	movzwl	(%rsi), %ecx
 +	bswap	%eax
 +	bswap	%ecx
 +	shrl	%eax
 +	shrl	%ecx
 +	movzbl	-1(%rdi, %rdx), %edi
 +	movzbl	-1(%rsi, %rdx), %esi
 +	orl	%edi, %eax
 +	orl	%esi, %ecx
 +	/* Subtraction is okay because the upper bit is zero.  */
 +	subl	%ecx, %eax
 	/* No ymm register was touched.  */
 	ret
 # endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-107.patch
+++ b/SOURCES/glibc-RHEL-15696-107.patch
@ -1,226 +0,0 @@
 From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
 From: Wangyang Guo <wangyang.guo@intel.com>
 Date: Fri, 6 May 2022 01:50:10 +0000
 Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
 Content-type: text/plain; charset=UTF-8
 When mutiple threads waiting for lock at the same time, once lock owner
 releases the lock, waiters will see lock available and all try to lock,
 which may cause an expensive CAS storm.
 Binary exponential backoff with random jitter is introduced. As try-lock
 attempt increases, there is more likely that a larger number threads
 compete for adaptive mutex lock, so increase wait time in exponential.
 A random jitter is also added to avoid synchronous try-lock from other
 threads.
 v2: Remove read-check before try-lock for performance.
 v3:
 1. Restore read-check since it works well in some platform.
 2. Make backoff arch dependent, and enable it for x86_64.
 3. Limit max backoff to reduce latency in large critical section.
 v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
 v5: Commit log updated for regression in large critical section.
 Result of pthread-mutex-locks bench
 Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
 First Row: thread number
 First Col: critical section length
 Values: backoff vs upstream, time based, low is better
 non-critical-length: 1
 	1	2	4	8	16	32	64	112	140
 0	0.99	0.58	0.52	0.49	0.43	0.44	0.46	0.52	0.54
 1	0.98	0.43	0.56	0.50	0.44	0.45	0.50	0.56	0.57
 2	0.99	0.41	0.57	0.51	0.45	0.47	0.48	0.60	0.61
 4	0.99	0.45	0.59	0.53	0.48	0.49	0.52	0.64	0.65
 8	1.00	0.66	0.71	0.63	0.56	0.59	0.66	0.72	0.71
 16	0.97	0.78	0.91	0.73	0.67	0.70	0.79	0.80	0.80
 32	0.95	1.17	0.98	0.87	0.82	0.86	0.89	0.90	0.90
 64	0.96	0.95	1.01	1.01	0.98	1.00	1.03	0.99	0.99
 128	0.99	1.01	1.01	1.17	1.08	1.12	1.02	0.97	1.02
 non-critical-length: 32
 	1	2	4	8	16	32	64	112	140
 0	1.03	0.97	0.75	0.65	0.58	0.58	0.56	0.70	0.70
 1	0.94	0.95	0.76	0.65	0.58	0.58	0.61	0.71	0.72
 2	0.97	0.96	0.77	0.66	0.58	0.59	0.62	0.74	0.74
 4	0.99	0.96	0.78	0.66	0.60	0.61	0.66	0.76	0.77
 8	0.99	0.99	0.84	0.70	0.64	0.66	0.71	0.80	0.80
 16	0.98	0.97	0.95	0.76	0.70	0.73	0.81	0.85	0.84
 32	1.04	1.12	1.04	0.89	0.82	0.86	0.93	0.91	0.91
 64	0.99	1.15	1.07	1.00	0.99	1.01	1.05	0.99	0.99
 128	1.00	1.21	1.20	1.22	1.25	1.31	1.12	1.10	0.99
 non-critical-length: 128
 	1	2	4	8	16	32	64	112	140
 0	1.02	1.00	0.99	0.67	0.61	0.61	0.61	0.74	0.73
 1	0.95	0.99	1.00	0.68	0.61	0.60	0.60	0.74	0.74
 2	1.00	1.04	1.00	0.68	0.59	0.61	0.65	0.76	0.76
 4	1.00	0.96	0.98	0.70	0.63	0.63	0.67	0.78	0.77
 8	1.01	1.02	0.89	0.73	0.65	0.67	0.71	0.81	0.80
 16	0.99	0.96	0.96	0.79	0.71	0.73	0.80	0.84	0.84
 32	0.99	0.95	1.05	0.89	0.84	0.85	0.94	0.92	0.91
 64	1.00	0.99	1.16	1.04	1.00	1.02	1.06	0.99	0.99
 128	1.00	1.06	0.98	1.14	1.39	1.26	1.08	1.02	0.98
 There is regression in large critical section. But adaptive mutex is
 aimed for "quick" locks. Small critical section is more common when
 users choose to use adaptive pthread_mutex.
 Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 Conflicts:
 	pthreadP.h
 	(had been moved)
 	nptl/pthread_mutex_lock.c
 	(max_adaptive_count renamed)
 ---
 nptl/pthreadP.h                             |  1 +
 nptl/pthread_mutex_lock.c                   | 16 +++++++--
 sysdeps/nptl/pthread_mutex_backoff.h        | 35 ++++++++++++++++++
 sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
 4 files changed, 89 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
 create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
 diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
 index 7ddc166c..1550e3b6 100644
 --- a/nptl/pthreadP.h
 +++ b/nptl/pthreadP.h
@@ -33,6 +33,7 @@
 #include <kernel-features.h>
 #include <errno.h>
 #include <internal-signals.h>
 +#include <pthread_mutex_backoff.h>
 /* Atomic operations on TLS memory.  */
 diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 index d96a9933..c7770fc9 100644
 --- a/nptl/pthread_mutex_lock.c
 +++ b/nptl/pthread_mutex_lock.c
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
 	  int cnt = 0;
 	  int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
 			     mutex->__data.__spins * 2 + 10);
 +	  int spin_count, exp_backoff = 1;
 +	  unsigned int jitter = get_jitter ();
 	  do
 	    {
 -	      if (cnt++ >= max_cnt)
 +	      /* In each loop, spin count is exponential backoff plus
 +		 random jitter, random range is [0, exp_backoff-1].  */
 +	      spin_count = exp_backoff + (jitter & (exp_backoff - 1));
 +	      cnt += spin_count;
 +	      if (cnt >= max_cnt)
 		{
 +		  /* If cnt exceeds max spin count, just go to wait
 +		     queue.  */
 		  LLL_MUTEX_LOCK (mutex);
 		  break;
 		}
 -	      atomic_spin_nop ();
 +	      do
 +		atomic_spin_nop ();
 +	      while (--spin_count > 0);
 +	      /* Prepare for next loop.  */
 +	      exp_backoff = get_next_backoff (exp_backoff);
 	    }
 	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
 		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
 diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
 new file mode 100644
 index 00000000..5b26c22a
 --- /dev/null
 +++ b/sysdeps/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,35 @@
 +/* Pthread mutex backoff configuration.
 +   Copyright (C) 2022 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +#ifndef _PTHREAD_MUTEX_BACKOFF_H
 +#define _PTHREAD_MUTEX_BACKOFF_H 1
 +
 +static inline unsigned int
 +get_jitter (void)
 +{
 +  /* Arch dependent random jitter, return 0 disables random.  */
 +  return 0;
 +}
 +
 +static inline int
 +get_next_backoff (int backoff)
 +{
 +  /* Next backoff, return 1 disables mutex backoff.  */
 +  return 1;
 +}
 +
 +#endif
 diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
 new file mode 100644
 index 00000000..ec74c3d9
 --- /dev/null
 +++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,39 @@
 +/* Pthread mutex backoff configuration.
 +   Copyright (C) 2022 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +#ifndef _PTHREAD_MUTEX_BACKOFF_H
 +#define _PTHREAD_MUTEX_BACKOFF_H 1
 +
 +#include <fast-jitter.h>
 +
 +static inline unsigned int
 +get_jitter (void)
 +{
 +  return get_fast_jitter ();
 +}
 +
 +#define MAX_BACKOFF 16
 +
 +static inline int
 +get_next_backoff (int backoff)
 +{
 +  /* Binary expontial backoff. Limiting max backoff
 +     can reduce latency in large critical section.  */
 +  return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
 +}
 +
 +#endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-108.patch
+++ b/SOURCES/glibc-RHEL-15696-108.patch
@ -1,55 +0,0 @@
 From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Tue, 15 Feb 2022 08:18:15 -0600
 Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
 #28896]
 Content-type: text/plain; charset=UTF-8
 In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
 call strcmp-avx2 and wcscmp-avx2 respectively. This would have
 not checks around vzeroupper and would trigger spurious
 aborts. This commit fixes that.
 test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
 AVX2 machines with and without RTM.
 Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
 1 file changed, 2 insertions(+), 6 deletions(-)
 Conflicts:
 	sysdeps/x86_64/multiarch/strcmp-avx2.S
 	(split into two patches due to upstream bug differences)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 index 28cc98b6..e267c6cb 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -345,10 +345,10 @@ L(one_or_less):
 	movq	%LOCALE_REG, %rdx
 #  endif
 	jb	L(ret_zero)
 -#  ifdef USE_AS_WCSCMP
 	/* 'nbe' covers the case where length is negative (large
 	   unsigned).  */
 -	jnbe	__wcscmp_avx2
 +	jnbe	OVERFLOW_STRCMP
 +#  ifdef USE_AS_WCSCMP
 	movl	(%rdi), %edx
 	xorl	%eax, %eax
 	cmpl	(%rsi), %edx
@@ -357,10 +357,6 @@ L(one_or_less):
 	negl	%eax
 	orl	$1, %eax
 #  else
 -	/* 'nbe' covers the case where length is negative (large
 -	   unsigned).  */
 -
 -	jnbe	__strcmp_avx2
 	movzbl	(%rdi), %eax
 	movzbl	(%rsi), %ecx
 	TOLOWER_gpr (%rax, %eax)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-109.patch
+++ b/SOURCES/glibc-RHEL-15696-109.patch
@ -1,60 +0,0 @@
 From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
 From: Stefan Liebler <stli@linux.ibm.com>
 Date: Mon, 28 Jun 2021 13:01:07 +0200
 Subject: s390x: Update math: redirect roundeven function
 After recent commit
 447954a206837b5f153869cfeeeab44631c3fac9
 "math: redirect roundeven function", building on
 s390x fails with:
 Error: symbol `__roundevenl' is already defined
 Similar to aarch64/riscv fix, this patch redirects target
 specific functions for s390x:
 commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
 "Update math: redirect roundeven function"
 diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
 index 40b07e054b..0773adfed0 100644
 --- a/sysdeps/s390/fpu/s_roundeven.c
 +++ b/sysdeps/s390/fpu/s_roundeven.c
@@ -18,6 +18,7 @@
    <https://www.gnu.org/licenses/>.  */
 #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
 +# define NO_MATH_REDIRECT
 # include <math.h>
 # include <libm-alias-double.h>
@@ -31,7 +32,6 @@ __roundeven (double x)
   __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
   return y;
 }
 -hidden_def (__roundeven)
 libm_alias_double (__roundeven, roundeven)
 #else
 diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
 index d2fbf3d2b6..289785bc4a 100644
 --- a/sysdeps/s390/fpu/s_roundevenf.c
 +++ b/sysdeps/s390/fpu/s_roundevenf.c
@@ -18,6 +18,7 @@
    <https://www.gnu.org/licenses/>.  */
 #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
 +# define NO_MATH_REDIRECT
 # include <math.h>
 # include <libm-alias-float.h>
 diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
 index 29ab7a8616..94b6459ab4 100644
 --- a/sysdeps/s390/fpu/s_roundevenl.c
 +++ b/sysdeps/s390/fpu/s_roundevenl.c
@@ -18,6 +18,7 @@
    <https://www.gnu.org/licenses/>.  */
 #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
 +# define NO_MATH_REDIRECT
 # include <math.h>
 # include <math_private.h>
 # include <libm-alias-ldouble.h>
--- a/SOURCES/glibc-RHEL-15696-11.patch
+++ b/SOURCES/glibc-RHEL-15696-11.patch
@ -1,74 +0,0 @@
 From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 26 Feb 2021 05:36:59 -0800
 Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
 Content-type: text/plain; charset=UTF-8
 1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
 by VZEROUPPER inside a transactionally executing RTM region.
 2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
 1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp.  Add
 Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
 ---
 sysdeps/x86/cpu-features.c                    | 20 +++++++++++++++++--
 sysdeps/x86/cpu-tunables.c                    |  2 ++
 ...cpu-features-preferred_feature_index_1.def |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)
 diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
 index 91042505..3610ee5c 100644
 --- a/sysdeps/x86/cpu-features.c
 +++ b/sysdeps/x86/cpu-features.c
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
 	cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
 	  |= bit_arch_Prefer_No_VZEROUPPER;
       else
 -	cpu_features->preferred[index_arch_Prefer_No_AVX512]
 -	  |= bit_arch_Prefer_No_AVX512;
 +	{
 +	  cpu_features->preferred[index_arch_Prefer_No_AVX512]
 +	    |= bit_arch_Prefer_No_AVX512;
 +
 +	  /* Avoid RTM abort triggered by VZEROUPPER inside a
 +	     transactionally executing RTM region.  */
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 +	    cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
 +	      |= bit_arch_Prefer_No_VZEROUPPER;
 +
 +	  /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
 +	     requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
 +	     requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
 +	     AVX2 strcmp is faster than EVEX strcmp.  */
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
 +	    cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
 +	      |= bit_arch_Prefer_AVX2_STRCMP;
 +	}
     }
   /* This spells out "AuthenticAMD".  */
   else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
 diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
 index 3173b2b9..73adbaba 100644
 --- a/sysdeps/x86/cpu-tunables.c
 +++ b/sysdeps/x86/cpu-tunables.c
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 	      CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
 						Fast_Copy_Backward,
 						disable, 18);
 +	      CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
 +		(n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
 	    }
 	  break;
 	case 19:
 diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
 index 17a5cc42..4ca70b40 100644
 --- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
 +++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
 BIT (Prefer_FSRM)
 BIT (Prefer_No_AVX512)
 BIT (MathVec_Prefer_No_AVX512)
 +BIT (Prefer_AVX2_STRCMP)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-110.patch
+++ b/SOURCES/glibc-RHEL-15696-110.patch
@ -1,26 +0,0 @@
 From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Wed, 23 Jun 2021 13:29:41 -0700
 Subject: Update math: redirect roundeven function
 Redirect target specific roundeven functions for aarch64, ldbl-128ibm
 and riscv.
 Conflicts:
 	sysdeps/aarch64/*
 	(not needed)
 	sysdeps/riscv/*
 	(not supported)
 diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
 index 6701970f4a..90eecf496b 100644
 --- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
 +++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
@@ -17,6 +17,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 +#define NO_MATH_REDIRECT
 #include <math.h>
 #include <math_private.h>
--- a/SOURCES/glibc-RHEL-15696-12.patch
+++ b/SOURCES/glibc-RHEL-15696-12.patch
--- a/SOURCES/glibc-RHEL-15696-13.patch
+++ b/SOURCES/glibc-RHEL-15696-13.patch
--- a/SOURCES/glibc-RHEL-15696-14.patch
+++ b/SOURCES/glibc-RHEL-15696-14.patch
@ -1,242 +0,0 @@
 From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 5 Mar 2021 06:46:08 -0800
 Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
 Content-type: text/plain; charset=UTF-8
 Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
 instructions using YMM16-YMM31 registers to avoid RTM abort with usable
 AVX512VL since VZEROUPPER isn't needed at function exit.
 ---
 sysdeps/x86_64/multiarch/Makefile             |  1 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 36 +++++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-memmove.h      | 21 +++++++++--
 .../multiarch/memmove-evex-unaligned-erms.S   | 33 +++++++++++++++++
 .../multiarch/memmove-vec-unaligned-erms.S    | 24 ++++++++-----
 5 files changed, 104 insertions(+), 11 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
 diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 index 46783cd1..4563fc56 100644
 --- a/sysdeps/x86_64/multiarch/Makefile
 +++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memset-avx2-unaligned-erms \
 		   memset-avx512-unaligned-erms \
 		   memchr-evex \
 +		   memmove-evex-unaligned-erms \
 		   memrchr-evex \
 		   rawmemchr-evex \
 		   stpcpy-evex \
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index 082e4da3..6bd3abfc 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memmove_chk_avx_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memmove_chk_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memmove_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_chk_ssse3_back)
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memmove_avx_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, memmove,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memmove_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, memmove,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memmove_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memmove_avx512_no_vzeroupper)
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memcpy_chk_avx_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memcpy_chk_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memcpy_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_chk_ssse3_back)
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 			      CPU_FEATURE_USABLE (AVX),
 			      __memcpy_avx_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, memcpy,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memcpy_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, memcpy,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __memcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __memcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
 			      __mempcpy_chk_avx_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __mempcpy_chk_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __mempcpy_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_chk_ssse3_back)
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX),
 			      __mempcpy_avx_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, mempcpy,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __mempcpy_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, mempcpy,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __mempcpy_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 			      __mempcpy_ssse3_back)
 	      IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
 index 5e5f0299..6f8bce5f 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
   attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
 +  attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
 +  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
 -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 -	return OPTIMIZE (avx_unaligned_erms);
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 +	{
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 +	    return OPTIMIZE (evex_unaligned_erms);
 +
 +	  return OPTIMIZE (evex_unaligned);
 +	}
 +
 +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 +	{
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 +	    return OPTIMIZE (avx_unaligned_erms);
 -      return OPTIMIZE (avx_unaligned);
 +	  return OPTIMIZE (avx_unaligned);
 +	}
     }
   if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
 diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
 new file mode 100644
 index 00000000..0cbce8f9
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,33 @@
 +#if IS_IN (libc)
 +# define VEC_SIZE	32
 +# define XMM0		xmm16
 +# define XMM1		xmm17
 +# define YMM0		ymm16
 +# define YMM1		ymm17
 +# define VEC0		ymm16
 +# define VEC1		ymm17
 +# define VEC2		ymm18
 +# define VEC3		ymm19
 +# define VEC4		ymm20
 +# define VEC5		ymm21
 +# define VEC6		ymm22
 +# define VEC7		ymm23
 +# define VEC8		ymm24
 +# define VEC9		ymm25
 +# define VEC10		ymm26
 +# define VEC11		ymm27
 +# define VEC12		ymm28
 +# define VEC13		ymm29
 +# define VEC14		ymm30
 +# define VEC15		ymm31
 +# define VEC(i)		VEC##i
 +# define VMOVNT		vmovntdq
 +# define VMOVU		vmovdqu64
 +# define VMOVA		vmovdqa64
 +# define VZEROUPPER
 +
 +# define SECTION(p)		p##.evex
 +# define MEMMOVE_SYMBOL(p,s)	p##_evex_##s
 +
 +# include "memmove-vec-unaligned-erms.S"
 +#endif
 diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 index 274aa1c7..08e21692 100644
 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -48,6 +48,14 @@
 # define MEMMOVE_CHK_SYMBOL(p,s)	MEMMOVE_SYMBOL(p, s)
 #endif
 +#ifndef XMM0
 +# define XMM0				xmm0
 +#endif
 +
 +#ifndef YMM0
 +# define YMM0				ymm0
 +#endif
 +
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER vzeroupper
@@ -277,20 +285,20 @@ L(less_vec):
 #if VEC_SIZE > 32
 L(between_32_63):
 	/* From 32 to 63.  No branch when size == 32.  */
 -	vmovdqu	(%rsi), %ymm0
 -	vmovdqu	-32(%rsi,%rdx), %ymm1
 -	vmovdqu	%ymm0, (%rdi)
 -	vmovdqu	%ymm1, -32(%rdi,%rdx)
 +	VMOVU	(%rsi), %YMM0
 +	VMOVU	-32(%rsi,%rdx), %YMM1
 +	VMOVU	%YMM0, (%rdi)
 +	VMOVU	%YMM1, -32(%rdi,%rdx)
 	VZEROUPPER
 	ret
 #endif
 #if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
 -	vmovdqu	(%rsi), %xmm0
 -	vmovdqu	-16(%rsi,%rdx), %xmm1
 -	vmovdqu	%xmm0, (%rdi)
 -	vmovdqu	%xmm1, -16(%rdi,%rdx)
 +	VMOVU	(%rsi), %XMM0
 +	VMOVU	-16(%rsi,%rdx), %XMM1
 +	VMOVU	%XMM0, (%rdi)
 +	VMOVU	%XMM1, -16(%rdi,%rdx)
 	ret
 #endif
 L(between_8_15):
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-15.patch
+++ b/SOURCES/glibc-RHEL-15696-15.patch
@ -1,254 +0,0 @@
 From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 5 Mar 2021 07:15:03 -0800
 Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
 Content-type: text/plain; charset=UTF-8
 Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
 with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
 abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
 function exit.
 ---
 sysdeps/x86_64/multiarch/Makefile             |  1 +
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 22 +++++++++++++++++
 sysdeps/x86_64/multiarch/ifunc-memset.h       | 24 +++++++++++++++----
 sysdeps/x86_64/multiarch/ifunc-wmemset.h      | 13 ++++++----
 .../multiarch/memset-evex-unaligned-erms.S    | 24 +++++++++++++++++++
 .../multiarch/memset-vec-unaligned-erms.S     | 20 +++++++++++-----
 6 files changed, 90 insertions(+), 14 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 index 4563fc56..1cc0a10e 100644
 --- a/sysdeps/x86_64/multiarch/Makefile
 +++ b/sysdeps/x86_64/multiarch/Makefile
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memchr-evex \
 		   memmove-evex-unaligned-erms \
 		   memrchr-evex \
 +		   memset-evex-unaligned-erms \
 		   rawmemchr-evex \
 		   stpcpy-evex \
 		   stpncpy-evex \
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index 6bd3abfc..7cf83485 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __memset_chk_avx2_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			      __memset_chk_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			      __memset_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memset_chk_avx512_unaligned_erms)
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __memset_avx2_unaligned_erms)
 +	      IFUNC_IMPL_ADD (array, i, memset,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			      __memset_evex_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, memset,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			      __memset_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memset_avx512_unaligned_erms)
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, wmemset,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wmemset_avx2_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, wmemset,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __wmemset_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __wmemset_avx512_unaligned))
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wmemset_chk_avx2_unaligned)
 +	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 +			      CPU_FEATURE_USABLE (AVX512VL),
 +			      __wmemset_chk_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __wmemset_chk_avx512_unaligned))
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
 index 708bd72e..6f31f4dc 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
   attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
 +  attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
 +  attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
   attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
     {
 -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 -	return OPTIMIZE (avx2_unaligned_erms);
 -      else
 -	return OPTIMIZE (avx2_unaligned);
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 +	{
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 +	    return OPTIMIZE (evex_unaligned_erms);
 +
 +	  return OPTIMIZE (evex_unaligned);
 +	}
 +
 +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 +	{
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 +	    return OPTIMIZE (avx2_unaligned_erms);
 +
 +	  return OPTIMIZE (avx2_unaligned);
 +	}
     }
   if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 index eb242210..9290c4bf 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,7 @@
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
 static inline void *
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
 -      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
 -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
 +	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
 +	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx512_unaligned);
 -      else
 +
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 +	return OPTIMIZE (evex_unaligned);
 +
 +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 	return OPTIMIZE (avx2_unaligned);
     }
 diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 new file mode 100644
 index 00000000..ae0a4d6e
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -0,0 +1,24 @@
 +#if IS_IN (libc)
 +# define VEC_SIZE	32
 +# define XMM0		xmm16
 +# define YMM0		ymm16
 +# define VEC0		ymm16
 +# define VEC(i)		VEC##i
 +# define VMOVU		vmovdqu64
 +# define VMOVA		vmovdqa64
 +# define VZEROUPPER
 +
 +# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 +  movq r, %rax; \
 +  vpbroadcastb d, %VEC0
 +
 +# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 +  movq r, %rax; \
 +  vpbroadcastd d, %VEC0
 +
 +# define SECTION(p)		p##.evex
 +# define MEMSET_SYMBOL(p,s)	p##_evex_##s
 +# define WMEMSET_SYMBOL(p,s)	p##_evex_##s
 +
 +# include "memset-vec-unaligned-erms.S"
 +#endif
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index 9a0fd818..71e91a8f 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,6 +34,14 @@
 # define WMEMSET_CHK_SYMBOL(p,s)	WMEMSET_SYMBOL(p, s)
 #endif
 +#ifndef XMM0
 +# define XMM0				xmm0
 +#endif
 +
 +#ifndef YMM0
 +# define YMM0				ymm0
 +#endif
 +
 #ifndef VZEROUPPER
 # if VEC_SIZE > 16
 #  define VZEROUPPER			vzeroupper
@@ -67,7 +75,7 @@
 ENTRY (__bzero)
 	mov	%RDI_LP, %RAX_LP /* Set return value.  */
 	mov	%RSI_LP, %RDX_LP /* Set n.  */
 -	pxor	%xmm0, %xmm0
 +	pxor	%XMM0, %XMM0
 	jmp	L(entry_from_bzero)
 END (__bzero)
 weak_alias (__bzero, bzero)
@@ -223,7 +231,7 @@ L(less_vec):
 	cmpb	$16, %dl
 	jae	L(between_16_31)
 # endif
 -	MOVQ	%xmm0, %rcx
 +	MOVQ	%XMM0, %rcx
 	cmpb	$8, %dl
 	jae	L(between_8_15)
 	cmpb	$4, %dl
@@ -238,16 +246,16 @@ L(less_vec):
 # if VEC_SIZE > 32
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
 -	vmovdqu	%ymm0, -32(%rdi,%rdx)
 -	vmovdqu	%ymm0, (%rdi)
 +	VMOVU	%YMM0, -32(%rdi,%rdx)
 +	VMOVU	%YMM0, (%rdi)
 	VZEROUPPER
 	ret
 # endif
 # if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
 -	vmovdqu	%xmm0, -16(%rdi,%rdx)
 -	vmovdqu	%xmm0, (%rdi)
 +	VMOVU	%XMM0, -16(%rdi,%rdx)
 +	VMOVU	%XMM0, (%rdi)
 	VZEROUPPER
 	ret
 # endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-16.patch
+++ b/SOURCES/glibc-RHEL-15696-16.patch
@ -1,561 +0,0 @@
 From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 5 Mar 2021 07:20:28 -0800
 Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
 Content-type: text/plain; charset=UTF-8
 Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
 instructions using YMM16-YMM31 registers to avoid RTM abort with usable
 AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
 exit.
 ---
 sysdeps/x86_64/multiarch/Makefile             |   4 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  10 +
 sysdeps/x86_64/multiarch/ifunc-memcmp.h       |  13 +-
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S  | 440 ++++++++++++++++++
 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S |   4 +
 5 files changed, 467 insertions(+), 4 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
 diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 index 1cc0a10e..9d79b138 100644
 --- a/sysdeps/x86_64/multiarch/Makefile
 +++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   memset-avx2-unaligned-erms \
 		   memset-avx512-unaligned-erms \
 		   memchr-evex \
 +		   memcmp-evex-movbe \
 		   memmove-evex-unaligned-erms \
 		   memrchr-evex \
 		   memset-evex-unaligned-erms \
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wcsncmp-evex \
 		   wcsnlen-evex \
 		   wcsrchr-evex \
 -		   wmemchr-evex
 +		   wmemchr-evex \
 +		   wmemcmp-evex-movbe
 endif
 ifeq ($(subdir),debug)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index 7cf83485..c8da910e 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __memcmp_avx2_movbe)
 +	      IFUNC_IMPL_ADD (array, i, memcmp,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (MOVBE)),
 +			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __memcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      (CPU_FEATURE_USABLE (AVX2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __wmemcmp_avx2_movbe)
 +	      IFUNC_IMPL_ADD (array, i, wmemcmp,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (MOVBE)),
 +			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 			      __wmemcmp_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 index 6c1f3153..3ca1f0a6 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
 static inline void *
 IFUNC_SELECTOR (void)
 {
   const struct cpu_features* cpu_features = __get_cpu_features ();
 -  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
 -      && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 -    return OPTIMIZE (avx2_movbe);
 +    {
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 +	return OPTIMIZE (evex_movbe);
 +
 +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 +	return OPTIMIZE (avx2_movbe);
 +    }
   if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
     return OPTIMIZE (sse4_1);
 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 new file mode 100644
 index 00000000..9c093972
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -0,0 +1,440 @@
 +/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#if IS_IN (libc)
 +
 +/* memcmp/wmemcmp is implemented as:
 +   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
 +      to avoid branches.
 +   2. Use overlapping compare to avoid branch.
 +   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
 +      bytes for wmemcmp.
 +   4. If size is 8 * VEC_SIZE or less, unroll the loop.
 +   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
 +      area.
 +   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 +   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 +   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 +
 +# include <sysdep.h>
 +
 +# ifndef MEMCMP
 +#  define MEMCMP	__memcmp_evex_movbe
 +# endif
 +
 +# define VMOVU		vmovdqu64
 +
 +# ifdef USE_AS_WMEMCMP
 +#  define VPCMPEQ	vpcmpeqd
 +# else
 +#  define VPCMPEQ	vpcmpeqb
 +# endif
 +
 +# define XMM1		xmm17
 +# define XMM2		xmm18
 +# define YMM1		ymm17
 +# define YMM2		ymm18
 +# define YMM3		ymm19
 +# define YMM4		ymm20
 +# define YMM5		ymm21
 +# define YMM6		ymm22
 +
 +# define VEC_SIZE 32
 +# ifdef USE_AS_WMEMCMP
 +#  define VEC_MASK 0xff
 +#  define XMM_MASK 0xf
 +# else
 +#  define VEC_MASK 0xffffffff
 +#  define XMM_MASK 0xffff
 +# endif
 +
 +/* Warning!
 +           wmemcmp has to use SIGNED comparison for elements.
 +           memcmp has to use UNSIGNED comparison for elemnts.
 +*/
 +
 +	.section .text.evex,"ax",@progbits
 +ENTRY (MEMCMP)
 +# ifdef USE_AS_WMEMCMP
 +	shl	$2, %RDX_LP
 +# elif defined __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 +# endif
 +	cmp	$VEC_SIZE, %RDX_LP
 +	jb	L(less_vec)
 +
 +	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k1
 +	kmovd	%k1, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +
 +	cmpq	$(VEC_SIZE * 2), %rdx
 +	jbe	L(last_vec)
 +
 +	/* More than 2 * VEC.  */
 +	cmpq	$(VEC_SIZE * 8), %rdx
 +	ja	L(more_8x_vec)
 +	cmpq	$(VEC_SIZE * 4), %rdx
 +	jb	L(last_4x_vec)
 +
 +	/* From 4 * VEC to 8 * VEC, inclusively. */
 +	VMOVU	(%rsi), %YMM1
 +	VPCMPEQ (%rdi), %YMM1, %k1
 +
 +	VMOVU	VEC_SIZE(%rsi), %YMM2
 +	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 +
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 +	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 +
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 +	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 +
 +	kandd	%k1, %k2, %k5
 +	kandd	%k3, %k4, %k6
 +	kandd	%k5, %k6, %k6
 +
 +	kmovd	%k6, %eax
 +	cmpl	$VEC_MASK, %eax
 +	jne	L(4x_vec_end)
 +
 +	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
 +	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 +	VMOVU	(%rsi), %YMM1
 +	VPCMPEQ (%rdi), %YMM1, %k1
 +
 +	VMOVU	VEC_SIZE(%rsi), %YMM2
 +	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 +	kandd	%k1, %k2, %k5
 +
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 +	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 +	kandd	%k3, %k5, %k5
 +
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 +	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 +	kandd	%k4, %k5, %k5
 +
 +	kmovd	%k5, %eax
 +	cmpl	$VEC_MASK, %eax
 +	jne	L(4x_vec_end)
 +	xorl	%eax, %eax
 +	ret
 +
 +	.p2align 4
 +L(last_2x_vec):
 +	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k2
 +	kmovd	%k2, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +
 +L(last_vec):
 +	/* Use overlapping loads to avoid branches.  */
 +	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
 +	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k2
 +	kmovd	%k2, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +	ret
 +
 +	.p2align 4
 +L(first_vec):
 +	/* A byte or int32 is different within 16 or 32 bytes.  */
 +	tzcntl	%eax, %ecx
 +# ifdef USE_AS_WMEMCMP
 +	xorl	%eax, %eax
 +	movl	(%rdi, %rcx, 4), %edx
 +	cmpl	(%rsi, %rcx, 4), %edx
 +L(wmemcmp_return):
 +	setl	%al
 +	negl	%eax
 +	orl	$1, %eax
 +# else
 +	movzbl	(%rdi, %rcx), %eax
 +	movzbl	(%rsi, %rcx), %edx
 +	sub	%edx, %eax
 +# endif
 +	ret
 +
 +# ifdef USE_AS_WMEMCMP
 +	.p2align 4
 +L(4):
 +	xorl	%eax, %eax
 +	movl	(%rdi), %edx
 +	cmpl	(%rsi), %edx
 +	jne	L(wmemcmp_return)
 +	ret
 +# else
 +	.p2align 4
 +L(between_4_7):
 +	/* Load as big endian with overlapping movbe to avoid branches.  */
 +	movbe	(%rdi), %eax
 +	movbe	(%rsi), %ecx
 +	shlq	$32, %rax
 +	shlq	$32, %rcx
 +	movbe	-4(%rdi, %rdx), %edi
 +	movbe	-4(%rsi, %rdx), %esi
 +	orq	%rdi, %rax
 +	orq	%rsi, %rcx
 +	subq	%rcx, %rax
 +	je	L(exit)
 +	sbbl	%eax, %eax
 +	orl	$1, %eax
 +	ret
 +
 +	.p2align 4
 +L(exit):
 +	ret
 +
 +	.p2align 4
 +L(between_2_3):
 +	/* Load as big endian to avoid branches.  */
 +	movzwl	(%rdi), %eax
 +	movzwl	(%rsi), %ecx
 +	shll	$8, %eax
 +	shll	$8, %ecx
 +	bswap	%eax
 +	bswap	%ecx
 +	movb	-1(%rdi, %rdx), %al
 +	movb	-1(%rsi, %rdx), %cl
 +	/* Subtraction is okay because the upper 8 bits are zero.  */
 +	subl	%ecx, %eax
 +	ret
 +
 +	.p2align 4
 +L(1):
 +	movzbl	(%rdi), %eax
 +	movzbl	(%rsi), %ecx
 +	subl	%ecx, %eax
 +	ret
 +# endif
 +
 +	.p2align 4
 +L(zero):
 +	xorl	%eax, %eax
 +	ret
 +
 +	.p2align 4
 +L(less_vec):
 +# ifdef USE_AS_WMEMCMP
 +	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
 +	cmpb	$4, %dl
 +	je	L(4)
 +	jb	L(zero)
 +# else
 +	cmpb	$1, %dl
 +	je	L(1)
 +	jb	L(zero)
 +	cmpb	$4, %dl
 +	jb	L(between_2_3)
 +	cmpb	$8, %dl
 +	jb	L(between_4_7)
 +# endif
 +	cmpb	$16, %dl
 +	jae	L(between_16_31)
 +	/* It is between 8 and 15 bytes.  */
 +	vmovq	(%rdi), %XMM1
 +	vmovq	(%rsi), %XMM2
 +	VPCMPEQ %XMM1, %XMM2, %k2
 +	kmovw	%k2, %eax
 +	subl    $XMM_MASK, %eax
 +	jnz	L(first_vec)
 +	/* Use overlapping loads to avoid branches.  */
 +	leaq	-8(%rdi, %rdx), %rdi
 +	leaq	-8(%rsi, %rdx), %rsi
 +	vmovq	(%rdi), %XMM1
 +	vmovq	(%rsi), %XMM2
 +	VPCMPEQ %XMM1, %XMM2, %k2
 +	kmovw	%k2, %eax
 +	subl    $XMM_MASK, %eax
 +	jnz	L(first_vec)
 +	ret
 +
 +	.p2align 4
 +L(between_16_31):
 +	/* From 16 to 31 bytes.  No branch when size == 16.  */
 +	VMOVU	(%rsi), %XMM2
 +	VPCMPEQ (%rdi), %XMM2, %k2
 +	kmovw	%k2, %eax
 +	subl    $XMM_MASK, %eax
 +	jnz	L(first_vec)
 +
 +	/* Use overlapping loads to avoid branches.  */
 +	leaq	-16(%rdi, %rdx), %rdi
 +	leaq	-16(%rsi, %rdx), %rsi
 +	VMOVU	(%rsi), %XMM2
 +	VPCMPEQ (%rdi), %XMM2, %k2
 +	kmovw	%k2, %eax
 +	subl    $XMM_MASK, %eax
 +	jnz	L(first_vec)
 +	ret
 +
 +	.p2align 4
 +L(more_8x_vec):
 +	/* More than 8 * VEC.  Check the first VEC.  */
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k2
 +	kmovd	%k2, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +
 +	/* Align the first memory area for aligned loads in the loop.
 +	   Compute how much the first memory area is misaligned.  */
 +	movq	%rdi, %rcx
 +	andl	$(VEC_SIZE - 1), %ecx
 +	/* Get the negative of offset for alignment.  */
 +	subq	$VEC_SIZE, %rcx
 +	/* Adjust the second memory area.  */
 +	subq	%rcx, %rsi
 +	/* Adjust the first memory area which should be aligned now.  */
 +	subq	%rcx, %rdi
 +	/* Adjust length.  */
 +	addq	%rcx, %rdx
 +
 +L(loop_4x_vec):
 +	/* Compare 4 * VEC at a time forward.  */
 +	VMOVU	(%rsi), %YMM1
 +	VPCMPEQ (%rdi), %YMM1, %k1
 +
 +	VMOVU	VEC_SIZE(%rsi), %YMM2
 +	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 +	kandd	%k2, %k1, %k5
 +
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 +	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 +	kandd	%k3, %k5, %k5
 +
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 +	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 +	kandd	%k4, %k5, %k5
 +
 +	kmovd	%k5, %eax
 +	cmpl	$VEC_MASK, %eax
 +	jne	L(4x_vec_end)
 +
 +	addq	$(VEC_SIZE * 4), %rdi
 +	addq	$(VEC_SIZE * 4), %rsi
 +
 +	subq	$(VEC_SIZE * 4), %rdx
 +	cmpq	$(VEC_SIZE * 4), %rdx
 +	jae	L(loop_4x_vec)
 +
 +	/* Less than 4 * VEC.  */
 +	cmpq	$VEC_SIZE, %rdx
 +	jbe	L(last_vec)
 +	cmpq	$(VEC_SIZE * 2), %rdx
 +	jbe	L(last_2x_vec)
 +
 +L(last_4x_vec):
 +	/* From 2 * VEC to 4 * VEC. */
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k2
 +	kmovd	%k2, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +
 +	addq	$VEC_SIZE, %rdi
 +	addq	$VEC_SIZE, %rsi
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k2
 +	kmovd	%k2, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +
 +	/* Use overlapping loads to avoid branches.  */
 +	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 +	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k2
 +	kmovd	%k2, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +
 +	addq	$VEC_SIZE, %rdi
 +	addq	$VEC_SIZE, %rsi
 +	VMOVU	(%rsi), %YMM2
 +	VPCMPEQ (%rdi), %YMM2, %k2
 +	kmovd	%k2, %eax
 +	subl    $VEC_MASK, %eax
 +	jnz	L(first_vec)
 +	ret
 +
 +	.p2align 4
 +L(4x_vec_end):
 +	kmovd	%k1, %eax
 +	subl	$VEC_MASK, %eax
 +	jnz	L(first_vec)
 +	kmovd	%k2, %eax
 +	subl	$VEC_MASK, %eax
 +	jnz	L(first_vec_x1)
 +	kmovd	%k3, %eax
 +	subl	$VEC_MASK, %eax
 +	jnz	L(first_vec_x2)
 +	kmovd	%k4, %eax
 +	subl	$VEC_MASK, %eax
 +	tzcntl	%eax, %ecx
 +# ifdef USE_AS_WMEMCMP
 +	xorl	%eax, %eax
 +	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
 +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
 +	jmp	L(wmemcmp_return)
 +# else
 +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 +	sub	%edx, %eax
 +# endif
 +	ret
 +
 +	.p2align 4
 +L(first_vec_x1):
 +	tzcntl	%eax, %ecx
 +# ifdef USE_AS_WMEMCMP
 +	xorl	%eax, %eax
 +	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
 +	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
 +	jmp	L(wmemcmp_return)
 +# else
 +	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 +	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 +	sub	%edx, %eax
 +# endif
 +	ret
 +
 +	.p2align 4
 +L(first_vec_x2):
 +	tzcntl	%eax, %ecx
 +# ifdef USE_AS_WMEMCMP
 +	xorl	%eax, %eax
 +	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
 +	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
 +	jmp	L(wmemcmp_return)
 +# else
 +	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 +	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 +	sub	%edx, %eax
 +# endif
 +	ret
 +END (MEMCMP)
 +#endif
 diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
 new file mode 100644
 index 00000000..4726d74a
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
@@ -0,0 +1,4 @@
 +#define MEMCMP __wmemcmp_evex_movbe
 +#define USE_AS_WMEMCMP 1
 +
 +#include "memcmp-evex-movbe.S"
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-17.patch
+++ b/SOURCES/glibc-RHEL-15696-17.patch
--- a/SOURCES/glibc-RHEL-15696-18.patch
+++ b/SOURCES/glibc-RHEL-15696-18.patch
@ -1,735 +0,0 @@
 From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Tue, 23 Feb 2021 06:33:10 -0800
 Subject: [PATCH] x86: Add string/memory function tests in RTM region
 Content-type: text/plain; charset=UTF-8
 At function exit, AVX optimized string/memory functions have VZEROUPPER
 which triggers RTM abort.   When such functions are called inside a
 transactionally executing RTM region, RTM abort causes severe performance
 degradation.  Add tests to verify that string/memory functions won't
 cause RTM abort in RTM region.
 ---
 sysdeps/x86/Makefile          | 23 +++++++++++
 sysdeps/x86/tst-memchr-rtm.c  | 54 ++++++++++++++++++++++++++
 sysdeps/x86/tst-memcmp-rtm.c  | 52 +++++++++++++++++++++++++
 sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
 sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
 sysdeps/x86/tst-memset-rtm.c  | 45 ++++++++++++++++++++++
 sysdeps/x86/tst-strchr-rtm.c  | 54 ++++++++++++++++++++++++++
 sysdeps/x86/tst-strcpy-rtm.c  | 53 ++++++++++++++++++++++++++
 sysdeps/x86/tst-string-rtm.h  | 72 +++++++++++++++++++++++++++++++++++
 sysdeps/x86/tst-strlen-rtm.c  | 53 ++++++++++++++++++++++++++
 sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
 sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
 12 files changed, 618 insertions(+)
 create mode 100644 sysdeps/x86/tst-memchr-rtm.c
 create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
 create mode 100644 sysdeps/x86/tst-memmove-rtm.c
 create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
 create mode 100644 sysdeps/x86/tst-memset-rtm.c
 create mode 100644 sysdeps/x86/tst-strchr-rtm.c
 create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
 create mode 100644 sysdeps/x86/tst-string-rtm.h
 create mode 100644 sysdeps/x86/tst-strlen-rtm.c
 create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
 create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
 diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
 index 59e928e9..5be71ada 100644
 --- a/sysdeps/x86/Makefile
 +++ b/sysdeps/x86/Makefile
@@ -17,6 +17,29 @@ endif
 ifeq ($(subdir),string)
 sysdep_routines += cacheinfo
 +
 +tests += \
 +  tst-memchr-rtm \
 +  tst-memcmp-rtm \
 +  tst-memmove-rtm \
 +  tst-memrchr-rtm \
 +  tst-memset-rtm \
 +  tst-strchr-rtm \
 +  tst-strcpy-rtm \
 +  tst-strlen-rtm \
 +  tst-strncmp-rtm \
 +  tst-strrchr-rtm
 +
 +CFLAGS-tst-memchr-rtm.c += -mrtm
 +CFLAGS-tst-memcmp-rtm.c += -mrtm
 +CFLAGS-tst-memmove-rtm.c += -mrtm
 +CFLAGS-tst-memrchr-rtm.c += -mrtm
 +CFLAGS-tst-memset-rtm.c += -mrtm
 +CFLAGS-tst-strchr-rtm.c += -mrtm
 +CFLAGS-tst-strcpy-rtm.c += -mrtm
 +CFLAGS-tst-strlen-rtm.c += -mrtm
 +CFLAGS-tst-strncmp-rtm.c += -mrtm
 +CFLAGS-tst-strrchr-rtm.c += -mrtm
 endif
 ifneq ($(enable-cet),no)
 diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
 new file mode 100644
 index 00000000..e4749401
 --- /dev/null
 +++ b/sysdeps/x86/tst-memchr-rtm.c
@@ -0,0 +1,54 @@
 +/* Test case for memchr inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE);
 +  string1[100] = 'c';
 +  string1[STRING_SIZE - 100] = 'c';
 +  char *p = memchr (string1, 'c', STRING_SIZE);
 +  if (p == &string1[100])
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  char *p = memchr (string1, 'c', STRING_SIZE);
 +  if (p == &string1[100])
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("memchr", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
 new file mode 100644
 index 00000000..e4c8a623
 --- /dev/null
 +++ b/sysdeps/x86/tst-memcmp-rtm.c
@@ -0,0 +1,52 @@
 +/* Test case for memcmp inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +char string2[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE);
 +  memset (string2, 'a', STRING_SIZE);
 +  if (memcmp (string1, string2, STRING_SIZE) == 0)
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  if (memcmp (string1, string2, STRING_SIZE) == 0)
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("memcmp", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
 new file mode 100644
 index 00000000..4bf97ef1
 --- /dev/null
 +++ b/sysdeps/x86/tst-memmove-rtm.c
@@ -0,0 +1,53 @@
 +/* Test case for memmove inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +char string2[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE);
 +  if (memmove (string2, string1, STRING_SIZE) == string2
 +      && memcmp (string2, string1, STRING_SIZE) == 0)
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  if (memmove (string2, string1, STRING_SIZE) == string2
 +      && memcmp (string2, string1, STRING_SIZE) == 0)
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("memmove", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
 new file mode 100644
 index 00000000..a57a5a8e
 --- /dev/null
 +++ b/sysdeps/x86/tst-memrchr-rtm.c
@@ -0,0 +1,54 @@
 +/* Test case for memrchr inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE);
 +  string1[100] = 'c';
 +  string1[STRING_SIZE - 100] = 'c';
 +  char *p = memrchr (string1, 'c', STRING_SIZE);
 +  if (p == &string1[STRING_SIZE - 100])
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  char *p = memrchr (string1, 'c', STRING_SIZE);
 +  if (p == &string1[STRING_SIZE - 100])
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("memrchr", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
 new file mode 100644
 index 00000000..bf343a4d
 --- /dev/null
 +++ b/sysdeps/x86/tst-memset-rtm.c
@@ -0,0 +1,45 @@
 +/* Test case for memset inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE);
 +  return EXIT_SUCCESS;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  memset (string1, 'a', STRING_SIZE);
 +  return 0;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("memset", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
 new file mode 100644
 index 00000000..a82e29c0
 --- /dev/null
 +++ b/sysdeps/x86/tst-strchr-rtm.c
@@ -0,0 +1,54 @@
 +/* Test case for strchr inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE - 1);
 +  string1[100] = 'c';
 +  string1[STRING_SIZE - 100] = 'c';
 +  char *p = strchr (string1, 'c');
 +  if (p == &string1[100])
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  char *p = strchr (string1, 'c');
 +  if (p == &string1[100])
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("strchr", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
 new file mode 100644
 index 00000000..2b2a583f
 --- /dev/null
 +++ b/sysdeps/x86/tst-strcpy-rtm.c
@@ -0,0 +1,53 @@
 +/* Test case for strcpy inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +char string2[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE - 1);
 +  if (strcpy (string2, string1) == string2
 +      && strcmp (string2, string1) == 0)
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  if (strcpy (string2, string1) == string2
 +      && strcmp (string2, string1) == 0)
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("strcpy", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
 new file mode 100644
 index 00000000..d2470afa
 --- /dev/null
 +++ b/sysdeps/x86/tst-string-rtm.h
@@ -0,0 +1,72 @@
 +/* Test string function in a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <string.h>
 +#include <x86intrin.h>
 +#include <sys/platform/x86.h>
 +#include <support/check.h>
 +#include <support/test-driver.h>
 +
 +static int
 +do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
 +	   int (*function) (void))
 +{
 +  if (!CPU_FEATURE_USABLE (RTM))
 +    return EXIT_UNSUPPORTED;
 +
 +  int status = prepare ();
 +  if (status != EXIT_SUCCESS)
 +    return status;
 +
 +  unsigned int i;
 +  unsigned int naborts = 0;
 +  unsigned int failed = 0;
 +  for (i = 0; i < loop; i++)
 +    {
 +      failed |= function ();
 +      if (_xbegin() == _XBEGIN_STARTED)
 +	{
 +	  failed |= function ();
 +	  _xend();
 +	}
 +      else
 +	{
 +	  failed |= function ();
 +	  ++naborts;
 +	}
 +    }
 +
 +  if (failed)
 +    FAIL_EXIT1 ("%s() failed", name);
 +
 +  if (naborts)
 +    {
 +      /* NB: Low single digit (<= 5%) noise-level aborts are normal for
 +	 TSX.  */
 +      double rate = 100 * ((double) naborts) / ((double) loop);
 +      if (rate > 5)
 +	FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
 +		    rate, naborts, loop);
 +    }
 +
 +  return EXIT_SUCCESS;
 +}
 +
 +static int do_test (void);
 +
 +#include <support/test-driver.c>
 diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
 new file mode 100644
 index 00000000..0dcf14db
 --- /dev/null
 +++ b/sysdeps/x86/tst-strlen-rtm.c
@@ -0,0 +1,53 @@
 +/* Test case for strlen inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE - 1);
 +  string1[STRING_SIZE - 100] = '\0';
 +  size_t len = strlen (string1);
 +  if (len == STRING_SIZE - 100)
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  size_t len = strlen (string1);
 +  if (len == STRING_SIZE - 100)
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("strlen", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
 new file mode 100644
 index 00000000..236ad951
 --- /dev/null
 +++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -0,0 +1,52 @@
 +/* Test case for strncmp inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +char string2[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE - 1);
 +  memset (string2, 'a', STRING_SIZE - 1);
 +  if (strncmp (string1, string2, STRING_SIZE) == 0)
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  if (strncmp (string1, string2, STRING_SIZE) == 0)
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("strncmp", LOOP, prepare, function);
 +}
 diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
 new file mode 100644
 index 00000000..e32bfaf5
 --- /dev/null
 +++ b/sysdeps/x86/tst-strrchr-rtm.c
@@ -0,0 +1,53 @@
 +/* Test case for strrchr inside a transactionally executing RTM region.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <tst-string-rtm.h>
 +
 +#define LOOP 3000
 +#define STRING_SIZE 1024
 +char string1[STRING_SIZE];
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +prepare (void)
 +{
 +  memset (string1, 'a', STRING_SIZE - 1);
 +  string1[STRING_SIZE - 100] = 'c';
 +  char *p = strrchr (string1, 'c');
 +  if (p == &string1[STRING_SIZE - 100])
 +    return EXIT_SUCCESS;
 +  else
 +    return EXIT_FAILURE;
 +}
 +
 +__attribute__ ((noinline, noclone))
 +static int
 +function (void)
 +{
 +  char *p = strrchr (string1, 'c');
 +  if (p == &string1[STRING_SIZE - 100])
 +    return 0;
 +  else
 +    return 1;
 +}
 +
 +static int
 +do_test (void)
 +{
 +  return do_test_1 ("strrchr", LOOP, prepare, function);
 +}
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-19.patch
+++ b/SOURCES/glibc-RHEL-15696-19.patch
@ -1,148 +0,0 @@
 From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Sun, 7 Mar 2021 09:44:18 -0800
 Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
 Content-type: text/plain; charset=UTF-8
 Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
 with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
 with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
 function exit.
 ---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c       | 14 +++++++++-----
 sysdeps/x86_64/multiarch/ifunc-memset.h          | 13 ++++++++-----
 sysdeps/x86_64/multiarch/ifunc-wmemset.h         | 12 ++++++------
 .../multiarch/memset-avx512-unaligned-erms.S     | 16 ++++++++--------
 4 files changed, 31 insertions(+), 24 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index c1efeec0..d969a156 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __memset_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __memset_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __memset_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      CPU_FEATURE_USABLE (AVX512F),
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __memset_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __memset_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)),
 			      __memset_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      CPU_FEATURE_USABLE (AVX512F),
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512VL),
 			      __wmemset_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __wmemset_avx512_unaligned))
 #ifdef SHARED
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
 index 6f3375cc..19795938 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
 -      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 -	return OPTIMIZE (avx512_no_vzeroupper);
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 +	{
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 +	    return OPTIMIZE (avx512_unaligned_erms);
 -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 -	return OPTIMIZE (avx512_unaligned_erms);
 +	  return OPTIMIZE (avx512_unaligned);
 +	}
 -      return OPTIMIZE (avx512_unaligned);
 +      return OPTIMIZE (avx512_no_vzeroupper);
     }
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
 diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 index bdc94c6c..98c5d406 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
 -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
 -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
 -	  && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 -	return OPTIMIZE (avx512_unaligned);
 -
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 -	return OPTIMIZE (evex_unaligned);
 +	{
 +	  if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
 +	    return OPTIMIZE (avx512_unaligned);
 +
 +	  return OPTIMIZE (evex_unaligned);
 +	}
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 	return OPTIMIZE (avx2_unaligned_rtm);
 diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 index 0783979c..22e7b187 100644
 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,22 +1,22 @@
 #if IS_IN (libc)
 # define VEC_SIZE	64
 -# define VEC(i)		zmm##i
 +# define XMM0		xmm16
 +# define YMM0		ymm16
 +# define VEC0		zmm16
 +# define VEC(i)		VEC##i
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 +# define VZEROUPPER
 # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 -  vmovd d, %xmm0; \
   movq r, %rax; \
 -  vpbroadcastb %xmm0, %xmm0; \
 -  vpbroadcastq %xmm0, %zmm0
 +  vpbroadcastb d, %VEC0
 # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
 -  vmovd d, %xmm0; \
   movq r, %rax; \
 -  vpbroadcastd %xmm0, %xmm0; \
 -  vpbroadcastq %xmm0, %zmm0
 +  vpbroadcastd d, %VEC0
 -# define SECTION(p)		p##.avx512
 +# define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-2.patch
+++ b/SOURCES/glibc-RHEL-15696-2.patch
@ -1,230 +0,0 @@
 From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:25:56 -0800
 Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
 [BZ# 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes memcmp/wmemcmp for x32.  Tested on x86-64 and x32.  On
 x86-64, libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
 	length.  Clear the upper 32 bits of RDX register.
 	* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
 	* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
 	tst-size_t-wmemcmp.
 	* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
 	* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
 ---
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S |  7 +-
 sysdeps/x86_64/multiarch/memcmp-sse4.S       |  9 ++-
 sysdeps/x86_64/multiarch/memcmp-ssse3.S      |  7 +-
 sysdeps/x86_64/x32/Makefile                  |  4 +-
 sysdeps/x86_64/x32/tst-size_t-memcmp.c       | 76 ++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c      | 20 ++++++
 6 files changed, 114 insertions(+), 9 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
 Conflicts:
 	ChangeLog
 	(removed)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 index 30f764c3..e3a35b89 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -58,9 +58,12 @@
 	.section .text.avx,"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 -	shl	$2, %rdx
 +	shl	$2, %RDX_LP
 +# elif defined __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 # endif
 -	cmpq	$VEC_SIZE, %rdx
 +	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
 index 8e164f2c..302900f5 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -42,13 +42,16 @@
 	.section .text.sse4.1,"ax",@progbits
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 -	shl	$2, %rdx
 +	shl	$2, %RDX_LP
 +# elif defined __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 # endif
 	pxor	%xmm0, %xmm0
 -	cmp	$79, %rdx
 +	cmp	$79, %RDX_LP
 	ja	L(79bytesormore)
 # ifndef USE_AS_WMEMCMP
 -	cmp	$1, %rdx
 +	cmp	$1, %RDX_LP
 	je	L(firstbyte)
 # endif
 	add	%rdx, %rsi
 diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
 index 6f76c641..69d030fc 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -33,9 +33,12 @@
 	atom_text_section
 ENTRY (MEMCMP)
 # ifdef USE_AS_WMEMCMP
 -	shl	$2, %rdx
 -	test	%rdx, %rdx
 +	shl	$2, %RDX_LP
 +	test	%RDX_LP, %RDX_LP
 	jz	L(equal)
 +# elif defined __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 # endif
 	mov	%rdx, %rcx
 	mov	%rdi, %rdx
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index 7d528889..ddec7f04 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 endif
 ifeq ($(subdir),string)
 -tests += tst-size_t-memchr
 +tests += tst-size_t-memchr tst-size_t-memcmp
 endif
 ifeq ($(subdir),wcsmbs)
 -tests += tst-size_t-wmemchr
 +tests += tst-size_t-wmemchr tst-size_t-wmemcmp
 endif
 diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
 new file mode 100644
 index 00000000..9bd6fdb4
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
@@ -0,0 +1,76 @@
 +/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define TEST_MAIN
 +#ifdef WIDE
 +# define TEST_NAME "wmemcmp"
 +#else
 +# define TEST_NAME "memcmp"
 +#endif
 +
 +#include "test-size_t.h"
 +
 +#ifdef WIDE
 +# include <inttypes.h>
 +# include <wchar.h>
 +
 +# define MEMCMP wmemcmp
 +# define CHAR wchar_t
 +#else
 +# define MEMCMP memcmp
 +# define CHAR char
 +#endif
 +
 +IMPL (MEMCMP, 1)
 +
 +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
 +
 +static int
 +__attribute__ ((noinline, noclone))
 +do_memcmp (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
 +  parameter_t src = { { 0 }, buf2 };
 +
 +  memcpy (buf1, buf2, page_size);
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      src.fn = impl->fn;
 +      int res = do_memcmp (dest, src);
 +      if (res)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %i != 0",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
 new file mode 100644
 index 00000000..e8b5ffd0
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
@@ -0,0 +1,20 @@
 +/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define WIDE 1
 +#include "tst-size_t-memcmp.c"
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-20.patch
+++ b/SOURCES/glibc-RHEL-15696-20.patch
@ -1,164 +0,0 @@
 From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Sun, 7 Mar 2021 09:45:23 -0800
 Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
 Content-type: text/plain; charset=UTF-8
 Update ifunc-memmove.h to select the function optimized with AVX512
 instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
 AVX512VL since VZEROUPPER isn't needed at function exit.
 ---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 24 +++++++++---------
 sysdeps/x86_64/multiarch/ifunc-memmove.h      | 12 +++++----
 .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
 3 files changed, 42 insertions(+), 19 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index d969a156..fec384f6 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memmove_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memmove_chk,
 			      CPU_FEATURE_USABLE (AVX),
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memmove_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memmove,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memmove_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
 			      __memmove_ssse3_back)
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __memcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __memcpy_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __mempcpy_chk_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
 			      CPU_FEATURE_USABLE (AVX),
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX512F),
 			      __mempcpy_avx512_no_vzeroupper)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      CPU_FEATURE_USABLE (AVX512VL),
 			      __mempcpy_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, mempcpy,
 			      CPU_FEATURE_USABLE (AVX),
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
 index fa09b9fb..014e95c7 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
 -      if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 -	return OPTIMIZE (avx512_no_vzeroupper);
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
 +	{
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 +	    return OPTIMIZE (avx512_unaligned_erms);
 -      if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 -	return OPTIMIZE (avx512_unaligned_erms);
 +	  return OPTIMIZE (avx512_unaligned);
 +	}
 -      return OPTIMIZE (avx512_unaligned);
 +      return OPTIMIZE (avx512_no_vzeroupper);
     }
   if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 index aac1515c..848848ab 100644
 --- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,11 +1,32 @@
 #if IS_IN (libc)
 # define VEC_SIZE	64
 -# define VEC(i)		zmm##i
 +# define XMM0		xmm16
 +# define XMM1		xmm17
 +# define YMM0		ymm16
 +# define YMM1		ymm17
 +# define VEC0		zmm16
 +# define VEC1		zmm17
 +# define VEC2		zmm18
 +# define VEC3		zmm19
 +# define VEC4		zmm20
 +# define VEC5		zmm21
 +# define VEC6		zmm22
 +# define VEC7		zmm23
 +# define VEC8		zmm24
 +# define VEC9		zmm25
 +# define VEC10		zmm26
 +# define VEC11		zmm27
 +# define VEC12		zmm28
 +# define VEC13		zmm29
 +# define VEC14		zmm30
 +# define VEC15		zmm31
 +# define VEC(i)		VEC##i
 # define VMOVNT		vmovntdq
 # define VMOVU		vmovdqu64
 # define VMOVA		vmovdqa64
 +# define VZEROUPPER
 -# define SECTION(p)		p##.avx512
 +# define SECTION(p)		p##.evex512
 # define MEMMOVE_SYMBOL(p,s)	p##_avx512_##s
 # include "memmove-vec-unaligned-erms.S"
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-21.patch
+++ b/SOURCES/glibc-RHEL-15696-21.patch
@ -1,71 +0,0 @@
 From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
 From: Sunil K Pandey <skpgkp2@gmail.com>
 Date: Thu, 1 Apr 2021 15:47:04 -0700
 Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
 Content-type: text/plain; charset=UTF-8
 Fix some indentations of ifdef in file strlen-evex.S which are off by 1
 and confusing to read.
 ---
 sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
 index cd022509..05838190 100644
 --- a/sysdeps/x86_64/multiarch/strlen-evex.S
 +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -276,10 +276,10 @@ L(last_2x_vec):
 	.p2align 4
 L(first_vec_x0_check):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 +#  ifdef USE_AS_WCSLEN
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 	sall	$2, %eax
 -# endif
 +#  endif
 	/* Check the end of data.  */
 	cmpq	%rax, %rsi
 	jbe	L(max)
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
 	.p2align 4
 L(first_vec_x1_check):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 +#  ifdef USE_AS_WCSLEN
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 	sall	$2, %eax
 -# endif
 +#  endif
 	/* Check the end of data.  */
 	cmpq	%rax, %rsi
 	jbe	L(max)
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
 	.p2align 4
 L(first_vec_x2_check):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 +#  ifdef USE_AS_WCSLEN
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 	sall	$2, %eax
 -# endif
 +#  endif
 	/* Check the end of data.  */
 	cmpq	%rax, %rsi
 	jbe	L(max)
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
 	.p2align 4
 L(first_vec_x3_check):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 +#  ifdef USE_AS_WCSLEN
 	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 	sall	$2, %eax
 -# endif
 +#  endif
 	/* Check the end of data.  */
 	cmpq	%rax, %rsi
 	jbe	L(max)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-22.patch
+++ b/SOURCES/glibc-RHEL-15696-22.patch
@ -1,51 +0,0 @@
 From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 19 Apr 2021 07:07:21 -0700
 Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
 Content-type: text/plain; charset=UTF-8
 Since __strlen_evex and __strnlen_evex added by
 commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Fri Mar 5 06:24:52 2021 -0800
    x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
 use sarx:
 c4 e2 6a f7 c0       	sarx   %edx,%eax,%eax
 require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
 ifunc-avx2.h already requires BMI2 for EVEX implementation.
 ---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index fec384f6..cbfc1a5d 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strlen,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_evex)
 	      IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __strnlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_evex)
 	      IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-23.patch
+++ b/SOURCES/glibc-RHEL-15696-23.patch
@ -1,584 +0,0 @@
 From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 3 May 2021 03:01:58 -0400
 Subject: [PATCH] x86: Optimize memchr-avx2.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes memchr-avx2.S. The optimizations include
 replacing some branches with cmovcc, avoiding some branches entirely
 in the less_4x_vec case, making the page cross logic less strict,
 asaving a few instructions the in loop return loop. test-memchr,
 test-rawmemchr, and test-wmemchr are all passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
 1 file changed, 247 insertions(+), 178 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
 index cf893e77..b377f22e 100644
 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
 # ifdef USE_AS_WMEMCHR
 #  define VPCMPEQ	vpcmpeqd
 +#  define VPBROADCAST	vpbroadcastd
 +#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
 +#  define VPBROADCAST	vpbroadcastb
 +#  define CHAR_SIZE	1
 +# endif
 +
 +# ifdef USE_AS_RAWMEMCHR
 +#  define ERAW_PTR_REG	ecx
 +#  define RRAW_PTR_REG	rcx
 +#  define ALGN_PTR_REG	rdi
 +# else
 +#  define ERAW_PTR_REG	edi
 +#  define RRAW_PTR_REG	rdi
 +#  define ALGN_PTR_REG	rcx
 # endif
 # ifndef VZEROUPPER
@@ -39,6 +53,7 @@
 # endif
 # define VEC_SIZE 32
 +# define PAGE_SIZE 4096
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
 	test	%RDX_LP, %RDX_LP
 	jz	L(null)
 # endif
 -	movl	%edi, %ecx
 -	/* Broadcast CHAR to YMM0.  */
 -	vmovd	%esi, %xmm0
 # ifdef USE_AS_WMEMCHR
 	shl	$2, %RDX_LP
 -	vpbroadcastd %xmm0, %ymm0
 # else
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
 #  endif
 -	vpbroadcastb %xmm0, %ymm0
 # endif
 +	/* Broadcast CHAR to YMMMATCH.  */
 +	vmovd	%esi, %xmm0
 +	VPBROADCAST %xmm0, %ymm0
 	/* Check if we may cross page boundary with one vector load.  */
 -	andl	$(2 * VEC_SIZE - 1), %ecx
 -	cmpl	$VEC_SIZE, %ecx
 -	ja	L(cros_page_boundary)
 +	movl	%edi, %eax
 +	andl	$(PAGE_SIZE - 1), %eax
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	ja	L(cross_page_boundary)
 	/* Check the first VEC_SIZE bytes.  */
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 +	VPCMPEQ	(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -
 # ifndef USE_AS_RAWMEMCHR
 -	jnz	L(first_vec_x0_check)
 -	/* Adjust length and check the end of data.  */
 -	subq	$VEC_SIZE, %rdx
 -	jbe	L(zero)
 -# else
 -	jnz	L(first_vec_x0)
 +	/* If length < CHAR_PER_VEC handle special.  */
 +	cmpq	$VEC_SIZE, %rdx
 +	jbe	L(first_vec_x0)
 # endif
 -
 -	/* Align data for aligned loads in the loop.  */
 -	addq	$VEC_SIZE, %rdi
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 +	testl	%eax, %eax
 +	jz	L(aligned_more)
 +	tzcntl	%eax, %eax
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 # ifndef USE_AS_RAWMEMCHR
 -	/* Adjust length.  */
 -	addq	%rcx, %rdx
 +	.p2align 5
 +L(first_vec_x0):
 +	/* Check if first match was before length.  */
 +	tzcntl	%eax, %eax
 +	xorl	%ecx, %ecx
 +	cmpl	%eax, %edx
 +	leaq	(%rdi, %rax), %rax
 +	cmovle	%rcx, %rax
 +	VZEROUPPER_RETURN
 -	subq	$(VEC_SIZE * 4), %rdx
 -	jbe	L(last_4x_vec_or_less)
 +L(null):
 +	xorl	%eax, %eax
 +	ret
 # endif
 -	jmp	L(more_4x_vec)
 -
 	.p2align 4
 -L(cros_page_boundary):
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 +L(cross_page_boundary):
 +	/* Save pointer before aligning as its original value is necessary
 +	   for computer return address if byte is found or adjusting length
 +	   if it is not and this is memchr.  */
 +	movq	%rdi, %rcx
 +	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
 +	   rdi for rawmemchr.  */
 +	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
 +	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 +# ifndef USE_AS_RAWMEMCHR
 +	/* Calculate length until end of page (length checked for a
 +	   match).  */
 +	leaq	1(%ALGN_PTR_REG), %rsi
 +	subq	%RRAW_PTR_REG, %rsi
 +# endif
 	/* Remove the leading bytes.  */
 -	sarl	%cl, %eax
 -	testl	%eax, %eax
 -	jz	L(aligned_more)
 -	tzcntl	%eax, %eax
 +	sarxl	%ERAW_PTR_REG, %eax, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 +	cmpq	%rsi, %rdx
 +	jbe	L(first_vec_x0)
 # endif
 -	addq	%rdi, %rax
 -	addq	%rcx, %rax
 +	testl	%eax, %eax
 +	jz	L(cross_page_continue)
 +	tzcntl	%eax, %eax
 +	addq	%RRAW_PTR_REG, %rax
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 	.p2align 4
 -L(aligned_more):
 -# ifndef USE_AS_RAWMEMCHR
 -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
 -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
 -	   overflow.  */
 -	negq	%rcx
 -	addq	$VEC_SIZE, %rcx
 +L(first_vec_x1):
 +	tzcntl	%eax, %eax
 +	incq	%rdi
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 -	/* Check the end of data.  */
 -	subq	%rcx, %rdx
 -	jbe	L(zero)
 -# endif
 +	.p2align 4
 +L(first_vec_x2):
 +	tzcntl	%eax, %eax
 +	addq	$(VEC_SIZE + 1), %rdi
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 +L(first_vec_x3):
 +	tzcntl	%eax, %eax
 +	addq	$(VEC_SIZE * 2 + 1), %rdi
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 -	addq	$VEC_SIZE, %rdi
 -# ifndef USE_AS_RAWMEMCHR
 -	subq	$(VEC_SIZE * 4), %rdx
 -	jbe	L(last_4x_vec_or_less)
 -# endif
 +	.p2align 4
 +L(first_vec_x4):
 +	tzcntl	%eax, %eax
 +	addq	$(VEC_SIZE * 3 + 1), %rdi
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 -L(more_4x_vec):
 +	.p2align 4
 +L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 +# ifndef USE_AS_RAWMEMCHR
 +L(cross_page_continue):
 +	/* Align data to VEC_SIZE - 1.  */
 +	xorl	%ecx, %ecx
 +	subl	%edi, %ecx
 +	orq	$(VEC_SIZE - 1), %rdi
 +	/* esi is for adjusting length to see if near the end.  */
 +	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
 +# else
 +	orq	$(VEC_SIZE - 1), %rdi
 +L(cross_page_continue):
 +# endif
 +	/* Load first VEC regardless.  */
 +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 +# ifndef USE_AS_RAWMEMCHR
 +	/* Adjust length. If near end handle specially.  */
 +	subq	%rsi, %rdx
 +	jbe	L(last_4x_vec_or_less)
 +# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 -	addq	$(VEC_SIZE * 4), %rdi
 +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x4)
 # ifndef USE_AS_RAWMEMCHR
 +	/* Check if at last VEC_SIZE * 4 length.  */
 	subq	$(VEC_SIZE * 4), %rdx
 -	jbe	L(last_4x_vec_or_less)
 -# endif
 -
 -	/* Align data to 4 * VEC_SIZE.  */
 -	movq	%rdi, %rcx
 -	andl	$(4 * VEC_SIZE - 1), %ecx
 -	andq	$-(4 * VEC_SIZE), %rdi
 -
 -# ifndef USE_AS_RAWMEMCHR
 -	/* Adjust length.  */
 +	jbe	L(last_4x_vec_or_less_cmpeq)
 +	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
 +	   length.  */
 +	incq	%rdi
 +	movl	%edi, %ecx
 +	orq	$(VEC_SIZE * 4 - 1), %rdi
 +	andl	$(VEC_SIZE * 4 - 1), %ecx
 	addq	%rcx, %rdx
 +# else
 +	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
 +	incq	%rdi
 +	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
 +	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
 -
 +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
 +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
 +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
 	vpor	%ymm1, %ymm2, %ymm5
 	vpor	%ymm3, %ymm4, %ymm6
 	vpor	%ymm5, %ymm6, %ymm5
 -	vpmovmskb %ymm5, %eax
 -	testl	%eax, %eax
 -	jnz	L(4x_vec_end)
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 -
 +	vpmovmskb %ymm5, %ecx
 # ifdef USE_AS_RAWMEMCHR
 -	jmp	L(loop_4x_vec)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	testl	%ecx, %ecx
 +	jz	L(loop_4x_vec)
 # else
 -	subq	$(VEC_SIZE * 4), %rdx
 -	ja	L(loop_4x_vec)
 +	testl	%ecx, %ecx
 +	jnz	L(loop_4x_vec_end)
 -L(last_4x_vec_or_less):
 -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 -	addl	$(VEC_SIZE * 2), %edx
 -	jle	L(last_2x_vec)
 +	subq	$-(VEC_SIZE * 4), %rdi
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 +	subq	$(VEC_SIZE * 4), %rdx
 +	ja	L(loop_4x_vec)
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 +	/* Fall through into less than 4 remaining vectors of length case.
 +	 */
 +	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 +	.p2align 4
 +L(last_4x_vec_or_less):
 +	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 +	jnz	L(first_vec_x1_check)
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 +	/* If remaining length > VEC_SIZE * 2.  */
 +	addl	$(VEC_SIZE * 2), %edx
 +	jg	L(last_4x_vec)
 -	jnz	L(first_vec_x2_check)
 -	subl	$VEC_SIZE, %edx
 -	jle	L(zero)
 +L(last_2x_vec):
 +	/* If remaining length < VEC_SIZE.  */
 +	addl	$VEC_SIZE, %edx
 +	jle	L(zero_end)
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 +	/* Check VEC2 and compare any match with remaining length.  */
 +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -
 -	jnz	L(first_vec_x3_check)
 -	xorl	%eax, %eax
 +	tzcntl	%eax, %eax
 +	cmpl	%eax, %edx
 +	jbe	L(set_zero_end)
 +	addq	$(VEC_SIZE + 1), %rdi
 +	addq	%rdi, %rax
 +L(zero_end):
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(last_2x_vec):
 -	addl	$(VEC_SIZE * 2), %edx
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 +L(loop_4x_vec_end):
 +# endif
 +	/* rawmemchr will fall through into this if match was found in
 +	   loop.  */
 +
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 +	jnz	L(last_vec_x1_return)
 -	jnz	L(first_vec_x0_check)
 -	subl	$VEC_SIZE, %edx
 -	jle	L(zero)
 -
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 +	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1_check)
 -	xorl	%eax, %eax
 -	VZEROUPPER_RETURN
 +	jnz	L(last_vec_x2_return)
 -	.p2align 4
 -L(first_vec_x0_check):
 -	tzcntl	%eax, %eax
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 +	vpmovmskb %ymm3, %eax
 +	/* Combine VEC3 matches (eax) with VEC4 matches (ecx).  */
 +	salq	$32, %rcx
 +	orq	%rcx, %rax
 +	tzcntq	%rax, %rax
 +# ifdef USE_AS_RAWMEMCHR
 +	subq	$(VEC_SIZE * 2 - 1), %rdi
 +# else
 +	subq	$-(VEC_SIZE * 2 + 1), %rdi
 +# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 +# ifndef USE_AS_RAWMEMCHR
 	.p2align 4
 L(first_vec_x1_check):
 	tzcntl	%eax, %eax
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 -	addq	$VEC_SIZE, %rax
 +	/* Adjust length.  */
 +	subl	$-(VEC_SIZE * 4), %edx
 +	/* Check if match within remaining length.  */
 +	cmpl	%eax, %edx
 +	jbe	L(set_zero_end)
 +	incq	%rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 +	.p2align 4
 +L(set_zero_end):
 +	xorl	%eax, %eax
 +	VZEROUPPER_RETURN
 +# endif
 	.p2align 4
 -L(first_vec_x2_check):
 +L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 -	addq	$(VEC_SIZE * 2), %rax
 +# ifdef USE_AS_RAWMEMCHR
 +	subq	$(VEC_SIZE * 4 - 1), %rdi
 +# else
 +	incq	%rdi
 +# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec_x3_check):
 +L(last_vec_x2_return):
 	tzcntl	%eax, %eax
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 -	addq	$(VEC_SIZE * 3), %rax
 +# ifdef USE_AS_RAWMEMCHR
 +	subq	$(VEC_SIZE * 3 - 1), %rdi
 +# else
 +	subq	$-(VEC_SIZE + 1), %rdi
 +# endif
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 +# ifndef USE_AS_RAWMEMCHR
 	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 -	jmp     L(return_vzeroupper)
 +L(last_4x_vec_or_less_cmpeq):
 +	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	/* Check first VEC regardless.  */
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x1_check)
 +	/* If remaining length <= CHAR_PER_VEC * 2.  */
 +	addl	$(VEC_SIZE * 2), %edx
 +	jle	L(last_2x_vec)
 	.p2align 4
 -L(null):
 -	xorl	%eax, %eax
 -	ret
 -# endif
 +L(last_4x_vec):
 +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(last_vec_x2_return)
 -	.p2align 4
 -L(first_vec_x0):
 -	tzcntl	%eax, %eax
 -	addq	%rdi, %rax
 -	VZEROUPPER_RETURN
 +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb %ymm1, %eax
 -	.p2align 4
 -L(first_vec_x1):
 -	tzcntl	%eax, %eax
 -	addq	$VEC_SIZE, %rax
 -	addq	%rdi, %rax
 -	VZEROUPPER_RETURN
 +	/* Create mask for possible matches within remaining length.  */
 +	movq	$-1, %rcx
 +	bzhiq	%rdx, %rcx, %rcx
 -	.p2align 4
 -L(first_vec_x2):
 +	/* Test matches in data against length match.  */
 +	andl	%ecx, %eax
 +	jnz	L(last_vec_x3)
 +
 +	/* if remaining length <= VEC_SIZE * 3 (Note this is after
 +	   remaining length was found to be > VEC_SIZE * 2.  */
 +	subl	$VEC_SIZE, %edx
 +	jbe	L(zero_end2)
 +
 +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	/* Shift remaining length mask for last VEC.  */
 +	shrq	$32, %rcx
 +	andl	%ecx, %eax
 +	jz	L(zero_end2)
 	tzcntl	%eax, %eax
 -	addq	$(VEC_SIZE * 2), %rax
 +	addq	$(VEC_SIZE * 3 + 1), %rdi
 	addq	%rdi, %rax
 +L(zero_end2):
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(4x_vec_end):
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -	vpmovmskb %ymm2, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -	vpmovmskb %ymm3, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x2)
 -	vpmovmskb %ymm4, %eax
 -	testl	%eax, %eax
 -L(first_vec_x3):
 +L(last_vec_x3):
 	tzcntl	%eax, %eax
 -	addq	$(VEC_SIZE * 3), %rax
 +	subq	$-(VEC_SIZE * 2 + 1), %rdi
 	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 +# endif
 END (MEMCHR)
 #endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-24.patch
+++ b/SOURCES/glibc-RHEL-15696-24.patch
@ -1,388 +0,0 @@
 From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Wed, 9 Jun 2021 16:25:32 -0400
 Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
 #27974]
 Content-type: text/plain; charset=UTF-8
 This commit fixes the bug mentioned in the previous commit.
 The previous implementations of wmemchr in these files relied
 on n * sizeof(wchar_t) which was not guranteed by the standard.
 The new overflow tests added in the previous commit now
 pass (As well as all the other tests).
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/memchr.S                | 77 +++++++++++++++++++-------
 sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
 2 files changed, 98 insertions(+), 37 deletions(-)
 diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
 index cb320257..24f9a0c5 100644
 --- a/sysdeps/x86_64/memchr.S
 +++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
 #ifdef USE_AS_WMEMCHR
 # define MEMCHR		wmemchr
 # define PCMPEQ		pcmpeqd
 +# define CHAR_PER_VEC	4
 #else
 # define MEMCHR		memchr
 # define PCMPEQ		pcmpeqb
 +# define CHAR_PER_VEC	16
 #endif
 /* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
 	movd	%esi, %xmm1
 	mov	%edi, %ecx
 +#ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 +#endif
 #ifdef USE_AS_WMEMCHR
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
 -	shl	$2, %RDX_LP
 #else
 -# ifdef __ILP32__
 -	/* Clear the upper 32 bits.  */
 -	movl	%edx, %edx
 -# endif
 	punpcklbw %xmm1, %xmm1
 	test	%RDX_LP, %RDX_LP
 	jz	L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
 	test	%eax, %eax
 	jnz	L(matches_1)
 -	sub	$16, %rdx
 +	sub	$CHAR_PER_VEC, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 	and	$15, %ecx
 	and	$-16, %rdi
 +#ifdef USE_AS_WMEMCHR
 +	shr	$2, %ecx
 +#endif
 	add	%rcx, %rdx
 -	sub	$64, %rdx
 +	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	jmp	L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 -/* Check if there is a match.  */
 +	/* Check if there is a match.  */
 	pmovmskb %xmm0, %eax
 -/* Remove the leading bytes.  */
 +	/* Remove the leading bytes.  */
 	sar	%cl, %eax
 	test	%eax, %eax
 	je	L(unaligned_no_match)
 -/* Check which byte is a match.  */
 +	/* Check which byte is a match.  */
 	bsf	%eax, %eax
 -
 +#ifdef USE_AS_WMEMCHR
 +	mov	%eax, %esi
 +	shr	$2, %esi
 +	sub	%rsi, %rdx
 +#else
 	sub	%rax, %rdx
 +#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	add	%rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
 	.p2align 4
 L(unaligned_no_match):
 -        /* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 +	/* "rcx" is less than 16.  Calculate "rdx + rcx - 16" by using
 	   "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
 	   possible addition overflow.  */
 	neg	%rcx
 	add	$16, %rcx
 +#ifdef USE_AS_WMEMCHR
 +	shr	$2, %ecx
 +#endif
 	sub	%rcx, %rdx
 	jbe	L(return_null)
 	add	$16, %rdi
 -	sub	$64, %rdx
 +	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
 	test	$0x3f, %rdi
 	jz	L(align64_loop)
 -	sub	$64, %rdx
 +	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
 	mov	%rdi, %rcx
 	and	$-64, %rdi
 	and	$63, %ecx
 +#ifdef USE_AS_WMEMCHR
 +	shr	$2, %ecx
 +#endif
 	add	%rcx, %rdx
 	.p2align 4
 L(align64_loop):
 -	sub	$64, %rdx
 +	sub	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(exit_loop)
 	movdqa	(%rdi), %xmm0
 	movdqa	16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
 	.p2align 4
 L(exit_loop):
 -	add	$32, %edx
 +	add	$(CHAR_PER_VEC * 2), %edx
 	jle	L(exit_loop_32)
 	movdqa	(%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
 	pmovmskb %xmm3, %eax
 	test	%eax, %eax
 	jnz	L(matches32_1)
 -	sub	$16, %edx
 +	sub	$CHAR_PER_VEC, %edx
 	jle	L(return_null)
 	PCMPEQ	48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
 	.p2align 4
 L(exit_loop_32):
 -	add	$32, %edx
 +	add	$(CHAR_PER_VEC * 2), %edx
 	movdqa	(%rdi), %xmm0
 	PCMPEQ	%xmm1, %xmm0
 	pmovmskb %xmm0, %eax
 	test	%eax, %eax
 	jnz	L(matches_1)
 -	sub	$16, %edx
 +	sub	$CHAR_PER_VEC, %edx
 	jbe	L(return_null)
 	PCMPEQ	16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
 	.p2align 4
 L(matches_1):
 	bsf	%eax, %eax
 +#ifdef USE_AS_WMEMCHR
 +	mov	%eax, %esi
 +	shr	$2, %esi
 +	sub	%rsi, %rdx
 +#else
 	sub	%rax, %rdx
 +#endif
 	jbe	L(return_null)
 	add	%rdi, %rax
 	ret
@@ -301,7 +322,13 @@ L(matches_1):
 	.p2align 4
 L(matches16_1):
 	bsf	%eax, %eax
 +#ifdef USE_AS_WMEMCHR
 +	mov	%eax, %esi
 +	shr	$2, %esi
 +	sub	%rsi, %rdx
 +#else
 	sub	%rax, %rdx
 +#endif
 	jbe	L(return_null)
 	lea	16(%rdi, %rax), %rax
 	ret
@@ -309,7 +336,13 @@ L(matches16_1):
 	.p2align 4
 L(matches32_1):
 	bsf	%eax, %eax
 +#ifdef USE_AS_WMEMCHR
 +	mov	%eax, %esi
 +	shr	$2, %esi
 +	sub	%rsi, %rdx
 +#else
 	sub	%rax, %rdx
 +#endif
 	jbe	L(return_null)
 	lea	32(%rdi, %rax), %rax
 	ret
@@ -317,7 +350,13 @@ L(matches32_1):
 	.p2align 4
 L(matches48_1):
 	bsf	%eax, %eax
 +#ifdef USE_AS_WMEMCHR
 +	mov	%eax, %esi
 +	shr	$2, %esi
 +	sub	%rsi, %rdx
 +#else
 	sub	%rax, %rdx
 +#endif
 	jbe	L(return_null)
 	lea	48(%rdi, %rax), %rax
 	ret
 diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
 index b377f22e..16027abb 100644
 --- a/sysdeps/x86_64/multiarch/memchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
 +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 -	test	%RDX_LP, %RDX_LP
 -	jz	L(null)
 -# endif
 -# ifdef USE_AS_WMEMCHR
 -	shl	$2, %RDX_LP
 -# else
 #  ifdef __ILP32__
 -	/* Clear the upper 32 bits.  */
 -	movl	%edx, %edx
 +	/* Clear upper bits.  */
 +	and	%RDX_LP, %RDX_LP
 +#  else
 +	test	%RDX_LP, %RDX_LP
 #  endif
 +	jz	L(null)
 # endif
 	/* Broadcast CHAR to YMMMATCH.  */
 	vmovd	%esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
 	vpmovmskb %ymm1, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* If length < CHAR_PER_VEC handle special.  */
 -	cmpq	$VEC_SIZE, %rdx
 +	cmpq	$CHAR_PER_VEC, %rdx
 	jbe	L(first_vec_x0)
 # endif
 	testl	%eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
 L(first_vec_x0):
 	/* Check if first match was before length.  */
 	tzcntl	%eax, %eax
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Multiply length by 4 to get byte count.  */
 +	sall	$2, %edx
 +#  endif
 	xorl	%ecx, %ecx
 	cmpl	%eax, %edx
 	leaq	(%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
 # endif
 	.p2align 4
 L(cross_page_boundary):
 -	/* Save pointer before aligning as its original value is necessary
 -	   for computer return address if byte is found or adjusting length
 -	   if it is not and this is memchr.  */
 +	/* Save pointer before aligning as its original value is
 +	   necessary for computer return address if byte is found or
 +	   adjusting length if it is not and this is memchr.  */
 	movq	%rdi, %rcx
 -	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
 -	   rdi for rawmemchr.  */
 +	/* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
 +	   and rdi for rawmemchr.  */
 	orq	$(VEC_SIZE - 1), %ALGN_PTR_REG
 	VPCMPEQ	-(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
 	   match).  */
 	leaq	1(%ALGN_PTR_REG), %rsi
 	subq	%RRAW_PTR_REG, %rsi
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 +	shrl	$2, %esi
 +#  endif
 # endif
 	/* Remove the leading bytes.  */
 	sarxl	%ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
 	orq	$(VEC_SIZE - 1), %rdi
 	/* esi is for adjusting length to see if near the end.  */
 	leal	(VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %esi
 +#  endif
 # else
 	orq	$(VEC_SIZE - 1), %rdi
 L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
 # ifndef USE_AS_RAWMEMCHR
 	/* Check if at last VEC_SIZE * 4 length.  */
 -	subq	$(VEC_SIZE * 4), %rdx
 +	subq	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(last_4x_vec_or_less_cmpeq)
 	/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
 	   length.  */
@@ -221,6 +231,10 @@ L(cross_page_continue):
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %ecx
 +#  endif
 	addq	%rcx, %rdx
 # else
 	/* Align data to VEC_SIZE * 4 - 1 for loop.  */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
 	subq	$-(VEC_SIZE * 4), %rdi
 -	subq	$(VEC_SIZE * 4), %rdx
 +	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 -	/* Fall through into less than 4 remaining vectors of length case.
 -	 */
 +	/* Fall through into less than 4 remaining vectors of length
 +	   case.  */
 	VPCMPEQ	(VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 	.p2align 4
 L(last_4x_vec_or_less):
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Multiply length by 4 to get byte count.  */
 +	sall	$2, %edx
 +#  endif
 	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
 	jnz	L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	(VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
 	vpmovmskb %ymm1, %eax
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Multiply length by 4 to get byte count.  */
 +	sall	$2, %edx
 +#  endif
 	subq	$-(VEC_SIZE * 4), %rdi
 	/* Check first VEC regardless.  */
 	testl	%eax, %eax
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-25.patch
+++ b/SOURCES/glibc-RHEL-15696-25.patch
@ -1,767 +0,0 @@
 From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 19 Apr 2021 19:36:07 -0400
 Subject: [PATCH] x86: Optimize strlen-avx2.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes strlen-avx2.S. The optimizations are
 mostly small things but they add up to roughly 10-30% performance
 improvement for strlen. The results for strnlen are bit more
 ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
 are all passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  16 +-
 sysdeps/x86_64/multiarch/strlen-avx2.S     | 532 +++++++++++++--------
 2 files changed, 334 insertions(+), 214 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index cbfc1a5d..f1a6460a 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
 	      IFUNC_IMPL_ADD (array, i, strlen,
 -			      CPU_FEATURE_USABLE (AVX2),
 +			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strlen,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strlen,
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strnlen.c.  */
   IFUNC_IMPL (i, name, strnlen,
 	      IFUNC_IMPL_ADD (array, i, strnlen,
 -			      CPU_FEATURE_USABLE (AVX2),
 +			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strnlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strnlen,
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcslen.c.  */
   IFUNC_IMPL (i, name, wcslen,
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 -			      CPU_FEATURE_USABLE (AVX2),
 +			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcslen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcslen,
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
   IFUNC_IMPL (i, name, wcsnlen,
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 -			      CPU_FEATURE_USABLE (AVX2),
 +			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcsnlen_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcsnlen_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
 index 82826e10..be8a5db5 100644
 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,9 +27,11 @@
 # ifdef USE_AS_WCSLEN
 #  define VPCMPEQ	vpcmpeqd
 #  define VPMINU	vpminud
 +#  define CHAR_SIZE	4
 # else
 #  define VPCMPEQ	vpcmpeqb
 #  define VPMINU	vpminub
 +#  define CHAR_SIZE	1
 # endif
 # ifndef VZEROUPPER
@@ -41,349 +43,459 @@
 # endif
 # define VEC_SIZE 32
 +# define PAGE_SIZE 4096
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 -	/* Check for zero length.  */
 +	/* Check zero length.  */
 	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
 +	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 +	mov	%RSI_LP, %R8_LP
 #  ifdef USE_AS_WCSLEN
 	shl	$2, %RSI_LP
 #  elif defined __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%esi, %esi
 #  endif
 -	mov	%RSI_LP, %R8_LP
 # endif
 -	movl	%edi, %ecx
 +	movl	%edi, %eax
 	movq	%rdi, %rdx
 	vpxor	%xmm0, %xmm0, %xmm0
 -
 +	/* Clear high bits from edi. Only keeping bits relevant to page
 +	   cross check.  */
 +	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we may cross page boundary with one vector load.  */
 -	andl	$(2 * VEC_SIZE - 1), %ecx
 -	cmpl	$VEC_SIZE, %ecx
 -	ja	L(cros_page_boundary)
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	ja	L(cross_page_boundary)
 	/* Check the first VEC_SIZE bytes.  */
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -
 +	VPCMPEQ	(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 # ifdef USE_AS_STRNLEN
 -	jnz	L(first_vec_x0_check)
 -	/* Adjust length and check the end of data.  */
 -	subq	$VEC_SIZE, %rsi
 -	jbe	L(max)
 -# else
 -	jnz	L(first_vec_x0)
 +	/* If length < VEC_SIZE handle special.  */
 +	cmpq	$VEC_SIZE, %rsi
 +	jbe	L(first_vec_x0)
 # endif
 -
 -	/* Align data for aligned loads in the loop.  */
 -	addq	$VEC_SIZE, %rdi
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 +	/* If empty continue to aligned_more. Otherwise return bit
 +	   position of first match.  */
 +	testl	%eax, %eax
 +	jz	L(aligned_more)
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WCSLEN
 +	shrl	$2, %eax
 +# endif
 +	VZEROUPPER_RETURN
 # ifdef USE_AS_STRNLEN
 -	/* Adjust length.  */
 -	addq	%rcx, %rsi
 +L(zero):
 +	xorl	%eax, %eax
 +	ret
 -	subq	$(VEC_SIZE * 4), %rsi
 -	jbe	L(last_4x_vec_or_less)
 +	.p2align 4
 +L(first_vec_x0):
 +	/* Set bit for max len so that tzcnt will return min of max len
 +	   and position of first match.  */
 +	btsq	%rsi, %rax
 +	tzcntl	%eax, %eax
 +#  ifdef USE_AS_WCSLEN
 +	shrl	$2, %eax
 +#  endif
 +	VZEROUPPER_RETURN
 # endif
 -	jmp	L(more_4x_vec)
 	.p2align 4
 -L(cros_page_boundary):
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	/* Remove the leading bytes.  */
 -	sarl	%cl, %eax
 -	testl	%eax, %eax
 -	jz	L(aligned_more)
 +L(first_vec_x1):
 	tzcntl	%eax, %eax
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	subl	$(VEC_SIZE * 4 + 1), %ecx
 +	addl	%ecx, %eax
 +# else
 +	subl	%edx, %edi
 +	incl	%edi
 +	addl	%edi, %eax
 # endif
 -	addq	%rdi, %rax
 -	addq	%rcx, %rax
 -	subq	%rdx, %rax
 # ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	shrl	$2, %eax
 # endif
 -L(return_vzeroupper):
 -	ZERO_UPPER_VEC_REGISTERS_RETURN
 +	VZEROUPPER_RETURN
 	.p2align 4
 -L(aligned_more):
 +L(first_vec_x2):
 +	tzcntl	%eax, %eax
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
 -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
 -	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
 -	    to void possible addition overflow.  */
 -	negq	%rcx
 -	addq	$VEC_SIZE, %rcx
 -
 -	/* Check the end of data.  */
 -	subq	%rcx, %rsi
 -	jbe	L(max)
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	subl	$(VEC_SIZE * 3 + 1), %ecx
 +	addl	%ecx, %eax
 +# else
 +	subl	%edx, %edi
 +	addl	$(VEC_SIZE + 1), %edi
 +	addl	%edi, %eax
 # endif
 +# ifdef USE_AS_WCSLEN
 +	shrl	$2, %eax
 +# endif
 +	VZEROUPPER_RETURN
 -	addq	$VEC_SIZE, %rdi
 +	.p2align 4
 +L(first_vec_x3):
 +	tzcntl	%eax, %eax
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 +# ifdef USE_AS_STRNLEN
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	subl	$(VEC_SIZE * 2 + 1), %ecx
 +	addl	%ecx, %eax
 +# else
 +	subl	%edx, %edi
 +	addl	$(VEC_SIZE * 2 + 1), %edi
 +	addl	%edi, %eax
 +# endif
 +# ifdef USE_AS_WCSLEN
 +	shrl	$2, %eax
 +# endif
 +	VZEROUPPER_RETURN
 +	.p2align 4
 +L(first_vec_x4):
 +	tzcntl	%eax, %eax
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
 -	subq	$(VEC_SIZE * 4), %rsi
 -	jbe	L(last_4x_vec_or_less)
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	subl	$(VEC_SIZE + 1), %ecx
 +	addl	%ecx, %eax
 +# else
 +	subl	%edx, %edi
 +	addl	$(VEC_SIZE * 3 + 1), %edi
 +	addl	%edi, %eax
 # endif
 +# ifdef USE_AS_WCSLEN
 +	shrl	$2, %eax
 +# endif
 +	VZEROUPPER_RETURN
 -L(more_4x_vec):
 +	.p2align 5
 +L(aligned_more):
 +	/* Align data to VEC_SIZE - 1. This is the same number of
 +	   instructions as using andq with -VEC_SIZE but saves 4 bytes of
 +	   code on the x4 check.  */
 +	orq	$(VEC_SIZE - 1), %rdi
 +L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 +# ifdef USE_AS_STRNLEN
 +	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
 +	   it simplies the logic in last_4x_vec_or_less.  */
 +	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 +	subq	%rdx, %rcx
 +# endif
 +	/* Load first VEC regardless.  */
 +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 +# ifdef USE_AS_STRNLEN
 +	/* Adjust length. If near end handle specially.  */
 +	subq	%rcx, %rsi
 +	jb	L(last_4x_vec_or_less)
 +# endif
 +	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 -	addq	$(VEC_SIZE * 4), %rdi
 -
 -# ifdef USE_AS_STRNLEN
 -	subq	$(VEC_SIZE * 4), %rsi
 -	jbe	L(last_4x_vec_or_less)
 -# endif
 -
 -	/* Align data to 4 * VEC_SIZE.  */
 -	movq	%rdi, %rcx
 -	andl	$(4 * VEC_SIZE - 1), %ecx
 -	andq	$-(4 * VEC_SIZE), %rdi
 +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x4)
 +	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
 -	/* Adjust length.  */
 +	/* Before adjusting length check if at last VEC_SIZE * 4.  */
 +	cmpq	$(VEC_SIZE * 4 - 1), %rsi
 +	jbe	L(last_4x_vec_or_less_load)
 +	incq	%rdi
 +	movl	%edi, %ecx
 +	orq	$(VEC_SIZE * 4 - 1), %rdi
 +	andl	$(VEC_SIZE * 4 - 1), %ecx
 +	/* Readjust length.  */
 	addq	%rcx, %rsi
 +# else
 +	incq	%rdi
 +	orq	$(VEC_SIZE * 4 - 1), %rdi
 # endif
 -
 +	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 -	vmovdqa (%rdi), %ymm1
 -	vmovdqa	VEC_SIZE(%rdi), %ymm2
 -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm3
 -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm4
 -	VPMINU	%ymm1, %ymm2, %ymm5
 -	VPMINU	%ymm3, %ymm4, %ymm6
 -	VPMINU	%ymm5, %ymm6, %ymm5
 -
 -	VPCMPEQ	%ymm5, %ymm0, %ymm5
 -	vpmovmskb %ymm5, %eax
 -	testl	%eax, %eax
 -	jnz	L(4x_vec_end)
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 -
 -# ifndef USE_AS_STRNLEN
 -	jmp	L(loop_4x_vec)
 -# else
 +# ifdef USE_AS_STRNLEN
 +	/* Break if at end of length.  */
 	subq	$(VEC_SIZE * 4), %rsi
 -	ja	L(loop_4x_vec)
 -
 -L(last_4x_vec_or_less):
 -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 -	addl	$(VEC_SIZE * 2), %esi
 -	jle	L(last_2x_vec)
 +	jb	L(last_4x_vec_or_less_cmpeq)
 +# endif
 +	/* Save some code size by microfusing VPMINU with the load. Since
 +	   the matches in ymm2/ymm4 can only be returned if there where no
 +	   matches in ymm1/ymm3 respectively there is no issue with overlap.
 +	 */
 +	vmovdqa	1(%rdi), %ymm1
 +	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 +	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
 +	VPMINU	(VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
 +
 +	VPMINU	%ymm2, %ymm4, %ymm5
 +	VPCMPEQ	%ymm5, %ymm0, %ymm5
 +	vpmovmskb	%ymm5, %ecx
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	testl	%ecx, %ecx
 +	jz	L(loop_4x_vec)
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 +	VPCMPEQ	%ymm1, %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 +	subq	%rdx, %rdi
 	testl	%eax, %eax
 +	jnz	L(last_vec_return_x0)
 -	jnz	L(first_vec_x2_check)
 -	subl	$VEC_SIZE, %esi
 -	jle	L(max)
 -
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 +	VPCMPEQ	%ymm2, %ymm0, %ymm2
 +	vpmovmskb	%ymm2, %eax
 	testl	%eax, %eax
 -
 -	jnz	L(first_vec_x3_check)
 -	movq	%r8, %rax
 -#  ifdef USE_AS_WCSLEN
 +	jnz	L(last_vec_return_x1)
 +
 +	/* Combine last 2 VEC.  */
 +	VPCMPEQ	%ymm3, %ymm0, %ymm3
 +	vpmovmskb	%ymm3, %eax
 +	/* rcx has combined result from all 4 VEC. It will only be used if
 +	   the first 3 other VEC all did not contain a match.  */
 +	salq	$32, %rcx
 +	orq	%rcx, %rax
 +	tzcntq	%rax, %rax
 +	subq	$(VEC_SIZE * 2 - 1), %rdi
 +	addq	%rdi, %rax
 +# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 -#  endif
 +# endif
 	VZEROUPPER_RETURN
 +
 +# ifdef USE_AS_STRNLEN
 	.p2align 4
 -L(last_2x_vec):
 -	addl	$(VEC_SIZE * 2), %esi
 -	VPCMPEQ (%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 +L(last_4x_vec_or_less_load):
 +	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
 +	subq	$-(VEC_SIZE * 4), %rdi
 +L(last_4x_vec_or_less_cmpeq):
 +	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 +L(last_4x_vec_or_less):
 -	jnz	L(first_vec_x0_check)
 -	subl	$VEC_SIZE, %esi
 -	jle	L(max)
 +	vpmovmskb	%ymm1, %eax
 +	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
 +	   VEC_SIZE * 4.  */
 +	testl	$(VEC_SIZE * 2), %esi
 +	jnz	L(last_4x_vec)
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 +	/* length may have been negative or positive by an offset of
 +	   VEC_SIZE * 4 depending on where this was called from. This fixes
 +	   that.  */
 +	andl	$(VEC_SIZE * 4 - 1), %esi
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1_check)
 -	movq	%r8, %rax
 -#  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 -#  endif
 -	VZEROUPPER_RETURN
 +	jnz	L(last_vec_x1_check)
 -	.p2align 4
 -L(first_vec_x0_check):
 +	subl	$VEC_SIZE, %esi
 +	jb	L(max)
 +
 +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 +	cmpl	%eax, %esi
 +	jb	L(max)
 +	subq	%rdx, %rdi
 +	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 -	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
 +# endif
 	.p2align 4
 -L(first_vec_x1_check):
 +L(last_vec_return_x0):
 	tzcntl	%eax, %eax
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -	addq	$VEC_SIZE, %rax
 +	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -#  ifdef USE_AS_WCSLEN
 +# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 -#  endif
 +# endif
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec_x2_check):
 +L(last_vec_return_x1):
 	tzcntl	%eax, %eax
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -	addq	$(VEC_SIZE * 2), %rax
 +	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -#  ifdef USE_AS_WCSLEN
 +# ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 -#  endif
 +# endif
 	VZEROUPPER_RETURN
 +# ifdef USE_AS_STRNLEN
 	.p2align 4
 -L(first_vec_x3_check):
 +L(last_vec_x1_check):
 +
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -	addq	$(VEC_SIZE * 3), %rax
 +	cmpl	%eax, %esi
 +	jb	L(max)
 +	subq	%rdx, %rdi
 +	incl	%eax
 	addq	%rdi, %rax
 -	subq	%rdx, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
 -	.p2align 4
 L(max):
 	movq	%r8, %rax
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 +L(last_4x_vec):
 +	/* Test first 2x VEC normally.  */
 +	testl	%eax, %eax
 +	jnz	L(last_vec_x1)
 +
 +	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(last_vec_x2)
 +
 +	/* Normalize length.  */
 +	andl	$(VEC_SIZE * 4 - 1), %esi
 +	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(last_vec_x3)
 +
 +	subl	$(VEC_SIZE * 3), %esi
 +	jb	L(max)
 +
 +	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 +	tzcntl	%eax, %eax
 +	/* Check the end of data.  */
 +	cmpl	%eax, %esi
 +	jb	L(max)
 +	subq	%rdx, %rdi
 +	addl	$(VEC_SIZE * 3 + 1), %eax
 +	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
 -	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 -	ret
 -# endif
 	.p2align 4
 -L(first_vec_x0):
 +L(last_vec_x1):
 +	/* essentially duplicates of first_vec_x1 but use 64 bit
 +	   instructions.  */
 	tzcntl	%eax, %eax
 +	subq	%rdx, %rdi
 +	incl	%eax
 	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 +#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 -# endif
 +#  endif
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec_x1):
 +L(last_vec_x2):
 +	/* essentially duplicates of first_vec_x1 but use 64 bit
 +	   instructions.  */
 	tzcntl	%eax, %eax
 -	addq	$VEC_SIZE, %rax
 +	subq	%rdx, %rdi
 +	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 +#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 -# endif
 +#  endif
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec_x2):
 +L(last_vec_x3):
 	tzcntl	%eax, %eax
 -	addq	$(VEC_SIZE * 2), %rax
 +	subl	$(VEC_SIZE * 2), %esi
 +	/* Check the end of data.  */
 +	cmpl	%eax, %esi
 +	jb	L(max_end)
 +	subq	%rdx, %rdi
 +	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 +#  ifdef USE_AS_WCSLEN
 	shrq	$2, %rax
 -# endif
 +#  endif
 +	VZEROUPPER_RETURN
 +L(max_end):
 +	movq	%r8, %rax
 	VZEROUPPER_RETURN
 +# endif
 +	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 -L(4x_vec_end):
 -	VPCMPEQ	%ymm1, %ymm0, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -	VPCMPEQ %ymm2, %ymm0, %ymm2
 -	vpmovmskb %ymm2, %eax
 +L(cross_page_boundary):
 +	/* Align data to VEC_SIZE - 1.  */
 +	orq	$(VEC_SIZE - 1), %rdi
 +	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
 +	vpmovmskb	%ymm1, %eax
 +	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 +	   so no need to manually mod rdx.  */
 +	sarxl	%edx, %eax, %eax
 +# ifdef USE_AS_STRNLEN
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -	VPCMPEQ %ymm3, %ymm0, %ymm3
 -	vpmovmskb %ymm3, %eax
 +	jnz	L(cross_page_less_vec)
 +	leaq	1(%rdi), %rcx
 +	subq	%rdx, %rcx
 +	/* Check length.  */
 +	cmpq	%rsi, %rcx
 +	jb	L(cross_page_continue)
 +	movq	%r8, %rax
 +# else
 	testl	%eax, %eax
 -	jnz	L(first_vec_x2)
 -	VPCMPEQ %ymm4, %ymm0, %ymm4
 -	vpmovmskb %ymm4, %eax
 -L(first_vec_x3):
 +	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 -	addq	$(VEC_SIZE * 3), %rax
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +#  ifdef USE_AS_WCSLEN
 +	shrl	$2, %eax
 +#  endif
 # endif
 +L(return_vzeroupper):
 +	ZERO_UPPER_VEC_REGISTERS_RETURN
 +
 +# ifdef USE_AS_STRNLEN
 +	.p2align 4
 +L(cross_page_less_vec):
 +	tzcntl	%eax, %eax
 +	cmpq	%rax, %rsi
 +	cmovb	%esi, %eax
 +#  ifdef USE_AS_WCSLEN
 +	shrl	$2, %eax
 +#  endif
 	VZEROUPPER_RETURN
 +# endif
 END (STRLEN)
 #endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-26.patch
+++ b/SOURCES/glibc-RHEL-15696-26.patch
@ -1,701 +0,0 @@
 From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 3 May 2021 03:03:19 -0400
 Subject: [PATCH] x86: Optimize memchr-evex.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes memchr-evex.S. The optimizations include
 replacing some branches with cmovcc, avoiding some branches entirely
 in the less_4x_vec case, making the page cross logic less strict,
 saving some ALU in the alignment process, and most importantly
 increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
 test-wmemchr are all passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
 1 file changed, 322 insertions(+), 225 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
 index 6dd5d67b..81d5cd64 100644
 --- a/sysdeps/x86_64/multiarch/memchr-evex.S
 +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
 # ifdef USE_AS_WMEMCHR
 #  define VPBROADCAST	vpbroadcastd
 -#  define VPCMP		vpcmpd
 -#  define SHIFT_REG	r8d
 +#  define VPMINU	vpminud
 +#  define VPCMP	vpcmpd
 +#  define VPCMPEQ	vpcmpeqd
 +#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 -#  define VPCMP		vpcmpb
 -#  define SHIFT_REG	ecx
 +#  define VPMINU	vpminub
 +#  define VPCMP	vpcmpb
 +#  define VPCMPEQ	vpcmpeqb
 +#  define CHAR_SIZE	1
 # endif
 +# ifdef USE_AS_RAWMEMCHR
 +#  define RAW_PTR_REG	rcx
 +#  define ALGN_PTR_REG	rdi
 +# else
 +#  define RAW_PTR_REG	rdi
 +#  define ALGN_PTR_REG	rcx
 +# endif
 +
 +# define XMMZERO	xmm23
 +# define YMMZERO	ymm23
 # define XMMMATCH	xmm16
 # define YMMMATCH	ymm16
 # define YMM1		ymm17
@@ -44,6 +58,8 @@
 # define YMM6		ymm22
 # define VEC_SIZE 32
 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 +# define PAGE_SIZE 4096
 	.section .text.evex,"ax",@progbits
 ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
 	jz	L(zero)
 -# endif
 -	movl	%edi, %ecx
 -# ifdef USE_AS_WMEMCHR
 -	shl	$2, %RDX_LP
 -# else
 +
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
 	/* Broadcast CHAR to YMMMATCH.  */
 	VPBROADCAST %esi, %YMMMATCH
 	/* Check if we may cross page boundary with one vector load.  */
 -	andl	$(2 * VEC_SIZE - 1), %ecx
 -	cmpl	$VEC_SIZE, %ecx
 -	ja	L(cros_page_boundary)
 +	movl	%edi, %eax
 +	andl	$(PAGE_SIZE - 1), %eax
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	ja	L(cross_page_boundary)
 	/* Check the first VEC_SIZE bytes.  */
 -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -
 +	VPCMP	$0, (%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 # ifndef USE_AS_RAWMEMCHR
 -	jnz	L(first_vec_x0_check)
 -	/* Adjust length and check the end of data.  */
 -	subq	$VEC_SIZE, %rdx
 -	jbe	L(zero)
 +	/* If length < CHAR_PER_VEC handle special.  */
 +	cmpq	$CHAR_PER_VEC, %rdx
 +	jbe	L(first_vec_x0)
 +# endif
 +	testl	%eax, %eax
 +	jz	L(aligned_more)
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCHR
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 -	jnz	L(first_vec_x0)
 +	addq	%rdi, %rax
 # endif
 -
 -	/* Align data for aligned loads in the loop.  */
 -	addq	$VEC_SIZE, %rdi
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 +	ret
 # ifndef USE_AS_RAWMEMCHR
 -	/* Adjust length.  */
 -	addq	%rcx, %rdx
 -
 -	subq	$(VEC_SIZE * 4), %rdx
 -	jbe	L(last_4x_vec_or_less)
 -# endif
 -	jmp	L(more_4x_vec)
 +L(zero):
 +	xorl	%eax, %eax
 +	ret
 +	.p2align 5
 +L(first_vec_x0):
 +	/* Check if first match was before length.  */
 +	tzcntl	%eax, %eax
 +	xorl	%ecx, %ecx
 +	cmpl	%eax, %edx
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 +	cmovle	%rcx, %rax
 +	ret
 +# else
 +	/* NB: first_vec_x0 is 17 bytes which will leave
 +	   cross_page_boundary (which is relatively cold) close enough
 +	   to ideal alignment. So only realign L(cross_page_boundary) if
 +	   rawmemchr.  */
 	.p2align 4
 -L(cros_page_boundary):
 -	andl	$(VEC_SIZE - 1), %ecx
 +# endif
 +L(cross_page_boundary):
 +	/* Save pointer before aligning as its original value is
 +	   necessary for computer return address if byte is found or
 +	   adjusting length if it is not and this is memchr.  */
 +	movq	%rdi, %rcx
 +	/* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
 +	   for rawmemchr.  */
 +	andq	$-VEC_SIZE, %ALGN_PTR_REG
 +	VPCMP	$0, (%ALGN_PTR_REG), %YMMMATCH, %k0
 +	kmovd	%k0, %r8d
 # ifdef USE_AS_WMEMCHR
 -	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 +	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 	   bytes.  */
 -	movl	%ecx, %SHIFT_REG
 -	sarl	$2, %SHIFT_REG
 +	sarl	$2, %eax
 +# endif
 +# ifndef USE_AS_RAWMEMCHR
 +	movl	$(PAGE_SIZE / CHAR_SIZE), %esi
 +	subl	%eax, %esi
 # endif
 -	andq	$-VEC_SIZE, %rdi
 -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 -	/* Remove the leading bytes.  */
 -	sarxl	%SHIFT_REG, %eax, %eax
 -	testl	%eax, %eax
 -	jz	L(aligned_more)
 -	tzcntl	%eax, %eax
 # ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 +	andl	$(CHAR_PER_VEC - 1), %eax
 # endif
 +	/* Remove the leading bytes.  */
 +	sarxl	%eax, %r8d, %eax
 # ifndef USE_AS_RAWMEMCHR
 	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 +	cmpq	%rsi, %rdx
 +	jbe	L(first_vec_x0)
 +# endif
 +	testl	%eax, %eax
 +	jz	L(cross_page_continue)
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCHR
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	(%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
 +# else
 +	addq	%RAW_PTR_REG, %rax
 # endif
 -	addq	%rdi, %rax
 -	addq	%rcx, %rax
 	ret
 	.p2align 4
 -L(aligned_more):
 -# ifndef USE_AS_RAWMEMCHR
 -        /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
 -	   instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
 -	   overflow.  */
 -	negq	%rcx
 -	addq	$VEC_SIZE, %rcx
 +L(first_vec_x1):
 +	tzcntl	%eax, %eax
 +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 -	/* Check the end of data.  */
 -	subq	%rcx, %rdx
 -	jbe	L(zero)
 -# endif
 +	.p2align 4
 +L(first_vec_x2):
 +	tzcntl	%eax, %eax
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 -	addq	$VEC_SIZE, %rdi
 +	.p2align 4
 +L(first_vec_x3):
 +	tzcntl	%eax, %eax
 +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 -# ifndef USE_AS_RAWMEMCHR
 -	subq	$(VEC_SIZE * 4), %rdx
 -	jbe	L(last_4x_vec_or_less)
 -# endif
 +	.p2align 4
 +L(first_vec_x4):
 +	tzcntl	%eax, %eax
 +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 -L(more_4x_vec):
 +	.p2align 5
 +L(aligned_more):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 +# ifndef USE_AS_RAWMEMCHR
 +	/* Align data to VEC_SIZE.  */
 +L(cross_page_continue):
 +	xorl	%ecx, %ecx
 +	subl	%edi, %ecx
 +	andq	$-VEC_SIZE, %rdi
 +	/* esi is for adjusting length to see if near the end.  */
 +	leal	(VEC_SIZE * 5)(%rdi, %rcx), %esi
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %esi
 +#  endif
 +# else
 +	andq	$-VEC_SIZE, %rdi
 +L(cross_page_continue):
 +# endif
 +	/* Load first VEC regardless.  */
 +	VPCMP	$0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +# ifndef USE_AS_RAWMEMCHR
 +	/* Adjust length. If near end handle specially.  */
 +	subq	%rsi, %rdx
 +	jbe	L(last_4x_vec_or_less)
 +# endif
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 -	addq	$(VEC_SIZE * 4), %rdi
 +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x4)
 +
 # ifndef USE_AS_RAWMEMCHR
 -	subq	$(VEC_SIZE * 4), %rdx
 -	jbe	L(last_4x_vec_or_less)
 -# endif
 +	/* Check if at last CHAR_PER_VEC * 4 length.  */
 +	subq	$(CHAR_PER_VEC * 4), %rdx
 +	jbe	L(last_4x_vec_or_less_cmpeq)
 +	addq	$VEC_SIZE, %rdi
 -	/* Align data to 4 * VEC_SIZE.  */
 -	movq	%rdi, %rcx
 -	andl	$(4 * VEC_SIZE - 1), %ecx
 +	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
 +	 */
 +#  ifdef USE_AS_WMEMCHR
 +	movl	%edi, %ecx
 	andq	$-(4 * VEC_SIZE), %rdi
 -
 -# ifndef USE_AS_RAWMEMCHR
 -	/* Adjust length.  */
 +	andl	$(VEC_SIZE * 4 - 1), %ecx
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %ecx
 	addq	%rcx, %rdx
 +#  else
 +	addq	%rdi, %rdx
 +	andq	$-(4 * VEC_SIZE), %rdi
 +	subq	%rdi, %rdx
 +#  endif
 +# else
 +	addq	$VEC_SIZE, %rdi
 +	andq	$-(4 * VEC_SIZE), %rdi
 # endif
 +	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 +
 +	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k2
 -	kord	%k1, %k2, %k5
 -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
 -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
 -
 -	kord	%k3, %k4, %k6
 -	kortestd %k5, %k6
 -	jnz	L(4x_vec_end)
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 -
 +	/* It would be possible to save some instructions using 4x VPCMP
 +	   but bottleneck on port 5 makes it not woth it.  */
 +	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
 +	/* xor will set bytes match esi to zero.  */
 +	vpxorq	(VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
 +	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
 +	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
 +	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
 +	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
 +	VPCMP	$0, %YMM3, %YMMZERO, %k2
 # ifdef USE_AS_RAWMEMCHR
 -	jmp	L(loop_4x_vec)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	kortestd %k2, %k3
 +	jz	L(loop_4x_vec)
 # else
 -	subq	$(VEC_SIZE * 4), %rdx
 +	kortestd %k2, %k3
 +	jnz	L(loop_4x_vec_end)
 +
 +	subq	$-(VEC_SIZE * 4), %rdi
 +
 +	subq	$(CHAR_PER_VEC * 4), %rdx
 	ja	L(loop_4x_vec)
 +	/* Fall through into less than 4 remaining vectors of length case.
 +	 */
 +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +	addq	$(VEC_SIZE * 3), %rdi
 +	.p2align 4
 L(last_4x_vec_or_less):
 -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 -	addl	$(VEC_SIZE * 2), %edx
 -	jle	L(last_2x_vec)
 -
 -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 +	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 +	jnz	L(first_vec_x1_check)
 -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 +	/* If remaining length > CHAR_PER_VEC * 2.  */
 +	addl	$(CHAR_PER_VEC * 2), %edx
 +	jg	L(last_4x_vec)
 -	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 +L(last_2x_vec):
 +	/* If remaining length < CHAR_PER_VEC.  */
 +	addl	$CHAR_PER_VEC, %edx
 +	jle	L(zero_end)
 -	jnz	L(first_vec_x2_check)
 -	subl	$VEC_SIZE, %edx
 -	jle	L(zero)
 +	/* Check VEC2 and compare any match with remaining length.  */
 +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +	tzcntl	%eax, %eax
 +	cmpl	%eax, %edx
 +	jbe	L(set_zero_end)
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 +L(zero_end):
 +	ret
 -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x3_check)
 +	.p2align 4
 +L(first_vec_x1_check):
 +	tzcntl	%eax, %eax
 +	/* Adjust length.  */
 +	subl	$-(CHAR_PER_VEC * 4), %edx
 +	/* Check if match within remaining length.  */
 +	cmpl	%eax, %edx
 +	jbe	L(set_zero_end)
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +L(set_zero_end):
 	xorl	%eax, %eax
 	ret
 	.p2align 4
 -L(last_2x_vec):
 -	addl	$(VEC_SIZE * 2), %edx
 -	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 +L(loop_4x_vec_end):
 +# endif
 +	/* rawmemchr will fall through into this if match was found in
 +	   loop.  */
 +
 +	/* k1 has not of matches with VEC1.  */
 	kmovd	%k1, %eax
 -	testl	%eax, %eax
 +# ifdef USE_AS_WMEMCHR
 +	subl	$((1 << CHAR_PER_VEC) - 1), %eax
 +# else
 +	incl	%eax
 +# endif
 +	jnz	L(last_vec_x1_return)
 -	jnz	L(first_vec_x0_check)
 -	subl	$VEC_SIZE, %edx
 -	jle	L(zero)
 +	VPCMP	$0, %YMM2, %YMMZERO, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(last_vec_x2_return)
 -	VPCMP	$0, VEC_SIZE(%rdi), %YMMMATCH, %k1
 -	kmovd	%k1, %eax
 +	kmovd	%k2, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1_check)
 -	xorl	%eax, %eax
 -	ret
 +	jnz	L(last_vec_x3_return)
 -	.p2align 4
 -L(first_vec_x0_check):
 +	kmovd	%k3, %eax
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 +# ifdef USE_AS_RAWMEMCHR
 +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 +# else
 +	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 -	addq	%rdi, %rax
 	ret
 	.p2align 4
 -L(first_vec_x1_check):
 +L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -# endif
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 -	addq	$VEC_SIZE, %rax
 +# ifdef USE_AS_RAWMEMCHR
 +#  ifdef USE_AS_WMEMCHR
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 +#  else
 	addq	%rdi, %rax
 -	ret
 -
 -	.p2align 4
 -L(first_vec_x2_check):
 -	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 +#  endif
 +# else
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 -	addq	$(VEC_SIZE * 2), %rax
 -	addq	%rdi, %rax
 	ret
 	.p2align 4
 -L(first_vec_x3_check):
 +L(last_vec_x2_return):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 +# ifdef USE_AS_RAWMEMCHR
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 +# else
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rdx
 -	jbe	L(zero)
 -	addq	$(VEC_SIZE * 3), %rax
 -	addq	%rdi, %rax
 	ret
 	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 -	ret
 -# endif
 -
 -	.p2align 4
 -L(first_vec_x0):
 +L(last_vec_x3_return):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(%rdi, %rax, 4), %rax
 +# ifdef USE_AS_RAWMEMCHR
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 # else
 -	addq	%rdi, %rax
 +	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 +	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 # endif
 	ret
 +
 +# ifndef USE_AS_RAWMEMCHR
 +L(last_4x_vec_or_less_cmpeq):
 +	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	/* Check first VEC regardless.  */
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x1_check)
 +
 +	/* If remaining length <= CHAR_PER_VEC * 2.  */
 +	addl	$(CHAR_PER_VEC * 2), %edx
 +	jle	L(last_2x_vec)
 +
 	.p2align 4
 -L(first_vec_x1):
 +L(last_4x_vec):
 +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(last_vec_x2)
 +
 +
 +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +	/* Create mask for possible matches within remaining length.  */
 +#  ifdef USE_AS_WMEMCHR
 +	movl	$((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
 +	bzhil	%edx, %ecx, %ecx
 +#  else
 +	movq	$-1, %rcx
 +	bzhiq	%rdx, %rcx, %rcx
 +#  endif
 +	/* Test matches in data against length match.  */
 +	andl	%ecx, %eax
 +	jnz	L(last_vec_x3)
 +
 +	/* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
 +	   remaining length was found to be > CHAR_PER_VEC * 2.  */
 +	subl	$CHAR_PER_VEC, %edx
 +	jbe	L(zero_end2)
 +
 +
 +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 +	kmovd	%k0, %eax
 +	/* Shift remaining length mask for last VEC.  */
 +#  ifdef USE_AS_WMEMCHR
 +	shrl	$CHAR_PER_VEC, %ecx
 +#  else
 +	shrq	$CHAR_PER_VEC, %rcx
 +#  endif
 +	andl	%ecx, %eax
 +	jz	L(zero_end2)
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
 -# else
 -	addq	$VEC_SIZE, %rax
 -	addq	%rdi, %rax
 -# endif
 +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 +L(zero_end2):
 	ret
 -	.p2align 4
 -L(first_vec_x2):
 +L(last_vec_x2):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
 -# else
 -	addq	$(VEC_SIZE * 2), %rax
 -	addq	%rdi, %rax
 -# endif
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 	.p2align 4
 -L(4x_vec_end):
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -	kmovd	%k2, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -	kmovd	%k3, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x2)
 -	kmovd	%k4, %eax
 -	testl	%eax, %eax
 -L(first_vec_x3):
 +L(last_vec_x3):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WMEMCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
 -# else
 -	addq	$(VEC_SIZE * 3), %rax
 -	addq	%rdi, %rax
 -# endif
 +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 +# endif
 END (MEMCHR)
 #endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-27.patch
+++ b/SOURCES/glibc-RHEL-15696-27.patch
@ -1,30 +0,0 @@
 From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
 From: Alice Xu <alice.d.xu@gmail.com>
 Date: Fri, 7 May 2021 19:03:21 -0700
 Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
 Content-type: text/plain; charset=UTF-8
 An unknown vector operation occurred in commit 2a76821c308. Fixed it
 by using "ymm{k1}{z}" but not "ymm {k1} {z}".
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
 index 81d5cd64..f3fdad4f 100644
 --- a/sysdeps/x86_64/multiarch/memchr-evex.S
 +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -271,7 +271,7 @@ L(loop_4x_vec):
 	vpxorq	(VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
 	VPCMP	$0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
 -	VPMINU	%YMM2, %YMM3, %YMM3 {%k1} {z}
 +	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
 	VPCMP	$0, %YMM3, %YMMZERO, %k2
 # ifdef USE_AS_RAWMEMCHR
 	subq	$-(VEC_SIZE * 4), %rdi
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-28.patch
+++ b/SOURCES/glibc-RHEL-15696-28.patch
@ -1,566 +0,0 @@
 From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Tue, 22 Jun 2021 20:42:10 -0700
 Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
 Content-type: text/plain; charset=UTF-8
 Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
 version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
 and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
 This also removes the unused symbols, __GI___strlen_sse2 and
 __GI___wcsnlen_sse4_1.
 ---
 sysdeps/x86_64/multiarch/strlen-sse2.S    |   2 +-
 sysdeps/x86_64/multiarch/strlen-vec.S     | 257 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S |   2 +-
 sysdeps/x86_64/strlen.S                   | 243 +-------------------
 4 files changed, 262 insertions(+), 242 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
 Conflicts:
 	sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
 	(Copyright dates, URL)
 diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
 index 7bc57b8d..449c8a7f 100644
 --- a/sysdeps/x86_64/multiarch/strlen-sse2.S
 +++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -20,4 +20,4 @@
 # define strlen __strlen_sse2
 #endif
 -#include "../strlen.S"
 +#include "strlen-vec.S"
 diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
 new file mode 100644
 index 00000000..8f660bb9
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -0,0 +1,257 @@
 +/* SSE2 version of strlen and SSE4.1 version of wcslen.
 +   Copyright (C) 2012-2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +#ifdef AS_WCSLEN
 +# define PMINU		pminud
 +# define PCMPEQ		pcmpeqd
 +# define SHIFT_RETURN	shrq $2, %rax
 +#else
 +# define PMINU		pminub
 +# define PCMPEQ		pcmpeqb
 +# define SHIFT_RETURN
 +#endif
 +
 +/* Long lived register in strlen(s), strnlen(s, n) are:
 +
 +	%xmm3 - zero
 +	%rdi   - s
 +	%r10  (s+n) & (~(64-1))
 +	%r11   s+n
 +*/
 +
 +
 +.text
 +ENTRY(strlen)
 +
 +/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 +#define FIND_ZERO	\
 +	PCMPEQ	(%rax), %xmm0;	\
 +	PCMPEQ	16(%rax), %xmm1;	\
 +	PCMPEQ	32(%rax), %xmm2;	\
 +	PCMPEQ	48(%rax), %xmm3;	\
 +	pmovmskb	%xmm0, %esi;	\
 +	pmovmskb	%xmm1, %edx;	\
 +	pmovmskb	%xmm2, %r8d;	\
 +	pmovmskb	%xmm3, %ecx;	\
 +	salq	$16, %rdx;	\
 +	salq	$16, %rcx;	\
 +	orq	%rsi, %rdx;	\
 +	orq	%r8, %rcx;	\
 +	salq	$32, %rcx;	\
 +	orq	%rcx, %rdx;
 +
 +#ifdef AS_STRNLEN
 +/* Do not read anything when n==0.  */
 +	test	%RSI_LP, %RSI_LP
 +	jne	L(n_nonzero)
 +	xor	%rax, %rax
 +	ret
 +L(n_nonzero):
 +# ifdef AS_WCSLEN
 +	shl	$2, %RSI_LP
 +# endif
 +
 +/* Initialize long lived registers.  */
 +
 +	add	%RDI_LP, %RSI_LP
 +	mov	%RSI_LP, %R10_LP
 +	and	$-64, %R10_LP
 +	mov	%RSI_LP, %R11_LP
 +#endif
 +
 +	pxor	%xmm0, %xmm0
 +	pxor	%xmm1, %xmm1
 +	pxor	%xmm2, %xmm2
 +	pxor	%xmm3, %xmm3
 +	movq	%rdi, %rax
 +	movq	%rdi, %rcx
 +	andq	$4095, %rcx
 +/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
 +	cmpq	$4047, %rcx
 +/* We cannot unify this branching as it would be ~6 cycles slower.  */
 +	ja	L(cross_page)
 +
 +#ifdef AS_STRNLEN
 +/* Test if end is among first 64 bytes.  */
 +# define STRNLEN_PROLOG	\
 +	mov	%r11, %rsi;	\
 +	subq	%rax, %rsi;	\
 +	andq	$-64, %rax;	\
 +	testq	$-64, %rsi;	\
 +	je	L(strnlen_ret)
 +#else
 +# define STRNLEN_PROLOG  andq $-64, %rax;
 +#endif
 +
 +/* Ignore bits in mask that come before start of string.  */
 +#define PROLOG(lab)	\
 +	movq	%rdi, %rcx;	\
 +	xorq	%rax, %rcx;	\
 +	STRNLEN_PROLOG;	\
 +	sarq	%cl, %rdx;	\
 +	test	%rdx, %rdx;	\
 +	je	L(lab);	\
 +	bsfq	%rdx, %rax;	\
 +	SHIFT_RETURN;		\
 +	ret
 +
 +#ifdef AS_STRNLEN
 +	andq	$-16, %rax
 +	FIND_ZERO
 +#else
 +	/* Test first 16 bytes unaligned.  */
 +	movdqu	(%rax), %xmm4
 +	PCMPEQ	%xmm0, %xmm4
 +	pmovmskb	%xmm4, %edx
 +	test	%edx, %edx
 +	je 	L(next48_bytes)
 +	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
 +	SHIFT_RETURN
 +	ret
 +
 +L(next48_bytes):
 +/* Same as FIND_ZERO except we do not check first 16 bytes.  */
 +	andq	$-16, %rax
 +	PCMPEQ 16(%rax), %xmm1
 +	PCMPEQ 32(%rax), %xmm2
 +	PCMPEQ 48(%rax), %xmm3
 +	pmovmskb	%xmm1, %edx
 +	pmovmskb	%xmm2, %r8d
 +	pmovmskb	%xmm3, %ecx
 +	salq	$16, %rdx
 +	salq	$16, %rcx
 +	orq	%r8, %rcx
 +	salq	$32, %rcx
 +	orq	%rcx, %rdx
 +#endif
 +
 +	/* When no zero byte is found xmm1-3 are zero so we do not have to
 +	   zero them.  */
 +	PROLOG(loop)
 +
 +	.p2align 4
 +L(cross_page):
 +	andq	$-64, %rax
 +	FIND_ZERO
 +	PROLOG(loop_init)
 +
 +#ifdef AS_STRNLEN
 +/* We must do this check to correctly handle strnlen (s, -1).  */
 +L(strnlen_ret):
 +	bts	%rsi, %rdx
 +	sarq	%cl, %rdx
 +	test	%rdx, %rdx
 +	je	L(loop_init)
 +	bsfq	%rdx, %rax
 +	SHIFT_RETURN
 +	ret
 +#endif
 +	.p2align 4
 +L(loop_init):
 +	pxor	%xmm1, %xmm1
 +	pxor	%xmm2, %xmm2
 +	pxor	%xmm3, %xmm3
 +#ifdef AS_STRNLEN
 +	.p2align 4
 +L(loop):
 +
 +	addq	$64, %rax
 +	cmpq	%rax, %r10
 +	je	L(exit_end)
 +
 +	movdqa	(%rax), %xmm0
 +	PMINU	16(%rax), %xmm0
 +	PMINU	32(%rax), %xmm0
 +	PMINU	48(%rax), %xmm0
 +	PCMPEQ	%xmm3, %xmm0
 +	pmovmskb	%xmm0, %edx
 +	testl	%edx, %edx
 +	jne	L(exit)
 +	jmp	L(loop)
 +
 +	.p2align 4
 +L(exit_end):
 +	cmp	%rax, %r11
 +	je	L(first) /* Do not read when end is at page boundary.  */
 +	pxor	%xmm0, %xmm0
 +	FIND_ZERO
 +
 +L(first):
 +	bts	%r11, %rdx
 +	bsfq	%rdx, %rdx
 +	addq	%rdx, %rax
 +	subq	%rdi, %rax
 +	SHIFT_RETURN
 +	ret
 +
 +	.p2align 4
 +L(exit):
 +	pxor	%xmm0, %xmm0
 +	FIND_ZERO
 +
 +	bsfq	%rdx, %rdx
 +	addq	%rdx, %rax
 +	subq	%rdi, %rax
 +	SHIFT_RETURN
 +	ret
 +
 +#else
 +
 +	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
 +	.p2align 4
 +L(loop):
 +
 +	movdqa	64(%rax), %xmm0
 +	PMINU	80(%rax), %xmm0
 +	PMINU	96(%rax), %xmm0
 +	PMINU	112(%rax), %xmm0
 +	PCMPEQ	%xmm3, %xmm0
 +	pmovmskb	%xmm0, %edx
 +	testl	%edx, %edx
 +	jne	L(exit64)
 +
 +	subq	$-128, %rax
 +
 +	movdqa	(%rax), %xmm0
 +	PMINU	16(%rax), %xmm0
 +	PMINU	32(%rax), %xmm0
 +	PMINU	48(%rax), %xmm0
 +	PCMPEQ	%xmm3, %xmm0
 +	pmovmskb	%xmm0, %edx
 +	testl	%edx, %edx
 +	jne	L(exit0)
 +	jmp	L(loop)
 +
 +	.p2align 4
 +L(exit64):
 +	addq	$64, %rax
 +L(exit0):
 +	pxor	%xmm0, %xmm0
 +	FIND_ZERO
 +
 +	bsfq	%rdx, %rdx
 +	addq	%rdx, %rax
 +	subq	%rdi, %rax
 +	SHIFT_RETURN
 +	ret
 +
 +#endif
 +
 +END(strlen)
 diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
 index a8cab0cb..5fa51fe0 100644
 --- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
 +++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -2,4 +2,4 @@
 #define AS_STRNLEN
 #define strlen	__wcsnlen_sse4_1
 -#include "../strlen.S"
 +#include "strlen-vec.S"
 diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
 index f845f3d4..ad047d84 100644
 --- a/sysdeps/x86_64/strlen.S
 +++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
 -/* SSE2 version of strlen/wcslen.
 -   Copyright (C) 2012-2018 Free Software Foundation, Inc.
 +/* SSE2 version of strlen.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,243 +16,6 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 -#include <sysdep.h>
 +#include "multiarch/strlen-vec.S"
 -#ifdef AS_WCSLEN
 -# define PMINU		pminud
 -# define PCMPEQ		pcmpeqd
 -# define SHIFT_RETURN	shrq $2, %rax
 -#else
 -# define PMINU		pminub
 -# define PCMPEQ		pcmpeqb
 -# define SHIFT_RETURN
 -#endif
 -
 -/* Long lived register in strlen(s), strnlen(s, n) are:
 -
 -	%xmm3 - zero
 -	%rdi   - s
 -	%r10  (s+n) & (~(64-1))
 -	%r11   s+n
 -*/
 -
 -
 -.text
 -ENTRY(strlen)
 -
 -/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx.  */
 -#define FIND_ZERO	\
 -	PCMPEQ	(%rax), %xmm0;	\
 -	PCMPEQ	16(%rax), %xmm1;	\
 -	PCMPEQ	32(%rax), %xmm2;	\
 -	PCMPEQ	48(%rax), %xmm3;	\
 -	pmovmskb	%xmm0, %esi;	\
 -	pmovmskb	%xmm1, %edx;	\
 -	pmovmskb	%xmm2, %r8d;	\
 -	pmovmskb	%xmm3, %ecx;	\
 -	salq	$16, %rdx;	\
 -	salq	$16, %rcx;	\
 -	orq	%rsi, %rdx;	\
 -	orq	%r8, %rcx;	\
 -	salq	$32, %rcx;	\
 -	orq	%rcx, %rdx;
 -
 -#ifdef AS_STRNLEN
 -/* Do not read anything when n==0.  */
 -	test	%RSI_LP, %RSI_LP
 -	jne	L(n_nonzero)
 -	xor	%rax, %rax
 -	ret
 -L(n_nonzero):
 -# ifdef AS_WCSLEN
 -	shl	$2, %RSI_LP
 -# endif
 -
 -/* Initialize long lived registers.  */
 -
 -	add	%RDI_LP, %RSI_LP
 -	mov	%RSI_LP, %R10_LP
 -	and	$-64, %R10_LP
 -	mov	%RSI_LP, %R11_LP
 -#endif
 -
 -	pxor	%xmm0, %xmm0
 -	pxor	%xmm1, %xmm1
 -	pxor	%xmm2, %xmm2
 -	pxor	%xmm3, %xmm3
 -	movq	%rdi, %rax
 -	movq	%rdi, %rcx
 -	andq	$4095, %rcx
 -/* Offsets 4032-4047 will be aligned into 4032 thus fit into page.  */
 -	cmpq	$4047, %rcx
 -/* We cannot unify this branching as it would be ~6 cycles slower.  */
 -	ja	L(cross_page)
 -
 -#ifdef AS_STRNLEN
 -/* Test if end is among first 64 bytes.  */
 -# define STRNLEN_PROLOG	\
 -	mov	%r11, %rsi;	\
 -	subq	%rax, %rsi;	\
 -	andq	$-64, %rax;	\
 -	testq	$-64, %rsi;	\
 -	je	L(strnlen_ret)
 -#else
 -# define STRNLEN_PROLOG  andq $-64, %rax;
 -#endif
 -
 -/* Ignore bits in mask that come before start of string.  */
 -#define PROLOG(lab)	\
 -	movq	%rdi, %rcx;	\
 -	xorq	%rax, %rcx;	\
 -	STRNLEN_PROLOG;	\
 -	sarq	%cl, %rdx;	\
 -	test	%rdx, %rdx;	\
 -	je	L(lab);	\
 -	bsfq	%rdx, %rax;	\
 -	SHIFT_RETURN;		\
 -	ret
 -
 -#ifdef AS_STRNLEN
 -	andq	$-16, %rax
 -	FIND_ZERO
 -#else
 -	/* Test first 16 bytes unaligned.  */
 -	movdqu	(%rax), %xmm4
 -	PCMPEQ	%xmm0, %xmm4
 -	pmovmskb	%xmm4, %edx
 -	test	%edx, %edx
 -	je 	L(next48_bytes)
 -	bsf	%edx, %eax /* If eax is zeroed 16bit bsf can be used.  */
 -	SHIFT_RETURN
 -	ret
 -
 -L(next48_bytes):
 -/* Same as FIND_ZERO except we do not check first 16 bytes.  */
 -	andq	$-16, %rax
 -	PCMPEQ 16(%rax), %xmm1
 -	PCMPEQ 32(%rax), %xmm2
 -	PCMPEQ 48(%rax), %xmm3
 -	pmovmskb	%xmm1, %edx
 -	pmovmskb	%xmm2, %r8d
 -	pmovmskb	%xmm3, %ecx
 -	salq	$16, %rdx
 -	salq	$16, %rcx
 -	orq	%r8, %rcx
 -	salq	$32, %rcx
 -	orq	%rcx, %rdx
 -#endif
 -
 -	/* When no zero byte is found xmm1-3 are zero so we do not have to
 -	   zero them.  */
 -	PROLOG(loop)
 -
 -	.p2align 4
 -L(cross_page):
 -	andq	$-64, %rax
 -	FIND_ZERO
 -	PROLOG(loop_init)
 -
 -#ifdef AS_STRNLEN
 -/* We must do this check to correctly handle strnlen (s, -1).  */
 -L(strnlen_ret):
 -	bts	%rsi, %rdx
 -	sarq	%cl, %rdx
 -	test	%rdx, %rdx
 -	je	L(loop_init)
 -	bsfq	%rdx, %rax
 -	SHIFT_RETURN
 -	ret
 -#endif
 -	.p2align 4
 -L(loop_init):
 -	pxor	%xmm1, %xmm1
 -	pxor	%xmm2, %xmm2
 -	pxor	%xmm3, %xmm3
 -#ifdef AS_STRNLEN
 -	.p2align 4
 -L(loop):
 -
 -	addq	$64, %rax
 -	cmpq	%rax, %r10
 -	je	L(exit_end)
 -
 -	movdqa	(%rax), %xmm0
 -	PMINU	16(%rax), %xmm0
 -	PMINU	32(%rax), %xmm0
 -	PMINU	48(%rax), %xmm0
 -	PCMPEQ	%xmm3, %xmm0
 -	pmovmskb	%xmm0, %edx
 -	testl	%edx, %edx
 -	jne	L(exit)
 -	jmp	L(loop)
 -
 -	.p2align 4
 -L(exit_end):
 -	cmp	%rax, %r11
 -	je	L(first) /* Do not read when end is at page boundary.  */
 -	pxor	%xmm0, %xmm0
 -	FIND_ZERO
 -
 -L(first):
 -	bts	%r11, %rdx
 -	bsfq	%rdx, %rdx
 -	addq	%rdx, %rax
 -	subq	%rdi, %rax
 -	SHIFT_RETURN
 -	ret
 -
 -	.p2align 4
 -L(exit):
 -	pxor	%xmm0, %xmm0
 -	FIND_ZERO
 -
 -	bsfq	%rdx, %rdx
 -	addq	%rdx, %rax
 -	subq	%rdi, %rax
 -	SHIFT_RETURN
 -	ret
 -
 -#else
 -
 -	/* Main loop.  Unrolled twice to improve L2 cache performance on core2.  */
 -	.p2align 4
 -L(loop):
 -
 -	movdqa	64(%rax), %xmm0
 -	PMINU	80(%rax), %xmm0
 -	PMINU	96(%rax), %xmm0
 -	PMINU	112(%rax), %xmm0
 -	PCMPEQ	%xmm3, %xmm0
 -	pmovmskb	%xmm0, %edx
 -	testl	%edx, %edx
 -	jne	L(exit64)
 -
 -	subq	$-128, %rax
 -
 -	movdqa	(%rax), %xmm0
 -	PMINU	16(%rax), %xmm0
 -	PMINU	32(%rax), %xmm0
 -	PMINU	48(%rax), %xmm0
 -	PCMPEQ	%xmm3, %xmm0
 -	pmovmskb	%xmm0, %edx
 -	testl	%edx, %edx
 -	jne	L(exit0)
 -	jmp	L(loop)
 -
 -	.p2align 4
 -L(exit64):
 -	addq	$64, %rax
 -L(exit0):
 -	pxor	%xmm0, %xmm0
 -	FIND_ZERO
 -
 -	bsfq	%rdx, %rdx
 -	addq	%rdx, %rax
 -	subq	%rdi, %rax
 -	SHIFT_RETURN
 -	ret
 -
 -#endif
 -
 -END(strlen)
 libc_hidden_builtin_def (strlen)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-29.patch
+++ b/SOURCES/glibc-RHEL-15696-29.patch
@ -1,181 +0,0 @@
 From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Wed, 23 Jun 2021 01:19:34 -0400
 Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
 Content-type: text/plain; charset=UTF-8
 No bug. This comment adds the ifunc / build infrastructure
 necessary for wcslen to prefer the sse4.1 implementation
 in strlen-vec.S. test-wcslen.c is passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/Makefile          |  4 +-
 sysdeps/x86_64/multiarch/ifunc-impl-list.c |  3 ++
 sysdeps/x86_64/multiarch/ifunc-wcslen.h    | 52 ++++++++++++++++++++++
 sysdeps/x86_64/multiarch/wcslen-sse4_1.S   |  4 ++
 sysdeps/x86_64/multiarch/wcslen.c          |  2 +-
 sysdeps/x86_64/multiarch/wcsnlen.c         | 34 +-------------
 6 files changed, 63 insertions(+), 36 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
 create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
 diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 index 491c7698..65fde4eb 100644
 --- a/sysdeps/x86_64/multiarch/Makefile
 +++ b/sysdeps/x86_64/multiarch/Makefile
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wcscpy-ssse3 wcscpy-c \
 		   wcschr-sse2 wcschr-avx2 \
 		   wcsrchr-sse2 wcsrchr-avx2 \
 -		   wcsnlen-sse4_1 wcsnlen-c \
 -		   wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
 +		   wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
 +		   wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
 		   wcschr-avx2-rtm \
 		   wcscmp-avx2-rtm \
 		   wcslen-avx2-rtm \
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index f1a6460a..580913ca 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
 +	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 +			      CPU_FEATURE_USABLE (SSE4_1),
 +			      __wcsnlen_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
 diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
 new file mode 100644
 index 00000000..39e33473
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -0,0 +1,52 @@
 +/* Common definition for ifunc selections for wcslen and wcsnlen
 +   All versions must be listed in ifunc-impl-list.c.
 +   Copyright (C) 2017-2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <init-arch.h>
 +
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 +
 +static inline void *
 +IFUNC_SELECTOR (void)
 +{
 +  const struct cpu_features* cpu_features = __get_cpu_features ();
 +
 +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 +    {
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 +	return OPTIMIZE (evex);
 +
 +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 +	return OPTIMIZE (avx2_rtm);
 +
 +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 +	return OPTIMIZE (avx2);
 +    }
 +
 +  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
 +    return OPTIMIZE (sse4_1);
 +
 +  return OPTIMIZE (sse2);
 +}
 diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
 new file mode 100644
 index 00000000..7e62621a
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -0,0 +1,4 @@
 +#define AS_WCSLEN
 +#define strlen	__wcslen_sse4_1
 +
 +#include "strlen-vec.S"
 diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
 index 6d06e47c..3b04b75b 100644
 --- a/sysdeps/x86_64/multiarch/wcslen.c
 +++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -24,7 +24,7 @@
 # undef __wcslen
 # define SYMBOL_NAME wcslen
 -# include "ifunc-avx2.h"
 +# include "ifunc-wcslen.h"
 libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
 weak_alias (__wcslen, wcslen);
 diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
 index 20b731ae..06736410 100644
 --- a/sysdeps/x86_64/multiarch/wcsnlen.c
 +++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,39 +24,7 @@
 # undef __wcsnlen
 # define SYMBOL_NAME wcsnlen
 -# include <init-arch.h>
 -
 -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 -extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
 -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 -extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 -extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 -
 -static inline void *
 -IFUNC_SELECTOR (void)
 -{
 -  const struct cpu_features* cpu_features = __get_cpu_features ();
 -
 -  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 -      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 -    {
 -      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 -	return OPTIMIZE (evex);
 -
 -      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 -	return OPTIMIZE (avx2_rtm);
 -
 -      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 -	return OPTIMIZE (avx2);
 -    }
 -
 -  if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
 -    return OPTIMIZE (sse4_1);
 -
 -  return OPTIMIZE (sse2);
 -}
 +# include "ifunc-wcslen.h"
 libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
 weak_alias (__wcsnlen, wcsnlen);
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-3.patch
+++ b/SOURCES/glibc-RHEL-15696-3.patch
@ -1,396 +0,0 @@
 From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:27:25 -0800
 Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes memcpy for x32.  Tested on x86-64 and x32.  On x86-64,
 libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
 	length.  Clear the upper 32 bits of RDX register.
 	* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
 	* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
 	Likewise.
 	* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
 	Likewise.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
 	tst-size_t-wmemchr.
 	* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
 ---
 sysdeps/x86_64/multiarch/memcpy-ssse3-back.S  | 17 ++++--
 sysdeps/x86_64/multiarch/memcpy-ssse3.S       | 17 ++++--
 .../multiarch/memmove-avx512-no-vzeroupper.S  | 16 +++--
 .../multiarch/memmove-vec-unaligned-erms.S    | 54 +++++++++--------
 sysdeps/x86_64/x32/Makefile                   |  2 +-
 sysdeps/x86_64/x32/tst-size_t-memcpy.c        | 58 +++++++++++++++++++
 6 files changed, 122 insertions(+), 42 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
 Conflicts:
 	ChangeLog
 	(removed)
 diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 index 3cd11233..568eebd3 100644
 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
 +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -45,28 +45,33 @@
 	.section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 ENTRY (MEMPCPY)
 -	movq	%rdi, %rax
 -	addq	%rdx, %rax
 +	mov	%RDI_LP, %RAX_LP
 +	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY)
 #endif
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 ENTRY (MEMCPY)
 -	mov	%rdi, %rax
 +	mov	%RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
 -	add	%rdx, %rax
 +	add	%RDX_LP, %RAX_LP
 +#endif
 +
 +#ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 #endif
 #ifdef USE_AS_MEMMOVE
 diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
 index 0240bfa3..0bd5ee99 100644
 --- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
 +++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -45,28 +45,33 @@
 	.section .text.ssse3,"ax",@progbits
 #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
 ENTRY (MEMPCPY_CHK)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMPCPY_CHK)
 ENTRY (MEMPCPY)
 -	movq	%rdi, %rax
 -	addq	%rdx, %rax
 +	mov	%RDI_LP, %RAX_LP
 +	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY)
 #endif
 #if !defined USE_AS_BCOPY
 ENTRY (MEMCPY_CHK)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMCPY_CHK)
 #endif
 ENTRY (MEMCPY)
 -	mov	%rdi, %rax
 +	mov	%RDI_LP, %RAX_LP
 #ifdef USE_AS_MEMPCPY
 -	add	%rdx, %rax
 +	add	%RDX_LP, %RAX_LP
 +#endif
 +
 +#ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 #endif
 #ifdef USE_AS_MEMMOVE
 diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
 index effc3ac2..6ca2bbc9 100644
 --- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
 +++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -24,27 +24,31 @@
 	.section .text.avx512,"ax",@progbits
 ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_avx512_no_vzeroupper)
 ENTRY (__mempcpy_avx512_no_vzeroupper)
 -	movq	%rdi, %rax
 -	addq	%rdx, %rax
 +	mov	%RDI_LP, %RAX_LP
 +	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (__mempcpy_avx512_no_vzeroupper)
 ENTRY (__memmove_chk_avx512_no_vzeroupper)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_avx512_no_vzeroupper)
 ENTRY (__memmove_avx512_no_vzeroupper)
 -	mov	%rdi, %rax
 +	mov	%RDI_LP, %RAX_LP
 # ifdef USE_AS_MEMPCPY
 -	add	%rdx, %rax
 +	add	%RDX_LP, %RAX_LP
 # endif
 L(start):
 +# ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 +# endif
 	lea	(%rsi, %rdx), %rcx
 	lea	(%rdi, %rdx), %r9
 	cmp	$512, %rdx
 diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 index c952576c..274aa1c7 100644
 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -95,20 +95,20 @@
 	.section SECTION(.text),"ax",@progbits
 #if defined SHARED && IS_IN (libc)
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
 #endif
 ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 -	movq	%rdi, %rax
 -	addq	%rdx, %rax
 +	mov	%RDI_LP, %RAX_LP
 +	add	%RDX_LP, %RAX_LP
 	jmp	L(start)
 END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
 #if defined SHARED && IS_IN (libc)
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 #endif
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
 	movq	%rdi, %rax
 L(start):
 -	cmpq	$VEC_SIZE, %rdx
 +# ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 +# endif
 +	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 -	cmpq	$(VEC_SIZE * 2), %rdx
 +	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(more_2x_vec)
 #if !defined USE_MULTIARCH || !IS_IN (libc)
 L(last_2x_vec):
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
 # if VEC_SIZE == 16
 ENTRY (__mempcpy_chk_erms)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__mempcpy_chk_erms)
 /* Only used to measure performance of REP MOVSB.  */
 ENTRY (__mempcpy_erms)
 -	movq	%rdi, %rax
 +	mov	%RDI_LP, %RAX_LP
 	/* Skip zero length.  */
 -	testq	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	jz	2f
 -	addq	%rdx, %rax
 +	add	%RDX_LP, %RAX_LP
 	jmp	L(start_movsb)
 END (__mempcpy_erms)
 ENTRY (__memmove_chk_erms)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memmove_chk_erms)
 ENTRY (__memmove_erms)
 	movq	%rdi, %rax
 	/* Skip zero length.  */
 -	testq	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	jz	2f
 L(start_movsb):
 -	movq	%rdx, %rcx
 -	cmpq	%rsi, %rdi
 +	mov	%RDX_LP, %RCX_LP
 +	cmp	%RSI_LP, %RDI_LP
 	jb	1f
 	/* Source == destination is less common.  */
 	je	2f
 -	leaq	(%rsi,%rcx), %rdx
 -	cmpq	%rdx, %rdi
 +	lea	(%rsi,%rcx), %RDX_LP
 +	cmp	%RDX_LP, %RDI_LP
 	jb	L(movsb_backward)
 1:
 	rep movsb
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
 # ifdef SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
 # endif
 ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 -	movq	%rdi, %rax
 -	addq	%rdx, %rax
 +	mov	%RDI_LP, %RAX_LP
 +	add	%RDX_LP, %RAX_LP
 	jmp	L(start_erms)
 END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
 # ifdef SHARED
 ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 # endif
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
 ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 	movq	%rdi, %rax
 L(start_erms):
 -	cmpq	$VEC_SIZE, %rdx
 +# ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 +# endif
 +	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 -	cmpq	$(VEC_SIZE * 2), %rdx
 +	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(movsb_more_2x_vec)
 L(last_2x_vec):
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE. */
@@ -236,7 +244,7 @@ L(movsb):
 	/* Avoid slow backward REP MOVSB.  */
 	jb	L(more_8x_vec_backward)
 1:
 -	movq	%rdx, %rcx
 +	mov	%RDX_LP, %RCX_LP
 	rep movsb
 L(nop):
 	ret
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index ddec7f04..2fe1e5ac 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 endif
 ifeq ($(subdir),string)
 -tests += tst-size_t-memchr tst-size_t-memcmp
 +tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
 endif
 ifeq ($(subdir),wcsmbs)
 diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
 new file mode 100644
 index 00000000..66b71e17
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
@@ -0,0 +1,58 @@
 +/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define TEST_NAME "memcpy"
 +#include "test-size_t.h"
 +
 +IMPL (memcpy, 1)
 +
 +typedef void *(*proto_t) (void *, const void *, size_t);
 +
 +static void *
 +__attribute__ ((noinline, noclone))
 +do_memcpy (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  parameter_t dest = { { page_size }, buf1 };
 +  parameter_t src = { { 0 }, buf2 };
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      src.fn = impl->fn;
 +      do_memcpy (dest, src);
 +      int res = memcmp (dest.p, src.p, dest.len);
 +      if (res)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %i != 0",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-30.patch
+++ b/SOURCES/glibc-RHEL-15696-30.patch
@ -1,497 +0,0 @@
 From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Wed, 23 Jun 2021 01:56:29 -0400
 Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
 #27974]
 Content-type: text/plain; charset=UTF-8
 This commit fixes the bug mentioned in the previous commit.
 The previous implementations of wmemchr in these files relied
 on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
 The new overflow tests added in the previous commit now
 pass (As well as all the other tests).
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
 sysdeps/x86_64/multiarch/strlen-vec.S  |  15 ++-
 2 files changed, 107 insertions(+), 38 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
 index be8a5db5..37688966 100644
 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
 +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check zero length.  */
 +#  ifdef __ILP32__
 +	/* Clear upper bits.  */
 +	and	%RSI_LP, %RSI_LP
 +#  else
 	test	%RSI_LP, %RSI_LP
 +#  endif
 	jz	L(zero)
 	/* Store max len in R8_LP before adjusting if using WCSLEN.  */
 	mov	%RSI_LP, %R8_LP
 -#  ifdef USE_AS_WCSLEN
 -	shl	$2, %RSI_LP
 -#  elif defined __ILP32__
 -	/* Clear the upper 32 bits.  */
 -	movl	%esi, %esi
 -#  endif
 # endif
 	movl	%edi, %eax
 	movq	%rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
 	/* Check the first VEC_SIZE bytes.  */
 	VPCMPEQ	(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 # ifdef USE_AS_STRNLEN
 	/* If length < VEC_SIZE handle special.  */
 -	cmpq	$VEC_SIZE, %rsi
 +	cmpq	$CHAR_PER_VEC, %rsi
 	jbe	L(first_vec_x0)
 # endif
 	/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
 	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
 L(first_vec_x0):
 	/* Set bit for max len so that tzcnt will return min of max len
 	   and position of first match.  */
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Multiply length by 4 to get byte count.  */
 +	sall	$2, %esi
 +#  endif
 	btsq	%rsi, %rax
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 	VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
 +#  ifdef USE_AS_WCSLEN
 +	leal	-(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
 +#  else
 	subl	$(VEC_SIZE * 4 + 1), %ecx
 	addl	%ecx, %eax
 +#  endif
 # else
 	subl	%edx, %edi
 	incl	%edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
 +#  ifdef USE_AS_WCSLEN
 +	leal	-(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
 +#  else
 	subl	$(VEC_SIZE * 3 + 1), %ecx
 	addl	%ecx, %eax
 +#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
 +#  ifdef USE_AS_WCSLEN
 +	leal	-(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
 +#  else
 	subl	$(VEC_SIZE * 2 + 1), %ecx
 	addl	%ecx, %eax
 +#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 2 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
 # ifdef USE_AS_STRNLEN
 	/* Use ecx which was computed earlier to compute correct value.
 	 */
 +#  ifdef USE_AS_WCSLEN
 +	leal	-(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
 +#  else
 	subl	$(VEC_SIZE + 1), %ecx
 	addl	%ecx, %eax
 +#  endif
 # else
 	subl	%edx, %edi
 	addl	$(VEC_SIZE * 3 + 1), %edi
 	addl	%edi, %eax
 # endif
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 # endif
 	VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 # ifdef USE_AS_STRNLEN
 -	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
 -	   it simplies the logic in last_4x_vec_or_less.  */
 +	/* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
 +	   because it simplies the logic in last_4x_vec_or_less.  */
 	leaq	(VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
 	subq	%rdx, %rcx
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %ecx
 +#  endif
 # endif
 	/* Load first VEC regardless.  */
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
 	subq	%rcx, %rsi
 	jb	L(last_4x_vec_or_less)
 # endif
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x2)
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x4)
 	/* Align data to VEC_SIZE * 4 - 1.  */
 # ifdef USE_AS_STRNLEN
 	/* Before adjusting length check if at last VEC_SIZE * 4.  */
 -	cmpq	$(VEC_SIZE * 4 - 1), %rsi
 +	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 	jbe	L(last_4x_vec_or_less_load)
 	incq	%rdi
 	movl	%edi, %ecx
 	orq	$(VEC_SIZE * 4 - 1), %rdi
 	andl	$(VEC_SIZE * 4 - 1), %ecx
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %ecx
 +#  endif
 	/* Readjust length.  */
 	addq	%rcx, %rsi
 # else
@@ -246,13 +280,13 @@ L(cross_page_continue):
 L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	/* Break if at end of length.  */
 -	subq	$(VEC_SIZE * 4), %rsi
 +	subq	$(CHAR_PER_VEC * 4), %rsi
 	jb	L(last_4x_vec_or_less_cmpeq)
 # endif
 -	/* Save some code size by microfusing VPMINU with the load. Since
 -	   the matches in ymm2/ymm4 can only be returned if there where no
 -	   matches in ymm1/ymm3 respectively there is no issue with overlap.
 -	 */
 +	/* Save some code size by microfusing VPMINU with the load.
 +	   Since the matches in ymm2/ymm4 can only be returned if there
 +	   where no matches in ymm1/ymm3 respectively there is no issue
 +	   with overlap.  */
 	vmovdqa	1(%rdi), %ymm1
 	VPMINU	(VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
 	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
 	VPMINU	%ymm2, %ymm4, %ymm5
 	VPCMPEQ	%ymm5, %ymm0, %ymm5
 -	vpmovmskb	%ymm5, %ecx
 +	vpmovmskb %ymm5, %ecx
 	subq	$-(VEC_SIZE * 4), %rdi
 	testl	%ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
 	VPCMPEQ	%ymm1, %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	subq	%rdx, %rdi
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x0)
 	VPCMPEQ	%ymm2, %ymm0, %ymm2
 -	vpmovmskb	%ymm2, %eax
 +	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_return_x1)
 	/* Combine last 2 VEC.  */
 	VPCMPEQ	%ymm3, %ymm0, %ymm3
 -	vpmovmskb	%ymm3, %eax
 -	/* rcx has combined result from all 4 VEC. It will only be used if
 -	   the first 3 other VEC all did not contain a match.  */
 +	vpmovmskb %ymm3, %eax
 +	/* rcx has combined result from all 4 VEC. It will only be used
 +	   if the first 3 other VEC all did not contain a match.  */
 	salq	$32, %rcx
 	orq	%rcx, %rax
 	tzcntq	%rax, %rax
 	subq	$(VEC_SIZE * 2 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
 # ifdef USE_AS_STRNLEN
 	.p2align 4
 L(last_4x_vec_or_less_load):
 -	/* Depending on entry adjust rdi / prepare first VEC in ymm1.  */
 +	/* Depending on entry adjust rdi / prepare first VEC in ymm1.
 +	 */
 	subq	$-(VEC_SIZE * 4), %rdi
 L(last_4x_vec_or_less_cmpeq):
 	VPCMPEQ	1(%rdi), %ymm0, %ymm1
 L(last_4x_vec_or_less):
 -
 -	vpmovmskb	%ymm1, %eax
 -	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
 -	   VEC_SIZE * 4.  */
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Multiply length by 4 to get byte count.  */
 +	sall	$2, %esi
 +#  endif
 +	vpmovmskb %ymm1, %eax
 +	/* If remaining length > VEC_SIZE * 2. This works if esi is off
 +	   by VEC_SIZE * 4.  */
 	testl	$(VEC_SIZE * 2), %esi
 	jnz	L(last_4x_vec)
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
 	jb	L(max)
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
 	subq	$(VEC_SIZE * 4 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
 	subq	$(VEC_SIZE * 3 - 1), %rdi
 	addq	%rdi, %rax
 # ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
 	jnz	L(last_vec_x1)
 	VPCMPEQ	(VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x2)
 	/* Normalize length.  */
 	andl	$(VEC_SIZE * 4 - 1), %esi
 	VPCMPEQ	(VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3)
@@ -396,7 +439,7 @@ L(last_4x_vec):
 	jb	L(max)
 	VPCMPEQ	(VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	tzcntl	%eax, %eax
 	/* Check the end of data.  */
 	cmpl	%eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
 	addl	$(VEC_SIZE * 3 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
 	incl	%eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
 	addl	$(VEC_SIZE + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
 	addl	$(VEC_SIZE * 2 + 1), %eax
 	addq	%rdi, %rax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 	shrq	$2, %rax
 #  endif
 	VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
 	VZEROUPPER_RETURN
 # endif
 -	/* Cold case for crossing page with first load.	 */
 +	/* Cold case for crossing page with first load.  */
 	.p2align 4
 L(cross_page_boundary):
 	/* Align data to VEC_SIZE - 1.  */
 	orq	$(VEC_SIZE - 1), %rdi
 	VPCMPEQ	-(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
 -	vpmovmskb	%ymm1, %eax
 +	vpmovmskb %ymm1, %eax
 	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 	   so no need to manually mod rdx.  */
 	sarxl	%edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
 	jnz	L(cross_page_less_vec)
 	leaq	1(%rdi), %rcx
 	subq	%rdx, %rcx
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get wchar_t count.  */
 +	shrl	$2, %ecx
 +#  endif
 	/* Check length.  */
 	cmpq	%rsi, %rcx
 	jb	L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
 	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 #  ifdef USE_AS_WCSLEN
 +	/* NB: Divide length by 4 to get wchar_t count.  */
 	shrl	$2, %eax
 #  endif
 # endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
 	.p2align 4
 L(cross_page_less_vec):
 	tzcntl	%eax, %eax
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Multiply length by 4 to get byte count.  */
 +	sall	$2, %esi
 +#  endif
 	cmpq	%rax, %rsi
 	cmovb	%esi, %eax
 #  ifdef USE_AS_WCSLEN
 diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
 index 8f660bb9..439e486a 100644
 --- a/sysdeps/x86_64/multiarch/strlen-vec.S
 +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -65,12 +65,25 @@ ENTRY(strlen)
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
 -	shl	$2, %RSI_LP
 +/* Check for overflow from maxlen * sizeof(wchar_t). If it would
 +   overflow the only way this program doesn't have undefined behavior 
 +   is if there is a null terminator in valid memory so wcslen will 
 +   suffice.  */
 +	mov	%RSI_LP, %R10_LP
 +	sar	$62, %R10_LP
 +	test	%R10_LP, %R10_LP
 +	jnz	__wcslen_sse4_1
 +	sal	$2, %RSI_LP
 # endif
 +
 /* Initialize long lived registers.  */
 	add	%RDI_LP, %RSI_LP
 +# ifdef AS_WCSLEN
 +/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
 +	jbe	__wcslen_sse4_1
 +# endif
 	mov	%RSI_LP, %R10_LP
 	and	$-64, %R10_LP
 	mov	%RSI_LP, %R11_LP
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-31.patch
+++ b/SOURCES/glibc-RHEL-15696-31.patch
@ -1,745 +0,0 @@
 From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 19 Apr 2021 19:36:06 -0400
 Subject: [PATCH] x86: Optimize strlen-evex.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes strlen-evex.S. The
 optimizations are mostly small things but they add up to roughly
 10-30% performance improvement for strlen. The results for strnlen are
 bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
 test-wcsnlen are all passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
 1 file changed, 317 insertions(+), 264 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
 index 05838190..4bf6874b 100644
 --- a/sysdeps/x86_64/multiarch/strlen-evex.S
 +++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -29,11 +29,13 @@
 # ifdef USE_AS_WCSLEN
 #  define VPCMP		vpcmpd
 #  define VPMINU	vpminud
 -#  define SHIFT_REG	r9d
 +#  define SHIFT_REG ecx
 +#  define CHAR_SIZE	4
 # else
 #  define VPCMP		vpcmpb
 #  define VPMINU	vpminub
 -#  define SHIFT_REG	ecx
 +#  define SHIFT_REG edx
 +#  define CHAR_SIZE	1
 # endif
 # define XMMZERO	xmm16
@@ -46,132 +48,165 @@
 # define YMM6		ymm22
 # define VEC_SIZE 32
 +# define PAGE_SIZE 4096
 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 	.section .text.evex,"ax",@progbits
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 -	/* Check for zero length.  */
 +	/* Check zero length.  */
 	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
 -#  ifdef USE_AS_WCSLEN
 -	shl	$2, %RSI_LP
 -#  elif defined __ILP32__
 +#  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%esi, %esi
 #  endif
 	mov	%RSI_LP, %R8_LP
 # endif
 -	movl	%edi, %ecx
 -	movq	%rdi, %rdx
 +	movl	%edi, %eax
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 -
 +	/* Clear high bits from edi. Only keeping bits relevant to page
 +	   cross check.  */
 +	andl	$(PAGE_SIZE - 1), %eax
 	/* Check if we may cross page boundary with one vector load.  */
 -	andl	$(2 * VEC_SIZE - 1), %ecx
 -	cmpl	$VEC_SIZE, %ecx
 -	ja	L(cros_page_boundary)
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	ja	L(cross_page_boundary)
 	/* Check the first VEC_SIZE bytes.  Each bit in K0 represents a
 	   null byte.  */
 	VPCMP	$0, (%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -
 # ifdef USE_AS_STRNLEN
 -	jnz	L(first_vec_x0_check)
 -	/* Adjust length and check the end of data.  */
 -	subq	$VEC_SIZE, %rsi
 -	jbe	L(max)
 -# else
 -	jnz	L(first_vec_x0)
 +	/* If length < CHAR_PER_VEC handle special.  */
 +	cmpq	$CHAR_PER_VEC, %rsi
 +	jbe	L(first_vec_x0)
 # endif
 -
 -	/* Align data for aligned loads in the loop.  */
 -	addq	$VEC_SIZE, %rdi
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 -
 +	testl	%eax, %eax
 +	jz	L(aligned_more)
 +	tzcntl	%eax, %eax
 +	ret
 # ifdef USE_AS_STRNLEN
 -	/* Adjust length.  */
 -	addq	%rcx, %rsi
 +L(zero):
 +	xorl	%eax, %eax
 +	ret
 -	subq	$(VEC_SIZE * 4), %rsi
 -	jbe	L(last_4x_vec_or_less)
 +	.p2align 4
 +L(first_vec_x0):
 +	/* Set bit for max len so that tzcnt will return min of max len
 +	   and position of first match.  */
 +	btsq	%rsi, %rax
 +	tzcntl	%eax, %eax
 +	ret
 # endif
 -	jmp	L(more_4x_vec)
 	.p2align 4
 -L(cros_page_boundary):
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 -
 -# ifdef USE_AS_WCSLEN
 -	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 -	   bytes.  */
 -	movl	%ecx, %SHIFT_REG
 -	sarl	$2, %SHIFT_REG
 +L(first_vec_x1):
 +	tzcntl	%eax, %eax
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 +# ifdef USE_AS_STRNLEN
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	leal	-(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
 +# else
 +	subl	%edx, %edi
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %edi
 +#  endif
 +	leal	CHAR_PER_VEC(%rdi, %rax), %eax
 # endif
 -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 -	kmovd	%k0, %eax
 +	ret
 -	/* Remove the leading bytes.  */
 -	sarxl	%SHIFT_REG, %eax, %eax
 -	testl	%eax, %eax
 -	jz	L(aligned_more)
 +	.p2align 4
 +L(first_vec_x2):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -# endif
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -# endif
 -	addq	%rdi, %rax
 -	addq	%rcx, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	leal	-(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
 +# else
 +	subl	%edx, %edi
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %edi
 +#  endif
 +	leal	(CHAR_PER_VEC * 2)(%rdi, %rax), %eax
 # endif
 	ret
 	.p2align 4
 -L(aligned_more):
 +L(first_vec_x3):
 +	tzcntl	%eax, %eax
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
 -        /* "rcx" is less than VEC_SIZE.  Calculate "rdx + rcx - VEC_SIZE"
 -	    with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
 -	    to void possible addition overflow.  */
 -	negq	%rcx
 -	addq	$VEC_SIZE, %rcx
 -
 -	/* Check the end of data.  */
 -	subq	%rcx, %rsi
 -	jbe	L(max)
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	leal	-(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
 +# else
 +	subl	%edx, %edi
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %edi
 +#  endif
 +	leal	(CHAR_PER_VEC * 3)(%rdi, %rax), %eax
 # endif
 +	ret
 -	addq	$VEC_SIZE, %rdi
 -
 +	.p2align 4
 +L(first_vec_x4):
 +	tzcntl	%eax, %eax
 +	/* Safe to use 32 bit instructions as these are only called for
 +	   size = [1, 159].  */
 # ifdef USE_AS_STRNLEN
 -	subq	$(VEC_SIZE * 4), %rsi
 -	jbe	L(last_4x_vec_or_less)
 +	/* Use ecx which was computed earlier to compute correct value.
 +	 */
 +	leal	-(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
 +# else
 +	subl	%edx, %edi
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %edi
 +#  endif
 +	leal	(CHAR_PER_VEC * 4)(%rdi, %rax), %eax
 # endif
 +	ret
 -L(more_4x_vec):
 +	.p2align 5
 +L(aligned_more):
 +	movq	%rdi, %rdx
 +	/* Align data to VEC_SIZE.  */
 +	andq	$-(VEC_SIZE), %rdi
 +L(cross_page_continue):
 	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 	   since data is only aligned to VEC_SIZE.  */
 -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -
 +# ifdef USE_AS_STRNLEN
 +	/* + CHAR_SIZE because it simplies the logic in
 +	   last_4x_vec_or_less.  */
 +	leaq	(VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
 +	subq	%rdx, %rcx
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %ecx
 +#  endif
 +# endif
 +	/* Load first VEC regardless.  */
 	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
 +# ifdef USE_AS_STRNLEN
 +	/* Adjust length. If near end handle specially.  */
 +	subq	%rcx, %rsi
 +	jb	L(last_4x_vec_or_less)
 +# endif
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 -	testl	%eax, %eax
 +	test	%eax, %eax
 	jnz	L(first_vec_x2)
 	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
@@ -179,258 +214,276 @@ L(more_4x_vec):
 	testl	%eax, %eax
 	jnz	L(first_vec_x3)
 -	addq	$(VEC_SIZE * 4), %rdi
 -
 -# ifdef USE_AS_STRNLEN
 -	subq	$(VEC_SIZE * 4), %rsi
 -	jbe	L(last_4x_vec_or_less)
 -# endif
 -
 -	/* Align data to 4 * VEC_SIZE.  */
 -	movq	%rdi, %rcx
 -	andl	$(4 * VEC_SIZE - 1), %ecx
 -	andq	$-(4 * VEC_SIZE), %rdi
 +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x4)
 +	addq	$VEC_SIZE, %rdi
 # ifdef USE_AS_STRNLEN
 -	/* Adjust length.  */
 +	/* Check if at last VEC_SIZE * 4 length.  */
 +	cmpq	$(CHAR_PER_VEC * 4 - 1), %rsi
 +	jbe	L(last_4x_vec_or_less_load)
 +	movl	%edi, %ecx
 +	andl	$(VEC_SIZE * 4 - 1), %ecx
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarl	$2, %ecx
 +#  endif
 +	/* Readjust length.  */
 	addq	%rcx, %rsi
 # endif
 +	/* Align data to VEC_SIZE * 4.  */
 +	andq	$-(VEC_SIZE * 4), %rdi
 +	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 -	VMOVA	(%rdi), %YMM1
 -	VMOVA	VEC_SIZE(%rdi), %YMM2
 -	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM3
 -	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM4
 -
 -	VPMINU	%YMM1, %YMM2, %YMM5
 -	VPMINU	%YMM3, %YMM4, %YMM6
 +	/* Load first VEC regardless.  */
 +	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 +# ifdef USE_AS_STRNLEN
 +	/* Break if at end of length.  */
 +	subq	$(CHAR_PER_VEC * 4), %rsi
 +	jb	L(last_4x_vec_or_less_cmpeq)
 +# endif
 +	/* Save some code size by microfusing VPMINU with the load. Since
 +	   the matches in ymm2/ymm4 can only be returned if there where no
 +	   matches in ymm1/ymm3 respectively there is no issue with overlap.
 +	 */
 +	VPMINU	(VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
 +	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
 +	VPMINU	(VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
 +
 +	VPCMP	$0, %YMM2, %YMMZERO, %k0
 +	VPCMP	$0, %YMM4, %YMMZERO, %k1
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	kortestd	%k0, %k1
 +	jz	L(loop_4x_vec)
 +
 +	/* Check if end was in first half.  */
 +	kmovd	%k0, %eax
 +	subq	%rdx, %rdi
 +# ifdef USE_AS_WCSLEN
 +	shrq	$2, %rdi
 +# endif
 +	testl	%eax, %eax
 +	jz	L(second_vec_return)
 -	VPMINU	%YMM5, %YMM6, %YMM5
 -	VPCMP	$0, %YMM5, %YMMZERO, %k0
 -	ktestd	%k0, %k0
 -	jnz	L(4x_vec_end)
 +	VPCMP	$0, %YMM1, %YMMZERO, %k2
 +	kmovd	%k2, %edx
 +	/* Combine VEC1 matches (edx) with VEC2 matches (eax).  */
 +# ifdef USE_AS_WCSLEN
 +	sall	$CHAR_PER_VEC, %eax
 +	orl	%edx, %eax
 +	tzcntl	%eax, %eax
 +# else
 +	salq	$CHAR_PER_VEC, %rax
 +	orq	%rdx, %rax
 +	tzcntq	%rax, %rax
 +# endif
 +	addq	%rdi, %rax
 +	ret
 -	addq	$(VEC_SIZE * 4), %rdi
 -# ifndef USE_AS_STRNLEN
 -	jmp	L(loop_4x_vec)
 -# else
 -	subq	$(VEC_SIZE * 4), %rsi
 -	ja	L(loop_4x_vec)
 +# ifdef USE_AS_STRNLEN
 +L(last_4x_vec_or_less_load):
 +	/* Depending on entry adjust rdi / prepare first VEC in YMM1.  */
 +	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 +L(last_4x_vec_or_less_cmpeq):
 +	VPCMP	$0, %YMM1, %YMMZERO, %k0
 +	addq	$(VEC_SIZE * 3), %rdi
 L(last_4x_vec_or_less):
 -	/* Less than 4 * VEC and aligned to VEC_SIZE.  */
 -	addl	$(VEC_SIZE * 2), %esi
 -	jle	L(last_2x_vec)
 -
 -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 +	/* If remaining length > VEC_SIZE * 2. This works if esi is off by
 +	   VEC_SIZE * 4.  */
 +	testl	$(CHAR_PER_VEC * 2), %esi
 +	jnz	L(last_4x_vec)
 +
 +	/* length may have been negative or positive by an offset of
 +	   CHAR_PER_VEC * 4 depending on where this was called from. This
 +	   fixes that.  */
 +	andl	$(CHAR_PER_VEC * 4 - 1), %esi
 	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 +	jnz	L(last_vec_x1_check)
 -	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 +	/* Check the end of data.  */
 +	subl	$CHAR_PER_VEC, %esi
 +	jb	L(max)
 	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x2_check)
 -	subl	$VEC_SIZE, %esi
 -	jle	L(max)
 +	tzcntl	%eax, %eax
 +	/* Check the end of data.  */
 +	cmpl	%eax, %esi
 +	jb	L(max)
 -	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x3_check)
 +	subq	%rdx, %rdi
 +#  ifdef USE_AS_WCSLEN
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarq	$2, %rdi
 +#  endif
 +	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
 +	ret
 +L(max):
 	movq	%r8, %rax
 +	ret
 +# endif
 +
 +	/* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
 +	   in the 4x VEC loop can use 2 byte encoding.  */
 +	.p2align 4
 +L(second_vec_return):
 +	VPCMP	$0, %YMM3, %YMMZERO, %k0
 +	/* Combine YMM3 matches (k0) with YMM4 matches (k1).  */
 +# ifdef USE_AS_WCSLEN
 +	kunpckbw	%k0, %k1, %k0
 +	kmovd	%k0, %eax
 +	tzcntl	%eax, %eax
 +# else
 +	kunpckdq	%k0, %k1, %k0
 +	kmovq	%k0, %rax
 +	tzcntq	%rax, %rax
 +# endif
 +	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
 +	ret
 +
 +
 +# ifdef USE_AS_STRNLEN
 +L(last_vec_x1_check):
 +	tzcntl	%eax, %eax
 +	/* Check the end of data.  */
 +	cmpl	%eax, %esi
 +	jb	L(max)
 +	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarq	$2, %rdi
 #  endif
 +	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 	.p2align 4
 -L(last_2x_vec):
 -	addl	$(VEC_SIZE * 2), %esi
 +L(last_4x_vec):
 +	/* Test first 2x VEC normally.  */
 +	testl	%eax, %eax
 +	jnz	L(last_vec_x1)
 -	VPCMP	$0, (%rdi), %YMMZERO, %k0
 +	VPCMP	$0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x0_check)
 -	subl	$VEC_SIZE, %esi
 -	jle	L(max)
 +	jnz	L(last_vec_x2)
 -	VPCMP	$0, VEC_SIZE(%rdi), %YMMZERO, %k0
 +	/* Normalize length.  */
 +	andl	$(CHAR_PER_VEC * 4 - 1), %esi
 +	VPCMP	$0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1_check)
 -	movq	%r8, %rax
 -#  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 -#  endif
 -	ret
 +	jnz	L(last_vec_x3)
 -	.p2align 4
 -L(first_vec_x0_check):
 +	/* Check the end of data.  */
 +	subl	$(CHAR_PER_VEC * 3), %esi
 +	jb	L(max)
 +
 +	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
 +	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
 -#  ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -#  endif
 	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 +	cmpl	%eax, %esi
 +	jb	L(max_end)
 +
 +	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarq	$2, %rdi
 #  endif
 +	leaq	(CHAR_PER_VEC * 4)(%rdi, %rax), %rax
 	ret
 	.p2align 4
 -L(first_vec_x1_check):
 +L(last_vec_x1):
 	tzcntl	%eax, %eax
 +	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -#  endif
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -	addq	$VEC_SIZE, %rax
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -#  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarq	$2, %rdi
 #  endif
 +	leaq	(CHAR_PER_VEC)(%rdi, %rax), %rax
 	ret
 	.p2align 4
 -L(first_vec_x2_check):
 +L(last_vec_x2):
 	tzcntl	%eax, %eax
 +	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -#  endif
 -	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -	addq	$(VEC_SIZE * 2), %rax
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -#  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarq	$2, %rdi
 #  endif
 +	leaq	(CHAR_PER_VEC * 2)(%rdi, %rax), %rax
 	ret
 	.p2align 4
 -L(first_vec_x3_check):
 +L(last_vec_x3):
 	tzcntl	%eax, %eax
 -#  ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -#  endif
 +	subl	$(CHAR_PER_VEC * 2), %esi
 	/* Check the end of data.  */
 -	cmpq	%rax, %rsi
 -	jbe	L(max)
 -	addq	$(VEC_SIZE * 3), %rax
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 +	cmpl	%eax, %esi
 +	jb	L(max_end)
 +	subq	%rdx, %rdi
 #  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 +	sarq	$2, %rdi
 #  endif
 +	leaq	(CHAR_PER_VEC * 3)(%rdi, %rax), %rax
 	ret
 -
 -	.p2align 4
 -L(max):
 +L(max_end):
 	movq	%r8, %rax
 -#  ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 -#  endif
 -	ret
 -
 -	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 	ret
 # endif
 +	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 -L(first_vec_x0):
 -	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -# endif
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 +L(cross_page_boundary):
 +	movq	%rdi, %rdx
 +	/* Align data to VEC_SIZE.  */
 +	andq	$-VEC_SIZE, %rdi
 +	VPCMP	$0, (%rdi), %YMMZERO, %k0
 +	kmovd	%k0, %eax
 +	/* Remove the leading bytes.  */
 # ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 +	/* NB: Divide shift count by 4 since each bit in K0 represent 4
 +	   bytes.  */
 +	movl	%edx, %ecx
 +	shrl	$2, %ecx
 +	andl	$(CHAR_PER_VEC - 1), %ecx
 # endif
 -	ret
 -
 -	.p2align 4
 -L(first_vec_x1):
 +	/* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise.  */
 +	sarxl	%SHIFT_REG, %eax, %eax
 +	testl	%eax, %eax
 +# ifndef USE_AS_STRNLEN
 +	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -# endif
 -	addq	$VEC_SIZE, %rax
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 -# endif
 	ret
 -
 -	.p2align 4
 -L(first_vec_x2):
 -	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -# endif
 -	addq	$(VEC_SIZE * 2), %rax
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 -# endif
 +# else
 +	jnz	L(cross_page_less_vec)
 +#  ifndef USE_AS_WCSLEN
 +	movl	%edx, %ecx
 +	andl	$(CHAR_PER_VEC - 1), %ecx
 +#  endif
 +	movl	$CHAR_PER_VEC, %eax
 +	subl	%ecx, %eax
 +	/* Check the end of data.  */
 +	cmpq	%rax, %rsi
 +	ja	L(cross_page_continue)
 +	movl	%esi, %eax
 	ret
 -
 -	.p2align 4
 -L(4x_vec_end):
 -	VPCMP	$0, %YMM1, %YMMZERO, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -	VPCMP	$0, %YMM2, %YMMZERO, %k1
 -	kmovd	%k1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -	VPCMP	$0, %YMM3, %YMMZERO, %k2
 -	kmovd	%k2, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x2)
 -	VPCMP	$0, %YMM4, %YMMZERO, %k3
 -	kmovd	%k3, %eax
 -L(first_vec_x3):
 +L(cross_page_less_vec):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_WCSLEN
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	sall	$2, %eax
 -# endif
 -	addq	$(VEC_SIZE * 3), %rax
 -	addq	%rdi, %rax
 -	subq	%rdx, %rax
 -# ifdef USE_AS_WCSLEN
 -	shrq	$2, %rax
 -# endif
 +	/* Select min of length and position of first null.  */
 +	cmpq	%rax, %rsi
 +	cmovb	%esi, %eax
 	ret
 +# endif
 END (STRLEN)
 #endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-32.patch
+++ b/SOURCES/glibc-RHEL-15696-32.patch
@ -1,158 +0,0 @@
 From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Wed, 30 Jun 2021 10:47:06 -0700
 Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
 Content-type: text/plain; charset=UTF-8
 From
 https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
 * Intel TSX will be disabled by default.
 * The processor will force abort all Restricted Transactional Memory (RTM)
  transactions by default.
 * A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
  which is set to indicate to updated software that the loaded microcode is
  forcing RTM abort.
 * On processors that enumerate support for RTM, the CPUID enumeration bits
  for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
  be set by default after microcode update.
 * Workloads that were benefited from Intel TSX might experience a change
  in performance.
 * System software may use a new bit in Model-Specific Register (MSR) 0x10F
  TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
  Elision (HLE) and RTM bits to indicate to software that Intel TSX is
  disabled.
 1. Add RTM_ALWAYS_ABORT to CPUID features.
 2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set.  This skips the
 string/tst-memchr-rtm etc. testcases on the affected processors, which
 always fail after a microcde update.
 3. Check RTM feature, instead of usability, against /proc/cpuinfo.
 This fixes BZ #28033.
 ---
 manual/platform.texi                    | 3 +++
 sysdeps/x86/cpu-features.c              | 5 ++++-
 sysdeps/x86/sys/platform/x86.h          | 6 +++---
 sysdeps/x86/tst-cpu-features-supports.c | 2 +-
 sysdeps/x86/tst-get-cpu-features.c      | 2 ++
 5 files changed, 13 insertions(+), 5 deletions(-)
 Conflicts:
 	sysdeps/x86/bits/platform/x86.h
 	(doesn't exist)
 	sysdeps/x86/bits/platform/x86.h
 	(account for lack of upstream renames)
 diff --git a/manual/platform.texi b/manual/platform.texi
 index 8fec2933..b7e8aef7 100644
 --- a/manual/platform.texi
 +++ b/manual/platform.texi
@@ -510,6 +510,9 @@ capability.
 @item
 @code{RTM} -- RTM instruction extensions.
 +@item
 +@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
 +
 @item
 @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
 diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
 index 3610ee5c..4889f062 100644
 --- a/sysdeps/x86/cpu-features.c
 +++ b/sysdeps/x86/cpu-features.c
@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
   CPU_FEATURE_SET_USABLE (cpu_features, HLE);
   CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
   CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
 -  CPU_FEATURE_SET_USABLE (cpu_features, RTM);
   CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
   CPU_FEATURE_SET_USABLE (cpu_features, ADX);
   CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
   CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
   CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
 +  CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
   CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
   CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
   CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
@@ -779,6 +779,9 @@ no_cpuid:
     GLRO(dl_platform) = "i586";
 #endif
 +  if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
 +    CPU_FEATURE_SET_USABLE (cpu_features, RTM);
 +
 #if CET_ENABLED
 # if HAVE_TUNABLES
   TUNABLE_GET (x86_ibt, tunable_val_t *,
 diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
 index e5cc7c68..7a434926 100644
 --- a/sysdeps/x86/sys/platform/x86.h
 +++ b/sysdeps/x86/sys/platform/x86.h
@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
 #define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
 #define bit_cpu_INDEX_7_EDX_9	(1u << 9)
 #define bit_cpu_MD_CLEAR	(1u << 10)
 -#define bit_cpu_INDEX_7_EDX_11	(1u << 11)
 +#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
 #define bit_cpu_INDEX_7_EDX_12	(1u << 12)
 #define bit_cpu_INDEX_7_EDX_13	(1u << 13)
 #define bit_cpu_SERIALIZE	(1u << 14)
@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
 #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
 #define index_cpu_INDEX_7_EDX_9	COMMON_CPUID_INDEX_7
 #define index_cpu_MD_CLEAR	COMMON_CPUID_INDEX_7
 -#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
 +#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
 #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
 #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
 #define index_cpu_SERIALIZE	COMMON_CPUID_INDEX_7
@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
 #define reg_AVX512_VP2INTERSECT	edx
 #define reg_INDEX_7_EDX_9	edx
 #define reg_MD_CLEAR		edx
 -#define reg_INDEX_7_EDX_11	edx
 +#define reg_RTM_ALWAYS_ABORT	edx
 #define reg_INDEX_7_EDX_12	edx
 #define reg_INDEX_7_EDX_13	edx
 #define reg_SERIALIZE		edx
 diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
 index 287cf01f..8100a319 100644
 --- a/sysdeps/x86/tst-cpu-features-supports.c
 +++ b/sysdeps/x86/tst-cpu-features-supports.c
@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
   fails += CHECK_SUPPORTS (rdpid, RDPID);
   fails += CHECK_SUPPORTS (rdrnd, RDRAND);
   fails += CHECK_SUPPORTS (rdseed, RDSEED);
 -  fails += CHECK_SUPPORTS (rtm, RTM);
 +  fails += CHECK_CPU_SUPPORTS (rtm, RTM);
   fails += CHECK_SUPPORTS (serialize, SERIALIZE);
   fails += CHECK_SUPPORTS (sha, SHA);
   fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
 diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
 index 2763deb6..0717e5d8 100644
 --- a/sysdeps/x86/tst-get-cpu-features.c
 +++ b/sysdeps/x86/tst-get-cpu-features.c
@@ -183,6 +183,7 @@ do_test (void)
   CHECK_CPU_FEATURE (UINTR);
   CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
   CHECK_CPU_FEATURE (MD_CLEAR);
 +  CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
   CHECK_CPU_FEATURE (SERIALIZE);
   CHECK_CPU_FEATURE (HYBRID);
   CHECK_CPU_FEATURE (TSXLDTRK);
@@ -344,6 +345,7 @@ do_test (void)
   CHECK_CPU_FEATURE_USABLE (FSRM);
   CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
   CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
 +  CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
   CHECK_CPU_FEATURE_USABLE (SERIALIZE);
   CHECK_CPU_FEATURE_USABLE (HYBRID);
   CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-33.patch
+++ b/SOURCES/glibc-RHEL-15696-33.patch
@ -1,51 +0,0 @@
 From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Thu, 8 Jul 2021 16:13:19 -0400
 Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
 #28064]
 Content-type: text/plain; charset=UTF-8
 The following commit
 commit 6f573a27b6c8b4236445810a44660612323f5a73
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Wed Jun 23 01:19:34 2021 -0400
    x86-64: Add wcslen optimize for sse4.1
 Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
 not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
 fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
 implementation list and adding wcslen-sse4.1 to the ifunc
 implementation list.
 Testing:
 test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
 well as all other tests in wcsmbs and string.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index 580913ca..695cdba6 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcslen_evex)
 -	      IFUNC_IMPL_ADD (array, i, wcsnlen,
 +	      IFUNC_IMPL_ADD (array, i, wcslen,
 			      CPU_FEATURE_USABLE (SSE4_1),
 -			      __wcsnlen_sse4_1)
 +			      __wcslen_sse4_1)
 	      IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
   /* Support sysdeps/x86_64/multiarch/wcsnlen.c.  */
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-35.patch
+++ b/SOURCES/glibc-RHEL-15696-35.patch
@ -1,51 +0,0 @@
 From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Sat, 9 May 2020 12:04:23 -0700
 Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
 #25966]
 Content-type: text/plain; charset=UTF-8
 Since __x86_shared_non_temporal_threshold is defined as
 long int __x86_shared_non_temporal_threshold;
 and long int is 4 bytes for x32, use RDX_LP to compare against
 __x86_shared_non_temporal_threshold in assembly code.
 ---
 sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 index 71f5954d..673b73aa 100644
 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -245,7 +245,7 @@ L(return):
 #endif
 L(movsb):
 -	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
 +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	jae	L(more_8x_vec)
 	cmpq	%rsi, %rdi
 	jb	1f
@@ -397,7 +397,7 @@ L(more_8x_vec):
 	addq	%r8, %rdx
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
 -	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
 +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_forward)
 #endif
 L(loop_4x_vec_forward):
@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
 	subq	%r8, %rdx
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 	/* Check non-temporal store threshold.  */
 -	cmpq	__x86_shared_non_temporal_threshold(%rip), %rdx
 +	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 	ja	L(large_backward)
 #endif
 L(loop_4x_vec_backward):
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-36.patch
+++ b/SOURCES/glibc-RHEL-15696-36.patch
@ -1,44 +0,0 @@
 From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Thu, 11 Jun 2020 12:41:18 -0700
 Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
 Content-type: text/plain; charset=UTF-8
 Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
 %xmmN, instead of %ymmN, with vpxor to clear a vector register.
 ---
 sysdeps/x86_64/multiarch/strcmp-avx2.S  | 4 ++--
 sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 index 433ae047..70d8499b 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -105,8 +105,8 @@ ENTRY (STRCMP)
 # endif
 	movl	%edi, %eax
 	xorl	%edx, %edx
 -	/* Make %ymm7 all zeros in this function.  */
 -	vpxor	%ymm7, %ymm7, %ymm7
 +	/* Make %xmm7 (%ymm7) all zeros in this function.  */
 +	vpxor	%xmm7, %xmm7, %xmm7
 	orl	%esi, %eax
 	andl	$(PAGE_SIZE - 1), %eax
 	cmpl	$(PAGE_SIZE - (VEC_SIZE * 4)), %eax
 diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
 index 9f22a15e..c949410b 100644
 --- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
 	movl	%edi, %ecx
 	/* Broadcast CHAR to YMM4.  */
 	VPBROADCAST %xmm4, %ymm4
 -	vpxor	%ymm0, %ymm0, %ymm0
 +	vpxor	%xmm0, %xmm0, %xmm0
 	/* Check if we may cross page boundary with one vector load.  */
 	andl	$(2 * VEC_SIZE - 1), %ecx
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-37.patch
+++ b/SOURCES/glibc-RHEL-15696-37.patch
@ -1,359 +0,0 @@
 From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
 From: noah <goldstein.w.n@gmail.com>
 Date: Wed, 3 Feb 2021 00:38:59 -0500
 Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
 Content-type: text/plain; charset=UTF-8
 No bug. Just seemed the performance could be improved a bit. Observed
 and expected behavior are unchanged. Optimized body of main
 loop. Updated page cross logic and optimized accordingly. Made a few
 minor instruction selection modifications. No regressions in test
 suite. Both test-strchrnul and test-strchr passed.
 ---
 sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
 sysdeps/x86_64/multiarch/strchr.c      |   4 +-
 2 files changed, 114 insertions(+), 115 deletions(-)
 Conflicts:
 	sysdeps/x86_64/multiarch/strchr.c
 	(account for missing upstream macros)
 diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
 index da7d2620..919d256c 100644
 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
 # ifdef USE_AS_WCSCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
 +#  define VPMINU	vpminud
 #  define CHAR_REG	esi
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
 +#  define VPMINU	vpminub
 #  define CHAR_REG	sil
 # endif
@@ -43,71 +45,54 @@
 # endif
 # define VEC_SIZE 32
 +# define PAGE_SIZE 4096
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
 	movl	%edi, %ecx
 -	/* Broadcast CHAR to YMM0.  */
 +# ifndef USE_AS_STRCHRNUL
 +	xorl	%edx, %edx
 +# endif
 +
 +	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 	vpxor	%xmm9, %xmm9, %xmm9
 	VPBROADCAST %xmm0, %ymm0
 -	/* Check if we may cross page boundary with one vector load.  */
 -	andl	$(2 * VEC_SIZE - 1), %ecx
 -	cmpl	$VEC_SIZE, %ecx
 -	ja	L(cros_page_boundary)
 -	/* Check the first VEC_SIZE bytes.  Search for both CHAR and the
 -	   null byte.  */
 -	vmovdqu	(%rdi), %ymm8
 -	VPCMPEQ %ymm8, %ymm0, %ymm1
 -	VPCMPEQ %ymm8, %ymm9, %ymm2
 -	vpor	%ymm1, %ymm2, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 +	/* Check if we cross page boundary with one vector load.  */
 +	andl	$(PAGE_SIZE - 1), %ecx
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
 +	ja  L(cross_page_boundary)
 -	/* Align data for aligned loads in the loop.  */
 -	addq	$VEC_SIZE, %rdi
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 -
 -	jmp	L(more_4x_vec)
 -
 -	.p2align 4
 -L(cros_page_boundary):
 -	andl	$(VEC_SIZE - 1), %ecx
 -	andq	$-VEC_SIZE, %rdi
 +	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 +	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 -	/* Remove the leading bytes.  */
 -	sarl	%cl, %eax
 	testl	%eax, %eax
 -	jz	L(aligned_more)
 -	/* Found CHAR or the null byte.  */
 +	jz	L(more_vecs)
 	tzcntl	%eax, %eax
 -	addq	%rcx, %rax
 -# ifdef USE_AS_STRCHRNUL
 +	/* Found CHAR or the null byte.	 */
 	addq	%rdi, %rax
 -# else
 -	xorl	%edx, %edx
 -	leaq	(%rdi, %rax), %rax
 -	cmp	(%rax), %CHAR_REG
 +# ifndef USE_AS_STRCHRNUL
 +	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 	.p2align 4
 +L(more_vecs):
 +	/* Align data for aligned loads in the loop.  */
 +	andq	$-VEC_SIZE, %rdi
 L(aligned_more):
 -	addq	$VEC_SIZE, %rdi
 -L(more_4x_vec):
 -	/* Check the first 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 -	   since data is only aligned to VEC_SIZE.  */
 -	vmovdqa	(%rdi), %ymm8
 +	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
 +	   since data is only aligned to VEC_SIZE.	*/
 +	vmovdqa	VEC_SIZE(%rdi), %ymm8
 +	addq	$VEC_SIZE, %rdi
 	VPCMPEQ %ymm8, %ymm0, %ymm1
 	VPCMPEQ %ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
@@ -137,61 +122,24 @@ L(more_4x_vec):
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x3)
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 -
 -	/* Align data to 4 * VEC_SIZE.  */
 -	movq	%rdi, %rcx
 -	andl	$(4 * VEC_SIZE - 1), %ecx
 -	andq	$-(4 * VEC_SIZE), %rdi
 -
 -	.p2align 4
 -L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 -	vmovdqa	(%rdi), %ymm5
 -	vmovdqa	VEC_SIZE(%rdi), %ymm6
 -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
 -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 -
 -	VPCMPEQ %ymm5, %ymm0, %ymm1
 -	VPCMPEQ %ymm6, %ymm0, %ymm2
 -	VPCMPEQ %ymm7, %ymm0, %ymm3
 -	VPCMPEQ %ymm8, %ymm0, %ymm4
 -
 -	VPCMPEQ %ymm5, %ymm9, %ymm5
 -	VPCMPEQ %ymm6, %ymm9, %ymm6
 -	VPCMPEQ %ymm7, %ymm9, %ymm7
 -	VPCMPEQ %ymm8, %ymm9, %ymm8
 -
 -	vpor	%ymm1, %ymm5, %ymm1
 -	vpor	%ymm2, %ymm6, %ymm2
 -	vpor	%ymm3, %ymm7, %ymm3
 -	vpor	%ymm4, %ymm8, %ymm4
 -
 -	vpor	%ymm1, %ymm2, %ymm5
 -	vpor	%ymm3, %ymm4, %ymm6
 -
 -	vpor	%ymm5, %ymm6, %ymm5
 -
 -	vpmovmskb %ymm5, %eax
 -	testl	%eax, %eax
 -	jnz	L(4x_vec_end)
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 +	jz	L(prep_loop_4x)
 -	jmp	L(loop_4x_vec)
 +	tzcntl	%eax, %eax
 +	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 +# ifndef USE_AS_STRCHRNUL
 +	cmp (%rax), %CHAR_REG
 +	cmovne	%rdx, %rax
 +# endif
 +	VZEROUPPER
 +	ret
 	.p2align 4
 L(first_vec_x0):
 -	/* Found CHAR or the null byte.  */
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_STRCHRNUL
 +	/* Found CHAR or the null byte.	 */
 	addq	%rdi, %rax
 -# else
 -	xorl	%edx, %edx
 -	leaq	(%rdi, %rax), %rax
 -	cmp	(%rax), %CHAR_REG
 +# ifndef USE_AS_STRCHRNUL
 +	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -199,13 +147,9 @@ L(first_vec_x0):
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_STRCHRNUL
 -	addq	$VEC_SIZE, %rax
 -	addq	%rdi, %rax
 -# else
 -	xorl	%edx, %edx
 	leaq	VEC_SIZE(%rdi, %rax), %rax
 -	cmp	(%rax), %CHAR_REG
 +# ifndef USE_AS_STRCHRNUL
 +	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER_RETURN
@@ -213,42 +157,97 @@ L(first_vec_x1):
 	.p2align 4
 L(first_vec_x2):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_STRCHRNUL
 -	addq	$(VEC_SIZE * 2), %rax
 -	addq	%rdi, %rax
 -# else
 -	xorl	%edx, %edx
 +	/* Found CHAR or the null byte.	 */
 	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 -	cmp	(%rax), %CHAR_REG
 +# ifndef USE_AS_STRCHRNUL
 +	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER_RETURN
 +L(prep_loop_4x):
 +	/* Align data to 4 * VEC_SIZE.	*/
 +	andq	$-(VEC_SIZE * 4), %rdi
 +
 	.p2align 4
 -L(4x_vec_end):
 +L(loop_4x_vec):
 +	/* Compare 4 * VEC at a time forward.  */
 +	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
 +	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
 +	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
 +	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
 +
 +	/* Leaves only CHARS matching esi as 0.	 */
 +	vpxor	%ymm5, %ymm0, %ymm1
 +	vpxor	%ymm6, %ymm0, %ymm2
 +	vpxor	%ymm7, %ymm0, %ymm3
 +	vpxor	%ymm8, %ymm0, %ymm4
 +
 +	VPMINU	%ymm1, %ymm5, %ymm1
 +	VPMINU	%ymm2, %ymm6, %ymm2
 +	VPMINU	%ymm3, %ymm7, %ymm3
 +	VPMINU	%ymm4, %ymm8, %ymm4
 +
 +	VPMINU	%ymm1, %ymm2, %ymm5
 +	VPMINU	%ymm3, %ymm4, %ymm6
 +
 +	VPMINU	%ymm5, %ymm6, %ymm5
 +
 +	VPCMPEQ %ymm5, %ymm9, %ymm5
 +	vpmovmskb %ymm5, %eax
 +
 +	addq	$(VEC_SIZE * 4), %rdi
 +	testl	%eax, %eax
 +	jz  L(loop_4x_vec)
 +
 +	VPCMPEQ %ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x0)
 +
 +	VPCMPEQ %ymm2, %ymm9, %ymm2
 	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 	jnz	L(first_vec_x1)
 -	vpmovmskb %ymm3, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x2)
 +
 +	VPCMPEQ %ymm3, %ymm9, %ymm3
 +	VPCMPEQ %ymm4, %ymm9, %ymm4
 +	vpmovmskb %ymm3, %ecx
 	vpmovmskb %ymm4, %eax
 +	salq	$32, %rax
 +	orq %rcx, %rax
 +	tzcntq  %rax, %rax
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 +# ifndef USE_AS_STRCHRNUL
 +	cmp (%rax), %CHAR_REG
 +	cmovne	%rdx, %rax
 +# endif
 +	VZEROUPPER
 +	ret
 +
 +	/* Cold case for crossing page with first load.	 */
 +	.p2align 4
 +L(cross_page_boundary):
 +	andq	$-VEC_SIZE, %rdi
 +	andl	$(VEC_SIZE - 1), %ecx
 +
 +	vmovdqa	(%rdi), %ymm8
 +	VPCMPEQ %ymm8, %ymm0, %ymm1
 +	VPCMPEQ %ymm8, %ymm9, %ymm2
 +	vpor	%ymm1, %ymm2, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	/* Remove the leading bits.	 */
 +	sarxl	%ecx, %eax, %eax
 	testl	%eax, %eax
 -L(first_vec_x3):
 +	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_STRCHRNUL
 -	addq	$(VEC_SIZE * 3), %rax
 +	addq	%rcx, %rdi
 	addq	%rdi, %rax
 -# else
 -	xorl	%edx, %edx
 -	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 -	cmp	(%rax), %CHAR_REG
 +# ifndef USE_AS_STRCHRNUL
 +	cmp (%rax), %CHAR_REG
 	cmovne	%rdx, %rax
 # endif
 	VZEROUPPER_RETURN
 END (STRCHR)
 -#endif
 +# endif
 diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
 index 7e582f02..5225bd4f 100644
 --- a/sysdeps/x86_64/multiarch/strchr.c
 +++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
   const struct cpu_features* cpu_features = __get_cpu_features ();
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-38.patch
+++ b/SOURCES/glibc-RHEL-15696-38.patch
@ -1,67 +0,0 @@
 From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Sat, 25 Jan 2020 14:19:40 -0800
 Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
 Content-type: text/plain; charset=UTF-8
 When copying with "rep movsb", if the distance between source and
 destination is N*4GB + [1..63] with N >= 0, performance may be very
 slow.  This patch updates memmove-vec-unaligned-erms.S for AVX and
 AVX512 versions with the distance in RCX:
 	cmpl	$63, %ecx
 	// Don't use "rep movsb" if ECX <= 63
 	jbe	L(Don't use rep movsb")
 	Use "rep movsb"
 Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
 and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
 performance impact is within noise range as "rep movsb" is only used for
 data size >= 4KB.
 ---
 .../multiarch/memmove-vec-unaligned-erms.S    | 21 +++++++++++++++++++
 1 file changed, 21 insertions(+)
 diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 index 673b73aa..c475fed4 100644
 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -64,6 +64,13 @@
 # endif
 #endif
 +/* Avoid short distance rep movsb only with non-SSE vector.  */
 +#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 +# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
 +#else
 +# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
 +#endif
 +
 #ifndef PREFETCH
 # define PREFETCH(addr) prefetcht0 addr
 #endif
@@ -255,7 +262,21 @@ L(movsb):
 	cmpq	%r9, %rdi
 	/* Avoid slow backward REP MOVSB.  */
 	jb	L(more_8x_vec_backward)
 +# if AVOID_SHORT_DISTANCE_REP_MOVSB
 +	movq	%rdi, %rcx
 +	subq	%rsi, %rcx
 +	jmp	2f
 +# endif
 1:
 +# if AVOID_SHORT_DISTANCE_REP_MOVSB
 +	movq	%rsi, %rcx
 +	subq	%rdi, %rcx
 +2:
 +/* Avoid "rep movsb" if RCX, the distance between source and destination,
 +   is N*4GB + [1..63] with N >= 0.  */
 +	cmpl	$63, %ecx
 +	jbe	L(more_2x_vec)	/* Avoid "rep movsb" if ECX <= 63.  */
 +# endif
 	mov	%RDX_LP, %RCX_LP
 	rep movsb
 L(nop):
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-39.patch
+++ b/SOURCES/glibc-RHEL-15696-39.patch
@ -1,449 +0,0 @@
 From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
 From: noah <goldstein.w.n@gmail.com>
 Date: Sat, 3 Apr 2021 04:12:15 -0400
 Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
 Content-type: text/plain; charset=UTF-8
 No Bug. This commit updates the large memcpy case (no overlap). The
 update is to perform memcpy on either 2 or 4 contiguous pages at
 once. This 1) helps to alleviate the affects of false memory aliasing
 when destination and source have a close 4k alignment and 2) In most
 cases and for most DRAM units is a modestly more efficient access
 pattern. These changes are a clear performance improvement for
 VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
 test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
 pass.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
 1 file changed, 265 insertions(+), 73 deletions(-)
 Conflicts:
 	sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 	(different number of sections)
 diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 index c475fed4..3e2dd6bc 100644
 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -32,7 +32,16 @@
       overlapping addresses.
    6. If size >= __x86_shared_non_temporal_threshold and there is no
       overlap between destination and source, use non-temporal store
 -      instead of aligned store.  */
 +      instead of aligned store copying from either 2 or 4 pages at
 +      once.
 +   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
 +      and source and destination do not page alias, copy from 2 pages
 +      at once using non-temporal stores. Page aliasing in this case is
 +      considered true if destination's page alignment - sources' page
 +      alignment is less than 8 * VEC_SIZE.
 +   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
 +      and destination do page alias copy from 4 pages at once using
 +      non-temporal stores.  */
 #include <sysdep.h>
@@ -64,6 +73,34 @@
 # endif
 #endif
 +#ifndef PAGE_SIZE
 +# define PAGE_SIZE 4096
 +#endif
 +
 +#if PAGE_SIZE != 4096
 +# error Unsupported PAGE_SIZE
 +#endif
 +
 +#ifndef LOG_PAGE_SIZE
 +# define LOG_PAGE_SIZE 12
 +#endif
 +
 +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
 +# error Invalid LOG_PAGE_SIZE
 +#endif
 +
 +/* Byte per page for large_memcpy inner loop.  */
 +#if VEC_SIZE == 64
 +# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
 +#else
 +# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
 +#endif
 +
 +/* Amount to shift rdx by to compare for memcpy_large_4x.  */
 +#ifndef LOG_4X_MEMCPY_THRESH
 +# define LOG_4X_MEMCPY_THRESH 4
 +#endif
 +
 /* Avoid short distance rep movsb only with non-SSE vector.  */
 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -103,6 +140,28 @@
 # error Unsupported PREFETCH_SIZE!
 #endif
 +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
 +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
 +	VMOVU	(offset)base, vec0; \
 +	VMOVU	((offset) + VEC_SIZE)base, vec1;
 +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
 +	VMOVNT  vec0, (offset)base; \
 +	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
 +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
 +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
 +	VMOVU	(offset)base, vec0; \
 +	VMOVU	((offset) + VEC_SIZE)base, vec1; \
 +	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
 +	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
 +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
 +	VMOVNT	vec0, (offset)base; \
 +	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
 +	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
 +	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
 +#else
 +# error Invalid LARGE_LOAD_SIZE
 +#endif
 +
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -390,6 +449,15 @@ L(last_4x_vec):
 	VZEROUPPER_RETURN
 L(more_8x_vec):
 +	/* Check if non-temporal move candidate.  */
 +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 +	/* Check non-temporal store threshold.  */
 +	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
 +	ja	L(large_memcpy_2x)
 +#endif
 +	/* Entry if rdx is greater than non-temporal threshold but there
 +       is overlap.  */
 +L(more_8x_vec_check):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
@@ -416,24 +484,21 @@ L(more_8x_vec):
 	subq	%r8, %rdi
 	/* Adjust length.  */
 	addq	%r8, %rdx
 -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 -	/* Check non-temporal store threshold.  */
 -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 -	ja	L(large_forward)
 -#endif
 +
 +	.p2align 4
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 -	addq	$(VEC_SIZE * 4), %rsi
 -	subq	$(VEC_SIZE * 4), %rdx
 +	subq	$-(VEC_SIZE * 4), %rsi
 +	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%rdi)
 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 -	addq	$(VEC_SIZE * 4), %rdi
 +	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	$(VEC_SIZE * 4), %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
 	subq	%r8, %r9
 	/* Adjust length.  */
 	subq	%r8, %rdx
 -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 -	/* Check non-temporal store threshold.  */
 -	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
 -	ja	L(large_backward)
 -#endif
 +
 +	.p2align 4
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
 	VMOVU	(%rcx), %VEC(0)
 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
 -	subq	$(VEC_SIZE * 4), %rcx
 -	subq	$(VEC_SIZE * 4), %rdx
 +	addq	$-(VEC_SIZE * 4), %rcx
 +	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%r9)
 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
 -	subq	$(VEC_SIZE * 4), %r9
 +	addq	$-(VEC_SIZE * 4), %r9
 	cmpq	$(VEC_SIZE * 4), %rdx
 	ja	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
 	VZEROUPPER_RETURN
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
 -L(large_forward):
 +	.p2align 4
 +L(large_memcpy_2x):
 +	/* Compute absolute value of difference between source and
 +	   destination.  */
 +	movq	%rdi, %r9
 +	subq	%rsi, %r9
 +	movq	%r9, %r8
 +	leaq	-1(%r9), %rcx
 +	sarq	$63, %r8
 +	xorq	%r8, %r9
 +	subq	%r8, %r9
 	/* Don't use non-temporal store if there is overlap between
 -	   destination and source since destination may be in cache
 -	   when source is loaded.  */
 -	leaq    (%rdi, %rdx), %r10
 -	cmpq    %r10, %rsi
 -	jb	L(loop_4x_vec_forward)
 -L(loop_large_forward):
 +	   destination and source since destination may be in cache when
 +	   source is loaded.  */
 +	cmpq	%r9, %rdx
 +	ja	L(more_8x_vec_check)
 +
 +	/* Cache align destination. First store the first 64 bytes then
 +	   adjust alignments.  */
 +	VMOVU	(%rsi), %VEC(8)
 +#if VEC_SIZE < 64
 +	VMOVU	VEC_SIZE(%rsi), %VEC(9)
 +#if VEC_SIZE < 32
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
 +#endif
 +#endif
 +	VMOVU	%VEC(8), (%rdi)
 +#if VEC_SIZE < 64
 +	VMOVU	%VEC(9), VEC_SIZE(%rdi)
 +#if VEC_SIZE < 32
 +	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
 +	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
 +#endif
 +#endif
 +	/* Adjust source, destination, and size.  */
 +	movq	%rdi, %r8
 +	andq	$63, %r8
 +	/* Get the negative of offset for alignment.  */
 +	subq	$64, %r8
 +	/* Adjust source.  */
 +	subq	%r8, %rsi
 +	/* Adjust destination which should be aligned now.  */
 +	subq	%r8, %rdi
 +	/* Adjust length.  */
 +	addq	%r8, %rdx
 +
 +	/* Test if source and destination addresses will alias. If they do
 +	   the larger pipeline in large_memcpy_4x alleviated the
 +	   performance drop.  */
 +	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
 +	jz	L(large_memcpy_4x)
 +
 +	movq	%rdx, %r10
 +	shrq	$LOG_4X_MEMCPY_THRESH, %r10
 +	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
 +	jae	L(large_memcpy_4x)
 +
 +	/* edx will store remainder size for copying tail.  */
 +	andl	$(PAGE_SIZE * 2 - 1), %edx
 +	/* r10 stores outer loop counter.  */
 +	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
 +	/* Copy 4x VEC at a time from 2 pages.  */
 +	.p2align 4
 +L(loop_large_memcpy_2x_outer):
 +	/* ecx stores inner loop counter.  */
 +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 +L(loop_large_memcpy_2x_inner):
 +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
 +	/* Load vectors from rsi.  */
 +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 +	subq	$-LARGE_LOAD_SIZE, %rsi
 +	/* Non-temporal store vectors to rdi.  */
 +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 +	subq	$-LARGE_LOAD_SIZE, %rdi
 +	decl	%ecx
 +	jnz	L(loop_large_memcpy_2x_inner)
 +	addq	$PAGE_SIZE, %rdi
 +	addq	$PAGE_SIZE, %rsi
 +	decq	%r10
 +	jne	L(loop_large_memcpy_2x_outer)
 +	sfence
 +
 +	/* Check if only last 4 loads are needed.  */
 +	cmpl	$(VEC_SIZE * 4), %edx
 +	jbe	L(large_memcpy_2x_end)
 +
 +	/* Handle the last 2 * PAGE_SIZE bytes.  */
 +L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
 -	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
 +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 -	addq	$PREFETCHED_LOAD_SIZE, %rsi
 -	subq	$PREFETCHED_LOAD_SIZE, %rdx
 -	VMOVNT	%VEC(0), (%rdi)
 -	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
 -	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
 -	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
 -	addq	$PREFETCHED_LOAD_SIZE, %rdi
 -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
 -	ja	L(loop_large_forward)
 -	sfence
 +	subq	$-(VEC_SIZE * 4), %rsi
 +	addl	$-(VEC_SIZE * 4), %edx
 +	VMOVA	%VEC(0), (%rdi)
 +	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	cmpl	$(VEC_SIZE * 4), %edx
 +	ja	L(loop_large_memcpy_2x_tail)
 +
 +L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
 -	VMOVU	%VEC(5), (%rcx)
 -	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
 -	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
 -	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
 -	/* Store the first VEC.  */
 -	VMOVU	%VEC(4), (%r11)
 +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
 +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
 +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
 +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
 +
 +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
 +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
 +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
 +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 -L(large_backward):
 -	/* Don't use non-temporal store if there is overlap between
 -	   destination and source since destination may be in cache
 -	   when source is loaded.  */
 -	leaq    (%rcx, %rdx), %r10
 -	cmpq    %r10, %r9
 -	jb	L(loop_4x_vec_backward)
 -L(loop_large_backward):
 -	/* Copy 4 * VEC a time backward with non-temporal stores.  */
 -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
 -	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
 -	VMOVU	(%rcx), %VEC(0)
 -	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 -	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 -	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
 -	subq	$PREFETCHED_LOAD_SIZE, %rcx
 -	subq	$PREFETCHED_LOAD_SIZE, %rdx
 -	VMOVNT	%VEC(0), (%r9)
 -	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
 -	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
 -	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
 -	subq	$PREFETCHED_LOAD_SIZE, %r9
 -	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
 -	ja	L(loop_large_backward)
 +	.p2align 4
 +L(large_memcpy_4x):
 +	movq	%rdx, %r10
 +	/* edx will store remainder size for copying tail.  */
 +	andl	$(PAGE_SIZE * 4 - 1), %edx
 +	/* r10 stores outer loop counter.  */
 +	shrq	$(LOG_PAGE_SIZE + 2), %r10
 +	/* Copy 4x VEC at a time from 4 pages.  */
 +	.p2align 4
 +L(loop_large_memcpy_4x_outer):
 +	/* ecx stores inner loop counter.  */
 +	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
 +L(loop_large_memcpy_4x_inner):
 +	/* Only one prefetch set per page as doing 4 pages give more time
 +	   for prefetcher to keep up.  */
 +	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
 +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
 +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
 +	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
 +	/* Load vectors from rsi.  */
 +	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 +	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
 +	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
 +	subq	$-LARGE_LOAD_SIZE, %rsi
 +	/* Non-temporal store vectors to rdi.  */
 +	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
 +	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
 +	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
 +	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
 +	subq	$-LARGE_LOAD_SIZE, %rdi
 +	decl	%ecx
 +	jnz	L(loop_large_memcpy_4x_inner)
 +	addq	$(PAGE_SIZE * 3), %rdi
 +	addq	$(PAGE_SIZE * 3), %rsi
 +	decq	%r10
 +	jne	L(loop_large_memcpy_4x_outer)
 	sfence
 -	/* Store the first 4 * VEC.  */
 -	VMOVU	%VEC(4), (%rdi)
 -	VMOVU	%VEC(5), VEC_SIZE(%rdi)
 -	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
 -	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
 -	/* Store the last VEC.  */
 -	VMOVU	%VEC(8), (%r11)
 +	/* Check if only last 4 loads are needed.  */
 +	cmpl	$(VEC_SIZE * 4), %edx
 +	jbe	L(large_memcpy_4x_end)
 +
 +	/* Handle the last 4  * PAGE_SIZE bytes.  */
 +L(loop_large_memcpy_4x_tail):
 +	/* Copy 4 * VEC a time forward with non-temporal stores.  */
 +	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
 +	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 +	VMOVU	(%rsi), %VEC(0)
 +	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 +	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 +	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
 +	subq	$-(VEC_SIZE * 4), %rsi
 +	addl	$-(VEC_SIZE * 4), %edx
 +	VMOVA	%VEC(0), (%rdi)
 +	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 +	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 +	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	cmpl	$(VEC_SIZE * 4), %edx
 +	ja	L(loop_large_memcpy_4x_tail)
 +
 +L(large_memcpy_4x_end):
 +	/* Store the last 4 * VEC.  */
 +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
 +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
 +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
 +	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
 +
 +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
 +	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
 +	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
 +	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-4.patch
+++ b/SOURCES/glibc-RHEL-15696-4.patch
@ -1,151 +0,0 @@
 From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:29:58 -0800
 Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes memrchr for x32.  Tested on x86-64 and x32.  On x86-64,
 libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
 	* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
 	* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
 ---
 sysdeps/x86_64/memrchr.S                |  4 +-
 sysdeps/x86_64/multiarch/memrchr-avx2.S |  4 +-
 sysdeps/x86_64/x32/Makefile             |  3 +-
 sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
 4 files changed, 63 insertions(+), 5 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
 Conflicts:
 	ChangeLog
 	(removed)
 diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
 index b8e3fa1d..dc82f8f7 100644
 --- a/sysdeps/x86_64/memrchr.S
 +++ b/sysdeps/x86_64/memrchr.S
@@ -24,13 +24,13 @@
 ENTRY (__memrchr)
 	movd	%esi, %xmm1
 -	sub	$16, %rdx
 +	sub	$16, %RDX_LP
 	jbe	L(length_less16)
 	punpcklbw	%xmm1, %xmm1
 	punpcklbw	%xmm1, %xmm1
 -	add	%rdx, %rdi
 +	add	%RDX_LP, %RDI_LP
 	pshufd	$0, %xmm1, %xmm1
 	movdqu	(%rdi), %xmm0
 diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
 index b41a58bc..ce488dd9 100644
 --- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
 	vmovd	%esi, %xmm0
 	vpbroadcastb %xmm0, %ymm0
 -	subq	$VEC_SIZE, %rdx
 +	sub	$VEC_SIZE, %RDX_LP
 	jbe	L(last_vec_or_less)
 -	addq	%rdx, %rdi
 +	add	%RDX_LP, %RDI_LP
 	/* Check the last VEC_SIZE bytes.  */
 	vpcmpeqb (%rdi), %ymm0, %ymm1
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index 2fe1e5ac..e99dbd7c 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
 endif
 ifeq ($(subdir),string)
 -tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
 +tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 +	 tst-size_t-memrchr
 endif
 ifeq ($(subdir),wcsmbs)
 diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
 new file mode 100644
 index 00000000..c83699c0
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
@@ -0,0 +1,57 @@
 +/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define TEST_NAME "memrchr"
 +#include "test-size_t.h"
 +
 +IMPL (memchr, 1)
 +
 +typedef void * (*proto_t) (const void *, int, size_t);
 +
 +static void *
 +__attribute__ ((noinline, noclone))
 +do_memrchr (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  parameter_t src = { { page_size }, buf2 };
 +  parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      c.fn = impl->fn;
 +      void * res = do_memrchr (src, c);
 +      if (res)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %p != NULL",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-40.patch
+++ b/SOURCES/glibc-RHEL-15696-40.patch
@ -1,92 +0,0 @@
 From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 19 Apr 2021 10:45:07 -0700
 Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
 Content-type: text/plain; charset=UTF-8
 Since strchr-avx2.S updated by
 commit 1f745ecc2109890886b161d4791e1406fdfc29b8
 Author: noah <goldstein.w.n@gmail.com>
 Date:   Wed Feb 3 00:38:59 2021 -0500
    x86-64: Refactor and improve performance of strchr-avx2.S
 uses sarx:
 c4 e2 72 f7 c0       	sarx   %ecx,%eax,%eax
 for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
 ifunc-avx2.h.
 ---
 sysdeps/x86_64/multiarch/ifunc-avx2.h      |  4 ++--
 sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
 2 files changed, 11 insertions(+), 5 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
 index e0f30e61..ef72b73f 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
   const struct cpu_features* cpu_features = __get_cpu_features ();
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 	return OPTIMIZE (evex);
       if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index 695cdba6..85b8863a 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strchr.c.  */
   IFUNC_IMPL (i, name, strchr,
 	      IFUNC_IMPL_ADD (array, i, strchr,
 -			      CPU_FEATURE_USABLE (AVX2),
 +			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strchr_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchr,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strchr_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strchr,
@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/strchrnul.c.  */
   IFUNC_IMPL (i, name, strchrnul,
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 -			      CPU_FEATURE_USABLE (AVX2),
 +			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __strchrnul_avx2)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __strchrnul_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, strchrnul,
@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/x86_64/multiarch/wcschr.c.  */
   IFUNC_IMPL (i, name, wcschr,
 	      IFUNC_IMPL_ADD (array, i, wcschr,
 -			      CPU_FEATURE_USABLE (AVX2),
 +			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wcschr_avx2)
 	      IFUNC_IMPL_ADD (array, i, wcschr,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wcschr_avx2_rtm)
 	      IFUNC_IMPL_ADD (array, i, wcschr,
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-41.patch
+++ b/SOURCES/glibc-RHEL-15696-41.patch
@ -1,265 +0,0 @@
 From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 19 Apr 2021 17:48:10 -0400
 Subject: [PATCH] x86: Optimize less_vec evex and avx512
 memset-vec-unaligned-erms.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit adds optimized cased for less_vec memset case that
 uses the avx512vl/avx512bw mask store avoiding the excessive
 branches. test-memset and test-wmemset are passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    | 40 ++++++++++-----
 sysdeps/x86_64/multiarch/ifunc-memset.h       |  6 ++-
 .../multiarch/memset-avx512-unaligned-erms.S  |  2 +-
 .../multiarch/memset-evex-unaligned-erms.S    |  2 +-
 .../multiarch/memset-vec-unaligned-erms.S     | 51 +++++++++++++++----
 5 files changed, 74 insertions(+), 27 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index 85b8863a..d59d65f8 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_chk_avx2_unaligned_erms_rtm)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_chk_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_chk_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_chk_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_chk_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __memset_chk,
 			      CPU_FEATURE_USABLE (AVX512F),
@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      __memset_avx2_unaligned_erms_rtm)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_evex_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_avx512_unaligned_erms)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 -			       && CPU_FEATURE_USABLE (AVX512BW)),
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memset_avx512_unaligned)
 	      IFUNC_IMPL_ADD (array, i, memset,
 			      CPU_FEATURE_USABLE (AVX512F),
@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wmemset_avx2_unaligned_rtm)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
 -			      CPU_FEATURE_USABLE (AVX512VL),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wmemset_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, wmemset,
 -			      CPU_FEATURE_USABLE (AVX512VL),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wmemset_avx512_unaligned))
 #ifdef SHARED
@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			      CPU_FEATURE_USABLE (AVX2),
 			      __wmemset_chk_avx2_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 -			      CPU_FEATURE_USABLE (AVX512VL),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wmemset_chk_evex_unaligned)
 	      IFUNC_IMPL_ADD (array, i, __wmemset_chk,
 -			      CPU_FEATURE_USABLE (AVX512F),
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wmemset_chk_avx512_unaligned))
 #endif
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
 index 19795938..100e3707 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memset.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
       && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 	    return OPTIMIZE (avx512_unaligned_erms);
@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 -	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 +          && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
 +          && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
 	{
 	  if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 	    return OPTIMIZE (evex_unaligned_erms);
 diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 index 22e7b187..8ad842fc 100644
 --- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -19,6 +19,6 @@
 # define SECTION(p)		p##.evex512
 # define MEMSET_SYMBOL(p,s)	p##_avx512_##s
 # define WMEMSET_SYMBOL(p,s)	p##_avx512_##s
 -
 +# define USE_LESS_VEC_MASK_STORE	1
 # include "memset-vec-unaligned-erms.S"
 #endif
 diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 index ae0a4d6e..640f0929 100644
 --- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -19,6 +19,6 @@
 # define SECTION(p)		p##.evex
 # define MEMSET_SYMBOL(p,s)	p##_evex_##s
 # define WMEMSET_SYMBOL(p,s)	p##_evex_##s
 -
 +# define USE_LESS_VEC_MASK_STORE	1
 # include "memset-vec-unaligned-erms.S"
 #endif
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index bae5cba4..f877ac9d 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -63,6 +63,8 @@
 # endif
 #endif
 +#define PAGE_SIZE 4096
 +
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -213,11 +215,38 @@ L(loop):
 	cmpq	%rcx, %rdx
 	jne	L(loop)
 	VZEROUPPER_SHORT_RETURN
 +
 +	.p2align 4
 L(less_vec):
 	/* Less than 1 VEC.  */
 # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
 #  error Unsupported VEC_SIZE!
 # endif
 +# ifdef USE_LESS_VEC_MASK_STORE
 +	/* Clear high bits from edi. Only keeping bits relevant to page
 +	   cross check. Note that we are using rax which is set in
 +	   MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
 +	 */
 +	andl	$(PAGE_SIZE - 1), %edi
 +	/* Check if VEC_SIZE store cross page. Mask stores suffer serious
 +	   performance degradation when it has to fault supress.  */
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %edi
 +	ja	L(cross_page)
 +# if VEC_SIZE > 32
 +	movq	$-1, %rcx
 +	bzhiq	%rdx, %rcx, %rcx
 +	kmovq	%rcx, %k1
 +# else
 +	movl	$-1, %ecx
 +	bzhil	%edx, %ecx, %ecx
 +	kmovd	%ecx, %k1
 +# endif
 +	vmovdqu8	%VEC(0), (%rax) {%k1}
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 +L(cross_page):
 +# endif
 # if VEC_SIZE > 32
 	cmpb	$32, %dl
 	jae	L(between_32_63)
@@ -234,36 +263,36 @@ L(less_vec):
 	cmpb	$1, %dl
 	ja	L(between_2_3)
 	jb	1f
 -	movb	%cl, (%rdi)
 +	movb	%cl, (%rax)
 1:
 	VZEROUPPER_RETURN
 # if VEC_SIZE > 32
 	/* From 32 to 63.  No branch when size == 32.  */
 L(between_32_63):
 -	VMOVU	%YMM0, -32(%rdi,%rdx)
 -	VMOVU	%YMM0, (%rdi)
 +	VMOVU	%YMM0, -32(%rax,%rdx)
 +	VMOVU	%YMM0, (%rax)
 	VZEROUPPER_RETURN
 # endif
 # if VEC_SIZE > 16
 	/* From 16 to 31.  No branch when size == 16.  */
 L(between_16_31):
 -	VMOVU	%XMM0, -16(%rdi,%rdx)
 -	VMOVU	%XMM0, (%rdi)
 +	VMOVU	%XMM0, -16(%rax,%rdx)
 +	VMOVU	%XMM0, (%rax)
 	VZEROUPPER_RETURN
 # endif
 	/* From 8 to 15.  No branch when size == 8.  */
 L(between_8_15):
 -	movq	%rcx, -8(%rdi,%rdx)
 -	movq	%rcx, (%rdi)
 +	movq	%rcx, -8(%rax,%rdx)
 +	movq	%rcx, (%rax)
 	VZEROUPPER_RETURN
 L(between_4_7):
 	/* From 4 to 7.  No branch when size == 4.  */
 -	movl	%ecx, -4(%rdi,%rdx)
 -	movl	%ecx, (%rdi)
 +	movl	%ecx, -4(%rax,%rdx)
 +	movl	%ecx, (%rax)
 	VZEROUPPER_RETURN
 L(between_2_3):
 	/* From 2 to 3.  No branch when size == 2.  */
 -	movw	%cx, -2(%rdi,%rdx)
 -	movw	%cx, (%rdi)
 +	movw	%cx, -2(%rax,%rdx)
 +	movw	%cx, (%rax)
 	VZEROUPPER_RETURN
 END (MEMSET_SYMBOL (__memset, unaligned_erms))
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-42.patch
+++ b/SOURCES/glibc-RHEL-15696-42.patch
@ -1,396 +0,0 @@
 From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Fri, 23 Apr 2021 15:56:24 -0400
 Subject: [PATCH] x86: Optimize strchr-avx2.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes strchr-avx2.S. The optimizations are all
 small things such as save an ALU in the alignment process, saving a
 few instructions in the loop return, saving some bytes in the main
 loop, and increasing the ILP in the return cases. test-strchr,
 test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
 1 file changed, 170 insertions(+), 120 deletions(-)
 Conflics:
 	sysdeps/x86_64/multiarch/strchr-avx2.S
 	(rearranged to account for branch changes)
 diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
 index 919d256c..5884726b 100644
 --- a/sysdeps/x86_64/multiarch/strchr-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -49,133 +49,144 @@
 	.section SECTION(.text),"ax",@progbits
 ENTRY (STRCHR)
 -	movl	%edi, %ecx
 -# ifndef USE_AS_STRCHRNUL
 -	xorl	%edx, %edx
 -# endif
 -
 	/* Broadcast CHAR to YMM0.	*/
 	vmovd	%esi, %xmm0
 +	movl	%edi, %eax
 +	andl	$(PAGE_SIZE - 1), %eax
 +	VPBROADCAST	%xmm0, %ymm0
 	vpxor	%xmm9, %xmm9, %xmm9
 -	VPBROADCAST %xmm0, %ymm0
 	/* Check if we cross page boundary with one vector load.  */
 -	andl	$(PAGE_SIZE - 1), %ecx
 -	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
 -	ja  L(cross_page_boundary)
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	ja	L(cross_page_boundary)
 	/* Check the first VEC_SIZE bytes.	Search for both CHAR and the
 	   null byte.  */
 	vmovdqu	(%rdi), %ymm8
 -	VPCMPEQ %ymm8, %ymm0, %ymm1
 -	VPCMPEQ %ymm8, %ymm9, %ymm2
 +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 -	jz	L(more_vecs)
 +	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 +# ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero)
 +# endif
 	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 +
 +	/* .p2align 5 helps keep performance more consistent if ENTRY()
 +	   alignment % 32 was either 16 or 0. As well this makes the
 +	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
 +	   easier.  */
 +	.p2align 5
 +L(first_vec_x4):
 +	tzcntl	%eax, %eax
 +	addq	$(VEC_SIZE * 3 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero)
 # endif
 -L(return_vzeroupper):
 -	ZERO_UPPER_VEC_REGISTERS_RETURN
 -
 -	.p2align 4
 -L(more_vecs):
 -	/* Align data for aligned loads in the loop.  */
 -	andq	$-VEC_SIZE, %rdi
 -L(aligned_more):
 -
 -	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
 -	   since data is only aligned to VEC_SIZE.	*/
 -	vmovdqa	VEC_SIZE(%rdi), %ymm8
 -	addq	$VEC_SIZE, %rdi
 -	VPCMPEQ %ymm8, %ymm0, %ymm1
 -	VPCMPEQ %ymm8, %ymm9, %ymm2
 -	vpor	%ymm1, %ymm2, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -
 -	vmovdqa	VEC_SIZE(%rdi), %ymm8
 -	VPCMPEQ %ymm8, %ymm0, %ymm1
 -	VPCMPEQ %ymm8, %ymm9, %ymm2
 -	vpor	%ymm1, %ymm2, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -
 -	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm8
 -	VPCMPEQ %ymm8, %ymm0, %ymm1
 -	VPCMPEQ %ymm8, %ymm9, %ymm2
 -	vpor	%ymm1, %ymm2, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x2)
 -
 -	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 -	VPCMPEQ %ymm8, %ymm0, %ymm1
 -	VPCMPEQ %ymm8, %ymm9, %ymm2
 -	vpor	%ymm1, %ymm2, %ymm1
 -	vpmovmskb %ymm1, %eax
 -	testl	%eax, %eax
 -	jz	L(prep_loop_4x)
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 -	tzcntl	%eax, %eax
 -	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +L(zero):
 +	xorl	%eax, %eax
 +	VZEROUPPER_RETURN
 # endif
 -	VZEROUPPER
 -	ret
 +
 	.p2align 4
 -L(first_vec_x0):
 +L(first_vec_x1):
 	tzcntl	%eax, %eax
 -	/* Found CHAR or the null byte.	 */
 -	addq	%rdi, %rax
 +	incq	%rdi
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero)
 # endif
 +	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec_x1):
 +L(first_vec_x2):
 	tzcntl	%eax, %eax
 -	leaq	VEC_SIZE(%rdi, %rax), %rax
 +	addq	$(VEC_SIZE + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero)
 # endif
 +	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec_x2):
 +L(first_vec_x3):
 	tzcntl	%eax, %eax
 -	/* Found CHAR or the null byte.	 */
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 +	addq	$(VEC_SIZE * 2 + 1), %rdi
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero)
 # endif
 +	addq	%rdi, %rax
 	VZEROUPPER_RETURN
 -L(prep_loop_4x):
 -	/* Align data to 4 * VEC_SIZE.	*/
 -	andq	$-(VEC_SIZE * 4), %rdi
 +	.p2align 4
 +L(aligned_more):
 +	/* Align data to VEC_SIZE - 1. This is the same number of
 +	   instructions as using andq -VEC_SIZE but saves 4 bytes of code
 +	   on x4 check.  */
 +	orq	$(VEC_SIZE - 1), %rdi
 +L(cross_page_continue):
 +	/* Check the next 4 * VEC_SIZE.  Only one VEC_SIZE at a time
 +	   since data is only aligned to VEC_SIZE.  */
 +	vmovdqa	1(%rdi), %ymm8
 +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 +	vpor	%ymm1, %ymm2, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x1)
 +
 +	vmovdqa	(VEC_SIZE + 1)(%rdi), %ymm8
 +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 +	vpor	%ymm1, %ymm2, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x2)
 +
 +	vmovdqa	(VEC_SIZE * 2 + 1)(%rdi), %ymm8
 +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 +	vpor	%ymm1, %ymm2, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x3)
 +	vmovdqa	(VEC_SIZE * 3 + 1)(%rdi), %ymm8
 +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 +	vpor	%ymm1, %ymm2, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x4)
 +	/* Align data to VEC_SIZE * 4 - 1.	*/
 +	addq	$(VEC_SIZE * 4 + 1), %rdi
 +	andq	$-(VEC_SIZE * 4), %rdi
 	.p2align 4
 L(loop_4x_vec):
 	/* Compare 4 * VEC at a time forward.  */
 -	vmovdqa	(VEC_SIZE * 4)(%rdi), %ymm5
 -	vmovdqa	(VEC_SIZE * 5)(%rdi), %ymm6
 -	vmovdqa	(VEC_SIZE * 6)(%rdi), %ymm7
 -	vmovdqa	(VEC_SIZE * 7)(%rdi), %ymm8
 +	vmovdqa	(%rdi), %ymm5
 +	vmovdqa	(VEC_SIZE)(%rdi), %ymm6
 +	vmovdqa	(VEC_SIZE * 2)(%rdi), %ymm7
 +	vmovdqa	(VEC_SIZE * 3)(%rdi), %ymm8
 	/* Leaves only CHARS matching esi as 0.	 */
 	vpxor	%ymm5, %ymm0, %ymm1
@@ -191,63 +202,102 @@ L(loop_4x_vec):
 	VPMINU	%ymm1, %ymm2, %ymm5
 	VPMINU	%ymm3, %ymm4, %ymm6
 -	VPMINU	%ymm5, %ymm6, %ymm5
 +	VPMINU	%ymm5, %ymm6, %ymm6
 -	VPCMPEQ %ymm5, %ymm9, %ymm5
 -	vpmovmskb %ymm5, %eax
 +	VPCMPEQ	%ymm6, %ymm9, %ymm6
 +	vpmovmskb %ymm6, %ecx
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	testl	%ecx, %ecx
 +	jz	L(loop_4x_vec)
 -	addq	$(VEC_SIZE * 4), %rdi
 -	testl	%eax, %eax
 -	jz  L(loop_4x_vec)
 -	VPCMPEQ %ymm1, %ymm9, %ymm1
 +	VPCMPEQ	%ymm1, %ymm9, %ymm1
 	vpmovmskb %ymm1, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 +	jnz	L(last_vec_x0)
 +
 -	VPCMPEQ %ymm2, %ymm9, %ymm2
 +	VPCMPEQ	%ymm5, %ymm9, %ymm2
 	vpmovmskb %ymm2, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 +	jnz	L(last_vec_x1)
 +
 +	VPCMPEQ	%ymm3, %ymm9, %ymm3
 +	vpmovmskb %ymm3, %eax
 +	/* rcx has combined result from all 4 VEC. It will only be used
 +	   if the first 3 other VEC all did not contain a match.  */
 +	salq	$32, %rcx
 +	orq	%rcx, %rax
 +	tzcntq	%rax, %rax
 +	subq	$(VEC_SIZE * 2), %rdi
 +# ifndef USE_AS_STRCHRNUL
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero_end)
 +# endif
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 +
 +
 +	.p2align 4
 +L(last_vec_x0):
 +	tzcntl	%eax, %eax
 +	addq	$-(VEC_SIZE * 4), %rdi
 +# ifndef USE_AS_STRCHRNUL
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero_end)
 +# endif
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 -	VPCMPEQ %ymm3, %ymm9, %ymm3
 -	VPCMPEQ %ymm4, %ymm9, %ymm4
 -	vpmovmskb %ymm3, %ecx
 -	vpmovmskb %ymm4, %eax
 -	salq	$32, %rax
 -	orq %rcx, %rax
 -	tzcntq  %rax, %rax
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +L(zero_end):
 +	xorl	%eax, %eax
 +	VZEROUPPER_RETURN
 # endif
 -	VZEROUPPER
 -	ret
 +
 +	.p2align 4
 +L(last_vec_x1):
 +	tzcntl	%eax, %eax
 +	subq	$(VEC_SIZE * 3), %rdi
 +# ifndef USE_AS_STRCHRNUL
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdi, %rax), %CHAR_REG
 +	jne	L(zero_end)
 +# endif
 +	addq	%rdi, %rax
 +	VZEROUPPER_RETURN
 +
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
 -	andq	$-VEC_SIZE, %rdi
 -	andl	$(VEC_SIZE - 1), %ecx
 -
 -	vmovdqa	(%rdi), %ymm8
 -	VPCMPEQ %ymm8, %ymm0, %ymm1
 -	VPCMPEQ %ymm8, %ymm9, %ymm2
 +	movq	%rdi, %rdx
 +	/* Align rdi to VEC_SIZE - 1.  */
 +	orq	$(VEC_SIZE - 1), %rdi
 +	vmovdqa	-(VEC_SIZE - 1)(%rdi), %ymm8
 +	VPCMPEQ	%ymm8, %ymm0, %ymm1
 +	VPCMPEQ	%ymm8, %ymm9, %ymm2
 	vpor	%ymm1, %ymm2, %ymm1
 	vpmovmskb %ymm1, %eax
 -	/* Remove the leading bits.	 */
 -	sarxl	%ecx, %eax, %eax
 +	/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
 +	   so no need to manually mod edx.  */
 +	sarxl	%edx, %eax, %eax
 	testl	%eax, %eax
 -	jz	L(aligned_more)
 +	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 -	addq	%rcx, %rdi
 -	addq	%rdi, %rax
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	xorl	%ecx, %ecx
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rdx, %rax), %CHAR_REG
 +	leaq	(%rdx, %rax), %rax
 +	cmovne	%rcx, %rax
 +# else
 +	addq	%rdx, %rax
 # endif
 -	VZEROUPPER_RETURN
 +L(return_vzeroupper):
 +	ZERO_UPPER_VEC_REGISTERS_RETURN
 END (STRCHR)
 # endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-43.patch
+++ b/SOURCES/glibc-RHEL-15696-43.patch
@ -1,532 +0,0 @@
 From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Fri, 23 Apr 2021 15:56:25 -0400
 Subject: [PATCH] x86: Optimize strchr-evex.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes strchr-evex.S. The optimizations are
 mostly small things such as save an ALU in the alignment process,
 saving a few instructions in the loop return. The one significant
 change is saving 2 instructions in the 4x loop. test-strchr,
 test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
 1 file changed, 218 insertions(+), 174 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
 index ddc86a70..7f9d4ee4 100644
 --- a/sysdeps/x86_64/multiarch/strchr-evex.S
 +++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -32,13 +32,15 @@
 #  define VPCMP		vpcmpd
 #  define VPMINU	vpminud
 #  define CHAR_REG	esi
 -#  define SHIFT_REG	r8d
 +#  define SHIFT_REG	ecx
 +#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMP		vpcmpb
 #  define VPMINU	vpminub
 #  define CHAR_REG	sil
 -#  define SHIFT_REG	ecx
 +#  define SHIFT_REG	edx
 +#  define CHAR_SIZE	1
 # endif
 # define XMMZERO	xmm16
@@ -56,23 +58,20 @@
 # define VEC_SIZE 32
 # define PAGE_SIZE 4096
 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 	.section .text.evex,"ax",@progbits
 ENTRY (STRCHR)
 -	movl	%edi, %ecx
 -# ifndef USE_AS_STRCHRNUL
 -	xorl	%edx, %edx
 -# endif
 -
 	/* Broadcast CHAR to YMM0.	*/
 -	VPBROADCAST %esi, %YMM0
 -
 +	VPBROADCAST	%esi, %YMM0
 +	movl	%edi, %eax
 +	andl	$(PAGE_SIZE - 1), %eax
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 -	/* Check if we cross page boundary with one vector load.  */
 -	andl	$(PAGE_SIZE - 1), %ecx
 -	cmpl	$(PAGE_SIZE - VEC_SIZE), %ecx
 -	ja  L(cross_page_boundary)
 +	/* Check if we cross page boundary with one vector load.
 +	   Otherwise it is safe to use an unaligned load.  */
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	ja	L(cross_page_boundary)
 	/* Check the first VEC_SIZE bytes. Search for both CHAR and the
 	   null bytes.  */
@@ -83,251 +82,296 @@ ENTRY (STRCHR)
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
 -	ktestd	%k0, %k0
 -	jz	L(more_vecs)
 	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jz	L(aligned_more)
 	tzcntl	%eax, %eax
 -	/* Found CHAR or the null byte.	 */
 # ifdef USE_AS_WCSCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(%rdi, %rax, 4), %rax
 +	/* NB: Multiply wchar_t count by 4 to get the number of bytes.
 +	 */
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 # else
 	addq	%rdi, %rax
 # endif
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(%rax), %CHAR_REG
 +	jne	L(zero)
 # endif
 	ret
 -	.p2align 4
 -L(more_vecs):
 -	/* Align data for aligned loads in the loop.  */
 -	andq	$-VEC_SIZE, %rdi
 -L(aligned_more):
 -
 -	/* Check the next 4 * VEC_SIZE.	 Only one VEC_SIZE at a time
 -	   since data is only aligned to VEC_SIZE.	*/
 -	VMOVA	VEC_SIZE(%rdi), %YMM1
 -	addq	$VEC_SIZE, %rdi
 -
 -	/* Leaves only CHARS matching esi as 0.  */
 -	vpxorq	%YMM1, %YMM0, %YMM2
 -	VPMINU	%YMM2, %YMM1, %YMM2
 -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 -
 -	VMOVA	VEC_SIZE(%rdi), %YMM1
 -	/* Leaves only CHARS matching esi as 0.  */
 -	vpxorq	%YMM1, %YMM0, %YMM2
 -	VPMINU	%YMM2, %YMM1, %YMM2
 -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -
 -	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
 -	/* Leaves only CHARS matching esi as 0.  */
 -	vpxorq	%YMM1, %YMM0, %YMM2
 -	VPMINU	%YMM2, %YMM1, %YMM2
 -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 -	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -	jnz	L(first_vec_x2)
 -
 -	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
 -	/* Leaves only CHARS matching esi as 0.  */
 -	vpxorq	%YMM1, %YMM0, %YMM2
 -	VPMINU	%YMM2, %YMM1, %YMM2
 -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 -	VPCMP	$0, %YMMZERO, %YMM2, %k0
 -	ktestd	%k0, %k0
 -	jz	L(prep_loop_4x)
 -
 -	kmovd	%k0, %eax
 +	/* .p2align 5 helps keep performance more consistent if ENTRY()
 +	   alignment % 32 was either 16 or 0. As well this makes the
 +	   alignment % 32 of the loop_4x_vec fixed which makes tuning it
 +	   easier.  */
 +	.p2align 5
 +L(first_vec_x3):
 	tzcntl	%eax, %eax
 +# ifndef USE_AS_STRCHRNUL
 	/* Found CHAR or the null byte.	 */
 -# ifdef USE_AS_WCSCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(VEC_SIZE * 3)(%rdi, %rax, 4), %rax
 -# else
 -	leaq	(VEC_SIZE * 3)(%rdi, %rax), %rax
 +	cmp	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 +	jne	L(zero)
 # endif
 +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 +	   bytes.  */
 +	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 -# endif
 +L(zero):
 +	xorl	%eax, %eax
 	ret
 +# endif
 	.p2align 4
 -L(first_vec_x0):
 +L(first_vec_x4):
 +# ifndef USE_AS_STRCHRNUL
 +	/* Check to see if first match was CHAR (k0) or null (k1).  */
 +	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
 -	/* Found CHAR or the null byte.	 */
 -# ifdef USE_AS_WCSCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(%rdi, %rax, 4), %rax
 +	kmovd	%k1, %ecx
 +	/* bzhil will not be 0 if first match was null.  */
 +	bzhil	%eax, %ecx, %ecx
 +	jne	L(zero)
 # else
 -	addq	%rdi, %rax
 -# endif
 -# ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Combine CHAR and null matches.  */
 +	kord	%k0, %k1, %k0
 +	kmovd	%k0, %eax
 +	tzcntl	%eax, %eax
 # endif
 +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 +	   bytes.  */
 +	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 	.p2align 4
 L(first_vec_x1):
 	tzcntl	%eax, %eax
 -	/* Found CHAR or the null byte.	 */
 -# ifdef USE_AS_WCSCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	VEC_SIZE(%rdi, %rax, 4), %rax
 -# else
 -	leaq	VEC_SIZE(%rdi, %rax), %rax
 -# endif
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Found CHAR or the null byte.	 */
 +	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 +	jne	L(zero)
 +
 # endif
 +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 +	   bytes.  */
 +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 	.p2align 4
 L(first_vec_x2):
 +# ifndef USE_AS_STRCHRNUL
 +	/* Check to see if first match was CHAR (k0) or null (k1).  */
 +	kmovd	%k0, %eax
 	tzcntl	%eax, %eax
 -	/* Found CHAR or the null byte.	 */
 -# ifdef USE_AS_WCSCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
 +	kmovd	%k1, %ecx
 +	/* bzhil will not be 0 if first match was null.  */
 +	bzhil	%eax, %ecx, %ecx
 +	jne	L(zero)
 # else
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 -# endif
 -# ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Combine CHAR and null matches.  */
 +	kord	%k0, %k1, %k0
 +	kmovd	%k0, %eax
 +	tzcntl	%eax, %eax
 # endif
 +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 +	   bytes.  */
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 -L(prep_loop_4x):
 -	/* Align data to 4 * VEC_SIZE.	*/
 +	.p2align 4
 +L(aligned_more):
 +	/* Align data to VEC_SIZE.  */
 +	andq	$-VEC_SIZE, %rdi
 +L(cross_page_continue):
 +	/* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
 +	   data is only aligned to VEC_SIZE. Use two alternating methods
 +	   for checking VEC to balance latency and port contention.  */
 +
 +	/* This method has higher latency but has better port
 +	   distribution.  */
 +	VMOVA	(VEC_SIZE)(%rdi), %YMM1
 +	/* Leaves only CHARS matching esi as 0.  */
 +	vpxorq	%YMM1, %YMM0, %YMM2
 +	VPMINU	%YMM2, %YMM1, %YMM2
 +	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 +	VPCMP	$0, %YMMZERO, %YMM2, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x1)
 +
 +	/* This method has higher latency but has better port
 +	   distribution.  */
 +	VMOVA	(VEC_SIZE * 2)(%rdi), %YMM1
 +	/* Each bit in K0 represents a CHAR in YMM1.  */
 +	VPCMP	$0, %YMM1, %YMM0, %k0
 +	/* Each bit in K1 represents a CHAR in YMM1.  */
 +	VPCMP	$0, %YMM1, %YMMZERO, %k1
 +	kortestd	%k0, %k1
 +	jnz	L(first_vec_x2)
 +
 +	VMOVA	(VEC_SIZE * 3)(%rdi), %YMM1
 +	/* Leaves only CHARS matching esi as 0.  */
 +	vpxorq	%YMM1, %YMM0, %YMM2
 +	VPMINU	%YMM2, %YMM1, %YMM2
 +	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 +	VPCMP	$0, %YMMZERO, %YMM2, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(first_vec_x3)
 +
 +	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 +	/* Each bit in K0 represents a CHAR in YMM1.  */
 +	VPCMP	$0, %YMM1, %YMM0, %k0
 +	/* Each bit in K1 represents a CHAR in YMM1.  */
 +	VPCMP	$0, %YMM1, %YMMZERO, %k1
 +	kortestd	%k0, %k1
 +	jnz	L(first_vec_x4)
 +
 +	/* Align data to VEC_SIZE * 4 for the loop.  */
 +	addq	$VEC_SIZE, %rdi
 	andq	$-(VEC_SIZE * 4), %rdi
 	.p2align 4
 L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 +	/* Check 4x VEC at a time. No penalty to imm32 offset with evex
 +	   encoding.  */
 	VMOVA	(VEC_SIZE * 4)(%rdi), %YMM1
 	VMOVA	(VEC_SIZE * 5)(%rdi), %YMM2
 	VMOVA	(VEC_SIZE * 6)(%rdi), %YMM3
 	VMOVA	(VEC_SIZE * 7)(%rdi), %YMM4
 -	/* Leaves only CHARS matching esi as 0.  */
 +	/* For YMM1 and YMM3 use xor to set the CHARs matching esi to
 +	   zero.  */
 	vpxorq	%YMM1, %YMM0, %YMM5
 -	vpxorq	%YMM2, %YMM0, %YMM6
 +	/* For YMM2 and YMM4 cmp not equals to CHAR and store result in
 +	   k register. Its possible to save either 1 or 2 instructions
 +	   using cmp no equals method for either YMM1 or YMM1 and YMM3
 +	   respectively but bottleneck on p5 makes it not worth it.  */
 +	VPCMP	$4, %YMM0, %YMM2, %k2
 	vpxorq	%YMM3, %YMM0, %YMM7
 -	vpxorq	%YMM4, %YMM0, %YMM8
 -
 -	VPMINU	%YMM5, %YMM1, %YMM5
 -	VPMINU	%YMM6, %YMM2, %YMM6
 -	VPMINU	%YMM7, %YMM3, %YMM7
 -	VPMINU	%YMM8, %YMM4, %YMM8
 -
 -	VPMINU	%YMM5, %YMM6, %YMM1
 -	VPMINU	%YMM7, %YMM8, %YMM2
 -
 -	VPMINU	%YMM1, %YMM2, %YMM1
 -
 -	/* Each bit in K0 represents a CHAR or a null byte.  */
 -	VPCMP	$0, %YMMZERO, %YMM1, %k0
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 -
 -	ktestd	%k0, %k0
 +	VPCMP	$4, %YMM0, %YMM4, %k4
 +
 +	/* Use min to select all zeros from either xor or end of string).
 +	 */
 +	VPMINU	%YMM1, %YMM5, %YMM1
 +	VPMINU	%YMM3, %YMM7, %YMM3
 +
 +	/* Use min + zeromask to select for zeros. Since k2 and k4 will
 +	   have 0 as positions that matched with CHAR which will set
 +	   zero in the corresponding destination bytes in YMM2 / YMM4.
 +	 */
 +	VPMINU	%YMM1, %YMM2, %YMM2{%k2}{z}
 +	VPMINU	%YMM3, %YMM4, %YMM4
 +	VPMINU	%YMM2, %YMM4, %YMM4{%k4}{z}
 +
 +	VPCMP	$0, %YMMZERO, %YMM4, %k1
 +	kmovd	%k1, %ecx
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	testl	%ecx, %ecx
 	jz	L(loop_4x_vec)
 -	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 -	VPCMP	$0, %YMMZERO, %YMM5, %k0
 +	VPCMP	$0, %YMMZERO, %YMM1, %k0
 	kmovd	%k0, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x0)
 +	jnz	L(last_vec_x1)
 -	/* Each bit in K1 represents a CHAR or a null byte in YMM2.  */
 -	VPCMP	$0, %YMMZERO, %YMM6, %k1
 -	kmovd	%k1, %eax
 +	VPCMP	$0, %YMMZERO, %YMM2, %k0
 +	kmovd	%k0, %eax
 	testl	%eax, %eax
 -	jnz	L(first_vec_x1)
 -
 -	/* Each bit in K2 represents a CHAR or a null byte in YMM3.  */
 -	VPCMP	$0, %YMMZERO, %YMM7, %k2
 -	/* Each bit in K3 represents a CHAR or a null byte in YMM4.  */
 -	VPCMP	$0, %YMMZERO, %YMM8, %k3
 +	jnz	L(last_vec_x2)
 +	VPCMP	$0, %YMMZERO, %YMM3, %k0
 +	kmovd	%k0, %eax
 +	/* Combine YMM3 matches (eax) with YMM4 matches (ecx).  */
 # ifdef USE_AS_WCSCHR
 -	/* NB: Each bit in K2/K3 represents 4-byte element.  */
 -	kshiftlw $8, %k3, %k1
 +	sall	$8, %ecx
 +	orl	%ecx, %eax
 +	tzcntl	%eax, %eax
 # else
 -	kshiftlq $32, %k3, %k1
 +	salq	$32, %rcx
 +	orq	%rcx, %rax
 +	tzcntq	%rax, %rax
 # endif
 +# ifndef USE_AS_STRCHRNUL
 +	/* Check if match was CHAR or null.  */
 +	cmp	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 +	jne	L(zero_end)
 +# endif
 +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 +	   bytes.  */
 +	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 -	/* Each bit in K1 represents a NULL or a mismatch.  */
 -	korq	%k1, %k2, %k1
 -	kmovq	%k1, %rax
 +# ifndef USE_AS_STRCHRNUL
 +L(zero_end):
 +	xorl	%eax, %eax
 +	ret
 +# endif
 -	tzcntq  %rax, %rax
 -# ifdef USE_AS_WCSCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax, 4), %rax
 -# else
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax), %rax
 +	.p2align 4
 +L(last_vec_x1):
 +	tzcntl	%eax, %eax
 +# ifndef USE_AS_STRCHRNUL
 +	/* Check if match was null.  */
 +	cmp	(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 +	jne	L(zero_end)
 # endif
 +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 +	   bytes.  */
 +	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 +	ret
 +
 +	.p2align 4
 +L(last_vec_x2):
 +	tzcntl	%eax, %eax
 # ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	/* Check if match was null.  */
 +	cmp	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
 +	jne	L(zero_end)
 # endif
 +	/* NB: Multiply sizeof char type (1 or 4) to get the number of
 +	   bytes.  */
 +	leaq	(VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 	/* Cold case for crossing page with first load.	 */
 	.p2align 4
 L(cross_page_boundary):
 +	movq	%rdi, %rdx
 +	/* Align rdi.  */
 	andq	$-VEC_SIZE, %rdi
 -	andl	$(VEC_SIZE - 1), %ecx
 -
 	VMOVA	(%rdi), %YMM1
 -
 	/* Leaves only CHARS matching esi as 0.  */
 	vpxorq	%YMM1, %YMM0, %YMM2
 	VPMINU	%YMM2, %YMM1, %YMM2
 	/* Each bit in K0 represents a CHAR or a null byte in YMM1.  */
 	VPCMP	$0, %YMMZERO, %YMM2, %k0
 	kmovd	%k0, %eax
 -	testl	%eax, %eax
 -
 +	/* Remove the leading bits.	 */
 # ifdef USE_AS_WCSCHR
 +	movl	%edx, %SHIFT_REG
 	/* NB: Divide shift count by 4 since each bit in K1 represent 4
 	   bytes.  */
 -	movl	%ecx, %SHIFT_REG
 -	sarl    $2, %SHIFT_REG
 +	sarl	$2, %SHIFT_REG
 +	andl	$(CHAR_PER_VEC - 1), %SHIFT_REG
 # endif
 -
 -	/* Remove the leading bits.	 */
 	sarxl	%SHIFT_REG, %eax, %eax
 +	/* If eax is zero continue.  */
 	testl	%eax, %eax
 -
 -	jz	L(aligned_more)
 +	jz	L(cross_page_continue)
 	tzcntl	%eax, %eax
 -	addq	%rcx, %rdi
 +# ifndef USE_AS_STRCHRNUL
 +	/* Check to see if match was CHAR or null.  */
 +	cmp	(%rdx, %rax, CHAR_SIZE), %CHAR_REG
 +	jne	L(zero_end)
 +# endif
 # ifdef USE_AS_WCSCHR
 -	/* NB: Multiply wchar_t count by 4 to get the number of bytes.  */
 -	leaq	(%rdi, %rax, 4), %rax
 +	/* NB: Multiply wchar_t count by 4 to get the number of
 +	   bytes.  */
 +	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 # else
 -	addq	%rdi, %rax
 -# endif
 -# ifndef USE_AS_STRCHRNUL
 -	cmp (%rax), %CHAR_REG
 -	cmovne	%rdx, %rax
 +	addq	%rdx, %rax
 # endif
 	ret
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-44.patch
+++ b/SOURCES/glibc-RHEL-15696-44.patch
@ -1,536 +0,0 @@
 From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Tue, 4 May 2021 19:02:40 -0400
 Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
 Content-type: text/plain; charset=UTF-8
 No bug.
 This commit adds a new implementation for EVEX memchr that is not safe
 for RTM because it uses vzeroupper. The benefit is that by using
 ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
 faster than the RTM safe version which cannot use vpcmpeq because
 there is no EVEX encoding for the instruction. All parts of the
 implementation aside from the 4x loop are the same for the two
 versions and the optimization is only relevant for large sizes.
 Tigerlake:
 size  , algn  , Pos   , Cur T , New T , Win     , Dif
 512   , 6     , 192   , 9.2   , 9.04  , no-RTM  , 0.16
 512   , 7     , 224   , 9.19  , 8.98  , no-RTM  , 0.21
 2048  , 0     , 256   , 10.74 , 10.54 , no-RTM  , 0.2
 2048  , 0     , 512   , 14.81 , 14.87 , RTM     , 0.06
 2048  , 0     , 1024  , 22.97 , 22.57 , no-RTM  , 0.4
 2048  , 0     , 2048  , 37.49 , 34.51 , no-RTM  , 2.98   <--
 Icelake:
 size  , algn  , Pos   , Cur T , New T , Win     , Dif
 512   , 6     , 192   , 7.6   , 7.3   , no-RTM  , 0.3
 512   , 7     , 224   , 7.63  , 7.27  , no-RTM  , 0.36
 2048  , 0     , 256   , 8.48  , 8.38  , no-RTM  , 0.1
 2048  , 0     , 512   , 11.57 , 11.42 , no-RTM  , 0.15
 2048  , 0     , 1024  , 17.92 , 17.38 , no-RTM  , 0.54
 2048  , 0     , 2048  , 30.37 , 27.34 , no-RTM  , 3.03   <--
 test-memchr, test-wmemchr, and test-rawmemchr are all passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/Makefile             |   7 +-
 sysdeps/x86_64/multiarch/ifunc-evex.h         |  55 ++++++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |  15 ++
 sysdeps/x86_64/multiarch/memchr-evex-rtm.S    |   8 +
 sysdeps/x86_64/multiarch/memchr-evex.S        | 161 ++++++++++++++----
 sysdeps/x86_64/multiarch/memchr.c             |   2 +-
 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S |   3 +
 sysdeps/x86_64/multiarch/rawmemchr.c          |   2 +-
 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S   |   3 +
 sysdeps/x86_64/multiarch/wmemchr.c            |   2 +-
 10 files changed, 217 insertions(+), 41 deletions(-)
 create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
 create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
 create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
 diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
 index 65fde4eb..26be4095 100644
 --- a/sysdeps/x86_64/multiarch/Makefile
 +++ b/sysdeps/x86_64/multiarch/Makefile
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
 		   strncmp-evex \
 		   strncpy-evex \
 		   strnlen-evex \
 -		   strrchr-evex
 +		   strrchr-evex \
 +		   memchr-evex-rtm \
 +		   rawmemchr-evex-rtm
 CFLAGS-varshift.c += -msse4
 CFLAGS-strcspn-c.c += -msse4
 CFLAGS-strpbrk-c.c += -msse4
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
 		   wcsnlen-evex \
 		   wcsrchr-evex \
 		   wmemchr-evex \
 -		   wmemcmp-evex-movbe
 +		   wmemcmp-evex-movbe \
 +		   wmemchr-evex-rtm
 endif
 ifeq ($(subdir),debug)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
 new file mode 100644
 index 00000000..fc391edb
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
@@ -0,0 +1,55 @@
 +/* Common definition for ifunc selection optimized with EVEX.
 +   All versions must be listed in ifunc-impl-list.c.
 +   Copyright (C) 2017-2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <init-arch.h>
 +
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
 +extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
 +
 +
 +static inline void *
 +IFUNC_SELECTOR (void)
 +{
 +  const struct cpu_features* cpu_features = __get_cpu_features ();
 +
 +  if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
 +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
 +      && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
 +    {
 +      if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 +	  && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
 +	{
 +	  if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 +	    return OPTIMIZE (evex_rtm);
 +
 +	  return OPTIMIZE (evex);
 +	}
 +
 +      if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
 +	return OPTIMIZE (avx2_rtm);
 +
 +      if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
 +	return OPTIMIZE (avx2);
 +    }
 +
 +  return OPTIMIZE (sse2);
 +}
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index d59d65f8..ac097e8d 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __memchr_evex)
 +	      IFUNC_IMPL_ADD (array, i, memchr,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 +			      __memchr_evex_rtm)
 	      IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
   /* Support sysdeps/x86_64/multiarch/memcmp.c.  */
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __rawmemchr_evex)
 +	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 +			      __rawmemchr_evex_rtm)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
   /* Support sysdeps/x86_64/multiarch/strlen.c.  */
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 			       && CPU_FEATURE_USABLE (AVX512BW)
 			       && CPU_FEATURE_USABLE (BMI2)),
 			      __wmemchr_evex)
 +	      IFUNC_IMPL_ADD (array, i, wmemchr,
 +			      (CPU_FEATURE_USABLE (AVX512VL)
 +			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)),
 +			      __wmemchr_evex_rtm)
 	      IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
   /* Support sysdeps/x86_64/multiarch/wmemcmp.c.  */
 diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
 new file mode 100644
 index 00000000..19871882
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
@@ -0,0 +1,8 @@
 +#ifndef MEMCHR
 +# define MEMCHR __memchr_evex_rtm
 +#endif
 +
 +#define USE_IN_RTM 1
 +#define SECTION(p) p##.evex.rtm
 +
 +#include "memchr-evex.S"
 diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
 index f3fdad4f..4d0ed6d1 100644
 --- a/sysdeps/x86_64/multiarch/memchr-evex.S
 +++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -38,10 +38,32 @@
 #  define CHAR_SIZE	1
 # endif
 +	/* In the 4x loop the RTM and non-RTM versions have data pointer
 +	   off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
 +	   This is represented by BASE_OFFSET. As well because the RTM
 +	   version uses vpcmp which stores a bit per element compared where
 +	   the non-RTM version uses vpcmpeq which stores a bit per byte
 +	   compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
 +	   version.  */
 +# ifdef USE_IN_RTM
 +#  define VZEROUPPER
 +#  define BASE_OFFSET	(VEC_SIZE * 4)
 +#  define RET_SCALE	CHAR_SIZE
 +# else
 +#  define VZEROUPPER	vzeroupper
 +#  define BASE_OFFSET	0
 +#  define RET_SCALE	1
 +# endif
 +
 +	/* In the return from 4x loop memchr and rawmemchr versions have
 +	   data pointers off by VEC_SIZE * 4 with memchr version being
 +	   VEC_SIZE * 4 greater.  */
 # ifdef USE_AS_RAWMEMCHR
 +#  define RET_OFFSET	(BASE_OFFSET - (VEC_SIZE * 4))
 #  define RAW_PTR_REG	rcx
 #  define ALGN_PTR_REG	rdi
 # else
 +#  define RET_OFFSET	BASE_OFFSET
 #  define RAW_PTR_REG	rdi
 #  define ALGN_PTR_REG	rcx
 # endif
@@ -57,11 +79,15 @@
 # define YMM5		ymm21
 # define YMM6		ymm22
 +# ifndef SECTION
 +#  define SECTION(p)	p##.evex
 +# endif
 +
 # define VEC_SIZE 32
 # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
 # define PAGE_SIZE 4096
 -	.section .text.evex,"ax",@progbits
 +	.section SECTION(.text),"ax",@progbits
 ENTRY (MEMCHR)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
@@ -237,14 +263,15 @@ L(cross_page_continue):
 	/* Check if at last CHAR_PER_VEC * 4 length.  */
 	subq	$(CHAR_PER_VEC * 4), %rdx
 	jbe	L(last_4x_vec_or_less_cmpeq)
 -	addq	$VEC_SIZE, %rdi
 +	/* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5.  */
 +	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
 	/* Align data to VEC_SIZE * 4 for the loop and readjust length.
 	 */
 #  ifdef USE_AS_WMEMCHR
 	movl	%edi, %ecx
 	andq	$-(4 * VEC_SIZE), %rdi
 -	andl	$(VEC_SIZE * 4 - 1), %ecx
 +	subl	%edi, %ecx
 	/* NB: Divide bytes by 4 to get the wchar_t count.  */
 	sarl	$2, %ecx
 	addq	%rcx, %rdx
@@ -254,15 +281,28 @@ L(cross_page_continue):
 	subq	%rdi, %rdx
 #  endif
 # else
 -	addq	$VEC_SIZE, %rdi
 +	addq	$(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
 	andq	$-(4 * VEC_SIZE), %rdi
 # endif
 -
 +# ifdef USE_IN_RTM
 	vpxorq	%XMMZERO, %XMMZERO, %XMMZERO
 +# else
 +	/* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
 +	   encodable with EVEX registers (ymm16-ymm31).  */
 +	vmovdqa64 %YMMMATCH, %ymm0
 +# endif
 	/* Compare 4 * VEC at a time forward.  */
 	.p2align 4
 L(loop_4x_vec):
 +	/* Two versions of the loop. One that does not require
 +	   vzeroupper by not using ymm0-ymm15 and another does that require
 +	   vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
 +	   is used at all is because there is no EVEX encoding vpcmpeq and
 +	   with vpcmpeq this loop can be performed more efficiently. The
 +	   non-vzeroupper version is safe for RTM while the vzeroupper
 +	   version should be prefered if RTM are not supported.  */
 +# ifdef USE_IN_RTM
 	/* It would be possible to save some instructions using 4x VPCMP
 	   but bottleneck on port 5 makes it not woth it.  */
 	VPCMP	$4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
@@ -273,12 +313,55 @@ L(loop_4x_vec):
 	/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask.  */
 	VPMINU	%YMM2, %YMM3, %YMM3{%k1}{z}
 	VPCMP	$0, %YMM3, %YMMZERO, %k2
 +# else
 +	/* Since vptern can only take 3x vectors fastest to do 1 vec
 +	   seperately with EVEX vpcmp.  */
 +#  ifdef USE_AS_WMEMCHR
 +	/* vptern can only accept masks for epi32/epi64 so can only save
 +	   instruction using not equals mask on vptern with wmemchr.  */
 +	VPCMP	$4, (%rdi), %YMMMATCH, %k1
 +#  else
 +	VPCMP	$0, (%rdi), %YMMMATCH, %k1
 +#  endif
 +	/* Compare 3x with vpcmpeq and or them all together with vptern.
 +	 */
 +	VPCMPEQ	VEC_SIZE(%rdi), %ymm0, %ymm2
 +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
 +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
 +#  ifdef USE_AS_WMEMCHR
 +	/* This takes the not of or between ymm2, ymm3, ymm4 as well as
 +	   combines result from VEC0 with zero mask.  */
 +	vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
 +	vpmovmskb %ymm4, %ecx
 +#  else
 +	/* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4.  */
 +	vpternlogd $254, %ymm2, %ymm3, %ymm4
 +	vpmovmskb %ymm4, %ecx
 +	kmovd	%k1, %eax
 +#  endif
 +# endif
 +
 # ifdef USE_AS_RAWMEMCHR
 	subq	$-(VEC_SIZE * 4), %rdi
 +# endif
 +# ifdef USE_IN_RTM
 	kortestd %k2, %k3
 +# else
 +#  ifdef USE_AS_WMEMCHR
 +	/* ecx contains not of matches. All 1s means no matches. incl will
 +	   overflow and set zeroflag if that is the case.  */
 +	incl	%ecx
 +#  else
 +	/* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
 +	   to ecx is not an issue because if eax is non-zero it will be
 +	   used for returning the match. If it is zero the add does
 +	   nothing.  */
 +	addq	%rax, %rcx
 +#  endif
 +# endif
 +# ifdef USE_AS_RAWMEMCHR
 	jz	L(loop_4x_vec)
 # else
 -	kortestd %k2, %k3
 	jnz	L(loop_4x_vec_end)
 	subq	$-(VEC_SIZE * 4), %rdi
@@ -288,10 +371,11 @@ L(loop_4x_vec):
 	/* Fall through into less than 4 remaining vectors of length case.
 	 */
 -	VPCMP	$0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
 +	VPCMP	$0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
 +	addq	$(BASE_OFFSET - VEC_SIZE), %rdi
 	kmovd	%k0, %eax
 -	addq	$(VEC_SIZE * 3), %rdi
 -	.p2align 4
 +	VZEROUPPER
 +
 L(last_4x_vec_or_less):
 	/* Check if first VEC contained match.  */
 	testl	%eax, %eax
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
 	/* rawmemchr will fall through into this if match was found in
 	   loop.  */
 +# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
 	/* k1 has not of matches with VEC1.  */
 	kmovd	%k1, %eax
 -# ifdef USE_AS_WMEMCHR
 +#  ifdef USE_AS_WMEMCHR
 	subl	$((1 << CHAR_PER_VEC) - 1), %eax
 -# else
 +#  else
 	incl	%eax
 +#  endif
 +# else
 +	/* eax already has matches for VEC1.  */
 +	testl	%eax, %eax
 # endif
 	jnz	L(last_vec_x1_return)
 +# ifdef USE_IN_RTM
 	VPCMP	$0, %YMM2, %YMMZERO, %k0
 	kmovd	%k0, %eax
 +# else
 +	vpmovmskb %ymm2, %eax
 +# endif
 	testl	%eax, %eax
 	jnz	L(last_vec_x2_return)
 +# ifdef USE_IN_RTM
 	kmovd	%k2, %eax
 	testl	%eax, %eax
 	jnz	L(last_vec_x3_return)
 	kmovd	%k3, %eax
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_RAWMEMCHR
 -	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 +	leaq	(VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
 # else
 -	leaq	(VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
 +	vpmovmskb %ymm3, %eax
 +	/* Combine matches in VEC3 (eax) with matches in VEC4 (ecx).  */
 +	salq	$VEC_SIZE, %rcx
 +	orq	%rcx, %rax
 +	tzcntq	%rax, %rax
 +	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
 +	VZEROUPPER
 # endif
 	ret
 	.p2align 4
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_RAWMEMCHR
 -#  ifdef USE_AS_WMEMCHR
 +# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 -	leaq	(%rdi, %rax, CHAR_SIZE), %rax
 -#  else
 -	addq	%rdi, %rax
 -#  endif
 +	leaq	RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
 # else
 -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 -	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 +	addq	%rdi, %rax
 # endif
 +	VZEROUPPER
 	ret
 	.p2align 4
 L(last_vec_x2_return):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_RAWMEMCHR
 -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 -	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 -# else
 -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 -	leaq	(VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
 -# endif
 +	/* NB: Multiply bytes by RET_SCALE to get the wchar_t count
 +	   if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
 +	   USE_IN_RTM are both defined. Otherwise RET_SCALE = 1.  */
 +	leaq	(VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
 +	VZEROUPPER
 	ret
 +# ifdef USE_IN_RTM
 	.p2align 4
 L(last_vec_x3_return):
 	tzcntl	%eax, %eax
 -# ifdef USE_AS_RAWMEMCHR
 -	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 -	leaq	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
 -# else
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 -	leaq	(VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
 -# endif
 +	leaq	(VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 -
 +# endif
 # ifndef USE_AS_RAWMEMCHR
 L(last_4x_vec_or_less_cmpeq):
 diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
 index 016f5784..f28aea77 100644
 --- a/sysdeps/x86_64/multiarch/memchr.c
 +++ b/sysdeps/x86_64/multiarch/memchr.c
@@ -24,7 +24,7 @@
 # undef memchr
 # define SYMBOL_NAME memchr
 -# include "ifunc-avx2.h"
 +# include "ifunc-evex.h"
 libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
 strong_alias (memchr, __memchr)
 diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
 new file mode 100644
 index 00000000..deda1ca3
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -0,0 +1,3 @@
 +#define MEMCHR __rawmemchr_evex_rtm
 +#define USE_AS_RAWMEMCHR 1
 +#include "memchr-evex-rtm.S"
 diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
 index 8a0bc313..1f764f35 100644
 --- a/sysdeps/x86_64/multiarch/rawmemchr.c
 +++ b/sysdeps/x86_64/multiarch/rawmemchr.c
@@ -26,7 +26,7 @@
 # undef __rawmemchr
 # define SYMBOL_NAME rawmemchr
 -# include "ifunc-avx2.h"
 +# include "ifunc-evex.h"
 libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
 		       IFUNC_SELECTOR ());
 diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
 new file mode 100644
 index 00000000..a346cd35
 --- /dev/null
 +++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
@@ -0,0 +1,3 @@
 +#define MEMCHR __wmemchr_evex_rtm
 +#define USE_AS_WMEMCHR 1
 +#include "memchr-evex-rtm.S"
 diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
 index 6d833702..f9c91915 100644
 --- a/sysdeps/x86_64/multiarch/wmemchr.c
 +++ b/sysdeps/x86_64/multiarch/wmemchr.c
@@ -26,7 +26,7 @@
 # undef __wmemchr
 # define SYMBOL_NAME wmemchr
 -# include "ifunc-avx2.h"
 +# include "ifunc-evex.h"
 libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
 weak_alias (__wmemchr, wmemchr)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-45.patch
+++ b/SOURCES/glibc-RHEL-15696-45.patch
@ -1,873 +0,0 @@
 From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 17 May 2021 13:56:52 -0400
 Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes memcmp-avx2.S. The optimizations include
 adding a new vec compare path for small sizes, reorganizing the entry
 control flow, and removing some unnecissary ALU instructions from the
 main loop. test-memcmp and test-wmemcmp are both passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/ifunc-impl-list.c   |   6 +
 sysdeps/x86_64/multiarch/ifunc-memcmp.h      |   1 +
 sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
 3 files changed, 402 insertions(+), 281 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 index ac097e8d..8be0d78a 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memcmp,
 	      IFUNC_IMPL_ADD (array, i, memcmp,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __memcmp_avx2_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __memcmp_avx2_movbe_rtm)
 	      IFUNC_IMPL_ADD (array, i, memcmp,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __memcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, wmemcmp,
 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __wmemcmp_avx2_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
 			      (CPU_FEATURE_USABLE (AVX2)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)
 			       && CPU_FEATURE_USABLE (RTM)),
 			      __wmemcmp_avx2_movbe_rtm)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp,
 			      (CPU_FEATURE_USABLE (AVX512VL)
 			       && CPU_FEATURE_USABLE (AVX512BW)
 +			       && CPU_FEATURE_USABLE (BMI2)
 			       && CPU_FEATURE_USABLE (MOVBE)),
 			      __wmemcmp_evex_movbe)
 	      IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
 diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 index 8043c635..690dffe8 100644
 --- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
 +++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
       && CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
 +      && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
       && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
     {
       if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 index 9d5c9c72..16fc673e 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -19,17 +19,23 @@
 #if IS_IN (libc)
 /* memcmp/wmemcmp is implemented as:
 -   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
 -      to avoid branches.
 -   2. Use overlapping compare to avoid branch.
 -   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
 -      bytes for wmemcmp.
 -   4. If size is 8 * VEC_SIZE or less, unroll the loop.
 -   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
 +   1. Use ymm vector compares when possible. The only case where
 +      vector compares is not possible for when size < VEC_SIZE
 +      and loading from either s1 or s2 would cause a page cross.
 +   2. For size from 2 to 7 bytes on page cross, load as big endian
 +      with movbe and bswap to avoid branches.
 +   3. Use xmm vector compare when size >= 4 bytes for memcmp or
 +      size >= 8 bytes for wmemcmp.
 +   4. Optimistically compare up to first 4 * VEC_SIZE one at a
 +      to check for early mismatches. Only do this if its guranteed the
 +      work is not wasted.
 +   5. If size is 8 * VEC_SIZE or less, unroll the loop.
 +   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
       area.
 -   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 -   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 -   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 +   7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 +   8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 +   9. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 +
 # include <sysdep.h>
@@ -38,8 +44,10 @@
 # endif
 # ifdef USE_AS_WMEMCMP
 +#  define CHAR_SIZE	4
 #  define VPCMPEQ	vpcmpeqd
 # else
 +#  define CHAR_SIZE	1
 #  define VPCMPEQ	vpcmpeqb
 # endif
@@ -52,7 +60,7 @@
 # endif
 # define VEC_SIZE 32
 -# define VEC_MASK ((1 << VEC_SIZE) - 1)
 +# define PAGE_SIZE	4096
 /* Warning!
            wmemcmp has to use SIGNED comparison for elements.
@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
 	jb	L(less_vec)
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 -	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 +	vmovdqu	(%rsi), %ymm1
 +	VPCMPEQ	(%rdi), %ymm1, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	/* NB: eax must be destination register if going to
 +	   L(return_vec_[0,2]). For L(return_vec_3 destination register
 +	   must be ecx.  */
 +	incl	%eax
 +	jnz	L(return_vec_0)
 	cmpq	$(VEC_SIZE * 2), %rdx
 -	jbe	L(last_vec)
 -
 -	VPCMPEQ	%ymm0, %ymm0, %ymm0
 -	/* More than 2 * VEC.  */
 -	cmpq	$(VEC_SIZE * 8), %rdx
 -	ja	L(more_8x_vec)
 -	cmpq	$(VEC_SIZE * 4), %rdx
 -	jb	L(last_4x_vec)
 -
 -	/* From 4 * VEC to 8 * VEC, inclusively. */
 -	vmovdqu	(%rsi), %ymm1
 -	VPCMPEQ (%rdi), %ymm1, %ymm1
 +	jbe	L(last_1x_vec)
 +	/* Check second VEC no matter what.  */
 	vmovdqu	VEC_SIZE(%rsi), %ymm2
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 +	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
 +	vpmovmskb %ymm2, %eax
 +	/* If all 4 VEC where equal eax will be all 1s so incl will
 +	   overflow and set zero flag.  */
 +	incl	%eax
 +	jnz	L(return_vec_1)
 -	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 +	/* Less than 4 * VEC.  */
 +	cmpq	$(VEC_SIZE * 4), %rdx
 +	jbe	L(last_2x_vec)
 +	/* Check third and fourth VEC no matter what.  */
 +	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
 +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 +	vpmovmskb %ymm3, %eax
 +	incl	%eax
 +	jnz	L(return_vec_2)
 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 +	vpmovmskb %ymm4, %ecx
 +	incl	%ecx
 +	jnz	L(return_vec_3)
 -	vpand	%ymm1, %ymm2, %ymm5
 -	vpand	%ymm3, %ymm4, %ymm6
 -	vpand	%ymm5, %ymm6, %ymm5
 +	/* Go to 4x VEC loop.  */
 +	cmpq	$(VEC_SIZE * 8), %rdx
 +	ja	L(more_8x_vec)
 -	vptest	%ymm0, %ymm5
 -	jnc	L(4x_vec_end)
 +	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 +	   branches.  */
 +	/* Load first two VEC from s2 before adjusting addresses.  */
 +	vmovdqu	-(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
 +	vmovdqu	-(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
 	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
 	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 -	vmovdqu	(%rsi), %ymm1
 -	VPCMPEQ (%rdi), %ymm1, %ymm1
 -	vmovdqu	VEC_SIZE(%rsi), %ymm2
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 -	vpand	%ymm2, %ymm1, %ymm5
 +	/* Wait to load from s1 until addressed adjust due to
 +	   unlamination of microfusion with complex address mode.  */
 +	VPCMPEQ	(%rdi), %ymm1, %ymm1
 +	VPCMPEQ	(VEC_SIZE)(%rdi), %ymm2, %ymm2
 	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 -	vpand	%ymm3, %ymm5, %ymm5
 -
 +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 -	vpand	%ymm4, %ymm5, %ymm5
 +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 -	vptest	%ymm0, %ymm5
 -	jnc	L(4x_vec_end)
 -	xorl	%eax, %eax
 +	/* Reduce VEC0 - VEC4.  */
 +	vpand	%ymm1, %ymm2, %ymm5
 +	vpand	%ymm3, %ymm4, %ymm6
 +	vpand	%ymm5, %ymm6, %ymm7
 +	vpmovmskb %ymm7, %ecx
 +	incl	%ecx
 +	jnz	L(return_vec_0_1_2_3)
 +	/* NB: eax must be zero to reach here.  */
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 +L(return_vec_0):
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	(%rdi, %rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(%rsi, %rax), %ecx
 +	/* NB: no partial register stall here because xorl zero idiom
 +	   above.  */
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	(%rsi, %rax), %ecx
 +	movzbl	(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 L(return_vzeroupper):
 	ZERO_UPPER_VEC_REGISTERS_RETURN
 	.p2align 4
 -L(last_2x_vec):
 -	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 -	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 +L(return_vec_1):
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	VEC_SIZE(%rdi, %rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	VEC_SIZE(%rsi, %rax), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	VEC_SIZE(%rsi, %rax), %ecx
 +	movzbl	VEC_SIZE(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 +L(return_vec_2):
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	(VEC_SIZE * 2)(%rdi, %rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 +	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	VZEROUPPER_RETURN
 +
 +	/* NB: p2align 5 here to ensure 4x loop is 32 byte aligned.  */
 +	.p2align 5
 +L(8x_return_vec_0_1_2_3):
 +	/* Returning from L(more_8x_vec) requires restoring rsi.  */
 +	addq	%rdi, %rsi
 +L(return_vec_0_1_2_3):
 +	vpmovmskb %ymm1, %eax
 +	incl	%eax
 +	jnz	L(return_vec_0)
 -L(last_vec):
 -	/* Use overlapping loads to avoid branches.  */
 -	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
 -	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 +	incl	%eax
 +	jnz	L(return_vec_1)
 +
 +	vpmovmskb %ymm3, %eax
 +	incl	%eax
 +	jnz	L(return_vec_2)
 +L(return_vec_3):
 +	tzcntl	%ecx, %ecx
 +# ifdef USE_AS_WMEMCMP
 +	movl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %eax
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 +	subl	%ecx, %eax
 +# endif
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 +L(more_8x_vec):
 +	/* Set end of s1 in rdx.  */
 +	leaq	-(VEC_SIZE * 4)(%rdi, %rdx), %rdx
 +	/* rsi stores s2 - s1. This allows loop to only update one
 +	   pointer.  */
 +	subq	%rdi, %rsi
 +	/* Align s1 pointer.  */
 +	andq	$-VEC_SIZE, %rdi
 +	/* Adjust because first 4x vec where check already.  */
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	.p2align 4
 +L(loop_4x_vec):
 +	/* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
 +	 */
 +	vmovdqu	(%rsi, %rdi), %ymm1
 +	VPCMPEQ	(%rdi), %ymm1, %ymm1
 +
 +	vmovdqu	VEC_SIZE(%rsi, %rdi), %ymm2
 +	VPCMPEQ	VEC_SIZE(%rdi), %ymm2, %ymm2
 +
 +	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdi), %ymm3
 +	VPCMPEQ	(VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 +
 +	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdi), %ymm4
 +	VPCMPEQ	(VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 +
 +	vpand	%ymm1, %ymm2, %ymm5
 +	vpand	%ymm3, %ymm4, %ymm6
 +	vpand	%ymm5, %ymm6, %ymm7
 +	vpmovmskb %ymm7, %ecx
 +	incl	%ecx
 +	jnz	L(8x_return_vec_0_1_2_3)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	/* Check if s1 pointer at end.  */
 +	cmpq	%rdx, %rdi
 +	jb	L(loop_4x_vec)
 +
 +	subq	%rdx, %rdi
 +	/* rdi has 4 * VEC_SIZE - remaining length.  */
 +	cmpl	$(VEC_SIZE * 3), %edi
 +	jae	L(8x_last_1x_vec)
 +	/* Load regardless of branch.  */
 +	vmovdqu	(VEC_SIZE * 2)(%rsi, %rdx), %ymm3
 +	cmpl	$(VEC_SIZE * 2), %edi
 +	jae	L(8x_last_2x_vec)
 +
 +	/* Check last 4 VEC.  */
 +	vmovdqu	(%rsi, %rdx), %ymm1
 +	VPCMPEQ	(%rdx), %ymm1, %ymm1
 +
 +	vmovdqu	VEC_SIZE(%rsi, %rdx), %ymm2
 +	VPCMPEQ	VEC_SIZE(%rdx), %ymm2, %ymm2
 +
 +	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
 +
 +	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
 +	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
 +
 +	vpand	%ymm1, %ymm2, %ymm5
 +	vpand	%ymm3, %ymm4, %ymm6
 +	vpand	%ymm5, %ymm6, %ymm7
 +	vpmovmskb %ymm7, %ecx
 +	/* Restore s1 pointer to rdi.  */
 +	movq	%rdx, %rdi
 +	incl	%ecx
 +	jnz	L(8x_return_vec_0_1_2_3)
 +	/* NB: eax must be zero to reach here.  */
 +	VZEROUPPER_RETURN
 +
 +	/* Only entry is from L(more_8x_vec).  */
 +	.p2align 4
 +L(8x_last_2x_vec):
 +	/* Check second to last VEC. rdx store end pointer of s1 and
 +	   ymm3 has already been loaded with second to last VEC from s2.
 +	 */
 +	VPCMPEQ	(VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
 +	vpmovmskb %ymm3, %eax
 +	incl	%eax
 +	jnz	L(8x_return_vec_2)
 +	/* Check last VEC.  */
 +	.p2align 4
 +L(8x_last_1x_vec):
 +	vmovdqu	(VEC_SIZE * 3)(%rsi, %rdx), %ymm4
 +	VPCMPEQ	(VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
 +	vpmovmskb %ymm4, %eax
 +	incl	%eax
 +	jnz	L(8x_return_vec_3)
 	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec):
 -	/* A byte or int32 is different within 16 or 32 bytes.  */
 -	tzcntl	%eax, %ecx
 +L(last_2x_vec):
 +	/* Check second to last VEC.  */
 +	vmovdqu	-(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
 +	VPCMPEQ	-(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	incl	%eax
 +	jnz	L(return_vec_1_end)
 +	/* Check last VEC.  */
 +L(last_1x_vec):
 +	vmovdqu	-(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
 +	VPCMPEQ	-(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
 +	vpmovmskb %ymm1, %eax
 +	incl	%eax
 +	jnz	L(return_vec_0_end)
 +	VZEROUPPER_RETURN
 +
 +	.p2align 4
 +L(8x_return_vec_2):
 +	subq	$VEC_SIZE, %rdx
 +L(8x_return_vec_3):
 +	tzcntl	%eax, %eax
 +	addq	%rdx, %rax
 # ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	(%rdi, %rcx), %edx
 -	cmpl	(%rsi, %rcx), %edx
 -L(wmemcmp_return):
 -	setl	%al
 -	negl	%eax
 -	orl	$1, %eax
 +	movl	(VEC_SIZE * 3)(%rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 # else
 -	movzbl	(%rdi, %rcx), %eax
 -	movzbl	(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 +	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 +	movzbl	(VEC_SIZE * 3)(%rax), %eax
 +	subl	%ecx, %eax
 # endif
 	VZEROUPPER_RETURN
 -# ifdef USE_AS_WMEMCMP
 	.p2align 4
 -L(4):
 -	xorl	%eax, %eax
 -	movl	(%rdi), %edx
 -	cmpl	(%rsi), %edx
 -	jne	L(wmemcmp_return)
 -	ret
 +L(return_vec_1_end):
 +	tzcntl	%eax, %eax
 +	addl	%edx, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	-(VEC_SIZE * 2)(%rdi, %rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 # else
 +	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 +	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	VZEROUPPER_RETURN
 +
 	.p2align 4
 -L(between_4_7):
 -	/* Load as big endian with overlapping movbe to avoid branches.  */
 -	movbe	(%rdi), %eax
 -	movbe	(%rsi), %ecx
 -	shlq	$32, %rax
 -	shlq	$32, %rcx
 -	movbe	-4(%rdi, %rdx), %edi
 -	movbe	-4(%rsi, %rdx), %esi
 -	orq	%rdi, %rax
 -	orq	%rsi, %rcx
 -	subq	%rcx, %rax
 -	je	L(exit)
 -	sbbl	%eax, %eax
 -	orl	$1, %eax
 -	ret
 +L(return_vec_0_end):
 +	tzcntl	%eax, %eax
 +	addl	%edx, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	-VEC_SIZE(%rdi, %rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	-VEC_SIZE(%rsi, %rax), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
 +	movzbl	-VEC_SIZE(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	VZEROUPPER_RETURN
 	.p2align 4
 -L(exit):
 -	ret
 +L(less_vec):
 +	/* Check if one or less CHAR. This is necessary for size = 0 but
 +	   is also faster for size = CHAR_SIZE.  */
 +	cmpl	$CHAR_SIZE, %edx
 +	jbe	L(one_or_less)
 +
 +	/* Check if loading one VEC from either s1 or s2 could cause a
 +	   page cross. This can have false positives but is by far the
 +	   fastest method.  */
 +	movl	%edi, %eax
 +	orl	%esi, %eax
 +	andl	$(PAGE_SIZE - 1), %eax
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	jg	L(page_cross_less_vec)
 +
 +	/* No page cross possible.  */
 +	vmovdqu	(%rsi), %ymm2
 +	VPCMPEQ	(%rdi), %ymm2, %ymm2
 +	vpmovmskb %ymm2, %eax
 +	incl	%eax
 +	/* Result will be zero if s1 and s2 match. Otherwise first set
 +	   bit will be first mismatch.  */
 +	bzhil	%edx, %eax, %edx
 +	jnz	L(return_vec_0)
 +	xorl	%eax, %eax
 +	VZEROUPPER_RETURN
 	.p2align 4
 -L(between_2_3):
 +L(page_cross_less_vec):
 +	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
 +	   bytes.  */
 +	cmpl	$16, %edx
 +	jae	L(between_16_31)
 +# ifndef USE_AS_WMEMCMP
 +	cmpl	$8, %edx
 +	jae	L(between_8_15)
 +	cmpl	$4, %edx
 +	jae	L(between_4_7)
 +
 	/* Load as big endian to avoid branches.  */
 	movzwl	(%rdi), %eax
 	movzwl	(%rsi), %ecx
@@ -208,223 +439,106 @@ L(between_2_3):
 	shll	$8, %ecx
 	bswap	%eax
 	bswap	%ecx
 -	movb	-1(%rdi, %rdx), %al
 -	movb	-1(%rsi, %rdx), %cl
 +	movzbl	-1(%rdi, %rdx), %edi
 +	movzbl	-1(%rsi, %rdx), %esi
 +	orl	%edi, %eax
 +	orl	%esi, %ecx
 	/* Subtraction is okay because the upper 8 bits are zero.  */
 	subl	%ecx, %eax
 +	/* No ymm register was touched.  */
 	ret
 	.p2align 4
 -L(1):
 -	movzbl	(%rdi), %eax
 +L(one_or_less):
 +	jb	L(zero)
 	movzbl	(%rsi), %ecx
 +	movzbl	(%rdi), %eax
 	subl	%ecx, %eax
 -	ret
 -# endif
 -
 -	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 +	/* No ymm register was touched.  */
 	ret
 	.p2align 4
 -L(less_vec):
 -# ifdef USE_AS_WMEMCMP
 -	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
 -	cmpb	$4, %dl
 -	je	L(4)
 -	jb	L(zero)
 -# else
 -	cmpb	$1, %dl
 -	je	L(1)
 -	jb	L(zero)
 -	cmpb	$4, %dl
 -	jb	L(between_2_3)
 -	cmpb	$8, %dl
 -	jb	L(between_4_7)
 +L(between_8_15):
 # endif
 -	cmpb	$16, %dl
 -	jae	L(between_16_31)
 -	/* It is between 8 and 15 bytes.  */
 +	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
 -	VPCMPEQ %xmm1, %xmm2, %xmm2
 +	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 -	subl    $0xffff, %eax
 -	jnz	L(first_vec)
 +	subl	$0xffff, %eax
 +	jnz	L(return_vec_0)
 	/* Use overlapping loads to avoid branches.  */
 	leaq	-8(%rdi, %rdx), %rdi
 	leaq	-8(%rsi, %rdx), %rsi
 	vmovq	(%rdi), %xmm1
 	vmovq	(%rsi), %xmm2
 -	VPCMPEQ %xmm1, %xmm2, %xmm2
 +	VPCMPEQ	%xmm1, %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 -	subl    $0xffff, %eax
 -	jnz	L(first_vec)
 +	subl	$0xffff, %eax
 +	jnz	L(return_vec_0)
 +	/* No ymm register was touched.  */
 +	ret
 +
 +	.p2align 4
 +L(zero):
 +	xorl	%eax, %eax
 	ret
 	.p2align 4
 L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	vmovdqu	(%rsi), %xmm2
 -	VPCMPEQ (%rdi), %xmm2, %xmm2
 +	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 -	subl    $0xffff, %eax
 -	jnz	L(first_vec)
 +	subl	$0xffff, %eax
 +	jnz	L(return_vec_0)
 	/* Use overlapping loads to avoid branches.  */
 +
 +	vmovdqu	-16(%rsi, %rdx), %xmm2
 	leaq	-16(%rdi, %rdx), %rdi
 	leaq	-16(%rsi, %rdx), %rsi
 -	vmovdqu	(%rsi), %xmm2
 -	VPCMPEQ (%rdi), %xmm2, %xmm2
 +	VPCMPEQ	(%rdi), %xmm2, %xmm2
 	vpmovmskb %xmm2, %eax
 -	subl    $0xffff, %eax
 -	jnz	L(first_vec)
 +	subl	$0xffff, %eax
 +	jnz	L(return_vec_0)
 +	/* No ymm register was touched.  */
 	ret
 -	.p2align 4
 -L(more_8x_vec):
 -	/* More than 8 * VEC.  Check the first VEC.  */
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 -	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	/* Align the first memory area for aligned loads in the loop.
 -	   Compute how much the first memory area is misaligned.  */
 -	movq	%rdi, %rcx
 -	andl	$(VEC_SIZE - 1), %ecx
 -	/* Get the negative of offset for alignment.  */
 -	subq	$VEC_SIZE, %rcx
 -	/* Adjust the second memory area.  */
 -	subq	%rcx, %rsi
 -	/* Adjust the first memory area which should be aligned now.  */
 -	subq	%rcx, %rdi
 -	/* Adjust length.  */
 -	addq	%rcx, %rdx
 -
 -L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 -	vmovdqu	(%rsi), %ymm1
 -	VPCMPEQ (%rdi), %ymm1, %ymm1
 -
 -	vmovdqu	VEC_SIZE(%rsi), %ymm2
 -	VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
 -	vpand	%ymm2, %ymm1, %ymm5
 -
 -	vmovdqu	(VEC_SIZE * 2)(%rsi), %ymm3
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
 -	vpand	%ymm3, %ymm5, %ymm5
 -
 -	vmovdqu	(VEC_SIZE * 3)(%rsi), %ymm4
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
 -	vpand	%ymm4, %ymm5, %ymm5
 -
 -	vptest	%ymm0, %ymm5
 -	jnc	L(4x_vec_end)
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 -	addq	$(VEC_SIZE * 4), %rsi
 -
 -	subq	$(VEC_SIZE * 4), %rdx
 -	cmpq	$(VEC_SIZE * 4), %rdx
 -	jae	L(loop_4x_vec)
 -
 -	/* Less than 4 * VEC.  */
 -	cmpq	$VEC_SIZE, %rdx
 -	jbe	L(last_vec)
 -	cmpq	$(VEC_SIZE * 2), %rdx
 -	jbe	L(last_2x_vec)
 -
 -L(last_4x_vec):
 -	/* From 2 * VEC to 4 * VEC. */
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 -	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	addq	$VEC_SIZE, %rdi
 -	addq	$VEC_SIZE, %rsi
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 -	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	/* Use overlapping loads to avoid branches.  */
 -	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 -	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 -	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	addq	$VEC_SIZE, %rdi
 -	addq	$VEC_SIZE, %rsi
 -	vmovdqu	(%rsi), %ymm2
 -	VPCMPEQ (%rdi), %ymm2, %ymm2
 -	vpmovmskb %ymm2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -	VZEROUPPER_RETURN
 -
 -	.p2align 4
 -L(4x_vec_end):
 -	vpmovmskb %ymm1, %eax
 -	subl	$VEC_MASK, %eax
 -	jnz	L(first_vec)
 -	vpmovmskb %ymm2, %eax
 -	subl	$VEC_MASK, %eax
 -	jnz	L(first_vec_x1)
 -	vpmovmskb %ymm3, %eax
 -	subl	$VEC_MASK, %eax
 -	jnz	L(first_vec_x2)
 -	vpmovmskb %ymm4, %eax
 -	subl	$VEC_MASK, %eax
 -	tzcntl	%eax, %ecx
 # ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	(VEC_SIZE * 3)(%rdi, %rcx), %edx
 -	cmpl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 -	jmp	L(wmemcmp_return)
 -# else
 -	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 -	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 -# endif
 -	VZEROUPPER_RETURN
 -
 	.p2align 4
 -L(first_vec_x1):
 -	tzcntl	%eax, %ecx
 -# ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	VEC_SIZE(%rdi, %rcx), %edx
 -	cmpl	VEC_SIZE(%rsi, %rcx), %edx
 -	jmp	L(wmemcmp_return)
 +L(one_or_less):
 +	jb	L(zero)
 +	movl	(%rdi), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(%rsi), %ecx
 +	je	L(zero)
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +	/* No ymm register was touched.  */
 +	ret
 # else
 -	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 -	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 -# endif
 -	VZEROUPPER_RETURN
 	.p2align 4
 -L(first_vec_x2):
 -	tzcntl	%eax, %ecx
 -# ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	(VEC_SIZE * 2)(%rdi, %rcx), %edx
 -	cmpl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 -	jmp	L(wmemcmp_return)
 -# else
 -	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 -	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 +L(between_4_7):
 +	/* Load as big endian with overlapping movbe to avoid branches.
 +	 */
 +	movbe	(%rdi), %eax
 +	movbe	(%rsi), %ecx
 +	shlq	$32, %rax
 +	shlq	$32, %rcx
 +	movbe	-4(%rdi, %rdx), %edi
 +	movbe	-4(%rsi, %rdx), %esi
 +	orq	%rdi, %rax
 +	orq	%rsi, %rcx
 +	subq	%rcx, %rax
 +	jz	L(zero_4_7)
 +	sbbl	%eax, %eax
 +	orl	$1, %eax
 +L(zero_4_7):
 +	/* No ymm register was touched.  */
 +	ret
 # endif
 -	VZEROUPPER_RETURN
 +
 END (MEMCMP)
 #endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-46.patch
+++ b/SOURCES/glibc-RHEL-15696-46.patch
@ -1,851 +0,0 @@
 From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 17 May 2021 13:57:24 -0400
 Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit optimizes memcmp-evex.S. The optimizations include
 adding a new vec compare path for small sizes, reorganizing the entry
 control flow, removing some unnecissary ALU instructions from the main
 loop, and most importantly replacing the heavy use of vpcmp + kand
 logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
 passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
 1 file changed, 408 insertions(+), 302 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 index 9c093972..654dc7ac 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -19,17 +19,22 @@
 #if IS_IN (libc)
 /* memcmp/wmemcmp is implemented as:
 -   1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
 -      to avoid branches.
 -   2. Use overlapping compare to avoid branch.
 -   3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
 -      bytes for wmemcmp.
 -   4. If size is 8 * VEC_SIZE or less, unroll the loop.
 -   5. Compare 4 * VEC_SIZE at a time with the aligned first memory
 +   1. Use ymm vector compares when possible. The only case where
 +      vector compares is not possible for when size < CHAR_PER_VEC
 +      and loading from either s1 or s2 would cause a page cross.
 +   2. For size from 2 to 7 bytes on page cross, load as big endian
 +      with movbe and bswap to avoid branches.
 +   3. Use xmm vector compare when size >= 4 bytes for memcmp or
 +      size >= 8 bytes for wmemcmp.
 +   4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
 +      to check for early mismatches. Only do this if its guranteed the
 +      work is not wasted.
 +   5. If size is 8 * VEC_SIZE or less, unroll the loop.
 +   6. Compare 4 * VEC_SIZE at a time with the aligned first memory
       area.
 -   6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
 -   7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
 -   8. Use 8 vector compares when size is 8 * VEC_SIZE or less.  */
 +   7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
 +   8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
 +   9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.  */
 # include <sysdep.h>
@@ -40,11 +45,21 @@
 # define VMOVU		vmovdqu64
 # ifdef USE_AS_WMEMCMP
 -#  define VPCMPEQ	vpcmpeqd
 +#  define CHAR_SIZE	4
 +#  define VPCMP	vpcmpd
 # else
 -#  define VPCMPEQ	vpcmpeqb
 +#  define CHAR_SIZE	1
 +#  define VPCMP	vpcmpub
 # endif
 +# define VEC_SIZE	32
 +# define PAGE_SIZE	4096
 +# define CHAR_PER_VEC	(VEC_SIZE / CHAR_SIZE)
 +
 +# define XMM0		xmm16
 +# define XMM1		xmm17
 +# define XMM2		xmm18
 +# define YMM0		ymm16
 # define XMM1		xmm17
 # define XMM2		xmm18
 # define YMM1		ymm17
@@ -54,15 +69,6 @@
 # define YMM5		ymm21
 # define YMM6		ymm22
 -# define VEC_SIZE 32
 -# ifdef USE_AS_WMEMCMP
 -#  define VEC_MASK 0xff
 -#  define XMM_MASK 0xf
 -# else
 -#  define VEC_MASK 0xffffffff
 -#  define XMM_MASK 0xffff
 -# endif
 -
 /* Warning!
            wmemcmp has to use SIGNED comparison for elements.
            memcmp has to use UNSIGNED comparison for elemnts.
@@ -70,145 +76,370 @@
 	.section .text.evex,"ax",@progbits
 ENTRY (MEMCMP)
 -# ifdef USE_AS_WMEMCMP
 -	shl	$2, %RDX_LP
 -# elif defined __ILP32__
 +# ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 	movl	%edx, %edx
 # endif
 -	cmp	$VEC_SIZE, %RDX_LP
 +	cmp	$CHAR_PER_VEC, %RDX_LP
 	jb	L(less_vec)
 	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k1
 +	VMOVU	(%rsi), %YMM1
 +	/* Use compare not equals to directly check for mismatch.  */
 +	VPCMP	$4, (%rdi), %YMM1, %k1
 	kmovd	%k1, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	cmpq	$(VEC_SIZE * 2), %rdx
 -	jbe	L(last_vec)
 -
 -	/* More than 2 * VEC.  */
 -	cmpq	$(VEC_SIZE * 8), %rdx
 -	ja	L(more_8x_vec)
 -	cmpq	$(VEC_SIZE * 4), %rdx
 -	jb	L(last_4x_vec)
 +	/* NB: eax must be destination register if going to
 +	   L(return_vec_[0,2]). For L(return_vec_3 destination register
 +	   must be ecx.  */
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 -	/* From 4 * VEC to 8 * VEC, inclusively. */
 -	VMOVU	(%rsi), %YMM1
 -	VPCMPEQ (%rdi), %YMM1, %k1
 +	cmpq	$(CHAR_PER_VEC * 2), %rdx
 +	jbe	L(last_1x_vec)
 +	/* Check second VEC no matter what.  */
 	VMOVU	VEC_SIZE(%rsi), %YMM2
 -	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 +	VPCMP	$4, VEC_SIZE(%rdi), %YMM2, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_1)
 +
 +	/* Less than 4 * VEC.  */
 +	cmpq	$(CHAR_PER_VEC * 4), %rdx
 +	jbe	L(last_2x_vec)
 +	/* Check third and fourth VEC no matter what.  */
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 +	VPCMP	$4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 +	VPCMP	$4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
 +	kmovd	%k1, %ecx
 +	testl	%ecx, %ecx
 +	jnz	L(return_vec_3)
 -	kandd	%k1, %k2, %k5
 -	kandd	%k3, %k4, %k6
 -	kandd	%k5, %k6, %k6
 +	/* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
 +	   compare with zero to get a mask is needed.  */
 +	vpxorq	%XMM0, %XMM0, %XMM0
 -	kmovd	%k6, %eax
 -	cmpl	$VEC_MASK, %eax
 -	jne	L(4x_vec_end)
 +	/* Go to 4x VEC loop.  */
 +	cmpq	$(CHAR_PER_VEC * 8), %rdx
 +	ja	L(more_8x_vec)
 -	leaq	-(4 * VEC_SIZE)(%rdi, %rdx), %rdi
 -	leaq	-(4 * VEC_SIZE)(%rsi, %rdx), %rsi
 -	VMOVU	(%rsi), %YMM1
 -	VPCMPEQ (%rdi), %YMM1, %k1
 +	/* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
 +	   branches.  */
 -	VMOVU	VEC_SIZE(%rsi), %YMM2
 -	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 -	kandd	%k1, %k2, %k5
 +	/* Load first two VEC from s2 before adjusting addresses.  */
 +	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
 +	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
 +	leaq	-(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
 +	leaq	-(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
 +
 +	/* Wait to load from s1 until addressed adjust due to
 +	   unlamination of microfusion with complex address mode.  */
 +
 +	/* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
 +	   will have some 1s.  */
 +	vpxorq	(%rdi), %YMM1, %YMM1
 +	vpxorq	(VEC_SIZE)(%rdi), %YMM2, %YMM2
 	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 -	kandd	%k3, %k5, %k5
 +	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
 +	/* Or together YMM1, YMM2, and YMM3 into YMM3.  */
 +	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 -	kandd	%k4, %k5, %k5
 +	/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
 +	   oring with YMM3. Result is stored in YMM4.  */
 +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 +	/* Compare YMM4 with 0. If any 1s s1 and s2 don't match.  */
 +	VPCMP	$4, %YMM4, %YMM0, %k1
 +	kmovd	%k1, %ecx
 +	testl	%ecx, %ecx
 +	jnz	L(return_vec_0_1_2_3)
 +	/* NB: eax must be zero to reach here.  */
 +	ret
 -	kmovd	%k5, %eax
 -	cmpl	$VEC_MASK, %eax
 -	jne	L(4x_vec_end)
 -	xorl	%eax, %eax
 +	/* NB: aligning 32 here allows for the rest of the jump targets
 +	   to be tuned for 32 byte alignment. Most important this ensures
 +	   the L(more_8x_vec) loop is 32 byte aligned.  */
 +	.p2align 5
 +L(less_vec):
 +	/* Check if one or less CHAR. This is necessary for size = 0 but
 +	   is also faster for size = CHAR_SIZE.  */
 +	cmpl	$1, %edx
 +	jbe	L(one_or_less)
 +
 +	/* Check if loading one VEC from either s1 or s2 could cause a
 +	   page cross. This can have false positives but is by far the
 +	   fastest method.  */
 +	movl	%edi, %eax
 +	orl	%esi, %eax
 +	andl	$(PAGE_SIZE - 1), %eax
 +	cmpl	$(PAGE_SIZE - VEC_SIZE), %eax
 +	jg	L(page_cross_less_vec)
 +
 +	/* No page cross possible.  */
 +	VMOVU	(%rsi), %YMM2
 +	VPCMP	$4, (%rdi), %YMM2, %k1
 +	kmovd	%k1, %eax
 +	/* Create mask in ecx for potentially in bound matches.  */
 +	bzhil	%edx, %eax, %eax
 +	jnz	L(return_vec_0)
 	ret
 	.p2align 4
 -L(last_2x_vec):
 -	/* From VEC to 2 * VEC.  No branch when size == VEC_SIZE.  */
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k2
 -	kmovd	%k2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 +L(return_vec_0):
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	(%rdi, %rax, CHAR_SIZE), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(%rsi, %rax, CHAR_SIZE), %ecx
 +	/* NB: no partial register stall here because xorl zero idiom
 +	   above.  */
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	(%rsi, %rax), %ecx
 +	movzbl	(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	ret
 -L(last_vec):
 -	/* Use overlapping loads to avoid branches.  */
 -	leaq	-VEC_SIZE(%rdi, %rdx), %rdi
 -	leaq	-VEC_SIZE(%rsi, %rdx), %rsi
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k2
 -	kmovd	%k2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 +	/* NB: No p2align necessary. Alignment  % 16 is naturally 1
 +	   which is good enough for a target not in a loop.  */
 +L(return_vec_1):
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 +	xorl	%edx, %edx
 +	cmpl	VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	VEC_SIZE(%rsi, %rax), %ecx
 +	movzbl	VEC_SIZE(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 	ret
 -	.p2align 4
 -L(first_vec):
 -	/* A byte or int32 is different within 16 or 32 bytes.  */
 -	tzcntl	%eax, %ecx
 +	/* NB: No p2align necessary. Alignment  % 16 is naturally 2
 +	   which is good enough for a target not in a loop.  */
 +L(return_vec_2):
 +	tzcntl	%eax, %eax
 # ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	(%rdi, %rcx, 4), %edx
 -	cmpl	(%rsi, %rcx, 4), %edx
 -L(wmemcmp_return):
 -	setl	%al
 -	negl	%eax
 -	orl	$1, %eax
 +	movl	(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 # else
 -	movzbl	(%rdi, %rcx), %eax
 -	movzbl	(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 +	movzbl	(VEC_SIZE * 2)(%rsi, %rax), %ecx
 +	movzbl	(VEC_SIZE * 2)(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 # endif
 	ret
 +	.p2align 4
 +L(8x_return_vec_0_1_2_3):
 +	/* Returning from L(more_8x_vec) requires restoring rsi.  */
 +	addq	%rdi, %rsi
 +L(return_vec_0_1_2_3):
 +	VPCMP	$4, %YMM1, %YMM0, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 +
 +	VPCMP	$4, %YMM2, %YMM0, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_1)
 +
 +	VPCMP	$4, %YMM3, %YMM0, %k0
 +	kmovd	%k0, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_2)
 +L(return_vec_3):
 +	tzcntl	%ecx, %ecx
 # ifdef USE_AS_WMEMCMP
 +	movl	(VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 +	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %ecx
 +	subl	%ecx, %eax
 +# endif
 +	ret
 +
 	.p2align 4
 -L(4):
 -	xorl	%eax, %eax
 -	movl	(%rdi), %edx
 -	cmpl	(%rsi), %edx
 -	jne	L(wmemcmp_return)
 +L(more_8x_vec):
 +	/* Set end of s1 in rdx.  */
 +	leaq	-(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
 +	/* rsi stores s2 - s1. This allows loop to only update one
 +	   pointer.  */
 +	subq	%rdi, %rsi
 +	/* Align s1 pointer.  */
 +	andq	$-VEC_SIZE, %rdi
 +	/* Adjust because first 4x vec where check already.  */
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	.p2align 4
 +L(loop_4x_vec):
 +	VMOVU	(%rsi, %rdi), %YMM1
 +	vpxorq	(%rdi), %YMM1, %YMM1
 +
 +	VMOVU	VEC_SIZE(%rsi, %rdi), %YMM2
 +	vpxorq	VEC_SIZE(%rdi), %YMM2, %YMM2
 +
 +	VMOVU	(VEC_SIZE * 2)(%rsi, %rdi), %YMM3
 +	vpxorq	(VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
 +	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 +
 +	VMOVU	(VEC_SIZE * 3)(%rsi, %rdi), %YMM4
 +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
 +	VPCMP	$4, %YMM4, %YMM0, %k1
 +	kmovd	%k1, %ecx
 +	testl	%ecx, %ecx
 +	jnz	L(8x_return_vec_0_1_2_3)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	cmpq	%rdx, %rdi
 +	jb	L(loop_4x_vec)
 +
 +	subq	%rdx, %rdi
 +	/* rdi has 4 * VEC_SIZE - remaining length.  */
 +	cmpl	$(VEC_SIZE * 3), %edi
 +	jae	L(8x_last_1x_vec)
 +	/* Load regardless of branch.  */
 +	VMOVU	(VEC_SIZE * 2)(%rsi, %rdx), %YMM3
 +	cmpl	$(VEC_SIZE * 2), %edi
 +	jae	L(8x_last_2x_vec)
 +
 +	VMOVU	(%rsi, %rdx), %YMM1
 +	vpxorq	(%rdx), %YMM1, %YMM1
 +
 +	VMOVU	VEC_SIZE(%rsi, %rdx), %YMM2
 +	vpxorq	VEC_SIZE(%rdx), %YMM2, %YMM2
 +
 +	vpxorq	(VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
 +	vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
 +
 +	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM4
 +	vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
 +	VPCMP	$4, %YMM4, %YMM0, %k1
 +	kmovd	%k1, %ecx
 +	/* Restore s1 pointer to rdi.  */
 +	movq	%rdx, %rdi
 +	testl	%ecx, %ecx
 +	jnz	L(8x_return_vec_0_1_2_3)
 +	/* NB: eax must be zero to reach here.  */
 +	ret
 +
 +	/* Only entry is from L(more_8x_vec).  */
 +	.p2align 4
 +L(8x_last_2x_vec):
 +	VPCMP	$4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(8x_return_vec_2)
 +	/* Naturally aligned to 16 bytes.  */
 +L(8x_last_1x_vec):
 +	VMOVU	(VEC_SIZE * 3)(%rsi, %rdx), %YMM1
 +	VPCMP	$4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(8x_return_vec_3)
 +	ret
 +
 +	.p2align 4
 +L(last_2x_vec):
 +	/* Check second to last VEC.  */
 +	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
 +	VPCMP	$4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_1_end)
 +
 +	/* Check last VEC.  */
 +	.p2align 4
 +L(last_1x_vec):
 +	VMOVU	-(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
 +	VPCMP	$4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0_end)
 	ret
 +
 +	.p2align 4
 +L(8x_return_vec_2):
 +	subq	$VEC_SIZE, %rdx
 +L(8x_return_vec_3):
 +	tzcntl	%eax, %eax
 +# ifdef USE_AS_WMEMCMP
 +	leaq	(%rdx, %rax, CHAR_SIZE), %rax
 +	movl	(VEC_SIZE * 3)(%rax), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 # else
 +	addq	%rdx, %rax
 +	movzbl	(VEC_SIZE * 3)(%rsi, %rax), %ecx
 +	movzbl	(VEC_SIZE * 3)(%rax), %eax
 +	subl	%ecx, %eax
 +# endif
 +	ret
 +
 	.p2align 4
 -L(between_4_7):
 -	/* Load as big endian with overlapping movbe to avoid branches.  */
 -	movbe	(%rdi), %eax
 -	movbe	(%rsi), %ecx
 -	shlq	$32, %rax
 -	shlq	$32, %rcx
 -	movbe	-4(%rdi, %rdx), %edi
 -	movbe	-4(%rsi, %rdx), %esi
 -	orq	%rdi, %rax
 -	orq	%rsi, %rcx
 -	subq	%rcx, %rax
 -	je	L(exit)
 -	sbbl	%eax, %eax
 -	orl	$1, %eax
 +L(return_vec_0_end):
 +	tzcntl	%eax, %eax
 +	addl	%edx, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	-VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
 +	xorl	%edx, %edx
 +	cmpl	-VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	-VEC_SIZE(%rsi, %rax), %ecx
 +	movzbl	-VEC_SIZE(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 	ret
 	.p2align 4
 -L(exit):
 +L(return_vec_1_end):
 +	tzcntl	%eax, %eax
 +	addl	%edx, %eax
 +# ifdef USE_AS_WMEMCMP
 +	movl	-(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
 +	xorl	%edx, %edx
 +	cmpl	-(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 +# else
 +	movzbl	-(VEC_SIZE * 2)(%rsi, %rax), %ecx
 +	movzbl	-(VEC_SIZE * 2)(%rdi, %rax), %eax
 +	subl	%ecx, %eax
 +# endif
 	ret
 +
 	.p2align 4
 +L(page_cross_less_vec):
 +	/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
 +	   bytes.  */
 +	cmpl	$(16 / CHAR_SIZE), %edx
 +	jae	L(between_16_31)
 +# ifndef USE_AS_WMEMCMP
 +	cmpl	$8, %edx
 +	jae	L(between_8_15)
 +	cmpl	$4, %edx
 +	jae	L(between_4_7)
 L(between_2_3):
 	/* Load as big endian to avoid branches.  */
 	movzwl	(%rdi), %eax
@@ -217,224 +448,99 @@ L(between_2_3):
 	shll	$8, %ecx
 	bswap	%eax
 	bswap	%ecx
 -	movb	-1(%rdi, %rdx), %al
 -	movb	-1(%rsi, %rdx), %cl
 +	movzbl	-1(%rdi, %rdx), %edi
 +	movzbl	-1(%rsi, %rdx), %esi
 +	orl	%edi, %eax
 +	orl	%esi, %ecx
 	/* Subtraction is okay because the upper 8 bits are zero.  */
 	subl	%ecx, %eax
 	ret
 -
 	.p2align 4
 -L(1):
 -	movzbl	(%rdi), %eax
 +L(one_or_less):
 +	jb	L(zero)
 	movzbl	(%rsi), %ecx
 +	movzbl	(%rdi), %eax
 	subl	%ecx, %eax
 	ret
 -# endif
 -
 -	.p2align 4
 -L(zero):
 -	xorl	%eax, %eax
 -	ret
 	.p2align 4
 -L(less_vec):
 -# ifdef USE_AS_WMEMCMP
 -	/* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes.  */
 -	cmpb	$4, %dl
 -	je	L(4)
 -	jb	L(zero)
 -# else
 -	cmpb	$1, %dl
 -	je	L(1)
 -	jb	L(zero)
 -	cmpb	$4, %dl
 -	jb	L(between_2_3)
 -	cmpb	$8, %dl
 -	jb	L(between_4_7)
 +L(between_8_15):
 # endif
 -	cmpb	$16, %dl
 -	jae	L(between_16_31)
 -	/* It is between 8 and 15 bytes.  */
 +	/* If USE_AS_WMEMCMP fall through into 8-15 byte case.  */
 	vmovq	(%rdi), %XMM1
 	vmovq	(%rsi), %XMM2
 -	VPCMPEQ %XMM1, %XMM2, %k2
 -	kmovw	%k2, %eax
 -	subl    $XMM_MASK, %eax
 -	jnz	L(first_vec)
 +	VPCMP	$4, %XMM1, %XMM2, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 	/* Use overlapping loads to avoid branches.  */
 -	leaq	-8(%rdi, %rdx), %rdi
 -	leaq	-8(%rsi, %rdx), %rsi
 +	leaq	-8(%rdi, %rdx, CHAR_SIZE), %rdi
 +	leaq	-8(%rsi, %rdx, CHAR_SIZE), %rsi
 	vmovq	(%rdi), %XMM1
 	vmovq	(%rsi), %XMM2
 -	VPCMPEQ %XMM1, %XMM2, %k2
 -	kmovw	%k2, %eax
 -	subl    $XMM_MASK, %eax
 -	jnz	L(first_vec)
 +	VPCMP	$4, %XMM1, %XMM2, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 	ret
 	.p2align 4
 -L(between_16_31):
 -	/* From 16 to 31 bytes.  No branch when size == 16.  */
 -	VMOVU	(%rsi), %XMM2
 -	VPCMPEQ (%rdi), %XMM2, %k2
 -	kmovw	%k2, %eax
 -	subl    $XMM_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	/* Use overlapping loads to avoid branches.  */
 -	leaq	-16(%rdi, %rdx), %rdi
 -	leaq	-16(%rsi, %rdx), %rsi
 -	VMOVU	(%rsi), %XMM2
 -	VPCMPEQ (%rdi), %XMM2, %k2
 -	kmovw	%k2, %eax
 -	subl    $XMM_MASK, %eax
 -	jnz	L(first_vec)
 +L(zero):
 +	xorl	%eax, %eax
 	ret
 	.p2align 4
 -L(more_8x_vec):
 -	/* More than 8 * VEC.  Check the first VEC.  */
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k2
 -	kmovd	%k2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	/* Align the first memory area for aligned loads in the loop.
 -	   Compute how much the first memory area is misaligned.  */
 -	movq	%rdi, %rcx
 -	andl	$(VEC_SIZE - 1), %ecx
 -	/* Get the negative of offset for alignment.  */
 -	subq	$VEC_SIZE, %rcx
 -	/* Adjust the second memory area.  */
 -	subq	%rcx, %rsi
 -	/* Adjust the first memory area which should be aligned now.  */
 -	subq	%rcx, %rdi
 -	/* Adjust length.  */
 -	addq	%rcx, %rdx
 -
 -L(loop_4x_vec):
 -	/* Compare 4 * VEC at a time forward.  */
 -	VMOVU	(%rsi), %YMM1
 -	VPCMPEQ (%rdi), %YMM1, %k1
 -
 -	VMOVU	VEC_SIZE(%rsi), %YMM2
 -	VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
 -	kandd	%k2, %k1, %k5
 -
 -	VMOVU	(VEC_SIZE * 2)(%rsi), %YMM3
 -	VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
 -	kandd	%k3, %k5, %k5
 -
 -	VMOVU	(VEC_SIZE * 3)(%rsi), %YMM4
 -	VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
 -	kandd	%k4, %k5, %k5
 -
 -	kmovd	%k5, %eax
 -	cmpl	$VEC_MASK, %eax
 -	jne	L(4x_vec_end)
 -
 -	addq	$(VEC_SIZE * 4), %rdi
 -	addq	$(VEC_SIZE * 4), %rsi
 -
 -	subq	$(VEC_SIZE * 4), %rdx
 -	cmpq	$(VEC_SIZE * 4), %rdx
 -	jae	L(loop_4x_vec)
 -
 -	/* Less than 4 * VEC.  */
 -	cmpq	$VEC_SIZE, %rdx
 -	jbe	L(last_vec)
 -	cmpq	$(VEC_SIZE * 2), %rdx
 -	jbe	L(last_2x_vec)
 -
 -L(last_4x_vec):
 -	/* From 2 * VEC to 4 * VEC. */
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k2
 -	kmovd	%k2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -
 -	addq	$VEC_SIZE, %rdi
 -	addq	$VEC_SIZE, %rsi
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k2
 -	kmovd	%k2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 +L(between_16_31):
 +	/* From 16 to 31 bytes.  No branch when size == 16.  */
 +	VMOVU	(%rsi), %XMM2
 +	VPCMP	$4, (%rdi), %XMM2, %k1
 +	kmovd	%k1, %eax
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 	/* Use overlapping loads to avoid branches.  */
 -	leaq	-(3 * VEC_SIZE)(%rdi, %rdx), %rdi
 -	leaq	-(3 * VEC_SIZE)(%rsi, %rdx), %rsi
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k2
 -	kmovd	%k2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -	addq	$VEC_SIZE, %rdi
 -	addq	$VEC_SIZE, %rsi
 -	VMOVU	(%rsi), %YMM2
 -	VPCMPEQ (%rdi), %YMM2, %k2
 -	kmovd	%k2, %eax
 -	subl    $VEC_MASK, %eax
 -	jnz	L(first_vec)
 -	ret
 -
 -	.p2align 4
 -L(4x_vec_end):
 +	VMOVU	-16(%rsi, %rdx, CHAR_SIZE), %XMM2
 +	leaq	-16(%rdi, %rdx, CHAR_SIZE), %rdi
 +	leaq	-16(%rsi, %rdx, CHAR_SIZE), %rsi
 +	VPCMP	$4, (%rdi), %XMM2, %k1
 	kmovd	%k1, %eax
 -	subl	$VEC_MASK, %eax
 -	jnz	L(first_vec)
 -	kmovd	%k2, %eax
 -	subl	$VEC_MASK, %eax
 -	jnz	L(first_vec_x1)
 -	kmovd	%k3, %eax
 -	subl	$VEC_MASK, %eax
 -	jnz	L(first_vec_x2)
 -	kmovd	%k4, %eax
 -	subl	$VEC_MASK, %eax
 -	tzcntl	%eax, %ecx
 -# ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	(VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
 -	cmpl	(VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
 -	jmp	L(wmemcmp_return)
 -# else
 -	movzbl	(VEC_SIZE * 3)(%rdi, %rcx), %eax
 -	movzbl	(VEC_SIZE * 3)(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 -# endif
 +	testl	%eax, %eax
 +	jnz	L(return_vec_0)
 	ret
 -	.p2align 4
 -L(first_vec_x1):
 -	tzcntl	%eax, %ecx
 # ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	VEC_SIZE(%rdi, %rcx, 4), %edx
 -	cmpl	VEC_SIZE(%rsi, %rcx, 4), %edx
 -	jmp	L(wmemcmp_return)
 -# else
 -	movzbl	VEC_SIZE(%rdi, %rcx), %eax
 -	movzbl	VEC_SIZE(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 -# endif
 +	.p2align 4
 +L(one_or_less):
 +	jb	L(zero)
 +	movl	(%rdi), %ecx
 +	xorl	%edx, %edx
 +	cmpl	(%rsi), %ecx
 +	je	L(zero)
 +	setg	%dl
 +	leal	-1(%rdx, %rdx), %eax
 	ret
 +# else
 	.p2align 4
 -L(first_vec_x2):
 -	tzcntl	%eax, %ecx
 -# ifdef USE_AS_WMEMCMP
 -	xorl	%eax, %eax
 -	movl	(VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
 -	cmpl	(VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
 -	jmp	L(wmemcmp_return)
 -# else
 -	movzbl	(VEC_SIZE * 2)(%rdi, %rcx), %eax
 -	movzbl	(VEC_SIZE * 2)(%rsi, %rcx), %edx
 -	sub	%edx, %eax
 -# endif
 +L(between_4_7):
 +	/* Load as big endian with overlapping movbe to avoid branches.
 +	 */
 +	movbe	(%rdi), %eax
 +	movbe	(%rsi), %ecx
 +	shlq	$32, %rax
 +	shlq	$32, %rcx
 +	movbe	-4(%rdi, %rdx), %edi
 +	movbe	-4(%rsi, %rdx), %esi
 +	orq	%rdi, %rax
 +	orq	%rsi, %rcx
 +	subq	%rcx, %rax
 +	jz	L(zero_4_7)
 +	sbbl	%eax, %eax
 +	orl	$1, %eax
 +L(zero_4_7):
 	ret
 +# endif
 +
 END (MEMCMP)
 #endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-47.patch
+++ b/SOURCES/glibc-RHEL-15696-47.patch
@ -1,104 +0,0 @@
 From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Thu, 20 May 2021 13:13:51 -0400
 Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
 Content-type: text/plain; charset=UTF-8
 No bug. This commit makes a few small improvements to
 memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
 instead of 128. Either alignment will perform equally well in a loop
 and 128 just increases the odds of having to do an extra iteration
 which can be significant overhead for small values. 2) Align some
 targets and the loop. 3) Remove an ALU from the alignment process. 4)
 Reorder the last 4x VEC so that they are stored after the loop. 5)
 Move the condition for leq 8x VEC to before the alignment
 process. test-memset and test-wmemset are both passing.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 .../multiarch/memset-vec-unaligned-erms.S     | 50 +++++++++++--------
 1 file changed, 28 insertions(+), 22 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index f877ac9d..909c33f6 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	VMOVU	%VEC(0), (%rdi)
 	VZEROUPPER_RETURN
 +	.p2align 4
 L(stosb_more_2x_vec):
 	cmp	__x86_rep_stosb_threshold(%rip), %RDX_LP
 	ja	L(stosb)
 +#else
 +	.p2align 4
 #endif
 L(more_2x_vec):
 -	cmpq  $(VEC_SIZE * 4), %rdx
 -	ja	L(loop_start)
 +	/* Stores to first 2x VEC before cmp as any path forward will
 +	   require it.  */
 	VMOVU	%VEC(0), (%rdi)
 	VMOVU	%VEC(0), VEC_SIZE(%rdi)
 -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 +	cmpq	$(VEC_SIZE * 4), %rdx
 +	ja	L(loop_start)
 	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 +	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 L(return):
 #if VEC_SIZE > 16
 	ZERO_UPPER_VEC_REGISTERS_RETURN
@@ -192,28 +197,29 @@ L(return):
 #endif
 L(loop_start):
 -	leaq	(VEC_SIZE * 4)(%rdi), %rcx
 -	VMOVU	%VEC(0), (%rdi)
 -	andq	$-(VEC_SIZE * 4), %rcx
 -	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 -	VMOVU	%VEC(0), VEC_SIZE(%rdi)
 -	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 2)(%rdi)
 -	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
 	VMOVU	%VEC(0), (VEC_SIZE * 3)(%rdi)
 -	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
 -	addq	%rdi, %rdx
 -	andq	$-(VEC_SIZE * 4), %rdx
 -	cmpq	%rdx, %rcx
 -	je	L(return)
 +	cmpq	$(VEC_SIZE * 8), %rdx
 +	jbe	L(loop_end)
 +	andq	$-(VEC_SIZE * 2), %rdi
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	leaq	-(VEC_SIZE * 4)(%rax, %rdx), %rcx
 +	.p2align 4
 L(loop):
 -	VMOVA	%VEC(0), (%rcx)
 -	VMOVA	%VEC(0), VEC_SIZE(%rcx)
 -	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rcx)
 -	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rcx)
 -	addq	$(VEC_SIZE * 4), %rcx
 -	cmpq	%rcx, %rdx
 -	jne	L(loop)
 +	VMOVA	%VEC(0), (%rdi)
 +	VMOVA	%VEC(0), VEC_SIZE(%rdi)
 +	VMOVA	%VEC(0), (VEC_SIZE * 2)(%rdi)
 +	VMOVA	%VEC(0), (VEC_SIZE * 3)(%rdi)
 +	subq	$-(VEC_SIZE * 4), %rdi
 +	cmpq	%rcx, %rdi
 +	jb	L(loop)
 +L(loop_end):
 +	/* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
 +	       rdx as length is also unchanged.  */
 +	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
 +	VMOVU	%VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
 +	VMOVU	%VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
 +	VMOVU	%VEC(0), -VEC_SIZE(%rax, %rdx)
 	VZEROUPPER_SHORT_RETURN
 	.p2align 4
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-48.patch
+++ b/SOURCES/glibc-RHEL-15696-48.patch
@ -1,84 +0,0 @@
 From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Sun, 23 May 2021 19:43:24 -0400
 Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
 Content-type: text/plain; charset=UTF-8
 This patch changes the condition for copy 4x VEC so that if length is
 exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
 8x VEC case.
 Results For Skylake memcpy-avx2-erms
 size, al1 , al2 , Cur T   , New T   , Win , New / Cur
 128 , 0   , 0   , 9.137   , 6.873   , New , 75.22
 128 , 7   , 0   , 12.933  , 7.732   , New , 59.79
 128 , 0   , 7   , 11.852  , 6.76    , New , 57.04
 128 , 7   , 7   , 12.587  , 6.808   , New , 54.09
 Results For Icelake memcpy-evex-erms
 size, al1 , al2 , Cur T   , New T   , Win , New / Cur
 128 , 0   , 0   , 9.963   , 5.416   , New , 54.36
 128 , 7   , 0   , 16.467  , 8.061   , New , 48.95
 128 , 0   , 7   , 14.388  , 7.644   , New , 53.13
 128 , 7   , 7   , 14.546  , 7.642   , New , 52.54
 Results For Tigerlake memcpy-evex-erms
 size, al1 , al2 , Cur T   , New T   , Win , New / Cur
 128 , 0   , 0   , 8.979   , 4.95    , New , 55.13
 128 , 7   , 0   , 14.245  , 7.122   , New , 50.0
 128 , 0   , 7   , 12.668  , 6.675   , New , 52.69
 128 , 7   , 7   , 13.042  , 6.802   , New , 52.15
 Results For Skylake memmove-avx2-erms
 size, al1 , al2 , Cur T   , New T   , Win , New / Cur
 128 , 0   , 32  , 6.181   , 5.691   , New , 92.07
 128 , 32  , 0   , 6.165   , 5.752   , New , 93.3
 128 , 0   , 7   , 13.923  , 9.37    , New , 67.3
 128 , 7   , 0   , 12.049  , 10.182  , New , 84.5
 Results For Icelake memmove-evex-erms
 size, al1 , al2 , Cur T   , New T   , Win , New / Cur
 128 , 0   , 32  , 5.479   , 4.889   , New , 89.23
 128 , 32  , 0   , 5.127   , 4.911   , New , 95.79
 128 , 0   , 7   , 18.885  , 13.547  , New , 71.73
 128 , 7   , 0   , 15.565  , 14.436  , New , 92.75
 Results For Tigerlake memmove-evex-erms
 size, al1 , al2 , Cur T   , New T   , Win , New / Cur
 128 , 0   , 32  , 5.275   , 4.815   , New , 91.28
 128 , 32  , 0   , 5.376   , 4.565   , New , 84.91
 128 , 0   , 7   , 19.426  , 14.273  , New , 73.47
 128 , 7   , 0   , 15.924  , 14.951  , New , 93.89
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 index 3e2dd6bc..572cef04 100644
 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -417,8 +417,8 @@ L(more_2x_vec):
 	cmpq	$(VEC_SIZE * 8), %rdx
 	ja	L(more_8x_vec)
 	cmpq	$(VEC_SIZE * 4), %rdx
 -	jb	L(last_4x_vec)
 -	/* Copy from 4 * VEC to 8 * VEC, inclusively. */
 +	jbe	L(last_4x_vec)
 +	/* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
@@ -437,7 +437,7 @@ L(more_2x_vec):
 	VMOVU	%VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
 	VZEROUPPER_RETURN
 L(last_4x_vec):
 -	/* Copy from 2 * VEC to 4 * VEC. */
 +	/* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	-VEC_SIZE(%rsi,%rdx), %VEC(2)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-49.patch
+++ b/SOURCES/glibc-RHEL-15696-49.patch
@ -1,55 +0,0 @@
 From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Wed, 23 Jun 2021 19:19:34 -0400
 Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
 Content-type: text/plain; charset=UTF-8
 No bug. The way wcsnlen will check if near the end of maxlen
 is the following macro:
 	mov	%r11, %rsi;	\
 	subq	%rax, %rsi;	\
 	andq	$-64, %rax;	\
 	testq	$-64, %rsi;	\
 	je	L(strnlen_ret)
 Which words independently of s + maxlen overflowing. So the
 second overflow check is unnecissary for correctness and
 just extra overhead in the common no overflow case.
 test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
 all passing
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
 1 file changed, 7 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
 index 439e486a..b7657282 100644
 --- a/sysdeps/x86_64/multiarch/strlen-vec.S
 +++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -71,19 +71,12 @@ L(n_nonzero):
    suffice.  */
 	mov	%RSI_LP, %R10_LP
 	sar	$62, %R10_LP
 -	test	%R10_LP, %R10_LP
 	jnz	__wcslen_sse4_1
 	sal	$2, %RSI_LP
 # endif
 -
 /* Initialize long lived registers.  */
 -
 	add	%RDI_LP, %RSI_LP
 -# ifdef AS_WCSLEN
 -/* Check for overflow again from s + maxlen * sizeof(wchar_t).  */
 -	jbe	__wcslen_sse4_1
 -# endif
 	mov	%RSI_LP, %R10_LP
 	and	$-64, %R10_LP
 	mov	%RSI_LP, %R11_LP
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-5.patch
+++ b/SOURCES/glibc-RHEL-15696-5.patch
@ -1,290 +0,0 @@
 From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:32:24 -0800
 Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
 [BZ# 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes memset/wmemset for x32.  Tested on x86-64 and x32.  On
 x86-64, libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
 	RDX_LP for length.  Clear the upper 32 bits of RDX register.
 	* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
 	* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
 	* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
 ---
 .../multiarch/memset-avx512-no-vzeroupper.S   |  6 +-
 .../multiarch/memset-vec-unaligned-erms.S     | 34 +++++----
 sysdeps/x86_64/x32/Makefile                   |  4 +-
 sysdeps/x86_64/x32/tst-size_t-memset.c        | 73 +++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wmemset.c       | 20 +++++
 5 files changed, 121 insertions(+), 16 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
 Conflicts:
 	ChangeLog
 	(removed)
 diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
 index 689cc119..99e25519 100644
 --- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
 +++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -29,12 +29,16 @@
 	.section .text.avx512,"ax",@progbits
 #if defined PIC
 ENTRY (MEMSET_CHK)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (MEMSET_CHK)
 #endif
 ENTRY (MEMSET)
 +# ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 +# endif
 	vpxor	%xmm0, %xmm0, %xmm0
 	vmovd	%esi, %xmm1
 	lea	(%rdi, %rdx), %rsi
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index 270a1d49..9a0fd818 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -65,8 +65,8 @@
 	.section SECTION(.text),"ax",@progbits
 #if VEC_SIZE == 16 && IS_IN (libc)
 ENTRY (__bzero)
 -	movq	%rdi, %rax /* Set return value.  */
 -	movq	%rsi, %rdx /* Set n.  */
 +	mov	%RDI_LP, %RAX_LP /* Set return value.  */
 +	mov	%RSI_LP, %RDX_LP /* Set n.  */
 	pxor	%xmm0, %xmm0
 	jmp	L(entry_from_bzero)
 END (__bzero)
@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
 #if IS_IN (libc)
 # if defined SHARED
 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 # endif
 ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
 -	shlq	$2, %rdx
 +	shl	$2, %RDX_LP
 	WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 	jmp	L(entry_from_bzero)
 END (WMEMSET_SYMBOL (__wmemset, unaligned))
@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
 #if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
 #endif
 ENTRY (MEMSET_SYMBOL (__memset, unaligned))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 +# ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 +# endif
 L(entry_from_bzero):
 	cmpq	$VEC_SIZE, %rdx
 	jb	L(less_vec)
@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
 # if VEC_SIZE == 16
 ENTRY (__memset_chk_erms)
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END (__memset_chk_erms)
 /* Only used to measure performance of REP STOSB.  */
 ENTRY (__memset_erms)
 	/* Skip zero length.  */
 -	testq	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	jnz	 L(stosb)
 	movq	%rdi, %rax
 	ret
@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
 L(stosb):
 	/* Issue vzeroupper before rep stosb.  */
 	VZEROUPPER
 -	movq	%rdx, %rcx
 +	mov	%RDX_LP, %RCX_LP
 	movzbl	%sil, %eax
 -	movq	%rdi, %rdx
 +	mov	%RDI_LP, %RDX_LP
 	rep stosb
 -	movq	%rdx, %rax
 +	mov	%RDX_LP, %RAX_LP
 	ret
 # if VEC_SIZE == 16
 END (__memset_erms)
@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
 # if defined SHARED && IS_IN (libc)
 ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 -	cmpq	%rdx, %rcx
 +	cmp	%RDX_LP, %RCX_LP
 	jb	HIDDEN_JUMPTARGET (__chk_fail)
 END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
 # endif
 ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
 	MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
 -	cmpq	$VEC_SIZE, %rdx
 +# ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	mov	%edx, %edx
 +# endif
 +	cmp	$VEC_SIZE, %RDX_LP
 	jb	L(less_vec)
 -	cmpq	$(VEC_SIZE * 2), %rdx
 +	cmp	$(VEC_SIZE * 2), %RDX_LP
 	ja	L(stosb_more_2x_vec)
 	/* From VEC and to 2 * VEC.  No branch when size == VEC_SIZE.  */
 	VMOVU	%VEC(0), -VEC_SIZE(%rdi,%rdx)
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index e99dbd7c..98bd9ae9 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -7,9 +7,9 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 -	 tst-size_t-memrchr
 +	 tst-size_t-memrchr tst-size_t-memset
 endif
 ifeq ($(subdir),wcsmbs)
 -tests += tst-size_t-wmemchr tst-size_t-wmemcmp
 +tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
 endif
 diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
 new file mode 100644
 index 00000000..2c367af6
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
@@ -0,0 +1,73 @@
 +/* Test memset with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#ifdef WIDE
 +# define TEST_NAME "wmemset"
 +#else
 +# define TEST_NAME "memset"
 +#endif /* WIDE */
 +
 +#include "test-size_t.h"
 +
 +#ifdef WIDE
 +# include <wchar.h>
 +# define MEMSET wmemset
 +# define CHAR wchar_t
 +#else
 +# define MEMSET memset
 +# define CHAR char
 +#endif /* WIDE */
 +
 +IMPL (MEMSET, 1)
 +
 +typedef CHAR *(*proto_t) (CHAR *, int, size_t);
 +
 +static void *
 +__attribute__ ((noinline, noclone))
 +do_memset (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, (uintptr_t) b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  CHAR ch = 0x23;
 +  parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
 +  parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      c.fn = impl->fn;
 +      CHAR *p = (CHAR *) do_memset (src, c);
 +      size_t i;
 +      for (i = 0; i < src.len; i++)
 +	if (p[i] != ch)
 +	  {
 +	    error (0, 0, "Wrong result in function %s", impl->name);
 +	    ret = 1;
 +	  }
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
 new file mode 100644
 index 00000000..955eb488
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
@@ -0,0 +1,20 @@
 +/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define WIDE 1
 +#include "tst-size_t-memset.c"
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-50.patch
+++ b/SOURCES/glibc-RHEL-15696-50.patch
@ -1,43 +0,0 @@
 From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
 Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>  2021-05-23 21:43:10
 Committer: H.J. Lu <hjl.tools@gmail.com>  2021-06-27 10:56:57
 Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
 Child:  1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
 Branches: master, remotes/origin/master and many more (41)
 Follows: glibc-2.33.9000
 Precedes: glibc-2.34
    math: redirect roundeven function
    This patch redirect roundeven function for futhermore changes.
    Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 Conflicts:
 	*
 	(rewritten for older branch)
 diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
 index 7bbbb2dc..8728d0f2 100644
 --- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
 +++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
@@ -17,6 +17,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 +#define NO_MATH_REDIRECT
 #include <math.h>
 #include <math_private.h>
 #include <libm-alias-double.h>
@@ -67,5 +68,6 @@ __roundeven (double x)
   INSERT_WORDS64 (x, ix);
   return x;
 }
 -hidden_def (__roundeven)
 +#ifndef __roundeven
 libm_alias_double (__roundeven, roundeven)
 +#endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-51.patch
+++ b/SOURCES/glibc-RHEL-15696-51.patch
@ -1,118 +0,0 @@
 From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
 From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
 Date: Mon, 24 May 2021 09:43:10 +0800
 Subject: [PATCH] math: redirect roundeven function
 Content-type: text/plain; charset=UTF-8
 This patch redirect roundeven function for futhermore changes.
 Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 include/math.h                             | 3 ++-
 sysdeps/ieee754/dbl-64/s_roundeven.c       | 4 +++-
 sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
 sysdeps/ieee754/flt-32/s_roundevenf.c      | 3 +++
 sysdeps/ieee754/ldbl-128/s_roundevenl.c    | 1 +
 sysdeps/ieee754/ldbl-96/s_roundevenl.c     | 1 +
 6 files changed, 11 insertions(+), 2 deletions(-)
 Conflicts:
 	include/math.h
 	(missing MATH_REDIRECT macros)
 diff --git a/include/math.h b/include/math.h
 index e21d34b8..1f9f9a54 100644
 --- a/include/math.h
 +++ b/include/math.h
@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
 libm_hidden_proto (__issignalingf)
 libm_hidden_proto (__exp)
 libm_hidden_proto (__expf)
 -libm_hidden_proto (__roundeven)
 # ifndef __NO_LONG_DOUBLE_MATH
 libm_hidden_proto (__fpclassifyl)
@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
 # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
 #  ifndef NO_MATH_REDIRECT
 +float (roundevenf) (float) asm ("__roundevenf");
 +double (roundeven) (double) asm ("__roundeven");
 /* Declare sqrt for use within GLIBC.  Compilers typically inline sqrt as a
    single instruction.  Use an asm to avoid use of PLTs if it doesn't.  */
 float (sqrtf) (float) asm ("__ieee754_sqrtf");
 diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
 index 1438e81d..61962184 100644
 --- a/sysdeps/ieee754/dbl-64/s_roundeven.c
 +++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
@@ -17,6 +17,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 +#define NO_MATH_REDIRECT
 #include <math.h>
 #include <math_private.h>
 #include <libm-alias-double.h>
@@ -101,5 +102,6 @@ __roundeven (double x)
   INSERT_WORDS (x, hx, lx);
   return x;
 }
 -hidden_def (__roundeven)
 +#ifndef __roundeven
 libm_alias_double (__roundeven, roundeven)
 +#endif
 diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
 index 5a9b3f39..e0faf727 100644
 --- a/sysdeps/ieee754/float128/s_roundevenf128.c
 +++ b/sysdeps/ieee754/float128/s_roundevenf128.c
@@ -1,2 +1,3 @@
 +#define NO_MATH_REDIRECT
 #include <float128_private.h>
 #include "../ldbl-128/s_roundevenl.c"
 diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
 index 90f991d5..a661875e 100644
 --- a/sysdeps/ieee754/flt-32/s_roundevenf.c
 +++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
@@ -17,6 +17,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 +#define NO_MATH_REDIRECT
 #include <math.h>
 #include <math_private.h>
 #include <libm-alias-float.h>
@@ -67,4 +68,6 @@ __roundevenf (float x)
   SET_FLOAT_WORD (x, ix);
   return x;
 }
 +#ifndef __roundevenf
 libm_alias_float (__roundeven, roundeven)
 +#endif
 diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
 index 5fc59af4..b9375b6c 100644
 --- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
 +++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
@@ -17,6 +17,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 +#define NO_MATH_REDIRECT
 #include <math.h>
 #include <math_private.h>
 #include <libm-alias-ldouble.h>
 diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
 index be2e4fa4..65031ab7 100644
 --- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
 +++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
@@ -17,6 +17,7 @@
    License along with the GNU C Library; if not, see
    <http://www.gnu.org/licenses/>.  */
 +#define NO_MATH_REDIRECT
 #include <math.h>
 #include <math_private.h>
 #include <libm-alias-ldouble.h>
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-52.patch
+++ b/SOURCES/glibc-RHEL-15696-52.patch
@ -1,242 +0,0 @@
 From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
 From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
 Date: Mon, 24 May 2021 09:43:11 +0800
 Subject: [PATCH] x86_64: roundeven with sse4.1 support
 Content-type: text/plain; charset=UTF-8
 This patch adds support for the sse4.1 hardware floating point
 roundeven.
 Here is some benchmark results on my systems:
 =AMD Ryzen 9 3900X 12-Core Processor=
 * benchmark result before this commit
 |            |    roundeven |   roundevenf |
 |------------|--------------|--------------|
 | duration   |  3.75587e+09 |  3.75114e+09 |
 | iterations |  3.93053e+08 |  4.35402e+08 |
 | max        | 52.592       | 58.71        |
 | min        |  7.98        |  7.22        |
 | mean       |  9.55563     |  8.61535     |
 * benchmark result after this commit
 |            |     roundeven |   roundevenf |
 |------------|---------------|--------------|
 | duration   |   3.73815e+09 |  3.73738e+09 |
 | iterations |   5.82692e+08 |  5.91498e+08 |
 | max        |  56.468       | 51.642       |
 | min        |   6.27        |  6.156       |
 | mean       |   6.41532     |  6.3185      |
 =Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
 * benchmark result before this commit
 |            |    roundeven |   roundevenf |
 |------------|--------------|--------------|
 | duration   |  2.18208e+09 |  2.18258e+09 |
 | iterations |  2.39932e+08 |  2.46924e+08 |
 | max        | 96.378       | 98.035       |
 | min        |  6.776       |  5.94        |
 | mean       |  9.09456     |  8.83907     |
 * benchmark result after this commit
 |            |    roundeven |   roundevenf |
 |------------|--------------|--------------|
 | duration   |  2.17415e+09 |  2.17005e+09 |
 | iterations |  3.56193e+08 |  4.09824e+08 |
 | max        | 51.693       | 97.192       |
 | min        |  5.926       |  5.093       |
 | mean       |  6.10385     |  5.29507     |
 Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/fpu/multiarch/Makefile         |  5 +--
 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c  |  2 ++
 .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
 sysdeps/x86_64/fpu/multiarch/s_roundeven.c    | 31 +++++++++++++++++++
 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c |  3 ++
 .../fpu/multiarch/s_roundevenf-sse4_1.S       | 24 ++++++++++++++
 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c   | 31 +++++++++++++++++++
 7 files changed, 118 insertions(+), 2 deletions(-)
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
 diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
 index 9f387248..6ddd1c01 100644
 --- a/sysdeps/x86_64/fpu/multiarch/Makefile
 +++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,11 +1,12 @@
 ifeq ($(subdir),math)
 libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
 			s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
 -			s_trunc-c s_truncf-c
 +			s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
 libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
 			s_floorf-sse4_1 s_nearbyint-sse4_1 \
 -			s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
 +			s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
 +			s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
 			s_trunc-sse4_1 s_truncf-sse4_1
 libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
 new file mode 100644
 index 00000000..c7be43cb
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
@@ -0,0 +1,2 @@
 +#define __roundeven __roundeven_c
 +#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
 new file mode 100644
 index 00000000..6ae8f6b1
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
@@ -0,0 +1,24 @@
 +/* Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +	.section .text.sse4.1,"ax",@progbits
 +ENTRY(__roundeven_sse41)
 +	roundsd	$8, %xmm0, %xmm0
 +	ret
 +END(__roundeven_sse41)
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
 new file mode 100644
 index 00000000..d92eda65
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
@@ -0,0 +1,31 @@
 +/* Multiple versions of __roundeven.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <libm-alias-double.h>
 +
 +#define roundeven __redirect_roundeven
 +#define __roundeven __redirect___roundeven
 +#include <math.h>
 +#undef roundeven
 +#undef __roundeven
 +
 +#define SYMBOL_NAME roundeven
 +#include "ifunc-sse4_1.h"
 +
 +libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
 +libm_alias_double (__roundeven, roundeven)
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
 new file mode 100644
 index 00000000..72a6e7d1
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
@@ -0,0 +1,3 @@
 +#undef __roundevenf
 +#define __roundevenf __roundevenf_c
 +#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
 new file mode 100644
 index 00000000..a76e1080
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
@@ -0,0 +1,24 @@
 +/* Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +	.section .text.sse4.1,"ax",@progbits
 +ENTRY(__roundevenf_sse41)
 +	roundss	$8, %xmm0, %xmm0
 +	ret
 +END(__roundevenf_sse41)
 diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
 new file mode 100644
 index 00000000..2ee196e6
 --- /dev/null
 +++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
@@ -0,0 +1,31 @@
 +/* Multiple versions of __roundevenf.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <libm-alias-float.h>
 +
 +#define roundevenf __redirect_roundevenf
 +#define __roundevenf __redirect___roundevenf
 +#include <math.h>
 +#undef roundevenf
 +#undef __roundevenf
 +
 +#define SYMBOL_NAME roundevenf
 +#include "ifunc-sse4_1.h"
 +
 +libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
 +libm_alias_float (__roundeven, roundeven)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-53.patch
+++ b/SOURCES/glibc-RHEL-15696-53.patch
@ -1,41 +0,0 @@
 From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Sun, 9 Jan 2022 16:02:28 -0600
 Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
 Content-type: text/plain; charset=UTF-8
 Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
 __wcscmp_evex. For x86_64 this covers the entire address range so any
 length larger could not possibly be used to bound `s1` or `s2`.
 test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
 Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
 1 file changed, 10 insertions(+)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 index 459eeed0..d5aa6daa 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -97,6 +97,16 @@ ENTRY (STRCMP)
 	je	L(char0)
 	jb	L(zero)
 #  ifdef USE_AS_WCSCMP
 +#  ifndef __ILP32__
 +	movq	%rdx, %rcx
 +	/* Check if length could overflow when multiplied by
 +	   sizeof(wchar_t). Checking top 8 bits will cover all potential
 +	   overflow cases as well as redirect cases where its impossible to
 +	   length to bound a valid memory region. In these cases just use
 +	   'wcscmp'.  */
 +	shrq	$56, %rcx
 +	jnz	__wcscmp_evex
 +#  endif
 	/* Convert units: from wide to byte char.  */
 	shl	$2, %RDX_LP
 #  endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-58.patch
+++ b/SOURCES/glibc-RHEL-15696-58.patch
@ -1,45 +0,0 @@
 From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Sat, 23 Oct 2021 01:26:47 -0400
 Subject: [PATCH] x86: Replace sse2 instructions with avx in
 memcmp-evex-movbe.S
 Content-type: text/plain; charset=UTF-8
 This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
 it could potentially be dangerous to use SSE2 if this function is ever
 called without using 'vzeroupper' beforehand. While compilers appear
 to use 'vzeroupper' before function calls if AVX2 has been used, using
 SSE2 here is more brittle. Since it is not absolutely necessary it
 should be avoided.
 It costs 2-extra bytes but the extra bytes should only eat into
 alignment padding.
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 index 2761b54f..640f6757 100644
 --- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
 +++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -561,13 +561,13 @@ L(between_16_31):
 	/* From 16 to 31 bytes.  No branch when size == 16.  */
 	/* Use movups to save code size.  */
 -	movups	(%rsi), %xmm2
 +	vmovdqu	(%rsi), %xmm2
 	VPCMP	$4, (%rdi), %xmm2, %k1
 	kmovd	%k1, %eax
 	testl	%eax, %eax
 	jnz	L(return_vec_0_lv)
 	/* Use overlapping loads to avoid branches.  */
 -	movups	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 +	vmovdqu	-16(%rsi, %rdx, CHAR_SIZE), %xmm2
 	VPCMP	$4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
 	addl	$(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
 	kmovd	%k1, %eax
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-6.patch
+++ b/SOURCES/glibc-RHEL-15696-6.patch
@ -1,300 +0,0 @@
 From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:33:52 -0800
 Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
 [BZ# 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes the strncmp family for x32.  Tested on x86-64 and x32.
 On x86-64, libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
 	* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
 	* sysdeps/x86_64/strcmp.S: Likewise.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
 	tst-size_t-strncmp and tst-size_t-wcsncmp.
 	* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
 	* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
 	* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
 ---
 sysdeps/x86_64/multiarch/strcmp-avx2.S      |  6 +-
 sysdeps/x86_64/multiarch/strcmp-sse42.S     |  6 +-
 sysdeps/x86_64/strcmp.S                     |  6 +-
 sysdeps/x86_64/x32/Makefile                 |  6 +-
 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-strncmp.c     | 78 +++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c     | 20 ++++++
 7 files changed, 170 insertions(+), 11 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
 Conflicts:
 	ChangeLog
 	(removed)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 index 327e3d87..156c1949 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -79,15 +79,15 @@
 ENTRY (STRCMP)
 # ifdef USE_AS_STRNCMP
 	/* Check for simple cases (0 or 1) in offset.  */
 -	cmp	$1, %rdx
 +	cmp	$1, %RDX_LP
 	je	L(char0)
 	jb	L(zero)
 #  ifdef USE_AS_WCSCMP
 	/* Convert units: from wide to byte char.  */
 -	shl	$2, %rdx
 +	shl	$2, %RDX_LP
 #  endif
 	/* Register %r11 tracks the maximum offset.  */
 -	movq	%rdx, %r11
 +	mov	%RDX_LP, %R11_LP
 # endif
 	movl	%edi, %eax
 	xorl	%edx, %edx
 diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
 index d3c07bd2..a1ebea46 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -156,11 +156,11 @@ STRCMP_SSE42:
 #endif
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 -	test	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	je	LABEL(strcmp_exitz)
 -	cmp	$1, %rdx
 +	cmp	$1, %RDX_LP
 	je	LABEL(Byte0)
 -	mov	%rdx, %r11
 +	mov	%RDX_LP, %R11_LP
 #endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
 index e16945b9..f47c8ad4 100644
 --- a/sysdeps/x86_64/strcmp.S
 +++ b/sysdeps/x86_64/strcmp.S
@@ -135,11 +135,11 @@ ENTRY (STRCMP)
  * This implementation uses SSE to compare up to 16 bytes at a time.
  */
 #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
 -	test	%rdx, %rdx
 +	test	%RDX_LP, %RDX_LP
 	je	LABEL(strcmp_exitz)
 -	cmp	$1, %rdx
 +	cmp	$1, %RDX_LP
 	je	LABEL(Byte0)
 -	mov	%rdx, %r11
 +	mov	%RDX_LP, %R11_LP
 #endif
 	mov	%esi, %ecx
 	mov	%edi, %eax
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index 98bd9ae9..db302839 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -7,9 +7,11 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 -	 tst-size_t-memrchr tst-size_t-memset
 +	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
 +	 tst-size_t-strncmp
 endif
 ifeq ($(subdir),wcsmbs)
 -tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
 +tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
 +	 tst-size_t-wcsncmp
 endif
 diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
 new file mode 100644
 index 00000000..86233593
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
@@ -0,0 +1,59 @@
 +/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define TEST_NAME "strncasecmp"
 +#include "test-size_t.h"
 +
 +IMPL (strncasecmp, 1)
 +
 +typedef int (*proto_t) (const char *, const char *, size_t);
 +
 +static int
 +__attribute__ ((noinline, noclone))
 +do_strncasecmp (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  parameter_t dest = { { page_size }, buf1 };
 +  parameter_t src = { { 0 }, buf2 };
 +
 +  strncpy ((char *) buf1, (const char *) buf2, page_size);
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      src.fn = impl->fn;
 +      int res = do_strncasecmp (dest, src);
 +      if (res)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %i != 0",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
 new file mode 100644
 index 00000000..54e6bd83
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
@@ -0,0 +1,78 @@
 +/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#ifdef WIDE
 +# define TEST_NAME "wcsncmp"
 +#else
 +# define TEST_NAME "strncmp"
 +#endif
 +
 +#include "test-size_t.h"
 +
 +#ifdef WIDE
 +# include <wchar.h>
 +
 +# define STRNCMP wcsncmp
 +# define STRNCPY wcsncpy
 +# define CHAR wchar_t
 +#else
 +# define STRNCMP strncmp
 +# define STRNCPY strncpy
 +# define CHAR char
 +#endif
 +
 +IMPL (STRNCMP, 1)
 +
 +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
 +
 +
 +static int
 +__attribute__ ((noinline, noclone))
 +do_strncmp (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  size_t size = page_size / sizeof (CHAR);
 +  parameter_t dest = { { size }, buf1 };
 +  parameter_t src = { { 0 }, buf2 };
 +
 +  STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      src.fn = impl->fn;
 +      int res = do_strncmp (dest, src);
 +      if (res)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %i != 0",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
 new file mode 100644
 index 00000000..4829647c
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
@@ -0,0 +1,20 @@
 +/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define WIDE 1
 +#include "tst-size_t-strncmp.c"
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-61.patch
+++ b/SOURCES/glibc-RHEL-15696-61.patch
@ -1,56 +0,0 @@
 From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 30 Apr 2021 05:58:59 -0700
 Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
 Content-type: text/plain; charset=UTF-8
 The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
 that REP MOVSB became faster after 2112 bytes:
                                      Vector Move       REP MOVSB
 length=2112, align1=0, align2=0:        24.20             24.40
 length=2112, align1=1, align2=0:        26.07             23.13
 length=2112, align1=0, align2=1:        27.18             28.13
 length=2112, align1=1, align2=1:        26.23             25.16
 length=2176, align1=0, align2=0:        23.18             22.52
 length=2176, align1=2, align2=0:        25.45             22.52
 length=2176, align1=0, align2=2:        27.14             27.82
 length=2176, align1=2, align2=2:        22.73             25.56
 length=2240, align1=0, align2=0:        24.62             24.25
 length=2240, align1=3, align2=0:        29.77             27.15
 length=2240, align1=0, align2=3:        35.55             29.93
 length=2240, align1=3, align2=3:        34.49             25.15
 length=2304, align1=0, align2=0:        34.75             26.64
 length=2304, align1=4, align2=0:        32.09             22.63
 length=2304, align1=0, align2=4:        28.43             31.24
 Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
 fast short REP MOVSB (FSRM).
 	* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
 	rep_movsb_threshold to 2112 on processors with fast short REP
 	MOVSB (FSRM).
 ---
 sysdeps/x86/cacheinfo.h | 6 ++++++
 1 file changed, 6 insertions(+)
 diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
 index f72f634a..cc3941d3 100644
 --- a/sysdeps/x86/cacheinfo.h
 +++ b/sysdeps/x86/cacheinfo.h
@@ -430,6 +430,12 @@ init_cacheinfo (void)
       rep_movsb_threshold = 2048 * (16 / 16);
       minimum_rep_movsb_threshold = 16 * 8;
     }
 +
 +  /* NB: The default REP MOVSB threshold is 2112 on processors with fast
 +     short REP MOVSB (FSRM).  */
 +  if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
 +    rep_movsb_threshold = 2112;
 +
   if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
     __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
   else
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-66.patch
+++ b/SOURCES/glibc-RHEL-15696-66.patch
@ -1,51 +0,0 @@
 From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Tue, 2 Nov 2021 18:33:07 -0700
 Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
 Content-type: text/plain; charset=UTF-8
 CAS instruction is expensive.  From the x86 CPU's point of view, getting
 a cache line for writing is more expensive than reading.  See Appendix
 A.2 Spinlock in:
 https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
 The full compare and swap will grab the cache line exclusive and cause
 excessive cache line bouncing.
 Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
 loop if compare may fail to reduce cache line bouncing on contended locks.
 Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
 ---
 nptl/pthread_mutex_lock.c | 7 +++++++
 1 file changed, 7 insertions(+)
 diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 index 60ada70d..eb4d8baa 100644
 --- a/nptl/pthread_mutex_lock.c
 +++ b/nptl/pthread_mutex_lock.c
@@ -56,6 +56,11 @@
 #define FORCE_ELISION(m, s)
 #endif
 +#ifndef LLL_MUTEX_READ_LOCK
 +# define LLL_MUTEX_READ_LOCK(mutex) \
 +  atomic_load_relaxed (&(mutex)->__data.__lock)
 +#endif
 +
 static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
      __attribute_noinline__;
@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
 		  break;
 		}
 	      atomic_spin_nop ();
 +	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
 +		continue;
 	    }
 	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-67.patch
+++ b/SOURCES/glibc-RHEL-15696-67.patch
@ -1,71 +0,0 @@
 From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 12 Nov 2021 11:47:42 -0800
 Subject: [PATCH] Move assignment out of the CAS condition
 Content-type: text/plain; charset=UTF-8
 Update
 commit 49302b8fdf9103b6fc0a398678668a22fa19574c
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Thu Nov 11 06:54:01 2021 -0800
    Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
    Replace boolean CAS with value CAS to avoid the extra load.
 and
 commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Thu Nov 11 06:31:51 2021 -0800
    Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
    Replace boolean CAS with value CAS to avoid the extra load.
 by moving assignment out of the CAS condition.
 ---
 nptl/pthread_mutex_lock.c      | 7 +++----
 nptl/pthread_mutex_timedlock.c | 7 +++----
 2 files changed, 6 insertions(+), 8 deletions(-)
 diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 index eb4d8baa..a633d95e 100644
 --- a/nptl/pthread_mutex_lock.c
 +++ b/nptl/pthread_mutex_lock.c
@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
 	     meantime.  */
 	  if ((oldval & FUTEX_WAITERS) == 0)
 	    {
 -	      int val;
 -	      if ((val = atomic_compare_and_exchange_val_acq
 -		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
 -		    oldval)) != oldval)
 +	      int val = atomic_compare_and_exchange_val_acq
 +		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
 +	      if (val != oldval)
 		{
 		  oldval = val;
 		  continue;
 diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
 index c4627ef6..a76c30b7 100644
 --- a/nptl/pthread_mutex_timedlock.c
 +++ b/nptl/pthread_mutex_timedlock.c
@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
 	     meantime.  */
 	  if ((oldval & FUTEX_WAITERS) == 0)
 	    {
 -	      int val;
 -	      if ((val = atomic_compare_and_exchange_val_acq
 -		   (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
 -		    oldval)) != oldval)
 +	      int val = atomic_compare_and_exchange_val_acq
 +		(&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
 +	      if (val != oldval)
 		{
 		  oldval = val;
 		  continue;
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-68.patch
+++ b/SOURCES/glibc-RHEL-15696-68.patch
@ -1,60 +0,0 @@
 From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Fri, 3 Dec 2021 15:29:25 -0800
 Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
 Content-type: text/plain; charset=UTF-8
 Must use notl %edi here as lower bits are for CHAR comparisons
 potentially out of range thus can be 0 without indicating mismatch.
 This fixes BZ #28646.
 Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)
 Conflicts:
 	string/test-strcmp.c
 	(new check omitted)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 index 82f12ac8..6f5c4bf9 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -656,12 +656,13 @@ L(loop_cross_page):
 	   in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10).  */
 	VPCMP	$0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
 	kmovd	%k3, %edi
 +    /* Must use notl %edi here as lower bits are for CHAR
 +	   comparisons potentially out of range thus can be 0 without
 +	   indicating mismatch.  */
 +	notl	%edi
 # ifdef USE_AS_WCSCMP
 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 -	notl	%edi
 	andl	$0xff, %edi
 -# else
 -	incl	%edi
 # endif
 # ifdef USE_AS_WCSCMP
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
 	   in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10).  */
 	VPCMP	$0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
 	kmovd	%k3, %edi
 +	/* Must use notl %edi here as lower bits are for CHAR
 +	   comparisons potentially out of range thus can be 0 without
 +	   indicating mismatch.  */
 +	notl	%edi
 # ifdef USE_AS_WCSCMP
 	/* Don't use subl since it is the upper 8 bits of EDI below.  */
 -	notl	%edi
 	andl	$0xff, %edi
 -# else
 -	incl	%edi
 # endif
 # ifdef USE_AS_WCSCMP
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-7.patch
+++ b/SOURCES/glibc-RHEL-15696-7.patch
@ -1,153 +0,0 @@
 From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:35:18 -0800
 Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes strncpy for x32.  Tested on x86-64 and x32.  On x86-64,
 libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
 	* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
 	* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
 	* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
 ---
 .../x86_64/multiarch/strcpy-sse2-unaligned.S  |  4 +-
 sysdeps/x86_64/multiarch/strcpy-ssse3.S       |  6 +-
 sysdeps/x86_64/x32/Makefile                   |  2 +-
 sysdeps/x86_64/x32/tst-size_t-strncpy.c       | 58 +++++++++++++++++++
 4 files changed, 64 insertions(+), 6 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
 Conflicts:
 	ChangeLog
 	(removed)
 	sysdeps/x86_64/multiarch/strcpy-avx2.S
 	(skipped, only needed for x32 arch)
 diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
 index 72bf7e85..50aca22d 100644
 --- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
 +++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -40,8 +40,8 @@
 .text
 ENTRY (STRCPY)
 #  ifdef USE_AS_STRNCPY
 -	mov	%rdx, %r8
 -	test	%r8, %r8
 +	mov	%RDX_LP, %R8_LP
 +	test	%R8_LP, %R8_LP
 	jz	L(ExitZero)
 #  endif
 	mov	%rsi, %rcx
 diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
 index 9858d0c4..0a62814a 100644
 --- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
 +++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -31,13 +31,13 @@ ENTRY (STRCPY)
 	mov	%rsi, %rcx
 #  ifdef USE_AS_STRNCPY
 -	mov	%rdx, %r8
 +	mov	%RDX_LP, %R8_LP
 #  endif
 	mov	%rdi, %rdx
 #  ifdef USE_AS_STRNCPY
 -	test	%r8, %r8
 +	test	%R8_LP, %R8_LP
 	jz	L(Exit0)
 -	cmp	$8, %r8
 +	cmp	$8, %R8_LP
 	jbe	L(StrncpyExit8Bytes)
 # endif
 	cmpb	$0, (%rcx)
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index db302839..2a9e20a9 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -8,7 +8,7 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
 -	 tst-size_t-strncmp
 +	 tst-size_t-strncmp tst-size_t-strncpy
 endif
 ifeq ($(subdir),wcsmbs)
 diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
 new file mode 100644
 index 00000000..4dec71e6
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
@@ -0,0 +1,58 @@
 +/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define TEST_NAME "strncpy"
 +#include "test-size_t.h"
 +
 +IMPL (strncpy, 1)
 +
 +typedef char *(*proto_t) (char *, const char*, size_t);
 +
 +static void *
 +__attribute__ ((noinline, noclone))
 +do_strncpy (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  parameter_t dest = { { page_size }, buf1 };
 +  parameter_t src = { { 0 }, buf2 };
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      src.fn = impl->fn;
 +      do_strncpy (dest, src);
 +      int res = strncmp (dest.p, src.p, dest.len);
 +      if (res)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %i != 0",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-71.patch
+++ b/SOURCES/glibc-RHEL-15696-71.patch
@ -1,43 +0,0 @@
 From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
 From: Jangwoong Kim <6812skiii@gmail.com>
 Date: Tue, 14 Dec 2021 21:30:51 +0900
 Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
 Content-type: text/plain; charset=UTF-8
 The commit:
 "Add LLL_MUTEX_READ_LOCK [BZ #28537]"
 SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
 introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
 if atomic load fails. But, "continue" inside of do-while loop
 does not skip the evaluation of escape expression, thus CAS
 is not skipped.
 Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
 LLL_MUTEX_READ_LOCK fails.
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 nptl/pthread_mutex_lock.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)
 diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
 index a633d95e..d96a9933 100644
 --- a/nptl/pthread_mutex_lock.c
 +++ b/nptl/pthread_mutex_lock.c
@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
 		  break;
 		}
 	      atomic_spin_nop ();
 -	      if (LLL_MUTEX_READ_LOCK (mutex) != 0)
 -		continue;
 	    }
 -	  while (LLL_MUTEX_TRYLOCK (mutex) != 0);
 +	  while (LLL_MUTEX_READ_LOCK (mutex) != 0
 +		 || LLL_MUTEX_TRYLOCK (mutex) != 0);
 	  mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
 	}
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-73.patch
+++ b/SOURCES/glibc-RHEL-15696-73.patch
@ -1,37 +0,0 @@
 From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Fri, 18 Feb 2022 17:00:25 -0600
 Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
 Content-type: text/plain; charset=UTF-8
 Previously TEST_NAME was passing a function pointer. This didn't fail
 because of the -Wno-error flag (to allow for overflow sizes passed
 to strncmp/wcsncmp)
 Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
 ---
 sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)
 diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
 index 4e9f094f..aef9866c 100644
 --- a/sysdeps/x86/tst-strncmp-rtm.c
 +++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -23,12 +23,12 @@
 # define CHAR wchar_t
 # define MEMSET wmemset
 # define STRNCMP wcsncmp
 -# define TEST_NAME wcsncmp
 +# define TEST_NAME "wcsncmp"
 #else /* !WIDE */
 # define CHAR char
 # define MEMSET memset
 # define STRNCMP strncmp
 -# define TEST_NAME strncmp
 +# define TEST_NAME "strncmp"
 #endif /* !WIDE */
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-76.patch
+++ b/SOURCES/glibc-RHEL-15696-76.patch
@ -1,33 +0,0 @@
 From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 4 Feb 2022 11:09:10 -0800
 Subject: [PATCH] x86-64: Fix strcmp-avx2.S
 Content-type: text/plain; charset=UTF-8
 Change "movl %edx, %rdx" to "movl %edx, %edx" in:
 commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Mon Jan 10 15:35:38 2022 -0600
    x86: Optimize strcmp-avx2.S
 ---
 sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
 index 554ffe4c..04675aa4 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -106,7 +106,7 @@ ENTRY(STRCMP)
 # ifdef USE_AS_STRNCMP
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 -	movl	%edx, %rdx
 +	movl	%edx, %edx
 #  endif
 	cmp	$1, %RDX_LP
 	/* Signed comparison intentional. We use this branch to also
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-77.patch
+++ b/SOURCES/glibc-RHEL-15696-77.patch
@ -1,33 +0,0 @@
 From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Fri, 4 Feb 2022 11:11:08 -0800
 Subject: [PATCH] x86-64: Fix strcmp-evex.S
 Content-type: text/plain; charset=UTF-8
 Change "movl %edx, %rdx" to "movl %edx, %edx" in:
 commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Mon Jan 10 15:35:39 2022 -0600
    x86: Optimize strcmp-evex.S
 ---
 sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
 index 99d8409a..ed56af8e 100644
 --- a/sysdeps/x86_64/multiarch/strcmp-evex.S
 +++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -116,7 +116,7 @@ ENTRY(STRCMP)
 # ifdef USE_AS_STRNCMP
 #  ifdef __ILP32__
 	/* Clear the upper 32 bits.  */
 -	movl	%edx, %rdx
 +	movl	%edx, %edx
 #  endif
 	cmp	$1, %RDX_LP
 	/* Signed comparison intentional. We use this branch to also
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-79.patch
+++ b/SOURCES/glibc-RHEL-15696-79.patch
@ -1,40 +0,0 @@
 From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Mon, 7 Feb 2022 00:32:23 -0600
 Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
 Only)
 Content-type: text/plain; charset=UTF-8
 commit b62ace2740a106222e124cc86956448fa07abf4d
 Author: Noah Goldstein <goldstein.w.n@gmail.com>
 Date:   Sun Feb 6 00:54:18 2022 -0600
    x86: Improve vec generation in memset-vec-unaligned-erms.S
 Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
 instruction and memset.S is restricted to only SSE2 instructions.
 ---
 sysdeps/x86_64/memset.S | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)
 diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
 index 27debd2b..4cb4aa71 100644
 --- a/sysdeps/x86_64/memset.S
 +++ b/sysdeps/x86_64/memset.S
@@ -30,9 +30,10 @@
 # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
 -  pxor %xmm1, %xmm1; \
 -  pshufb %xmm1, %xmm0; \
 -  movq r, %rax
 +  movq r, %rax; \
 +  punpcklbw %xmm0, %xmm0; \
 +  punpcklwd %xmm0, %xmm0; \
 +  pshufd $0, %xmm0, %xmm0
 # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
   movd d, %xmm0; \
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-8.patch
+++ b/SOURCES/glibc-RHEL-15696-8.patch
@ -1,218 +0,0 @@
 From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 21 Jan 2019 11:36:36 -0800
 Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
 [BZ# 24097]
 Content-type: text/plain; charset=UTF-8
 On x32, the size_t parameter may be passed in the lower 32 bits of a
 64-bit register with the non-zero upper 32 bits.  The string/memory
 functions written in assembly can only use the lower 32 bits of a
 64-bit register as length or must clear the upper 32 bits before using
 the full 64-bit register for length.
 This pach fixes strnlen/wcsnlen for x32.  Tested on x86-64 and x32.  On
 x86-64, libc.so is the same with and withou the fix.
 	[BZ# 24097]
 	CVE-2019-6488
 	* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
 	Clear the upper 32 bits of RSI register.
 	* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
 	and tst-size_t-wcsnlen.
 	* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
 	* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
 ---
 sysdeps/x86_64/multiarch/strlen-avx2.S  |  9 ++--
 sysdeps/x86_64/strlen.S                 | 12 ++---
 sysdeps/x86_64/x32/Makefile             |  4 +-
 sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
 5 files changed, 106 insertions(+), 11 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
 Conflicts:
 	ChangeLog
 	(removed)
 diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
 index fb2418cd..645e0446 100644
 --- a/sysdeps/x86_64/multiarch/strlen-avx2.S
 +++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -42,12 +42,15 @@
 ENTRY (STRLEN)
 # ifdef USE_AS_STRNLEN
 	/* Check for zero length.  */
 -	testq	%rsi, %rsi
 +	test	%RSI_LP, %RSI_LP
 	jz	L(zero)
 #  ifdef USE_AS_WCSLEN
 -	shl	$2, %rsi
 +	shl	$2, %RSI_LP
 +#  elif defined __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%esi, %esi
 #  endif
 -	movq	%rsi, %r8
 +	mov	%RSI_LP, %R8_LP
 # endif
 	movl	%edi, %ecx
 	movq	%rdi, %rdx
 diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
 index 01cb5fa8..f845f3d4 100644
 --- a/sysdeps/x86_64/strlen.S
 +++ b/sysdeps/x86_64/strlen.S
@@ -59,21 +59,21 @@ ENTRY(strlen)
 #ifdef AS_STRNLEN
 /* Do not read anything when n==0.  */
 -	test	%rsi, %rsi
 +	test	%RSI_LP, %RSI_LP
 	jne	L(n_nonzero)
 	xor	%rax, %rax
 	ret
 L(n_nonzero):
 # ifdef AS_WCSLEN
 -	shlq	$2, %rsi
 +	shl	$2, %RSI_LP
 # endif
 /* Initialize long lived registers.  */
 -	add	%rdi, %rsi
 -	mov	%rsi, %r10
 -	and	$-64, %r10
 -	mov	%rsi, %r11
 +	add	%RDI_LP, %RSI_LP
 +	mov	%RSI_LP, %R10_LP
 +	and	$-64, %R10_LP
 +	mov	%RSI_LP, %R11_LP
 #endif
 	pxor	%xmm0, %xmm0
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index 2a9e20a9..1557724b 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -8,10 +8,10 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
 -	 tst-size_t-strncmp tst-size_t-strncpy
 +	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
 endif
 ifeq ($(subdir),wcsmbs)
 tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
 -	 tst-size_t-wcsncmp
 +	 tst-size_t-wcsncmp tst-size_t-wcsnlen
 endif
 diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
 new file mode 100644
 index 00000000..690a4a8a
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
@@ -0,0 +1,72 @@
 +/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#ifdef WIDE
 +# define TEST_NAME "wcsnlen"
 +#else
 +# define TEST_NAME "strnlen"
 +#endif /* WIDE */
 +
 +#include "test-size_t.h"
 +
 +#ifdef WIDE
 +# include <wchar.h>
 +# define STRNLEN wcsnlen
 +# define CHAR wchar_t
 +#else
 +# define STRNLEN strnlen
 +# define CHAR char
 +#endif /* WIDE */
 +
 +IMPL (STRNLEN, 1)
 +
 +typedef size_t (*proto_t) (const CHAR *, size_t);
 +
 +static size_t
 +__attribute__ ((noinline, noclone))
 +do_strnlen (parameter_t a, parameter_t b)
 +{
 +  return CALL (&a, a.p, b.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  size_t size = page_size / sizeof (CHAR);
 +  parameter_t src = { { 0 }, buf2 };
 +  parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      src.fn = impl->fn;
 +      size_t res = do_strnlen (src, c);
 +      if (res != size)
 +	{
 +	  error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
 +		 impl->name, res, size);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
 new file mode 100644
 index 00000000..093b4bbe
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
@@ -0,0 +1,20 @@
 +/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define WIDE 1
 +#include "tst-size_t-strnlen.c"
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-81.patch
+++ b/SOURCES/glibc-RHEL-15696-81.patch
@ -1,33 +0,0 @@
 From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
 From: Noah Goldstein <goldstein.w.n@gmail.com>
 Date: Sat, 12 Feb 2022 00:45:00 -0600
 Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
 Content-type: text/plain; charset=UTF-8
 commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Mon Feb 7 05:55:15 2022 -0800
    x86-64: Optimize bzero
 Remove setting the .text section for the code. This commit
 adds that back.
 ---
 sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
 1 file changed, 1 insertion(+)
 diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 index 06f5f5d7..4fb475c0 100644
 --- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -114,6 +114,7 @@
 # error SECTION is not defined!
 #endif
 +	.section SECTION(.text), "ax", @progbits
 #if IS_IN (libc)
 # if defined SHARED
 ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-86.patch
+++ b/SOURCES/glibc-RHEL-15696-86.patch
@ -1,36 +0,0 @@
 From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Thu, 10 Feb 2022 11:52:50 -0800
 Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
 Content-type: text/plain; charset=UTF-8
 commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
 Author: H.J. Lu <hjl.tools@gmail.com>
 Date:   Mon Feb 7 05:55:15 2022 -0800
    x86-64: Optimize bzero
 added the optimized bzero.  Remove bzero weak alias in SS2 memset to
 avoid undefined __bzero in memset-sse2-unaligned-erms.
 ---
 sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)
 diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
 index 8f579ad6..af51362b 100644
 --- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
 +++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -31,9 +31,7 @@
 # endif
 # undef weak_alias
 -# define weak_alias(original, alias) \
 -	.weak bzero; bzero = __bzero
 -
 +# define weak_alias(original, alias)
 # undef strong_alias
 # define strong_alias(ignored1, ignored2)
 #endif
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-87.patch
+++ b/SOURCES/glibc-RHEL-15696-87.patch
@ -1,29 +0,0 @@
 From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001
 From: Adhemerval Zanella <adhemerval.zanella@linaro.org>
 Date: Thu, 10 Feb 2022 11:23:24 -0300
 Subject: [PATCH] x86_64: Remove bcopy optimizations
 Content-type: text/plain; charset=UTF-8
 The symbols is not present in current POSIX specification and compiler
 already generates memmove call.
 ---
 sysdeps/x86_64/multiarch/bcopy.S | 7 -------
 1 file changed, 7 deletions(-)
 delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S
 diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S
 deleted file mode 100644
 index 639f02bd..00000000
 --- a/sysdeps/x86_64/multiarch/bcopy.S
 +++ /dev/null
@@ -1,7 +0,0 @@
 -#include <sysdep.h>
 -
 -	.text
 -ENTRY(bcopy)
 -	xchg	%rdi, %rsi
 -	jmp	__libc_memmove	/* Branch to IFUNC memmove.  */
 -END(bcopy)
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-15696-9.patch
+++ b/SOURCES/glibc-RHEL-15696-9.patch
@ -1,206 +0,0 @@
 From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001
 From: "H.J. Lu" <hjl.tools@gmail.com>
 Date: Mon, 4 Feb 2019 06:31:01 -0800
 Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ
 #24155]
 Content-type: text/plain; charset=UTF-8
 Since the size argument is unsigned. we should use unsigned Jcc
 instructions, instead of signed, to check size.
 Tested on x86-64 and x32, with and without --disable-multi-arch.
 	[BZ #24155]
 	CVE-2019-7309
 	* NEWS: Updated for CVE-2019-7309.
 	* sysdeps/x86_64/memcmp.S: Use RDX_LP for size.  Clear the
 	upper 32 bits of RDX register for x32.  Use unsigned Jcc
 	instructions, instead of signed.
 	* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2.
 	* sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test.
 ---
 sysdeps/x86_64/memcmp.S                  | 20 +++---
 sysdeps/x86_64/x32/Makefile              |  3 +-
 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++
 3 files changed, 93 insertions(+), 9 deletions(-)
 create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
 Conflics:
 	ChangeLog
 	(removed)
 	NEWS
 	(removed)
 diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S
 index bcb4a2e8..45918d37 100644
 --- a/sysdeps/x86_64/memcmp.S
 +++ b/sysdeps/x86_64/memcmp.S
@@ -21,14 +21,18 @@
 	.text
 ENTRY (memcmp)
 -	test	%rdx, %rdx
 +#ifdef __ILP32__
 +	/* Clear the upper 32 bits.  */
 +	movl	%edx, %edx
 +#endif
 +	test	%RDX_LP, %RDX_LP
 	jz	L(finz)
 	cmpq	$1, %rdx
 -	jle	L(finr1b)
 +	jbe	L(finr1b)
 	subq	%rdi, %rsi
 	movq	%rdx, %r10
 	cmpq	$32, %r10
 -	jge	L(gt32)
 +	jae	L(gt32)
 	/* Handle small chunks and last block of less than 32 bytes.  */
 L(small):
 	testq	$1, %r10
@@ -156,7 +160,7 @@ L(A32):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
 -        jge	L(mt16)
 +        jae	L(mt16)
 	/* Pre-unroll to be ready for unrolled 64B loop.  */
 	testq	$32, %rdi
 	jz	L(A64)
@@ -178,7 +182,7 @@ L(A64):
 	movq	%r11, %r10
 	andq	$-64, %r10
 	cmpq	%r10, %rdi
 -        jge	L(mt32)
 +        jae	L(mt32)
 L(A64main):
 	movdqu    (%rdi,%rsi), %xmm0
@@ -216,7 +220,7 @@ L(mt32):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
 -        jge	L(mt16)
 +        jae	L(mt16)
 L(A32main):
 	movdqu    (%rdi,%rsi), %xmm0
@@ -254,7 +258,7 @@ L(ATR):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
 -        jge	L(mt16)
 +        jae	L(mt16)
 	testq	$16, %rdi
 	jz	L(ATR32)
@@ -325,7 +329,7 @@ L(ATR64main):
 	movq	%r11, %r10
 	andq	$-32, %r10
 	cmpq	%r10, %rdi
 -        jge	L(mt16)
 +        jae	L(mt16)
 L(ATR32res):
 	movdqa    (%rdi,%rsi), %xmm0
 diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
 index 1557724b..87489565 100644
 --- a/sysdeps/x86_64/x32/Makefile
 +++ b/sysdeps/x86_64/x32/Makefile
@@ -8,7 +8,8 @@ endif
 ifeq ($(subdir),string)
 tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
 	 tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
 -	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
 +	 tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \
 +	 tst-size_t-memcmp-2
 endif
 ifeq ($(subdir),wcsmbs)
 diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
 new file mode 100644
 index 00000000..d8ae1a08
 --- /dev/null
 +++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c
@@ -0,0 +1,79 @@
 +/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
 +   Copyright (C) 2019 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#define TEST_MAIN
 +#ifdef WIDE
 +# define TEST_NAME "wmemcmp"
 +#else
 +# define TEST_NAME "memcmp"
 +#endif
 +
 +#include "test-size_t.h"
 +
 +#ifdef WIDE
 +# include <inttypes.h>
 +# include <wchar.h>
 +
 +# define MEMCMP wmemcmp
 +# define CHAR wchar_t
 +#else
 +# define MEMCMP memcmp
 +# define CHAR char
 +#endif
 +
 +IMPL (MEMCMP, 1)
 +
 +typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
 +
 +static int
 +__attribute__ ((noinline, noclone))
 +do_memcmp (parameter_t a, parameter_t b)
 +{
 +  return CALL (&b, a.p, b.p, a.len);
 +}
 +
 +static int
 +test_main (void)
 +{
 +  test_init ();
 +
 +  parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
 +  parameter_t src = { { 0 }, buf2 };
 +
 +  memcpy (buf1, buf2, page_size);
 +
 +  CHAR *p = (CHAR *) buf1;
 +  p[page_size / sizeof (CHAR) - 1] = (CHAR) 1;
 +
 +  int ret = 0;
 +  FOR_EACH_IMPL (impl, 0)
 +    {
 +      src.fn = impl->fn;
 +      int res = do_memcmp (dest, src);
 +      if (res >= 0)
 +	{
 +	  error (0, 0, "Wrong result in function %s: %i >= 0",
 +		 impl->name, res);
 +	  ret = 1;
 +	}
 +    }
 +
 +  return ret ? EXIT_FAILURE : EXIT_SUCCESS;
 +}
 +
 +#include <support/test-driver.c>
 -- 
 GitLab
--- a/SOURCES/glibc-RHEL-16016-1.patch
+++ b/SOURCES/glibc-RHEL-16016-1.patch
@ -1,26 +1,26 @@
-commit b67339d0bbc07911859ca8c488e1923441cd3c33
+commit 919b9bfaa969c9517fe86c753c001b96ee4ea840
 Author: Joseph Myers <joseph@codesourcery.com>
-Date:   Mon Jun 15 22:58:22 2020 +0000
+Date:   Wed Oct 5 14:33:14 2022 +0000
-    Update syscall-names.list for Linux 5.7.
+    Update syscall lists for Linux 6.0
-    Linux 5.7 has no new syscalls.  Update the version number in
+    Linux 6.0 has no new syscalls.  Update the version number in
-    syscall-names.list to reflect that it is still current for 5.7.
+    syscall-names.list to reflect that it is still current for 6.0.
    Tested with build-many-glibcs.py.
 diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
-index 21a62a06f4..15dec5b98f 100644
+index 028ad3107a..4a78258646 100644
 --- a/sysdeps/unix/sysv/linux/syscall-names.list
 +++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
-# The list of system calls is current as of Linux 5.6.
+-# The list of system calls is current as of Linux 5.19.
-kernel 5.6
+-kernel 5.19
-+# The list of system calls is current as of Linux 5.7.
+# The list of system calls is current as of Linux 6.0.
-+kernel 5.7
+kernel 6.0
 FAST_atomic_update
 FAST_cmpxchg
--- a/SOURCES/glibc-RHEL-16016-2.patch
+++ b/SOURCES/glibc-RHEL-16016-2.patch
@ -0,0 +1,24 @@
 commit 5ab9b2c92411eb52f7b7a8e6074f0740d9bd727b
 Author: Joseph Myers <joseph@codesourcery.com>
 Date:   Tue Dec 20 15:24:29 2022 +0000
    Update syscall lists for Linux 6.1
    Linux 6.1 has no new syscalls.  Update the version number in
    syscall-names.list to reflect that it is still current for 6.1.
 diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
 index 4a78258646..1274d9cd4a 100644
 --- a/sysdeps/unix/sysv/linux/syscall-names.list
 +++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
 -# The list of system calls is current as of Linux 6.0.
 -kernel 6.0
 +# The list of system calls is current as of Linux 6.1.
 +kernel 6.1
 FAST_atomic_update
 FAST_cmpxchg
--- a/SOURCES/glibc-RHEL-16016-3.patch
+++ b/SOURCES/glibc-RHEL-16016-3.patch
@ -0,0 +1,26 @@
 commit f8e8effa2629c74769a3552aba33175746b710bb
 Author: Joseph Myers <joseph@codesourcery.com>
 Date:   Thu Feb 23 22:53:17 2023 +0000
    Update syscall lists for Linux 6.2
    Linux 6.2 has no new syscalls.  Update the version number in
    syscall-names.list to reflect that it is still current for 6.2.
    Tested with build-many-glibcs.py.
 diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
 index 822498d3e3..5d27b5279c 100644
 --- a/sysdeps/unix/sysv/linux/syscall-names.list
 +++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
 -# The list of system calls is current as of Linux 6.1.
 -kernel 6.1
 +# The list of system calls is current as of Linux 6.2.
 +kernel 6.2
 FAST_atomic_update
 FAST_cmpxchg
--- a/SOURCES/glibc-RHEL-16016-4.patch
+++ b/SOURCES/glibc-RHEL-16016-4.patch
@ -0,0 +1,26 @@
 commit eeef96f56ce399f2c3fc1d93c0ba1dde34f3ae41
 Author: Joseph Myers <joseph@codesourcery.com>
 Date:   Mon May 15 22:26:56 2023 +0000
    Update syscall lists for Linux 6.3
    Linux 6.3 has no new syscalls.  Update the version number in
    syscall-names.list to reflect that it is still current for 6.3.
    Tested with build-many-glibcs.py.
 diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
 index 5d27b5279c..72fe1d5efe 100644
 --- a/sysdeps/unix/sysv/linux/syscall-names.list
 +++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
 -# The list of system calls is current as of Linux 6.2.
 -kernel 6.2
 +# The list of system calls is current as of Linux 6.3.
 +kernel 6.3
 FAST_atomic_update
 FAST_cmpxchg
--- a/SOURCES/glibc-RHEL-16016-5.patch
+++ b/SOURCES/glibc-RHEL-16016-5.patch
@ -0,0 +1,83 @@
 commit 1a21693e16a3f3d10f41c486b97fbecb53dd2087
 Author: Joseph Myers <joseph@codesourcery.com>
 Date:   Wed Jun 28 21:22:14 2023 +0000
    Update syscall lists for Linux 6.4
    Linux 6.4 adds the riscv_hwprobe syscall on riscv and enables
    memfd_secret on s390.  Update syscall-names.list and regenerate the
    arch-syscall.h headers with build-many-glibcs.py update-syscalls.
    Tested with build-many-glibcs.py.
 diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
 index 202520ee25..2416e041c8 100644
 --- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
@@ -198,6 +198,7 @@
 #define __NR_request_key 218
 #define __NR_restart_syscall 128
 #define __NR_riscv_flush_icache 259
 +#define __NR_riscv_hwprobe 258
 #define __NR_rseq 293
 #define __NR_rt_sigaction 134
 #define __NR_rt_sigpending 136
 diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
 index 4e65f337d4..a32bc82f60 100644
 --- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
@@ -205,6 +205,7 @@
 #define __NR_request_key 218
 #define __NR_restart_syscall 128
 #define __NR_riscv_flush_icache 259
 +#define __NR_riscv_hwprobe 258
 #define __NR_rseq 293
 #define __NR_rt_sigaction 134
 #define __NR_rt_sigpending 136
 diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
 index 57025107e8..2288f20e45 100644
 --- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
@@ -178,6 +178,7 @@
 #define __NR_mbind 268
 #define __NR_membarrier 356
 #define __NR_memfd_create 350
 +#define __NR_memfd_secret 447
 #define __NR_migrate_pages 287
 #define __NR_mincore 218
 #define __NR_mkdir 39
 diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
 index 72e19c6d56..05e6d8428e 100644
 --- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
@@ -152,6 +152,7 @@
 #define __NR_mbind 268
 #define __NR_membarrier 356
 #define __NR_memfd_create 350
 +#define __NR_memfd_secret 447
 #define __NR_migrate_pages 287
 #define __NR_mincore 218
 #define __NR_mkdir 39
 diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
 index 72fe1d5efe..5b69106434 100644
 --- a/sysdeps/unix/sysv/linux/syscall-names.list
 +++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
 -# The list of system calls is current as of Linux 6.3.
 -kernel 6.3
 +# The list of system calls is current as of Linux 6.4.
 +kernel 6.4
 FAST_atomic_update
 FAST_cmpxchg
@@ -477,6 +477,7 @@ renameat2
 request_key
 restart_syscall
 riscv_flush_icache
 +riscv_hwprobe
 rmdir
 rseq
 rt_sigaction
--- a/SOURCES/glibc-RHEL-16016-6.patch
+++ b/SOURCES/glibc-RHEL-16016-6.patch
@ -0,0 +1,338 @@
 commit 72511f539cc34681ec61c6a0dc2fe6d684760ffe
 Author: Joseph Myers <joseph@codesourcery.com>
 Date:   Tue Sep 12 14:08:53 2023 +0000
    Update syscall lists for Linux 6.5
    Linux 6.5 has one new syscall, cachestat, and also enables the
    cacheflush syscall for hppa.  Update syscall-names.list and regenerate
    the arch-syscall.h headers with build-many-glibcs.py update-syscalls.
    Tested with build-many-glibcs.py.
    Conflicts: Removed loongarch, or1k
 diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
 index 4fcb6da80a..8f21ee66a0 100644
 --- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
@@ -7,6 +7,7 @@
 #define __NR_bind 200
 #define __NR_bpf 280
 #define __NR_brk 214
 +#define __NR_cachestat 451
 #define __NR_capget 90
 #define __NR_capset 91
 #define __NR_chdir 49
 diff --git a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
 index 0cf74c1a96..c5802a5fec 100644
 --- a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
@@ -11,6 +11,7 @@
 #define __NR_bind 104
 #define __NR_bpf 515
 #define __NR_brk 17
 +#define __NR_cachestat 561
 #define __NR_capget 368
 #define __NR_capset 369
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/arc/arch-syscall.h b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
 index c1207aaa12..f23f9e1154 100644
 --- a/sysdeps/unix/sysv/linux/arc/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
@@ -11,6 +11,7 @@
 #define __NR_bpf 280
 #define __NR_brk 214
 #define __NR_cacheflush 244
 +#define __NR_cachestat 451
 #define __NR_capget 90
 #define __NR_capset 91
 #define __NR_chdir 49
 diff --git a/sysdeps/unix/sysv/linux/arm/arch-syscall.h b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
 index e7ba04c106..7edf574899 100644
 --- a/sysdeps/unix/sysv/linux/arm/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
@@ -15,6 +15,7 @@
 #define __NR_bpf 386
 #define __NR_brk 45
 #define __NR_cacheflush 983042
 +#define __NR_cachestat 451
 #define __NR_capget 184
 #define __NR_capset 185
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/csky/arch-syscall.h b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
 index dc9383758e..d74a06e063 100644
 --- a/sysdeps/unix/sysv/linux/csky/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
@@ -8,6 +8,7 @@
 #define __NR_bpf 280
 #define __NR_brk 214
 #define __NR_cacheflush 245
 +#define __NR_cachestat 451
 #define __NR_capget 90
 #define __NR_capset 91
 #define __NR_chdir 49
 diff --git a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
 index 767f1287a3..5568b94cd3 100644
 --- a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
@@ -13,6 +13,8 @@
 #define __NR_bind 22
 #define __NR_bpf 341
 #define __NR_brk 45
 +#define __NR_cacheflush 356
 +#define __NR_cachestat 451
 #define __NR_capget 106
 #define __NR_capset 107
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/i386/arch-syscall.h b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
 index 1998f0d76a..3af21a15cb 100644
 --- a/sysdeps/unix/sysv/linux/i386/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
@@ -15,6 +15,7 @@
 #define __NR_bpf 357
 #define __NR_break 17
 #define __NR_brk 45
 +#define __NR_cachestat 451
 #define __NR_capget 184
 #define __NR_capset 185
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
 index b2eab1b93d..39b270e642 100644
 --- a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
@@ -11,6 +11,7 @@
 #define __NR_bind 1191
 #define __NR_bpf 1341
 #define __NR_brk 1060
 +#define __NR_cachestat 1475
 #define __NR_capget 1185
 #define __NR_capset 1186
 #define __NR_chdir 1034
 diff --git a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
 index 5fc3723772..315e49cd33 100644
 --- a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
@@ -15,6 +15,7 @@
 #define __NR_bpf 354
 #define __NR_brk 45
 #define __NR_cacheflush 123
 +#define __NR_cachestat 451
 #define __NR_capget 184
 #define __NR_capset 185
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
 index b6e9b007e4..54af12780c 100644
 --- a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
@@ -15,6 +15,7 @@
 #define __NR_bpf 387
 #define __NR_break 17
 #define __NR_brk 45
 +#define __NR_cachestat 451
 #define __NR_capget 184
 #define __NR_capset 185
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
 index b3a3871f8a..a2aa1ffa1b 100644
 --- a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
@@ -17,6 +17,7 @@
 #define __NR_brk 4045
 #define __NR_cachectl 4148
 #define __NR_cacheflush 4147
 +#define __NR_cachestat 4451
 #define __NR_capget 4204
 #define __NR_capset 4205
 #define __NR_chdir 4012
 diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
 index b462182723..5bec858040 100644
 --- a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
@@ -14,6 +14,7 @@
 #define __NR_brk 6012
 #define __NR_cachectl 6198
 #define __NR_cacheflush 6197
 +#define __NR_cachestat 6451
 #define __NR_capget 6123
 #define __NR_capset 6124
 #define __NR_chdir 6078
 diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
 index a9d6b94572..0166371ee2 100644
 --- a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
@@ -14,6 +14,7 @@
 #define __NR_brk 5012
 #define __NR_cachectl 5198
 #define __NR_cacheflush 5197
 +#define __NR_cachestat 5451
 #define __NR_capget 5123
 #define __NR_capset 5124
 #define __NR_chdir 5078
 diff --git a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
 index 809a219ef3..29a4cfa988 100644
 --- a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
@@ -8,6 +8,7 @@
 #define __NR_bpf 280
 #define __NR_brk 214
 #define __NR_cacheflush 244
 +#define __NR_cachestat 451
 #define __NR_capget 90
 #define __NR_capset 91
 #define __NR_chdir 49
 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
 index 627831ebae..3a212a0269 100644
 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
@@ -15,6 +15,7 @@
 #define __NR_bpf 361
 #define __NR_break 17
 #define __NR_brk 45
 +#define __NR_cachestat 451
 #define __NR_capget 183
 #define __NR_capset 184
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
 index bae597199d..1038ead227 100644
 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
@@ -15,6 +15,7 @@
 #define __NR_bpf 361
 #define __NR_break 17
 #define __NR_brk 45
 +#define __NR_cachestat 451
 #define __NR_capget 183
 #define __NR_capset 184
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
 index 2416e041c8..57b043ffb5 100644
 --- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
@@ -6,6 +6,7 @@
 #define __NR_bind 200
 #define __NR_bpf 280
 #define __NR_brk 214
 +#define __NR_cachestat 451
 #define __NR_capget 90
 #define __NR_capset 91
 #define __NR_chdir 49
 diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
 index a32bc82f60..1041a0f8c9 100644
 --- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
@@ -7,6 +7,7 @@
 #define __NR_bind 200
 #define __NR_bpf 280
 #define __NR_brk 214
 +#define __NR_cachestat 451
 #define __NR_capget 90
 #define __NR_capset 91
 #define __NR_chdir 49
 diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
 index 2288f20e45..70d4c6782e 100644
 --- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
@@ -13,6 +13,7 @@
 #define __NR_bind 361
 #define __NR_bpf 351
 #define __NR_brk 45
 +#define __NR_cachestat 451
 #define __NR_capget 184
 #define __NR_capset 185
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
 index 05e6d8428e..65a8a9e316 100644
 --- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
@@ -11,6 +11,7 @@
 #define __NR_bind 361
 #define __NR_bpf 351
 #define __NR_brk 45
 +#define __NR_cachestat 451
 #define __NR_capget 184
 #define __NR_capset 185
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/sh/arch-syscall.h b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
 index d52b522d9c..94aad0f119 100644
 --- a/sysdeps/unix/sysv/linux/sh/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
@@ -14,6 +14,7 @@
 #define __NR_bpf 375
 #define __NR_brk 45
 #define __NR_cacheflush 123
 +#define __NR_cachestat 451
 #define __NR_capget 184
 #define __NR_capset 185
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
 index d3f4d8aa3e..d630306c75 100644
 --- a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
@@ -14,6 +14,7 @@
 #define __NR_bind 353
 #define __NR_bpf 349
 #define __NR_brk 17
 +#define __NR_cachestat 451
 #define __NR_capget 21
 #define __NR_capset 22
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
 index 2cc03d7a24..930f29b4d2 100644
 --- a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
@@ -14,6 +14,7 @@
 #define __NR_bind 353
 #define __NR_bpf 349
 #define __NR_brk 17
 +#define __NR_cachestat 451
 #define __NR_capget 21
 #define __NR_capset 22
 #define __NR_chdir 12
 diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
 index 5b69106434..cf6f70ecd9 100644
 --- a/sysdeps/unix/sysv/linux/syscall-names.list
 +++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
 -# The list of system calls is current as of Linux 6.4.
 -kernel 6.4
 +# The list of system calls is current as of Linux 6.5.
 +kernel 6.5
 FAST_atomic_update
 FAST_cmpxchg
@@ -58,6 +58,7 @@ breakpoint
 brk
 cachectl
 cacheflush
 +cachestat
 capget
 capset
 chdir
 diff --git a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
 index b4ab892ec1..58646cf0bd 100644
 --- a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
@@ -12,6 +12,7 @@
 #define __NR_bind 49
 #define __NR_bpf 321
 #define __NR_brk 12
 +#define __NR_cachestat 451
 #define __NR_capget 125
 #define __NR_capset 126
 #define __NR_chdir 80
 diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
 index 772559c87b..604bcdfa5b 100644
 --- a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
@@ -11,6 +11,7 @@
 #define __NR_bind 1073741873
 #define __NR_bpf 1073742145
 #define __NR_brk 1073741836
 +#define __NR_cachestat 1073742275
 #define __NR_capget 1073741949
 #define __NR_capset 1073741950
 #define __NR_chdir 1073741904
--- a/SOURCES/glibc-RHEL-16016-7.patch
+++ b/SOURCES/glibc-RHEL-16016-7.patch
@ -0,0 +1,350 @@
 commit 582383b37d95b133c1ee6855ffaa2b1f5cb3d3b8
 Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
 Date:   Tue Oct 31 13:32:33 2023 -0300
    Update syscall lists for Linux 6.6
    Linux 6.6 has one new syscall for all architectures, fchmodat2, and
    the map_shadow_stack on x86_64.
    Conflicts: Removed loongarch, or1k
 diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
 index 8f21ee66a0..746991aa2f 100644
 --- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h
@@ -44,6 +44,7 @@
 #define __NR_fchdir 50
 #define __NR_fchmod 52
 #define __NR_fchmodat 53
 +#define __NR_fchmodat2 452
 #define __NR_fchown 55
 #define __NR_fchownat 54
 #define __NR_fcntl 25
 diff --git a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
 index c5802a5fec..32efe51267 100644
 --- a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h
@@ -56,6 +56,7 @@
 #define __NR_fchdir 13
 #define __NR_fchmod 124
 #define __NR_fchmodat 461
 +#define __NR_fchmodat2 562
 #define __NR_fchown 123
 #define __NR_fchownat 453
 #define __NR_fcntl 92
 diff --git a/sysdeps/unix/sysv/linux/arc/arch-syscall.h b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
 index f23f9e1154..1d2879e877 100644
 --- a/sysdeps/unix/sysv/linux/arc/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/arc/arch-syscall.h
@@ -48,6 +48,7 @@
 #define __NR_fchdir 50
 #define __NR_fchmod 52
 #define __NR_fchmodat 53
 +#define __NR_fchmodat2 452
 #define __NR_fchown 55
 #define __NR_fchownat 54
 #define __NR_fcntl64 25
 diff --git a/sysdeps/unix/sysv/linux/arm/arch-syscall.h b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
 index 7edf574899..6711981e78 100644
 --- a/sysdeps/unix/sysv/linux/arm/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/arm/arch-syscall.h
@@ -64,6 +64,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 333
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchown32 207
 #define __NR_fchownat 325
 diff --git a/sysdeps/unix/sysv/linux/csky/arch-syscall.h b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
 index d74a06e063..92d9a703ea 100644
 --- a/sysdeps/unix/sysv/linux/csky/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/csky/arch-syscall.h
@@ -50,6 +50,7 @@
 #define __NR_fchdir 50
 #define __NR_fchmod 52
 #define __NR_fchmodat 53
 +#define __NR_fchmodat2 452
 #define __NR_fchown 55
 #define __NR_fchownat 54
 #define __NR_fcntl64 25
 diff --git a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
 index 5568b94cd3..fbac124b70 100644
 --- a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h
@@ -63,6 +63,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 286
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchownat 278
 #define __NR_fcntl 55
 diff --git a/sysdeps/unix/sysv/linux/i386/arch-syscall.h b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
 index 3af21a15cb..8961788a96 100644
 --- a/sysdeps/unix/sysv/linux/i386/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/i386/arch-syscall.h
@@ -67,6 +67,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 306
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchown32 207
 #define __NR_fchownat 298
 diff --git a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
 index 39b270e642..1ef762d693 100644
 --- a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h
@@ -55,6 +55,7 @@
 #define __NR_fchdir 1035
 #define __NR_fchmod 1099
 #define __NR_fchmodat 1292
 +#define __NR_fchmodat2 1476
 #define __NR_fchown 1100
 #define __NR_fchownat 1284
 #define __NR_fcntl 1066
 diff --git a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
 index 315e49cd33..2053d5d392 100644
 --- a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h
@@ -67,6 +67,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 299
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchown32 207
 #define __NR_fchownat 291
 diff --git a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
 index 54af12780c..6865b1693c 100644
 --- a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h
@@ -67,6 +67,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 306
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchown32 207
 #define __NR_fchownat 298
 diff --git a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
 index a2aa1ffa1b..b13ace8e1c 100644
 --- a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h
@@ -67,6 +67,7 @@
 #define __NR_fchdir 4133
 #define __NR_fchmod 4094
 #define __NR_fchmodat 4299
 +#define __NR_fchmodat2 4452
 #define __NR_fchown 4095
 #define __NR_fchownat 4291
 #define __NR_fcntl 4055
 diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
 index 5bec858040..b7a7c0dfa7 100644
 --- a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h
@@ -64,6 +64,7 @@
 #define __NR_fchdir 6079
 #define __NR_fchmod 6089
 #define __NR_fchmodat 6262
 +#define __NR_fchmodat2 6452
 #define __NR_fchown 6091
 #define __NR_fchownat 6254
 #define __NR_fcntl 6070
 diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
 index 0166371ee2..e5d7f91f48 100644
 --- a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h
@@ -59,6 +59,7 @@
 #define __NR_fchdir 5079
 #define __NR_fchmod 5089
 #define __NR_fchmodat 5258
 +#define __NR_fchmodat2 5452
 #define __NR_fchown 5091
 #define __NR_fchownat 5250
 #define __NR_fcntl 5070
 diff --git a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
 index 29a4cfa988..89950cc33a 100644
 --- a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h
@@ -49,6 +49,7 @@
 #define __NR_fchdir 50
 #define __NR_fchmod 52
 #define __NR_fchmodat 53
 +#define __NR_fchmodat2 452
 #define __NR_fchown 55
 #define __NR_fchownat 54
 #define __NR_fcntl64 25
 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
 index 3a212a0269..64683bcb76 100644
 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h
@@ -66,6 +66,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 297
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchownat 289
 #define __NR_fcntl 55
 diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
 index 1038ead227..af1bbf32e8 100644
 --- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h
@@ -60,6 +60,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 297
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchownat 289
 #define __NR_fcntl 55
 diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
 index 57b043ffb5..56e3088cbf 100644
 --- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h
@@ -43,6 +43,7 @@
 #define __NR_fchdir 50
 #define __NR_fchmod 52
 #define __NR_fchmodat 53
 +#define __NR_fchmodat2 452
 #define __NR_fchown 55
 #define __NR_fchownat 54
 #define __NR_fcntl64 25
 diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
 index 1041a0f8c9..508161b47a 100644
 --- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h
@@ -44,6 +44,7 @@
 #define __NR_fchdir 50
 #define __NR_fchmod 52
 #define __NR_fchmodat 53
 +#define __NR_fchmodat2 452
 #define __NR_fchown 55
 #define __NR_fchownat 54
 #define __NR_fcntl 25
 diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
 index 70d4c6782e..1498ebf42e 100644
 --- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h
@@ -65,6 +65,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 299
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchown32 207
 #define __NR_fchownat 291
 diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
 index 65a8a9e316..624d71b56d 100644
 --- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h
@@ -56,6 +56,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 299
 +#define __NR_fchmodat2 452
 #define __NR_fchown 207
 #define __NR_fchownat 291
 #define __NR_fcntl 55
 diff --git a/sysdeps/unix/sysv/linux/sh/arch-syscall.h b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
 index 94aad0f119..37211f5f8c 100644
 --- a/sysdeps/unix/sysv/linux/sh/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/sh/arch-syscall.h
@@ -64,6 +64,7 @@
 #define __NR_fchdir 133
 #define __NR_fchmod 94
 #define __NR_fchmodat 306
 +#define __NR_fchmodat2 452
 #define __NR_fchown 95
 #define __NR_fchown32 207
 #define __NR_fchownat 298
 diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
 index d630306c75..8093abcc9c 100644
 --- a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h
@@ -66,6 +66,7 @@
 #define __NR_fchdir 176
 #define __NR_fchmod 124
 #define __NR_fchmodat 295
 +#define __NR_fchmodat2 452
 #define __NR_fchown 123
 #define __NR_fchown32 32
 #define __NR_fchownat 287
 diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
 index 930f29b4d2..d25ccfb571 100644
 --- a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h
@@ -60,6 +60,7 @@
 #define __NR_fchdir 176
 #define __NR_fchmod 124
 #define __NR_fchmodat 295
 +#define __NR_fchmodat2 452
 #define __NR_fchown 123
 #define __NR_fchownat 287
 #define __NR_fcntl 92
 diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
 index cf6f70ecd9..c3627fcd7f 100644
 --- a/sysdeps/unix/sysv/linux/syscall-names.list
 +++ b/sysdeps/unix/sysv/linux/syscall-names.list
@@ -21,8 +21,8 @@
 # This file can list all potential system calls.  The names are only
 # used if the installed kernel headers also provide them.
 -# The list of system calls is current as of Linux 6.5.
 -kernel 6.5
 +# The list of system calls is current as of Linux 6.6.
 +kernel 6.6
 FAST_atomic_update
 FAST_cmpxchg
@@ -117,6 +117,7 @@ fanotify_mark
 fchdir
 fchmod
 fchmodat
 +fchmodat2
 fchown
 fchown32
 fchownat
@@ -246,6 +247,7 @@ lsetxattr
 lstat
 lstat64
 madvise
 +map_shadow_stack
 mbind
 membarrier
 memfd_create
 diff --git a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
 index 58646cf0bd..5e4c9e901c 100644
 --- a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h
@@ -59,6 +59,7 @@
 #define __NR_fchdir 81
 #define __NR_fchmod 91
 #define __NR_fchmodat 268
 +#define __NR_fchmodat2 452
 #define __NR_fchown 93
 #define __NR_fchownat 260
 #define __NR_fcntl 72
@@ -153,6 +154,7 @@
 #define __NR_lsetxattr 189
 #define __NR_lstat 6
 #define __NR_madvise 28
 +#define __NR_map_shadow_stack 453
 #define __NR_mbind 237
 #define __NR_membarrier 324
 #define __NR_memfd_create 319
 diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
 index 604bcdfa5b..dd5e196272 100644
 --- a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
 +++ b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h
@@ -55,6 +55,7 @@
 #define __NR_fchdir 1073741905
 #define __NR_fchmod 1073741915
 #define __NR_fchmodat 1073742092
 +#define __NR_fchmodat2 1073742276
 #define __NR_fchown 1073741917
 #define __NR_fchownat 1073742084
 #define __NR_fcntl 1073741896
--- a/SOURCES/glibc-RHEL-16275.patch
+++ b/SOURCES/glibc-RHEL-16275.patch
@ -0,0 +1,16 @@
 Downstream-only patch to refer to /run instead of the legacy /var/run
 directory in the downstream nscd systemd socket file.
 diff --git a/nscd/nscd.socket b/nscd/nscd.socket
 index 7e512d5339fa1136..52a67608c7c55475 100644
 --- a/nscd/nscd.socket
 +++ b/nscd/nscd.socket
@@ -2,7 +2,7 @@
 Description=Name Service Cache Daemon Socket
 [Socket]
 -ListenDatagram=/var/run/nscd/socket
 +ListenDatagram=/run/nscd/socket
 [Install]
 WantedBy=sockets.target
--- a/SOURCES/glibc-RHEL-16643-1.patch
+++ b/SOURCES/glibc-RHEL-16643-1.patch
@ -0,0 +1,211 @@
 commit 06890c7ba553e82393413c59bb3131db5815a337
 Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Date:   Tue Jul 27 22:49:53 2021 +0530
    gaiconf_init: Refactor some bits for readability
    Split out line processing for `label`, `precedence` and `scopev4` into
    separate functions instead of the gotos.
    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
    Reviewed-by: DJ Delorie <dj@redhat.com>
 diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
 index d6046a707f1d742a..3bf9a8bae16a5b02 100644
 --- a/sysdeps/posix/getaddrinfo.c
 +++ b/sysdeps/posix/getaddrinfo.c
@@ -1858,6 +1858,66 @@ scopecmp (const void *p1, const void *p2)
   return 1;
 }
 +static bool
 +add_prefixlist (struct prefixlist **listp, size_t *lenp, bool *nullbitsp,
 +		char *val1, char *val2, char **pos)
 +{
 +  struct in6_addr prefix;
 +  unsigned long int bits;
 +  unsigned long int val;
 +  char *endp;
 +
 +  bits = 128;
 +  __set_errno (0);
 +  char *cp = strchr (val1, '/');
 +  if (cp != NULL)
 +    *cp++ = '\0';
 +  *pos = cp;
 +  if (inet_pton (AF_INET6, val1, &prefix)
 +      && (cp == NULL
 +	  || (bits = strtoul (cp, &endp, 10)) != ULONG_MAX
 +	  || errno != ERANGE)
 +      && *endp == '\0'
 +      && bits <= 128
 +      && ((val = strtoul (val2, &endp, 10)) != ULONG_MAX
 +	  || errno != ERANGE)
 +      && *endp == '\0'
 +      && val <= INT_MAX)
 +    {
 +      struct prefixlist *newp = malloc (sizeof (*newp));
 +      if (newp == NULL)
 +	return false;
 +
 +      memcpy (&newp->entry.prefix, &prefix, sizeof (prefix));
 +      newp->entry.bits = bits;
 +      newp->entry.val = val;
 +      newp->next = *listp;
 +      *listp = newp;
 +      ++*lenp;
 +      *nullbitsp |= bits == 0;
 +    }
 +  return true;
 +}
 +
 +static bool
 +add_scopelist (struct scopelist **listp, size_t *lenp, bool *nullbitsp,
 +	       const struct in6_addr *prefixp, unsigned long int bits,
 +	       unsigned long int val)
 +{
 +  struct scopelist *newp = malloc (sizeof (*newp));
 +  if (newp == NULL)
 +    return false;
 +
 +  newp->entry.netmask = htonl (bits != 96 ? (0xffffffff << (128 - bits)) : 0);
 +  newp->entry.addr32 = (prefixp->s6_addr32[3] & newp->entry.netmask);
 +  newp->entry.scope = val;
 +  newp->next = *listp;
 +  *listp = newp;
 +  ++*lenp;
 +  *nullbitsp |= bits == 96;
 +
 +  return true;
 +}
 static void
 gaiconf_init (void)
@@ -1933,55 +1993,17 @@ gaiconf_init (void)
 	  /*  Ignore the rest of the line.  */
 	  *cp = '\0';
 -	  struct prefixlist **listp;
 -	  size_t *lenp;
 -	  bool *nullbitsp;
 	  switch (cmdlen)
 	    {
 	    case 5:
 	      if (strcmp (cmd, "label") == 0)
 		{
 -		  struct in6_addr prefix;
 -		  unsigned long int bits;
 -		  unsigned long int val;
 -		  char *endp;
 -
 -		  listp = &labellist;
 -		  lenp = &nlabellist;
 -		  nullbitsp = &labellist_nullbits;
 -
 -		new_elem:
 -		  bits = 128;
 -		  __set_errno (0);
 -		  cp = strchr (val1, '/');
 -		  if (cp != NULL)
 -		    *cp++ = '\0';
 -		  if (inet_pton (AF_INET6, val1, &prefix)
 -		      && (cp == NULL
 -			  || (bits = strtoul (cp, &endp, 10)) != ULONG_MAX
 -			  || errno != ERANGE)
 -		      && *endp == '\0'
 -		      && bits <= 128
 -		      && ((val = strtoul (val2, &endp, 10)) != ULONG_MAX
 -			  || errno != ERANGE)
 -		      && *endp == '\0'
 -		      && val <= INT_MAX)
 +		  if (!add_prefixlist (&labellist, &nlabellist,
 +				       &labellist_nullbits, val1, val2, &cp))
 		    {
 -		      struct prefixlist *newp = malloc (sizeof (*newp));
 -		      if (newp == NULL)
 -			{
 -			  free (line);
 -			  fclose (fp);
 -			  goto no_file;
 -			}
 -
 -		      memcpy (&newp->entry.prefix, &prefix, sizeof (prefix));
 -		      newp->entry.bits = bits;
 -		      newp->entry.val = val;
 -		      newp->next = *listp;
 -		      *listp = newp;
 -		      ++*lenp;
 -		      *nullbitsp |= bits == 0;
 +		      free (line);
 +		      fclose (fp);
 +		      goto no_file;
 		    }
 		}
 	      break;
@@ -2023,27 +2045,14 @@ gaiconf_init (void)
 			  && *endp == '\0'
 			  && val <= INT_MAX)
 			{
 -			  struct scopelist *newp;
 -			new_scope:
 -			  newp = malloc (sizeof (*newp));
 -			  if (newp == NULL)
 +			  if (!add_scopelist (&scopelist, &nscopelist,
 +					      &scopelist_nullbits, &prefix,
 +					      bits, val))
 			    {
 			      free (line);
 			      fclose (fp);
 			      goto no_file;
 			    }
 -
 -			  newp->entry.netmask = htonl (bits != 96
 -						       ? (0xffffffff
 -							  << (128 - bits))
 -						       : 0);
 -			  newp->entry.addr32 = (prefix.s6_addr32[3]
 -						& newp->entry.netmask);
 -			  newp->entry.scope = val;
 -			  newp->next = scopelist;
 -			  scopelist = newp;
 -			  ++nscopelist;
 -			  scopelist_nullbits |= bits == 96;
 			}
 		    }
 		  else if (inet_pton (AF_INET, val1, &prefix.s6_addr32[3])
@@ -2057,8 +2066,14 @@ gaiconf_init (void)
 			   && *endp == '\0'
 			   && val <= INT_MAX)
 		    {
 -		      bits += 96;
 -		      goto new_scope;
 +		      if (!add_scopelist (&scopelist, &nscopelist,
 +					  &scopelist_nullbits, &prefix,
 +					  bits + 96, val))
 +			{
 +			  free (line);
 +			  fclose (fp);
 +			  goto no_file;
 +			}
 		    }
 		}
 	      break;
@@ -2066,10 +2081,14 @@ gaiconf_init (void)
 	    case 10:
 	      if (strcmp (cmd, "precedence") == 0)
 		{
 -		  listp = &precedencelist;
 -		  lenp = &nprecedencelist;
 -		  nullbitsp = &precedencelist_nullbits;
 -		  goto new_elem;
 +		  if (!add_prefixlist (&precedencelist, &nprecedencelist,
 +				       &precedencelist_nullbits, val1, val2,
 +				       &cp))
 +		    {
 +		      free (line);
 +		      fclose (fp);
 +		      goto no_file;
 +		    }
 		}
 	      break;
 	    }
--- a/SOURCES/glibc-RHEL-16643-2.patch
+++ b/SOURCES/glibc-RHEL-16643-2.patch
@ -0,0 +1,584 @@
 commit bc0d18d873abf2cda6842ad8bb4df2a31dc0fbac
 Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Date:   Tue Aug 3 21:29:23 2021 +0530
    gai_init: Avoid jumping from if condition to its else counterpart
    Clean up another antipattern where code flows from an if condition to
    its else counterpart with a goto.
    Most of the change in this patch is whitespace-only; a `git diff -b`
    ought to show the actual logic changes.
    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
    Reviewed-by: DJ Delorie <dj@redhat.com>
 diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
 index 3bf9a8bae16a5b02..1635a09837351068 100644
 --- a/sysdeps/posix/getaddrinfo.c
 +++ b/sysdeps/posix/getaddrinfo.c
@@ -1933,142 +1933,122 @@ gaiconf_init (void)
   bool scopelist_nullbits = false;
   FILE *fp = fopen (GAICONF_FNAME, "rce");
 -  if (fp != NULL)
 +  if (fp == NULL)
 +    goto no_file;
 +
 +  struct __stat64_t64 st;
 +  if (__fstat64_time64 (fileno (fp), &st) != 0)
     {
 -      struct __stat64_t64 st;
 -      if (__fstat64_time64 (fileno (fp), &st) != 0)
 -	{
 -	  fclose (fp);
 -	  goto no_file;
 -	}
 +      fclose (fp);
 +      goto no_file;
 +    }
 -      char *line = NULL;
 -      size_t linelen = 0;
 +  char *line = NULL;
 +  size_t linelen = 0;
 -      __fsetlocking (fp, FSETLOCKING_BYCALLER);
 +  __fsetlocking (fp, FSETLOCKING_BYCALLER);
 -      while (!feof_unlocked (fp))
 -	{
 -	  ssize_t n = __getline (&line, &linelen, fp);
 -	  if (n <= 0)
 -	    break;
 +  while (!feof_unlocked (fp))
 +    {
 +      ssize_t n = __getline (&line, &linelen, fp);
 +      if (n <= 0)
 +	break;
 -	  /* Handle comments.  No escaping possible so this is easy.  */
 -	  char *cp = strchr (line, '#');
 -	  if (cp != NULL)
 -	    *cp = '\0';
 +      /* Handle comments.  No escaping possible so this is easy.  */
 +      char *cp = strchr (line, '#');
 +      if (cp != NULL)
 +	*cp = '\0';
 -	  cp = line;
 -	  while (isspace (*cp))
 -	    ++cp;
 +      cp = line;
 +      while (isspace (*cp))
 +	++cp;
 -	  char *cmd = cp;
 -	  while (*cp != '\0' && !isspace (*cp))
 -	    ++cp;
 -	  size_t cmdlen = cp - cmd;
 +      char *cmd = cp;
 +      while (*cp != '\0' && !isspace (*cp))
 +	++cp;
 +      size_t cmdlen = cp - cmd;
 -	  if (*cp != '\0')
 -	    *cp++ = '\0';
 -	  while (isspace (*cp))
 -	    ++cp;
 +      if (*cp != '\0')
 +	*cp++ = '\0';
 +      while (isspace (*cp))
 +	++cp;
 -	  char *val1 = cp;
 -	  while (*cp != '\0' && !isspace (*cp))
 -	    ++cp;
 -	  size_t val1len = cp - cmd;
 +      char *val1 = cp;
 +      while (*cp != '\0' && !isspace (*cp))
 +	++cp;
 +      size_t val1len = cp - cmd;
 -	  /* We always need at least two values.  */
 -	  if (val1len == 0)
 -	    continue;
 +      /* We always need at least two values.  */
 +      if (val1len == 0)
 +	continue;
 -	  if (*cp != '\0')
 -	    *cp++ = '\0';
 -	  while (isspace (*cp))
 -	    ++cp;
 +      if (*cp != '\0')
 +	*cp++ = '\0';
 +      while (isspace (*cp))
 +	++cp;
 -	  char *val2 = cp;
 -	  while (*cp != '\0' && !isspace (*cp))
 -	    ++cp;
 +      char *val2 = cp;
 +      while (*cp != '\0' && !isspace (*cp))
 +	++cp;
 -	  /*  Ignore the rest of the line.  */
 -	  *cp = '\0';
 +      /*  Ignore the rest of the line.  */
 +      *cp = '\0';
 -	  switch (cmdlen)
 +      switch (cmdlen)
 +	{
 +	case 5:
 +	  if (strcmp (cmd, "label") == 0)
 	    {
 -	    case 5:
 -	      if (strcmp (cmd, "label") == 0)
 +	      if (!add_prefixlist (&labellist, &nlabellist,
 +				   &labellist_nullbits, val1, val2, &cp))
 		{
 -		  if (!add_prefixlist (&labellist, &nlabellist,
 -				       &labellist_nullbits, val1, val2, &cp))
 -		    {
 -		      free (line);
 -		      fclose (fp);
 -		      goto no_file;
 -		    }
 +		  free (line);
 +		  fclose (fp);
 +		  goto no_file;
 		}
 -	      break;
 +	    }
 +	  break;
 -	    case 6:
 -	      if (strcmp (cmd, "reload") == 0)
 -		{
 -		  gaiconf_reload_flag = strcmp (val1, "yes") == 0;
 -		  if (gaiconf_reload_flag)
 -		    gaiconf_reload_flag_ever_set = 1;
 -		}
 -	      break;
 +	case 6:
 +	  if (strcmp (cmd, "reload") == 0)
 +	    {
 +	      gaiconf_reload_flag = strcmp (val1, "yes") == 0;
 +	      if (gaiconf_reload_flag)
 +		gaiconf_reload_flag_ever_set = 1;
 +	    }
 +	  break;
 -	    case 7:
 -	      if (strcmp (cmd, "scopev4") == 0)
 +	case 7:
 +	  if (strcmp (cmd, "scopev4") == 0)
 +	    {
 +	      struct in6_addr prefix;
 +	      unsigned long int bits;
 +	      unsigned long int val;
 +	      char *endp;
 +
 +	      bits = 32;
 +	      __set_errno (0);
 +	      cp = strchr (val1, '/');
 +	      if (cp != NULL)
 +		*cp++ = '\0';
 +	      if (inet_pton (AF_INET6, val1, &prefix))
 		{
 -		  struct in6_addr prefix;
 -		  unsigned long int bits;
 -		  unsigned long int val;
 -		  char *endp;
 -
 -		  bits = 32;
 -		  __set_errno (0);
 -		  cp = strchr (val1, '/');
 -		  if (cp != NULL)
 -		    *cp++ = '\0';
 -		  if (inet_pton (AF_INET6, val1, &prefix))
 -		    {
 -		      bits = 128;
 -		      if (IN6_IS_ADDR_V4MAPPED (&prefix)
 -			  && (cp == NULL
 -			      || (bits = strtoul (cp, &endp, 10)) != ULONG_MAX
 -			      || errno != ERANGE)
 -			  && *endp == '\0'
 -			  && bits >= 96
 -			  && bits <= 128
 -			  && ((val = strtoul (val2, &endp, 10)) != ULONG_MAX
 -			      || errno != ERANGE)
 -			  && *endp == '\0'
 -			  && val <= INT_MAX)
 -			{
 -			  if (!add_scopelist (&scopelist, &nscopelist,
 -					      &scopelist_nullbits, &prefix,
 -					      bits, val))
 -			    {
 -			      free (line);
 -			      fclose (fp);
 -			      goto no_file;
 -			    }
 -			}
 -		    }
 -		  else if (inet_pton (AF_INET, val1, &prefix.s6_addr32[3])
 -			   && (cp == NULL
 -			       || (bits = strtoul (cp, &endp, 10)) != ULONG_MAX
 -			       || errno != ERANGE)
 -			   && *endp == '\0'
 -			   && bits <= 32
 -			   && ((val = strtoul (val2, &endp, 10)) != ULONG_MAX
 -			       || errno != ERANGE)
 -			   && *endp == '\0'
 -			   && val <= INT_MAX)
 +		  bits = 128;
 +		  if (IN6_IS_ADDR_V4MAPPED (&prefix)
 +		      && (cp == NULL
 +			  || (bits = strtoul (cp, &endp, 10)) != ULONG_MAX
 +			  || errno != ERANGE)
 +		      && *endp == '\0'
 +		      && bits >= 96
 +		      && bits <= 128
 +		      && ((val = strtoul (val2, &endp, 10)) != ULONG_MAX
 +			  || errno != ERANGE)
 +		      && *endp == '\0'
 +		      && val <= INT_MAX)
 		    {
 		      if (!add_scopelist (&scopelist, &nscopelist,
 					  &scopelist_nullbits, &prefix,
 -					  bits + 96, val))
 +					  bits, val))
 			{
 			  free (line);
 			  fclose (fp);
@@ -2076,173 +2056,191 @@ gaiconf_init (void)
 			}
 		    }
 		}
 -	      break;
 -
 -	    case 10:
 -	      if (strcmp (cmd, "precedence") == 0)
 +	      else if (inet_pton (AF_INET, val1, &prefix.s6_addr32[3])
 +		       && (cp == NULL
 +			   || (bits = strtoul (cp, &endp, 10)) != ULONG_MAX
 +			   || errno != ERANGE)
 +		       && *endp == '\0'
 +		       && bits <= 32
 +		       && ((val = strtoul (val2, &endp, 10)) != ULONG_MAX
 +			   || errno != ERANGE)
 +		       && *endp == '\0'
 +		       && val <= INT_MAX)
 		{
 -		  if (!add_prefixlist (&precedencelist, &nprecedencelist,
 -				       &precedencelist_nullbits, val1, val2,
 -				       &cp))
 +		  if (!add_scopelist (&scopelist, &nscopelist,
 +				      &scopelist_nullbits, &prefix,
 +				      bits + 96, val))
 		    {
 		      free (line);
 		      fclose (fp);
 		      goto no_file;
 		    }
 		}
 -	      break;
 -	    }
 -	}
 -
 -      free (line);
 -
 -      fclose (fp);
 -
 -      /* Create the array for the labels.  */
 -      struct prefixentry *new_labels;
 -      if (nlabellist > 0)
 -	{
 -	  if (!labellist_nullbits)
 -	    ++nlabellist;
 -	  new_labels = malloc (nlabellist * sizeof (*new_labels));
 -	  if (new_labels == NULL)
 -	    goto no_file;
 -
 -	  int i = nlabellist;
 -	  if (!labellist_nullbits)
 -	    {
 -	      --i;
 -	      memset (&new_labels[i].prefix, '\0', sizeof (struct in6_addr));
 -	      new_labels[i].bits = 0;
 -	      new_labels[i].val = 1;
 	    }
 +	  break;
 -	  struct prefixlist *l = labellist;
 -	  while (i-- > 0)
 +	case 10:
 +	  if (strcmp (cmd, "precedence") == 0)
 	    {
 -	      new_labels[i] = l->entry;
 -	      l = l->next;
 +	      if (!add_prefixlist (&precedencelist, &nprecedencelist,
 +				   &precedencelist_nullbits, val1, val2,
 +				   &cp))
 +		{
 +		  free (line);
 +		  fclose (fp);
 +		  goto no_file;
 +		}
 	    }
 -	  free_prefixlist (labellist);
 -	  labellist = NULL;
 -
 -	  /* Sort the entries so that the most specific ones are at
 -	     the beginning.  */
 -	  qsort (new_labels, nlabellist, sizeof (*new_labels), prefixcmp);
 +	  break;
 	}
 -      else
 -	new_labels = (struct prefixentry *) default_labels;
 -
 -      struct prefixentry *new_precedence;
 -      if (nprecedencelist > 0)
 -	{
 -	  if (!precedencelist_nullbits)
 -	    ++nprecedencelist;
 -	  new_precedence = malloc (nprecedencelist * sizeof (*new_precedence));
 -	  if (new_precedence == NULL)
 -	    {
 -	      if (new_labels != default_labels)
 -		free (new_labels);
 -	      goto no_file;
 -	    }
 +    }
 -	  int i = nprecedencelist;
 -	  if (!precedencelist_nullbits)
 -	    {
 -	      --i;
 -	      memset (&new_precedence[i].prefix, '\0',
 -		      sizeof (struct in6_addr));
 -	      new_precedence[i].bits = 0;
 -	      new_precedence[i].val = 40;
 -	    }
 +  free (line);
 -	  struct prefixlist *l = precedencelist;
 -	  while (i-- > 0)
 -	    {
 -	      new_precedence[i] = l->entry;
 -	      l = l->next;
 -	    }
 -	  free_prefixlist (precedencelist);
 -	  precedencelist = NULL;
 +  fclose (fp);
 -	  /* Sort the entries so that the most specific ones are at
 -	     the beginning.  */
 -	  qsort (new_precedence, nprecedencelist, sizeof (*new_precedence),
 -		 prefixcmp);
 +  /* Create the array for the labels.  */
 +  struct prefixentry *new_labels;
 +  if (nlabellist > 0)
 +    {
 +      if (!labellist_nullbits)
 +	++nlabellist;
 +      new_labels = malloc (nlabellist * sizeof (*new_labels));
 +      if (new_labels == NULL)
 +	goto no_file;
 +
 +      int i = nlabellist;
 +      if (!labellist_nullbits)
 +	{
 +	  --i;
 +	  memset (&new_labels[i].prefix, '\0', sizeof (struct in6_addr));
 +	  new_labels[i].bits = 0;
 +	  new_labels[i].val = 1;
 	}
 -      else
 -	new_precedence = (struct prefixentry *) default_precedence;
 -      struct scopeentry *new_scopes;
 -      if (nscopelist > 0)
 +      struct prefixlist *l = labellist;
 +      while (i-- > 0)
 	{
 -	  if (!scopelist_nullbits)
 -	    ++nscopelist;
 -	  new_scopes = malloc (nscopelist * sizeof (*new_scopes));
 -	  if (new_scopes == NULL)
 -	    {
 -	      if (new_labels != default_labels)
 -		free (new_labels);
 -	      if (new_precedence != default_precedence)
 -		free (new_precedence);
 -	      goto no_file;
 -	    }
 -
 -	  int i = nscopelist;
 -	  if (!scopelist_nullbits)
 -	    {
 -	      --i;
 -	      new_scopes[i].addr32 = 0;
 -	      new_scopes[i].netmask = 0;
 -	      new_scopes[i].scope = 14;
 -	    }
 +	  new_labels[i] = l->entry;
 +	  l = l->next;
 +	}
 +      free_prefixlist (labellist);
 +      labellist = NULL;
 -	  struct scopelist *l = scopelist;
 -	  while (i-- > 0)
 -	    {
 -	      new_scopes[i] = l->entry;
 -	      l = l->next;
 -	    }
 -	  free_scopelist (scopelist);
 +      /* Sort the entries so that the most specific ones are at
 +	 the beginning.  */
 +      qsort (new_labels, nlabellist, sizeof (*new_labels), prefixcmp);
 +    }
 +  else
 +    new_labels = (struct prefixentry *) default_labels;
 -	  /* Sort the entries so that the most specific ones are at
 -	     the beginning.  */
 -	  qsort (new_scopes, nscopelist, sizeof (*new_scopes),
 -		 scopecmp);
 +  struct prefixentry *new_precedence;
 +  if (nprecedencelist > 0)
 +    {
 +      if (!precedencelist_nullbits)
 +	++nprecedencelist;
 +      new_precedence = malloc (nprecedencelist * sizeof (*new_precedence));
 +      if (new_precedence == NULL)
 +	{
 +	  if (new_labels != default_labels)
 +	    free (new_labels);
 +	  goto no_file;
 	}
 -      else
 -	new_scopes = (struct scopeentry *) default_scopes;
 -
 -      /* Now we are ready to replace the values.  */
 -      const struct prefixentry *old = labels;
 -      labels = new_labels;
 -      if (old != default_labels)
 -	free ((void *) old);
 -      old = precedence;
 -      precedence = new_precedence;
 -      if (old != default_precedence)
 -	free ((void *) old);
 +      int i = nprecedencelist;
 +      if (!precedencelist_nullbits)
 +	{
 +	  --i;
 +	  memset (&new_precedence[i].prefix, '\0',
 +		  sizeof (struct in6_addr));
 +	  new_precedence[i].bits = 0;
 +	  new_precedence[i].val = 40;
 +	}
 -      const struct scopeentry *oldscope = scopes;
 -      scopes = new_scopes;
 -      if (oldscope != default_scopes)
 -	free ((void *) oldscope);
 +      struct prefixlist *l = precedencelist;
 +      while (i-- > 0)
 +	{
 +	  new_precedence[i] = l->entry;
 +	  l = l->next;
 +	}
 +      free_prefixlist (precedencelist);
 +      precedencelist = NULL;
 -      save_gaiconf_mtime (&st);
 +      /* Sort the entries so that the most specific ones are at
 +	 the beginning.  */
 +      qsort (new_precedence, nprecedencelist, sizeof (*new_precedence),
 +	     prefixcmp);
     }
   else
 +    new_precedence = (struct prefixentry *) default_precedence;
 +
 +  struct scopeentry *new_scopes;
 +  if (nscopelist > 0)
     {
 -    no_file:
 -      free_prefixlist (labellist);
 -      free_prefixlist (precedencelist);
 +      if (!scopelist_nullbits)
 +	++nscopelist;
 +      new_scopes = malloc (nscopelist * sizeof (*new_scopes));
 +      if (new_scopes == NULL)
 +	{
 +	  if (new_labels != default_labels)
 +	    free (new_labels);
 +	  if (new_precedence != default_precedence)
 +	    free (new_precedence);
 +	  goto no_file;
 +	}
 +
 +      int i = nscopelist;
 +      if (!scopelist_nullbits)
 +	{
 +	  --i;
 +	  new_scopes[i].addr32 = 0;
 +	  new_scopes[i].netmask = 0;
 +	  new_scopes[i].scope = 14;
 +	}
 +
 +      struct scopelist *l = scopelist;
 +      while (i-- > 0)
 +	{
 +	  new_scopes[i] = l->entry;
 +	  l = l->next;
 +	}
       free_scopelist (scopelist);
 -      /* If we previously read the file but it is gone now, free the
 -	 old data and use the builtin one.  Leave the reload flag
 -	 alone.  */
 -      fini ();
 +      /* Sort the entries so that the most specific ones are at
 +	 the beginning.  */
 +      qsort (new_scopes, nscopelist, sizeof (*new_scopes),
 +	     scopecmp);
     }
 +  else
 +    new_scopes = (struct scopeentry *) default_scopes;
 +
 +  /* Now we are ready to replace the values.  */
 +  const struct prefixentry *old = labels;
 +  labels = new_labels;
 +  if (old != default_labels)
 +    free ((void *) old);
 +
 +  old = precedence;
 +  precedence = new_precedence;
 +  if (old != default_precedence)
 +    free ((void *) old);
 +
 +  const struct scopeentry *oldscope = scopes;
 +  scopes = new_scopes;
 +  if (oldscope != default_scopes)
 +    free ((void *) oldscope);
 +
 +  save_gaiconf_mtime (&st);
 +  return;
 +
 +no_file:
 +  free_prefixlist (labellist);
 +  free_prefixlist (precedencelist);
 +  free_scopelist (scopelist);
 +
 +  /* If we previously read the file but it is gone now, free the old data and
 +     use the builtin one.  Leave the reload flag alone.  */
 +  fini ();
 }
--- a/SOURCES/glibc-RHEL-16643-3.patch
+++ b/SOURCES/glibc-RHEL-16643-3.patch
@ -0,0 +1,90 @@
 commit d3f2c2c8b57bdf9d963db8fa2372d6c1b86a337e
 Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Date:   Tue Mar 22 22:40:05 2022 +0530
    getaddrinfo: Refactor code for readability
    The close_retry goto jump is confusing and clumsy to read, so refactor
    the code a bit to make it easier to follow.
    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
    Reviewed-by: DJ Delorie <dj@redhat.com>
 diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
 index 1635a09837351068..5e9bd17eb949974c 100644
 --- a/sysdeps/posix/getaddrinfo.c
 +++ b/sysdeps/posix/getaddrinfo.c
@@ -2253,6 +2253,36 @@ gaiconf_reload (void)
     gaiconf_init ();
 }
 +static bool
 +try_connect (int *fdp, int *afp, struct sockaddr_in6 *source_addrp,
 +	     const struct sockaddr *addr, socklen_t addrlen, int family)
 +{
 +  int fd = *fdp;
 +  int af = *afp;
 +  socklen_t sl = sizeof (*source_addrp);
 +
 +  while (true)
 +    {
 +      if (fd != -1 && __connect (fd, addr, addrlen) == 0
 +	  && __getsockname (fd, (struct sockaddr *) source_addrp, &sl) == 0)
 +	return true;
 +
 +      if (errno == EAFNOSUPPORT && af == AF_INET6 && family == AF_INET)
 +	{
 +	  /* This could mean IPv6 sockets are IPv6-only.  */
 +	  if (fd != -1)
 +	    __close_nocancel_nostatus (fd);
 +	  *afp = af = AF_INET;
 +	  *fdp = fd = __socket (AF_INET, SOCK_DGRAM | SOCK_CLOEXEC,
 +				IPPROTO_IP);
 +	  continue;
 +	}
 +
 +      return false;
 +    }
 +
 +  __builtin_unreachable ();
 +}
 int
 getaddrinfo (const char *name, const char *service,
@@ -2443,7 +2473,6 @@ getaddrinfo (const char *name, const char *service,
 	      if (fd == -1 || (af == AF_INET && q->ai_family == AF_INET6))
 		{
 		  if (fd != -1)
 -		  close_retry:
 		    __close_nocancel_nostatus (fd);
 		  af = q->ai_family;
 		  fd = __socket (af, SOCK_DGRAM | SOCK_CLOEXEC, IPPROTO_IP);
@@ -2455,14 +2484,10 @@ getaddrinfo (const char *name, const char *service,
 		  __connect (fd, &sa, sizeof (sa));
 		}
 -	      socklen_t sl = sizeof (results[i].source_addr);
 -	      if (fd != -1
 -		  && __connect (fd, q->ai_addr, q->ai_addrlen) == 0
 -		  && __getsockname (fd,
 -				    (struct sockaddr *) &results[i].source_addr,
 -				    &sl) == 0)
 +	      if (try_connect (&fd, &af, &results[i].source_addr, q->ai_addr,
 +			       q->ai_addrlen, q->ai_family))
 		{
 -		  results[i].source_addr_len = sl;
 +		  results[i].source_addr_len = sizeof (results[i].source_addr);
 		  results[i].got_source_addr = true;
 		  if (in6ai != NULL)
@@ -2527,10 +2552,6 @@ getaddrinfo (const char *name, const char *service,
 		      results[i].source_addr_len = sizeof (struct sockaddr_in);
 		    }
 		}
 -	      else if (errno == EAFNOSUPPORT && af == AF_INET6
 -		       && q->ai_family == AF_INET)
 -		/* This could mean IPv6 sockets are IPv6-only.  */
 -		goto close_retry;
 	      else
 		/* Just make sure that if we have to process the same
 		   address again we do not copy any memory.  */
--- a/SOURCES/glibc-RHEL-16643-4.patch
+++ b/SOURCES/glibc-RHEL-16643-4.patch
@ -0,0 +1,32 @@
 commit c9226c03da0276593a0918eaa9a14835183343e8
 Author: Jörg Sonnenberger <joerg@bec.de>
 Date:   Mon Sep 26 13:59:16 2022 -0400
    get_nscd_addresses: Fix subscript typos [BZ #29605]
    Fix the subscript on air->family, which was accidentally set to COUNT
    when it should have remained as I.
    Resolves: BZ #29605
    Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
 diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
 index 5e9bd17eb949974c..40a32a3de30cb294 100644
 --- a/sysdeps/posix/getaddrinfo.c
 +++ b/sysdeps/posix/getaddrinfo.c
@@ -549,11 +549,11 @@ get_nscd_addresses (const char *name, const struct addrinfo *req,
 	  at[count].addr[2] = htonl (0xffff);
 	}
       else if (req->ai_family == AF_UNSPEC
 -	       || air->family[count] == req->ai_family)
 +	       || air->family[i] == req->ai_family)
 	{
 -	  at[count].family = air->family[count];
 +	  at[count].family = air->family[i];
 	  memcpy (at[count].addr, addrs, size);
 -	  if (air->family[count] == AF_INET6)
 +	  if (air->family[i] == AF_INET6)
 	    res->got_ipv6 = true;
 	}
       at[count].next = at + count + 1;
--- a/SOURCES/glibc-RHEL-16643-5.patch
+++ b/SOURCES/glibc-RHEL-16643-5.patch
@ -0,0 +1,25 @@
 commit 3bf7bab88b0da01d4f5ef20afbbb45203185501e
 Author: Siddhesh Poyarekar <siddhesh@sourceware.org>
 Date:   Tue Sep 5 17:04:05 2023 -0400
    getcanonname: Fix a typo
    This code is generally unused in practice since there don't seem to be
    any NSS modules that only implement _nss_MOD_gethostbyname2_r and not
    _nss_MOD_gethostbyname3_r.
    Signed-off-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
 diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
 index 40a32a3de30cb294..e9f47aea358a3351 100644
 --- a/sysdeps/posix/getaddrinfo.c
 +++ b/sysdeps/posix/getaddrinfo.c
@@ -346,7 +346,7 @@ getcanonname (nss_action_list nip, const char *hname, const char *name)
 	   string.  */
 	s = (char *) name;
     }
 -  return __strdup (name);
 +  return __strdup (s);
 }
 /* Process looked up canonical name and if necessary, decode to IDNA.  Result
--- a/SOURCES/glibc-RHEL-16643-6.patch
+++ b/SOURCES/glibc-RHEL-16643-6.patch
@ -0,0 +1,23 @@
 commit 61bac1a9d2ab80ebcbc51484722e6ea43414bec7
 Author: Florian Weimer <fweimer@redhat.com>
 Date:   Wed Dec 20 16:14:33 2023 +0100
    nss: Remove unused allocation from get_nscd_addresses in getaddrinfo
    No bug because this is not visible if glibc is built with
    optimization.  Otherwise this would be a critical resource leak.
    Reviewed-by: Carlos O'Donell <carlos@redhat.com>
 diff --git a/sysdeps/posix/getaddrinfo.c b/sysdeps/posix/getaddrinfo.c
 index e9f47aea358a3351..321a6679d46494a3 100644
 --- a/sysdeps/posix/getaddrinfo.c
 +++ b/sysdeps/posix/getaddrinfo.c
@@ -514,7 +514,6 @@ get_nscd_addresses (const char *name, const struct addrinfo *req,
   int result = 0;
   char *addrs = air->addrs;
 -  struct gaih_addrtuple *addrfree = calloc (air->naddrs, sizeof (*addrfree));
   struct gaih_addrtuple *at = calloc (air->naddrs, sizeof (*at));
   if (at == NULL)
     {
--- a/Show More
+++ b/Show More
`@ -1 +1,2 @@`
	`SOURCES/glibc-2.28.tar.xz`	`SOURCES/glibc-2.34.tar.xz`
		`SOURCES/glibc-upstream-2.34-373.patch`
`@ -1 +1,2 @@`
	`ccb5dc9e51a9884df8488f86982439d47b283b2a SOURCES/glibc-2.28.tar.xz`	`7c3b8890a6346793b6334cc5f2fea5d437d307b8 SOURCES/glibc-2.34.tar.xz`
		`6022f103e5596ad229f22bc966327d71208f7016 SOURCES/glibc-upstream-2.34-373.patch`