From 8c4d8a0e5b46a83e073539e45f7c52ca70f66095 Mon Sep 17 00:00:00 2001
From: DJ Delorie <dj@redhat.com>
Date: Wed, 26 Jul 2023 22:45:32 -0400
Subject: [PATCH] Fix temporal threshold calculations (#2213907)

Resolves: #221390
---
 glibc-rh2213907-1.patch |  45 ++++
 glibc-rh2213907-2.patch | 223 +++++++++++++++++++
 glibc-rh2213907-3.patch | 475 ++++++++++++++++++++++++++++++++++++++++
 glibc-rh2213907-4.patch | 178 +++++++++++++++
 glibc-rh2213907-5.patch |  49 +++++
 glibc-rh2213907-6.patch |  55 +++++
 glibc.spec              |  11 +-
 7 files changed, 1035 insertions(+), 1 deletion(-)
 create mode 100644 glibc-rh2213907-1.patch
 create mode 100644 glibc-rh2213907-2.patch
 create mode 100644 glibc-rh2213907-3.patch
 create mode 100644 glibc-rh2213907-4.patch
 create mode 100644 glibc-rh2213907-5.patch
 create mode 100644 glibc-rh2213907-6.patch

diff --git a/glibc-rh2213907-1.patch b/glibc-rh2213907-1.patch
new file mode 100644
index 0000000..790ea66
--- /dev/null
+++ b/glibc-rh2213907-1.patch
@@ -0,0 +1,45 @@
+From ed2f9dc9420c4c61436328778a70459d0a35556a Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 8 May 2023 22:10:20 -0500
+Subject: [PATCH] x86: Use 64MB as nt-store threshold if no cacheinfo [BZ
+ #30429]
+Content-type: text/plain; charset=UTF-8
+
+If `non_temporal_threshold` is below `minimum_non_temporal_threshold`,
+it almost certainly means we failed to read the systems cache info.
+
+In this case, rather than defaulting the minimum correct value, we
+should default to a value that gets at least reasonable
+performance. 64MB is chosen conservatively to be at the very high
+end. This should never cause non-temporal stores when, if we had read
+cache info, we wouldn't have otherwise.
+Reviewed-by: Florian Weimer <fweimer@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index ec88945b39..877e73d700 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -749,8 +749,16 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
+      reflected in the manual.  */
+   unsigned long int maximum_non_temporal_threshold = SIZE_MAX >> 4;
+   unsigned long int minimum_non_temporal_threshold = 0x4040;
++
++  /* If `non_temporal_threshold` less than `minimum_non_temporal_threshold`
++     it most likely means we failed to detect the cache info. We don't want
++     to default to `minimum_non_temporal_threshold` as such a small value,
++     while correct, has bad performance. We default to 64MB as reasonable
++     default bound. 64MB is likely conservative in that most/all systems would
++     choose a lower value so it should never forcing non-temporal stores when
++     they otherwise wouldn't be used.  */
+   if (non_temporal_threshold < minimum_non_temporal_threshold)
+-    non_temporal_threshold = minimum_non_temporal_threshold;
++    non_temporal_threshold = 64 * 1024 * 1024;
+   else if (non_temporal_threshold > maximum_non_temporal_threshold)
+     non_temporal_threshold = maximum_non_temporal_threshold;
+ 
+-- 
+2.39.3
+
diff --git a/glibc-rh2213907-2.patch b/glibc-rh2213907-2.patch
new file mode 100644
index 0000000..cf2aaab
--- /dev/null
+++ b/glibc-rh2213907-2.patch
@@ -0,0 +1,223 @@
+From af992e7abdc9049714da76cae1e5e18bc4838fb8 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 7 Jun 2023 13:18:01 -0500
+Subject: [PATCH] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3
+ / 4`
+Content-type: text/plain; charset=UTF-8
+
+Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 /
+ncores_per_socket'. This patch updates that value to roughly
+'sizeof_L3 / 4`
+
+The original value (specifically dividing the `ncores_per_socket`) was
+done to limit the amount of other threads' data a `memcpy`/`memset`
+could evict.
+
+Dividing by 'ncores_per_socket', however leads to exceedingly low
+non-temporal thresholds and leads to using non-temporal stores in
+cases where REP MOVSB is multiple times faster.
+
+Furthermore, non-temporal stores are written directly to main memory
+so using it at a size much smaller than L3 can place soon to be
+accessed data much further away than it otherwise could be. As well,
+modern machines are able to detect streaming patterns (especially if
+REP MOVSB is used) and provide LRU hints to the memory subsystem. This
+in affect caps the total amount of eviction at 1/cache_associativity,
+far below meaningfully thrashing the entire cache.
+
+As best I can tell, the benchmarks that lead this small threshold
+where done comparing non-temporal stores versus standard cacheable
+stores. A better comparison (linked below) is to be REP MOVSB which,
+on the measure systems, is nearly 2x faster than non-temporal stores
+at the low-end of the previous threshold, and within 10% for over
+100MB copies (well past even the current threshold). In cases with a
+low number of threads competing for bandwidth, REP MOVSB is ~2x faster
+up to `sizeof_L3`.
+
+The divisor of `4` is a somewhat arbitrary value. From benchmarks it
+seems Skylake and Icelake both prefer a divisor of `2`, but older CPUs
+such as Broadwell prefer something closer to `8`. This patch is meant
+to be followed up by another one to make the divisor cpu-specific, but
+in the meantime (and for easier backporting), this patch settles on
+`4` as a middle-ground.
+
+Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable
+stores where done using:
+https://github.com/goldsteinn/memcpy-nt-benchmarks
+
+Sheets results (also available in pdf on the github):
+https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml
+Reviewed-by: DJ Delorie <dj@redhat.com>
+Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++---------------
+ 1 file changed, 43 insertions(+), 27 deletions(-)
+
+
+[diff rebased by DJ]
+diff -rup a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+--- a/sysdeps/x86/dl-cacheinfo.h	2023-07-25 00:38:39.386831871 -0400
++++ b/sysdeps/x86/dl-cacheinfo.h	2023-07-25 00:38:40.372870369 -0400
+@@ -408,7 +408,7 @@ handle_zhaoxin (int name)
+ }
+ 
+ static void
+-get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
++get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
+                 long int core)
+ {
+   unsigned int eax;
+@@ -427,6 +427,7 @@ get_common_cache_info (long int *shared_
+   unsigned int family = cpu_features->basic.family;
+   unsigned int model = cpu_features->basic.model;
+   long int shared = *shared_ptr;
++  long int shared_per_thread = *shared_per_thread_ptr;
+   unsigned int threads = *threads_ptr;
+   bool inclusive_cache = true;
+   bool support_count_mask = true;
+@@ -442,6 +443,7 @@ get_common_cache_info (long int *shared_
+       /* Try L2 otherwise.  */
+       level  = 2;
+       shared = core;
++      shared_per_thread = core;
+       threads_l2 = 0;
+       threads_l3 = -1;
+     }
+@@ -598,29 +600,28 @@ get_common_cache_info (long int *shared_
+         }
+       else
+         {
+-intel_bug_no_cache_info:
+-          /* Assume that all logical threads share the highest cache
+-             level.  */
+-          threads
+-            = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
+-	       & 0xff);
+-        }
+-
+-        /* Cap usage of highest cache level to the number of supported
+-           threads.  */
+-        if (shared > 0 && threads > 0)
+-          shared /= threads;
++	intel_bug_no_cache_info:
++	  /* Assume that all logical threads share the highest cache
++	     level.  */
++	  threads = ((cpu_features->features[CPUID_INDEX_1].cpuid.ebx >> 16)
++		     & 0xff);
++
++	  /* Get per-thread size of highest level cache.  */
++	  if (shared_per_thread > 0 && threads > 0)
++	    shared_per_thread /= threads;
++	}
+     }
+ 
+   /* Account for non-inclusive L2 and L3 caches.  */
+   if (!inclusive_cache)
+     {
+       if (threads_l2 > 0)
+-        core /= threads_l2;
++	shared_per_thread += core / threads_l2;
+       shared += core;
+     }
+ 
+   *shared_ptr = shared;
++  *shared_per_thread_ptr = shared_per_thread;
+   *threads_ptr = threads;
+ }
+ 
+@@ -630,6 +631,7 @@ dl_init_cacheinfo (struct cpu_features *
+   /* Find out what brand of processor.  */
+   long int data = -1;
+   long int shared = -1;
++  long int shared_per_thread = -1;
+   long int core = -1;
+   unsigned int threads = 0;
+   unsigned long int level1_icache_size = -1;
+@@ -650,6 +652,7 @@ dl_init_cacheinfo (struct cpu_features *
+       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
+       core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
++      shared_per_thread = shared;
+ 
+       level1_icache_size
+ 	= handle_intel (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
+@@ -673,13 +676,14 @@ dl_init_cacheinfo (struct cpu_features *
+       level4_cache_size
+ 	= handle_intel (_SC_LEVEL4_CACHE_SIZE, cpu_features);
+ 
+-      get_common_cache_info (&shared, &threads, core);
++      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+     }
+   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
+     {
+       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
+       core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
++      shared_per_thread = shared;
+ 
+       level1_icache_size = handle_zhaoxin (_SC_LEVEL1_ICACHE_SIZE);
+       level1_icache_linesize = handle_zhaoxin (_SC_LEVEL1_ICACHE_LINESIZE);
+@@ -693,13 +697,14 @@ dl_init_cacheinfo (struct cpu_features *
+       level3_cache_assoc = handle_zhaoxin (_SC_LEVEL3_CACHE_ASSOC);
+       level3_cache_linesize = handle_zhaoxin (_SC_LEVEL3_CACHE_LINESIZE);
+ 
+-      get_common_cache_info (&shared, &threads, core);
++      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+     }
+   else if (cpu_features->basic.kind == arch_kind_amd)
+     {
+       data  = handle_amd (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
+       core = handle_amd (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE, cpu_features);
++      shared_per_thread = shared;
+ 
+       level1_icache_size = handle_amd (_SC_LEVEL1_ICACHE_SIZE, cpu_features);
+       level1_icache_linesize
+@@ -721,6 +726,9 @@ dl_init_cacheinfo (struct cpu_features *
+       if (shared <= 0)
+         /* No shared L3 cache.  All we have is the L2 cache.  */
+          shared = core;
++
++      if (shared_per_thread <= 0)
++	shared_per_thread = shared;
+     }
+ 
+   cpu_features->level1_icache_size = level1_icache_size;
+@@ -736,17 +744,25 @@ dl_init_cacheinfo (struct cpu_features *
+   cpu_features->level3_cache_linesize = level3_cache_linesize;
+   cpu_features->level4_cache_size = level4_cache_size;
+ 
+-  /* The default setting for the non_temporal threshold is 3/4 of one
+-     thread's share of the chip's cache. For most Intel and AMD processors
+-     with an initial release date between 2017 and 2020, a thread's typical
+-     share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
+-     threshold leaves 125 KBytes to 500 KBytes of the thread's data
+-     in cache after a maximum temporal copy, which will maintain
+-     in cache a reasonable portion of the thread's stack and other
+-     active data. If the threshold is set higher than one thread's
+-     share of the cache, it has a substantial risk of negatively
+-     impacting the performance of other threads running on the chip. */
+-  unsigned long int non_temporal_threshold = shared * 3 / 4;
++  /* The default setting for the non_temporal threshold is 1/4 of size
++     of the chip's cache. For most Intel and AMD processors with an
++     initial release date between 2017 and 2023, a thread's typical
++     share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
++     estimate the point where non-temporal stores begin out-competing
++     REP MOVSB. As well the point where the fact that non-temporal
++     stores are forced back to main memory would already occurred to the
++     majority of the lines in the copy. Note, concerns about the
++     entire L3 cache being evicted by the copy are mostly alleviated
++     by the fact that modern HW detects streaming patterns and
++     provides proper LRU hints so that the maximum thrashing
++     capped at 1/associativity. */
++  unsigned long int non_temporal_threshold = shared / 4;
++  /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
++     a higher risk of actually thrashing the cache as they don't have a HW LRU
++     hint. As well, their performance in highly parallel situations is
++     noticeably worse.  */
++  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
++    non_temporal_threshold = shared_per_thread * 3 / 4;
+   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
+      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
+      if that operation cannot overflow. Minimum of 0x4040 (16448) because the
+
diff --git a/glibc-rh2213907-3.patch b/glibc-rh2213907-3.patch
new file mode 100644
index 0000000..5132c37
--- /dev/null
+++ b/glibc-rh2213907-3.patch
@@ -0,0 +1,475 @@
+From f193ea20eddc6cef84cba54cf1a647204ee6a86b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 7 Jun 2023 13:18:02 -0500
+Subject: [PATCH] x86: Refactor Intel `init_cpu_features`
+Content-type: text/plain; charset=UTF-8
+
+This patch should have no affect on existing functionality.
+
+The current code, which has a single switch for model detection and
+setting prefered features, is difficult to follow/extend. The cases
+use magic numbers and many microarchitectures are missing. This makes
+it difficult to reason about what is implemented so far and/or
+how/where to add support for new features.
+
+This patch splits the model detection and preference setting stages so
+that CPU preferences can be set based on a complete list of available
+microarchitectures, rather than based on model magic numbers.
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ sysdeps/x86/cpu-features.c | 390 +++++++++++++++++++++++++++++--------
+ 1 file changed, 309 insertions(+), 81 deletions(-)
+
+diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+index 0a99efdb28..d52a718e92 100644
+--- a/sysdeps/x86/cpu-features.c
++++ b/sysdeps/x86/cpu-features.c
+@@ -417,6 +417,216 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
+ 		     == index_arch_Fast_Copy_Backward)),
+ 		"Incorrect index_arch_Fast_Unaligned_Load");
+ 
++
++/* Intel Family-6 microarch list.  */
++enum
++{
++  /* Atom processors.  */
++  INTEL_ATOM_BONNELL,
++  INTEL_ATOM_SILVERMONT,
++  INTEL_ATOM_AIRMONT,
++  INTEL_ATOM_GOLDMONT,
++  INTEL_ATOM_GOLDMONT_PLUS,
++  INTEL_ATOM_SIERRAFOREST,
++  INTEL_ATOM_GRANDRIDGE,
++  INTEL_ATOM_TREMONT,
++
++  /* Bigcore processors.  */
++  INTEL_BIGCORE_MEROM,
++  INTEL_BIGCORE_PENRYN,
++  INTEL_BIGCORE_DUNNINGTON,
++  INTEL_BIGCORE_NEHALEM,
++  INTEL_BIGCORE_WESTMERE,
++  INTEL_BIGCORE_SANDYBRIDGE,
++  INTEL_BIGCORE_IVYBRIDGE,
++  INTEL_BIGCORE_HASWELL,
++  INTEL_BIGCORE_BROADWELL,
++  INTEL_BIGCORE_SKYLAKE,
++  INTEL_BIGCORE_KABYLAKE,
++  INTEL_BIGCORE_COMETLAKE,
++  INTEL_BIGCORE_SKYLAKE_AVX512,
++  INTEL_BIGCORE_CANNONLAKE,
++  INTEL_BIGCORE_ICELAKE,
++  INTEL_BIGCORE_TIGERLAKE,
++  INTEL_BIGCORE_ROCKETLAKE,
++  INTEL_BIGCORE_SAPPHIRERAPIDS,
++  INTEL_BIGCORE_RAPTORLAKE,
++  INTEL_BIGCORE_EMERALDRAPIDS,
++  INTEL_BIGCORE_METEORLAKE,
++  INTEL_BIGCORE_LUNARLAKE,
++  INTEL_BIGCORE_ARROWLAKE,
++  INTEL_BIGCORE_GRANITERAPIDS,
++
++  /* Mixed (bigcore + atom SOC).  */
++  INTEL_MIXED_LAKEFIELD,
++  INTEL_MIXED_ALDERLAKE,
++
++  /* KNL.  */
++  INTEL_KNIGHTS_MILL,
++  INTEL_KNIGHTS_LANDING,
++
++  /* Unknown.  */
++  INTEL_UNKNOWN,
++};
++
++static unsigned int
++intel_get_fam6_microarch (unsigned int model,
++			  __attribute__ ((unused)) unsigned int stepping)
++{
++  switch (model)
++    {
++    case 0x1C:
++    case 0x26:
++      return INTEL_ATOM_BONNELL;
++    case 0x27:
++    case 0x35:
++    case 0x36:
++      /* Really Saltwell, but Saltwell is just a die shrink of Bonnell
++         (microarchitecturally identical).  */
++      return INTEL_ATOM_BONNELL;
++    case 0x37:
++    case 0x4A:
++    case 0x4D:
++    case 0x5D:
++      return INTEL_ATOM_SILVERMONT;
++    case 0x4C:
++    case 0x5A:
++    case 0x75:
++      return INTEL_ATOM_AIRMONT;
++    case 0x5C:
++    case 0x5F:
++      return INTEL_ATOM_GOLDMONT;
++    case 0x7A:
++      return INTEL_ATOM_GOLDMONT_PLUS;
++    case 0xAF:
++      return INTEL_ATOM_SIERRAFOREST;
++    case 0xB6:
++      return INTEL_ATOM_GRANDRIDGE;
++    case 0x86:
++    case 0x96:
++    case 0x9C:
++      return INTEL_ATOM_TREMONT;
++    case 0x0F:
++    case 0x16:
++      return INTEL_BIGCORE_MEROM;
++    case 0x17:
++      return INTEL_BIGCORE_PENRYN;
++    case 0x1D:
++      return INTEL_BIGCORE_DUNNINGTON;
++    case 0x1A:
++    case 0x1E:
++    case 0x1F:
++    case 0x2E:
++      return INTEL_BIGCORE_NEHALEM;
++    case 0x25:
++    case 0x2C:
++    case 0x2F:
++      return INTEL_BIGCORE_WESTMERE;
++    case 0x2A:
++    case 0x2D:
++      return INTEL_BIGCORE_SANDYBRIDGE;
++    case 0x3A:
++    case 0x3E:
++      return INTEL_BIGCORE_IVYBRIDGE;
++    case 0x3C:
++    case 0x3F:
++    case 0x45:
++    case 0x46:
++      return INTEL_BIGCORE_HASWELL;
++    case 0x3D:
++    case 0x47:
++    case 0x4F:
++    case 0x56:
++      return INTEL_BIGCORE_BROADWELL;
++    case 0x4E:
++    case 0x5E:
++      return INTEL_BIGCORE_SKYLAKE;
++    case 0x8E:
++    /*
++     Stepping = {9}
++        -> Amberlake
++     Stepping = {10}
++        -> Coffeelake
++     Stepping = {11, 12}
++        -> Whiskeylake
++     else
++        -> Kabylake
++
++     All of these are derivatives of Kabylake (Skylake client).
++     */
++	  return INTEL_BIGCORE_KABYLAKE;
++    case 0x9E:
++    /*
++     Stepping = {10, 11, 12, 13}
++        -> Coffeelake
++     else
++        -> Kabylake
++
++     Coffeelake is a derivatives of Kabylake (Skylake client).
++     */
++	  return INTEL_BIGCORE_KABYLAKE;
++    case 0xA5:
++    case 0xA6:
++      return INTEL_BIGCORE_COMETLAKE;
++    case 0x66:
++      return INTEL_BIGCORE_CANNONLAKE;
++    case 0x55:
++    /*
++     Stepping = {6, 7}
++        -> Cascadelake
++     Stepping = {11}
++        -> Cooperlake
++     else
++        -> Skylake-avx512
++
++     These are all microarchitecturally indentical, so use
++     Skylake-avx512 for all of them.
++     */
++      return INTEL_BIGCORE_SKYLAKE_AVX512;
++    case 0x6A:
++    case 0x6C:
++    case 0x7D:
++    case 0x7E:
++    case 0x9D:
++      return INTEL_BIGCORE_ICELAKE;
++    case 0x8C:
++    case 0x8D:
++      return INTEL_BIGCORE_TIGERLAKE;
++    case 0xA7:
++      return INTEL_BIGCORE_ROCKETLAKE;
++    case 0x8F:
++      return INTEL_BIGCORE_SAPPHIRERAPIDS;
++    case 0xB7:
++    case 0xBA:
++    case 0xBF:
++      return INTEL_BIGCORE_RAPTORLAKE;
++    case 0xCF:
++      return INTEL_BIGCORE_EMERALDRAPIDS;
++    case 0xAA:
++    case 0xAC:
++      return INTEL_BIGCORE_METEORLAKE;
++    case 0xbd:
++      return INTEL_BIGCORE_LUNARLAKE;
++    case 0xc6:
++      return INTEL_BIGCORE_ARROWLAKE;
++    case 0xAD:
++    case 0xAE:
++      return INTEL_BIGCORE_GRANITERAPIDS;
++    case 0x8A:
++      return INTEL_MIXED_LAKEFIELD;
++    case 0x97:
++    case 0x9A:
++    case 0xBE:
++      return INTEL_MIXED_ALDERLAKE;
++    case 0x85:
++      return INTEL_KNIGHTS_MILL;
++    case 0x57:
++      return INTEL_KNIGHTS_LANDING;
++    default:
++      return INTEL_UNKNOWN;
++    }
++}
++
+ static inline void
+ init_cpu_features (struct cpu_features *cpu_features)
+ {
+@@ -453,129 +663,147 @@ init_cpu_features (struct cpu_features *cpu_features)
+       if (family == 0x06)
+ 	{
+ 	  model += extended_model;
+-	  switch (model)
++	  unsigned int microarch
++	      = intel_get_fam6_microarch (model, stepping);
++
++	  switch (microarch)
+ 	    {
+-	    case 0x1c:
+-	    case 0x26:
+-	      /* BSF is slow on Atom.  */
++	      /* Atom / KNL tuning.  */
++	    case INTEL_ATOM_BONNELL:
++	      /* BSF is slow on Bonnell.  */
+ 	      cpu_features->preferred[index_arch_Slow_BSF]
+-		|= bit_arch_Slow_BSF;
++		  |= bit_arch_Slow_BSF;
+ 	      break;
+ 
+-	    case 0x57:
+-	      /* Knights Landing.  Enable Silvermont optimizations.  */
+-
+-	    case 0x7a:
+-	      /* Unaligned load versions are faster than SSSE3
+-		 on Goldmont Plus.  */
+-
+-	    case 0x5c:
+-	    case 0x5f:
+ 	      /* Unaligned load versions are faster than SSSE3
+-		 on Goldmont.  */
++		     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
++	    case INTEL_ATOM_AIRMONT:
++	    case INTEL_ATOM_SILVERMONT:
++	    case INTEL_ATOM_GOLDMONT:
++	    case INTEL_ATOM_GOLDMONT_PLUS:
+ 
+-	    case 0x4c:
+-	    case 0x5a:
+-	    case 0x75:
+-	      /* Airmont is a die shrink of Silvermont.  */
++          /* Knights Landing.  Enable Silvermont optimizations.  */
++	    case INTEL_KNIGHTS_LANDING:
+ 
+-	    case 0x37:
+-	    case 0x4a:
+-	    case 0x4d:
+-	    case 0x5d:
+-	      /* Unaligned load versions are faster than SSSE3
+-		 on Silvermont.  */
+ 	      cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+-		|= (bit_arch_Fast_Unaligned_Load
+-		    | bit_arch_Fast_Unaligned_Copy
+-		    | bit_arch_Prefer_PMINUB_for_stringop
+-		    | bit_arch_Slow_SSE4_2);
++		  |= (bit_arch_Fast_Unaligned_Load
++		      | bit_arch_Fast_Unaligned_Copy
++		      | bit_arch_Prefer_PMINUB_for_stringop
++		      | bit_arch_Slow_SSE4_2);
+ 	      break;
+ 
+-	    case 0x86:
+-	    case 0x96:
+-	    case 0x9c:
++	    case INTEL_ATOM_TREMONT:
+ 	      /* Enable rep string instructions, unaligned load, unaligned
+-	         copy, pminub and avoid SSE 4.2 on Tremont.  */
++		 copy, pminub and avoid SSE 4.2 on Tremont.  */
+ 	      cpu_features->preferred[index_arch_Fast_Rep_String]
+-		|= (bit_arch_Fast_Rep_String
+-		    | bit_arch_Fast_Unaligned_Load
+-		    | bit_arch_Fast_Unaligned_Copy
+-		    | bit_arch_Prefer_PMINUB_for_stringop
+-		    | bit_arch_Slow_SSE4_2);
++		  |= (bit_arch_Fast_Rep_String
++		      | bit_arch_Fast_Unaligned_Load
++		      | bit_arch_Fast_Unaligned_Copy
++		      | bit_arch_Prefer_PMINUB_for_stringop
++		      | bit_arch_Slow_SSE4_2);
+ 	      break;
+ 
++	   /*
++	    Default tuned Knights microarch.
++	    case INTEL_KNIGHTS_MILL:
++        */
++
++	   /*
++	    Default tuned atom microarch.
++	    case INTEL_ATOM_SIERRAFOREST:
++	    case INTEL_ATOM_GRANDRIDGE:
++	   */
++
++	      /* Bigcore/Default Tuning.  */
+ 	    default:
+ 	      /* Unknown family 0x06 processors.  Assuming this is one
+ 		 of Core i3/i5/i7 processors if AVX is available.  */
+ 	      if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+ 		break;
+ 	      /* Fall through.  */
+-
+-	    case 0x1a:
+-	    case 0x1e:
+-	    case 0x1f:
+-	    case 0x25:
+-	    case 0x2c:
+-	    case 0x2e:
+-	    case 0x2f:
++	    case INTEL_BIGCORE_NEHALEM:
++	    case INTEL_BIGCORE_WESTMERE:
+ 	      /* Rep string instructions, unaligned load, unaligned copy,
+ 		 and pminub are fast on Intel Core i3, i5 and i7.  */
+ 	      cpu_features->preferred[index_arch_Fast_Rep_String]
+-		|= (bit_arch_Fast_Rep_String
+-		    | bit_arch_Fast_Unaligned_Load
+-		    | bit_arch_Fast_Unaligned_Copy
+-		    | bit_arch_Prefer_PMINUB_for_stringop);
++		  |= (bit_arch_Fast_Rep_String
++		      | bit_arch_Fast_Unaligned_Load
++		      | bit_arch_Fast_Unaligned_Copy
++		      | bit_arch_Prefer_PMINUB_for_stringop);
+ 	      break;
++
++	   /*
++	    Default tuned Bigcore microarch.
++	    case INTEL_BIGCORE_SANDYBRIDGE:
++	    case INTEL_BIGCORE_IVYBRIDGE:
++	    case INTEL_BIGCORE_HASWELL:
++	    case INTEL_BIGCORE_BROADWELL:
++	    case INTEL_BIGCORE_SKYLAKE:
++	    case INTEL_BIGCORE_KABYLAKE:
++	    case INTEL_BIGCORE_COMETLAKE:
++	    case INTEL_BIGCORE_SKYLAKE_AVX512:
++	    case INTEL_BIGCORE_CANNONLAKE:
++	    case INTEL_BIGCORE_ICELAKE:
++	    case INTEL_BIGCORE_TIGERLAKE:
++	    case INTEL_BIGCORE_ROCKETLAKE:
++	    case INTEL_BIGCORE_RAPTORLAKE:
++	    case INTEL_BIGCORE_METEORLAKE:
++	    case INTEL_BIGCORE_LUNARLAKE:
++	    case INTEL_BIGCORE_ARROWLAKE:
++	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
++	    case INTEL_BIGCORE_EMERALDRAPIDS:
++	    case INTEL_BIGCORE_GRANITERAPIDS:
++	    */
++
++	   /*
++	    Default tuned Mixed (bigcore + atom SOC).
++	    case INTEL_MIXED_LAKEFIELD:
++	    case INTEL_MIXED_ALDERLAKE:
++	    */
+ 	    }
+ 
+-	 /* Disable TSX on some processors to avoid TSX on kernels that
+-	    weren't updated with the latest microcode package (which
+-	    disables broken feature by default).  */
+-	 switch (model)
++	      /* Disable TSX on some processors to avoid TSX on kernels that
++		 weren't updated with the latest microcode package (which
++		 disables broken feature by default).  */
++	  switch (microarch)
+ 	    {
+-	    case 0x55:
++	    case INTEL_BIGCORE_SKYLAKE_AVX512:
++	      /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
+ 	      if (stepping <= 5)
+ 		goto disable_tsx;
+ 	      break;
+-	    case 0x8e:
+-	      /* NB: Although the errata documents that for model == 0x8e,
+-		 only 0xb stepping or lower are impacted, the intention of
+-		 the errata was to disable TSX on all client processors on
+-		 all steppings.  Include 0xc stepping which is an Intel
+-		 Core i7-8665U, a client mobile processor.  */
+-	    case 0x9e:
++
++	    case INTEL_BIGCORE_KABYLAKE:
++	      /* NB: Although the errata documents that for model == 0x8e
++		     (kabylake skylake client), only 0xb stepping or lower are
++		     impacted, the intention of the errata was to disable TSX on
++		     all client processors on all steppings.  Include 0xc
++		     stepping which is an Intel Core i7-8665U, a client mobile
++		     processor.  */
+ 	      if (stepping > 0xc)
+ 		break;
+ 	      /* Fall through.  */
+-	    case 0x4e:
+-	    case 0x5e:
+-	      {
++	    case INTEL_BIGCORE_SKYLAKE:
+ 		/* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+ 		   processors listed in:
+ 
+ https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+ 		 */
+-disable_tsx:
++	    disable_tsx:
+ 		CPU_FEATURE_UNSET (cpu_features, HLE);
+ 		CPU_FEATURE_UNSET (cpu_features, RTM);
+ 		CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+-	      }
+-	      break;
+-	    case 0x3f:
+-	      /* Xeon E7 v3 with stepping >= 4 has working TSX.  */
+-	      if (stepping >= 4)
+ 		break;
+-	      /* Fall through.  */
+-	    case 0x3c:
+-	    case 0x45:
+-	    case 0x46:
+-	      /* Disable Intel TSX on Haswell processors (except Xeon E7 v3
+-		 with stepping >= 4) to avoid TSX on kernels that weren't
+-		 updated with the latest microcode package (which disables
+-		 broken feature by default).  */
+-	      CPU_FEATURE_UNSET (cpu_features, RTM);
+-	      break;
++
++	    case INTEL_BIGCORE_HASWELL:
++		/* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
++		   TSX.  Haswell also include other model numbers that have
++		   working TSX.  */
++		if (model == 0x3f && stepping >= 4)
++		break;
++
++		CPU_FEATURE_UNSET (cpu_features, RTM);
++		break;
+ 	    }
+ 	}
+ 
+-- 
+2.39.3
+
diff --git a/glibc-rh2213907-4.patch b/glibc-rh2213907-4.patch
new file mode 100644
index 0000000..d2d4293
--- /dev/null
+++ b/glibc-rh2213907-4.patch
@@ -0,0 +1,178 @@
+From 180897c161a171d8ef0faee1c6c9fd6b57d8b13b Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 7 Jun 2023 13:18:03 -0500
+Subject: [PATCH] x86: Make the divisor in setting `non_temporal_threshold` cpu
+ specific
+Content-type: text/plain; charset=UTF-8
+
+Different systems prefer a different divisors.
+
+From benchmarks[1] so far the following divisors have been found:
+    ICX     : 2
+    SKX     : 2
+    BWD     : 8
+
+For Intel, we are generalizing that BWD and older prefers 8 as a
+divisor, and SKL and newer prefers 2. This number can be further tuned
+as benchmarks are run.
+
+[1]: https://github.com/goldsteinn/memcpy-nt-benchmarks
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ sysdeps/x86/cpu-features.c         | 31 ++++++++++++++++++++---------
+ sysdeps/x86/dl-cacheinfo.h         | 32 ++++++++++++++++++------------
+ sysdeps/x86/dl-diagnostics-cpu.c   | 11 ++++++----
+ sysdeps/x86/include/cpu-features.h |  3 +++
+ 4 files changed, 51 insertions(+), 26 deletions(-)
+
+[DJ - edited for ABI compatibility]
+
+diff -rup a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
+--- a/sysdeps/x86/cpu-features.c	2023-07-26 17:56:19.679300711 -0400
++++ b/sysdeps/x86/cpu-features.c	2023-07-28 15:27:00.336324265 -0400
+@@ -35,6 +35,9 @@ extern void TUNABLE_CALLBACK (set_x86_sh
+ # endif
+ #endif
+ 
++unsigned long int __rtld_global_ro_cachesize_non_temporal_divisor
++  attribute_hidden;
++
+ #if CET_ENABLED
+ # include <dl-cet.h>
+ #endif
+@@ -614,6 +617,7 @@ init_cpu_features (struct cpu_features *
+   unsigned int stepping = 0;
+   enum cpu_features_kind kind;
+ 
++  __rtld_global_ro_cachesize_non_temporal_divisor = 4;
+ #if !HAS_CPUID
+   if (__get_cpuid_max (0, 0) == 0)
+     {
+@@ -694,13 +698,13 @@ init_cpu_features (struct cpu_features *
+ 
+ 	      /* Bigcore/Default Tuning.  */
+ 	    default:
++	    default_tuning:
+ 	      /* Unknown family 0x06 processors.  Assuming this is one
+ 		 of Core i3/i5/i7 processors if AVX is available.  */
+ 	      if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+ 		break;
+-	      /* Fall through.  */
+-	    case INTEL_BIGCORE_NEHALEM:
+-	    case INTEL_BIGCORE_WESTMERE:
++
++	    enable_modern_features:
+ 	      /* Rep string instructions, unaligned load, unaligned copy,
+ 		 and pminub are fast on Intel Core i3, i5 and i7.  */
+ 	      cpu_features->preferred[index_arch_Fast_Rep_String]
+@@ -710,12 +714,23 @@ init_cpu_features (struct cpu_features *
+ 		      | bit_arch_Prefer_PMINUB_for_stringop);
+ 	      break;
+ 
+-	   /*
+-	    Default tuned Bigcore microarch.
++	    case INTEL_BIGCORE_NEHALEM:
++	    case INTEL_BIGCORE_WESTMERE:
++	      /* Older CPUs prefer non-temporal stores at lower threshold.  */
++	      __rtld_global_ro_cachesize_non_temporal_divisor = 8;
++	      goto enable_modern_features;
++
++	      /* Older Bigcore microarch (smaller non-temporal store
++		 threshold).  */
+ 	    case INTEL_BIGCORE_SANDYBRIDGE:
+ 	    case INTEL_BIGCORE_IVYBRIDGE:
+ 	    case INTEL_BIGCORE_HASWELL:
+ 	    case INTEL_BIGCORE_BROADWELL:
++	      __rtld_global_ro_cachesize_non_temporal_divisor = 8;
++	      goto default_tuning;
++
++	      /* Newer Bigcore microarch (larger non-temporal store
++		 threshold).  */
+ 	    case INTEL_BIGCORE_SKYLAKE:
+ 	    case INTEL_BIGCORE_KABYLAKE:
+ 	    case INTEL_BIGCORE_COMETLAKE:
+@@ -731,13 +746,14 @@ init_cpu_features (struct cpu_features *
+ 	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
+ 	    case INTEL_BIGCORE_EMERALDRAPIDS:
+ 	    case INTEL_BIGCORE_GRANITERAPIDS:
+-	    */
++	      __rtld_global_ro_cachesize_non_temporal_divisor = 2;
++	      goto default_tuning;
+ 
+-	   /*
+-	    Default tuned Mixed (bigcore + atom SOC).
++	      /* Default tuned Mixed (bigcore + atom SOC). */
+ 	    case INTEL_MIXED_LAKEFIELD:
+ 	    case INTEL_MIXED_ALDERLAKE:
+-	    */
++	      __rtld_global_ro_cachesize_non_temporal_divisor = 2;
++	      goto default_tuning;
+ 	    }
+ 
+ 	      /* Disable TSX on some processors to avoid TSX on kernels that
+diff -rup a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+--- a/sysdeps/x86/dl-cacheinfo.h	2023-07-26 17:56:18.662261475 -0400
++++ b/sysdeps/x86/dl-cacheinfo.h	2023-07-26 17:56:20.756342261 -0400
+@@ -744,19 +744,25 @@ dl_init_cacheinfo (struct cpu_features *
+   cpu_features->level3_cache_linesize = level3_cache_linesize;
+   cpu_features->level4_cache_size = level4_cache_size;
+ 
+-  /* The default setting for the non_temporal threshold is 1/4 of size
+-     of the chip's cache. For most Intel and AMD processors with an
+-     initial release date between 2017 and 2023, a thread's typical
+-     share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
+-     estimate the point where non-temporal stores begin out-competing
+-     REP MOVSB. As well the point where the fact that non-temporal
+-     stores are forced back to main memory would already occurred to the
+-     majority of the lines in the copy. Note, concerns about the
+-     entire L3 cache being evicted by the copy are mostly alleviated
+-     by the fact that modern HW detects streaming patterns and
+-     provides proper LRU hints so that the maximum thrashing
+-     capped at 1/associativity. */
+-  unsigned long int non_temporal_threshold = shared / 4;
++  unsigned long int cachesize_non_temporal_divisor
++      = __rtld_global_ro_cachesize_non_temporal_divisor;
++  if (cachesize_non_temporal_divisor <= 0)
++    cachesize_non_temporal_divisor = 4;
++
++  /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
++     of the chip's cache (depending on `cachesize_non_temporal_divisor` which
++     is microarch specific. The defeault is 1/4). For most Intel and AMD
++     processors with an initial release date between 2017 and 2023, a thread's
++     typical share of the cache is from 18-64MB. Using a reasonable size
++     fraction of L3 is meant to estimate the point where non-temporal stores
++     begin out-competing REP MOVSB. As well the point where the fact that
++     non-temporal stores are forced back to main memory would already occurred
++     to the majority of the lines in the copy. Note, concerns about the entire
++     L3 cache being evicted by the copy are mostly alleviated by the fact that
++     modern HW detects streaming patterns and provides proper LRU hints so that
++     the maximum thrashing capped at 1/associativity. */
++  unsigned long int non_temporal_threshold
++      = shared / cachesize_non_temporal_divisor;
+   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+      a higher risk of actually thrashing the cache as they don't have a HW LRU
+      hint. As well, their performance in highly parallel situations is
+diff -rup a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
+--- a/sysdeps/x86/dl-diagnostics-cpu.c	2021-08-01 21:33:43.000000000 -0400
++++ b/sysdeps/x86/dl-diagnostics-cpu.c	2023-07-26 17:56:20.761342454 -0400
+@@ -117,4 +117,6 @@ _dl_diagnostics_cpu (void)
+                   + sizeof (cpu_features->level4_cache_size)
+                   == sizeof (*cpu_features),
+                   "last cpu_features field has been printed");
++  print_cpu_features_value ("cachesize_non_temporal_divisor",
++			    __rtld_global_ro_cachesize_non_temporal_divisor);
+ }
+diff -rup a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
+--- a/sysdeps/x86/include/cpu-features.h	2021-08-01 21:33:43.000000000 -0400
++++ b/sysdeps/x86/include/cpu-features.h	2023-07-27 13:51:52.081494751 -0400
+@@ -919,6 +919,10 @@ struct cpu_features
+   unsigned long int level4_cache_size;
+ };
+ 
++/* When no user non_temporal_threshold is specified. We default to
++   cachesize / cachesize_non_temporal_divisor.  */
++extern unsigned long int __rtld_global_ro_cachesize_non_temporal_divisor;
++
+ /* Get a pointer to the CPU features structure.  */
+ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
+      __attribute__ ((pure));
diff --git a/glibc-rh2213907-5.patch b/glibc-rh2213907-5.patch
new file mode 100644
index 0000000..0e563ec
--- /dev/null
+++ b/glibc-rh2213907-5.patch
@@ -0,0 +1,49 @@
+From 47f747217811db35854ea06741be3685e8bbd44d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 Jul 2023 23:14:33 -0500
+Subject: [PATCH] x86: Fix slight bug in `shared_per_thread` cache size
+ calculation.
+Content-type: text/plain; charset=UTF-8
+
+After:
+```
+    commit af992e7abdc9049714da76cae1e5e18bc4838fb8
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Wed Jun 7 13:18:01 2023 -0500
+
+        x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4`
+```
+
+Split `shared` (cumulative cache size) from `shared_per_thread` (cache
+size per socket), the `shared_per_thread` *can* be slightly off from
+the previous calculation.
+
+Previously we added `core` even if `threads_l2` was invalid, and only
+used `threads_l2` to divide `core` if it was present. The changed
+version only included `core` if `threads_l2` was valid.
+
+This change restores the old behavior if `threads_l2` is invalid by
+adding the entire value of `core`.
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+index c98fa57a7b..43be2c1229 100644
+--- a/sysdeps/x86/dl-cacheinfo.h
++++ b/sysdeps/x86/dl-cacheinfo.h
+@@ -614,8 +614,8 @@ get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, u
+   /* Account for non-inclusive L2 and L3 caches.  */
+   if (!inclusive_cache)
+     {
+-      if (threads_l2 > 0)
+-	shared_per_thread += core / threads_l2;
++      long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
++      shared_per_thread += core_per_thread;
+       shared += core;
+     }
+ 
+-- 
+2.39.3
+
diff --git a/glibc-rh2213907-6.patch b/glibc-rh2213907-6.patch
new file mode 100644
index 0000000..e7a36cf
--- /dev/null
+++ b/glibc-rh2213907-6.patch
@@ -0,0 +1,55 @@
+From 8b9a0af8ca012217bf90d1dc0694f85b49ae09da Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 18 Jul 2023 10:27:59 -0500
+Subject: [PATCH] [PATCH v1] x86: Use `3/4*sizeof(per-thread-L3)` as low bound
+ for NT threshold.
+Content-type: text/plain; charset=UTF-8
+
+On some machines we end up with incomplete cache information. This can
+make the new calculation of `sizeof(total-L3)/custom-divisor` end up
+lower than intended (and lower than the prior value). So reintroduce
+the old bound as a lower bound to avoid potentially regressing code
+where we don't have complete information to make the decision.
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+[diff rebased by DJ]
+diff -rup a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
+--- a/sysdeps/x86/dl-cacheinfo.h	2023-07-25 00:38:43.343986368 -0400
++++ b/sysdeps/x86/dl-cacheinfo.h	2023-07-25 00:38:44.336025100 -0400
+@@ -751,8 +751,8 @@ dl_init_cacheinfo (struct cpu_features *
+ 
+   /* The default setting for the non_temporal threshold is [1/8, 1/2] of size
+      of the chip's cache (depending on `cachesize_non_temporal_divisor` which
+-     is microarch specific. The defeault is 1/4). For most Intel and AMD
+-     processors with an initial release date between 2017 and 2023, a thread's
++     is microarch specific. The default is 1/4). For most Intel processors
++     with an initial release date between 2017 and 2023, a thread's
+      typical share of the cache is from 18-64MB. Using a reasonable size
+      fraction of L3 is meant to estimate the point where non-temporal stores
+      begin out-competing REP MOVSB. As well the point where the fact that
+@@ -763,12 +763,21 @@ dl_init_cacheinfo (struct cpu_features *
+      the maximum thrashing capped at 1/associativity. */
+   unsigned long int non_temporal_threshold
+       = shared / cachesize_non_temporal_divisor;
++
++  /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
++     likely have incorrect/incomplete cache info in which case, default to
++     3/4 * per-thread L3 to avoid regressions.  */
++  unsigned long int non_temporal_threshold_lowbound
++      = shared_per_thread * 3 / 4;
++  if (non_temporal_threshold < non_temporal_threshold_lowbound)
++    non_temporal_threshold = non_temporal_threshold_lowbound;
++
+   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+      a higher risk of actually thrashing the cache as they don't have a HW LRU
+      hint. As well, their performance in highly parallel situations is
+      noticeably worse.  */
+   if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-    non_temporal_threshold = shared_per_thread * 3 / 4;
++    non_temporal_threshold = non_temporal_threshold_lowbound;
+   /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
+      'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
+      if that operation cannot overflow. Minimum of 0x4040 (16448) because the
diff --git a/glibc.spec b/glibc.spec
index 9b875bd..53e832a 100644
--- a/glibc.spec
+++ b/glibc.spec
@@ -155,7 +155,7 @@ end \
 Summary: The GNU libc libraries
 Name: glibc
 Version: %{glibcversion}
-Release: 78%{?dist}
+Release: 79%{?dist}
 
 # In general, GPLv2+ is used by programs, LGPLv2+ is used for
 # libraries.
@@ -737,6 +737,12 @@ Patch496: glibc-rh2224349.patch
 Patch497: glibc-rh2224289-3.patch
 Patch498: glibc-rh2224504-1.patch
 Patch499: glibc-rh2224504-2.patch
+Patch500: glibc-rh2213907-1.patch
+Patch501: glibc-rh2213907-2.patch
+Patch502: glibc-rh2213907-3.patch
+Patch503: glibc-rh2213907-4.patch
+Patch504: glibc-rh2213907-5.patch
+Patch505: glibc-rh2213907-6.patch
 
 ##############################################################################
 # Continued list of core "glibc" package information:
@@ -2894,6 +2900,9 @@ update_gconv_modules_cache ()
 %endif
 
 %changelog
+* Tue Aug  8 2023 DJ Delorie <dj@redhat.com> - 2.34-79
+- Fix temporal threshold calculations (#2213907)
+
 * Fri Aug  4 2023 Florian Weimer <fweimer@redhat.com> - 2.34-78
 - Ignore symbolic link change on /etc/nsswitch.conf (#2229156)