Fix temporal threshold calculations (#2180462)

Resolves: #2180462
2023-08-08 16:22:45 -04:00 · 2023-08-08 16:22:45 -04:00 · f089a914cf
commit f089a914cf
parent 1afa752bd9
4 changed files with 314 additions and 1 deletions
--- a/glibc-rh2180462-1.patch
+++ b/glibc-rh2180462-1.patch
@ -0,0 +1,216 @@
+From af992e7abdc9049714da76cae1e5e18bc4838fb8 Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Wed, 7 Jun 2023 13:18:01 -0500
+Subject: [PATCH] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3
+ / 4`
+Content-type: text/plain; charset=UTF-8
+
+Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 /
+ncores_per_socket'. This patch updates that value to roughly
+'sizeof_L3 / 4`
+
+The original value (specifically dividing the `ncores_per_socket`) was
+done to limit the amount of other threads' data a `memcpy`/`memset`
+could evict.
+
+Dividing by 'ncores_per_socket', however leads to exceedingly low
+non-temporal thresholds and leads to using non-temporal stores in
+cases where REP MOVSB is multiple times faster.
+
+Furthermore, non-temporal stores are written directly to main memory
+so using it at a size much smaller than L3 can place soon to be
+accessed data much further away than it otherwise could be. As well,
+modern machines are able to detect streaming patterns (especially if
+REP MOVSB is used) and provide LRU hints to the memory subsystem. This
+in affect caps the total amount of eviction at 1/cache_associativity,
+far below meaningfully thrashing the entire cache.
+
+As best I can tell, the benchmarks that lead this small threshold
+where done comparing non-temporal stores versus standard cacheable
+stores. A better comparison (linked below) is to be REP MOVSB which,
+on the measure systems, is nearly 2x faster than non-temporal stores
+at the low-end of the previous threshold, and within 10% for over
+100MB copies (well past even the current threshold). In cases with a
+low number of threads competing for bandwidth, REP MOVSB is ~2x faster
+up to `sizeof_L3`.
+
+The divisor of `4` is a somewhat arbitrary value. From benchmarks it
+seems Skylake and Icelake both prefer a divisor of `2`, but older CPUs
+such as Broadwell prefer something closer to `8`. This patch is meant
+to be followed up by another one to make the divisor cpu-specific, but
+in the meantime (and for easier backporting), this patch settles on
+`4` as a middle-ground.
+
+Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable
+stores where done using:
+https://github.com/goldsteinn/memcpy-nt-benchmarks
+
+Sheets results (also available in pdf on the github):
+https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml
+Reviewed-by: DJ Delorie <dj@redhat.com>
+Reviewed-by: Carlos O'Donell <carlos@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++---------------
+ 1 file changed, 43 insertions(+), 27 deletions(-)
+
+[DJ - ported to C8S]
+
+diff -rup a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
+--- a/sysdeps/x86/cacheinfo.h	2023-08-08 11:54:09.969791421 -0400
+++ b/sysdeps/x86/cacheinfo.h	2023-08-08 13:44:55.185333601 -0400
+@@ -46,7 +46,7 @@ long int __x86_rep_movsb_threshold attri
+ long int __x86_rep_stosb_threshold attribute_hidden = 2048;
+ 
+ static void
+-get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
+get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
+ 		       long int core)
+ {
+   unsigned int eax;
+@@ -65,6 +65,7 @@ get_common_cache_info (long int *shared_
+   unsigned int family = cpu_features->basic.family;
+   unsigned int model = cpu_features->basic.model;
+   long int shared = *shared_ptr;
+  long int shared_per_thread = *shared_per_thread_ptr;
+   unsigned int threads = *threads_ptr;
+   bool inclusive_cache = true;
+   bool support_count_mask = true;
+@@ -80,6 +81,7 @@ get_common_cache_info (long int *shared_
+       /* Try L2 otherwise.  */
+       level  = 2;
+       shared = core;
+      shared_per_thread = core;
+       threads_l2 = 0;
+       threads_l3 = -1;
+     }
+@@ -236,29 +238,28 @@ get_common_cache_info (long int *shared_
+         }
+       else
+         {
+-intel_bug_no_cache_info:
+-          /* Assume that all logical threads share the highest cache
+-             level.  */
+-          threads
+-            = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx
+-                >> 16) & 0xff);
+-        }
+-
+-        /* Cap usage of highest cache level to the number of supported
+-           threads.  */
+-        if (shared > 0 && threads > 0)
+-          shared /= threads;
+	intel_bug_no_cache_info:
+	  /* Assume that all logical threads share the highest cache
+	     level.  */
+	  threads = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx >> 16)
+		     & 0xff);
+
+	  /* Get per-thread size of highest level cache.  */
+	  if (shared_per_thread > 0 && threads > 0)
+	    shared_per_thread /= threads;
+	}
+     }
+ 
+   /* Account for non-inclusive L2 and L3 caches.  */
+   if (!inclusive_cache)
+     {
+       if (threads_l2 > 0)
+-        core /= threads_l2;
+	shared_per_thread += core / threads_l2;
+       shared += core;
+     }
+ 
+   *shared_ptr = shared;
+  *shared_per_thread_ptr = shared_per_thread;
+   *threads_ptr = threads;
+ }
+ 
+@@ -272,6 +273,7 @@ init_cacheinfo (void)
+   int max_cpuid_ex;
+   long int data = -1;
+   long int shared = -1;
+  long int shared_per_thread = -1;
+   long int core;
+   unsigned int threads = 0;
+   const struct cpu_features *cpu_features = __get_cpu_features ();
+@@ -287,22 +289,25 @@ init_cacheinfo (void)
+       data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
+       core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
+       shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
+      shared_per_thread = shared;
+ 
+-      get_common_cache_info (&shared, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+     }
+   else if (cpu_features->basic.kind == arch_kind_zhaoxin)
+     {
+       data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
+       core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
+      shared_per_thread = shared;
+ 
+-      get_common_cache_info (&shared, &threads, core);
+      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
+     }
+   else if (cpu_features->basic.kind == arch_kind_amd)
+     {
+       data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
+       long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
+       shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
+      shared_per_thread = shared;
+ 
+       /* Get maximum extended function. */
+       __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
+@@ -352,6 +357,9 @@ init_cacheinfo (void)
+ 	      shared += core;
+             }
+ 	}
+
+      if (shared_per_thread <= 0)
+	shared_per_thread = shared;
+     }
+ 
+   if (cpu_features->data_cache_size != 0)
+@@ -380,20 +388,30 @@ init_cacheinfo (void)
+       __x86_shared_cache_size = shared;
+     }
+ 
+-  /* The default setting for the non_temporal threshold is 3/4 of one
+-     thread's share of the chip's cache. For most Intel and AMD processors
+-     with an initial release date between 2017 and 2020, a thread's typical
+-     share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
+-     threshold leaves 125 KBytes to 500 KBytes of the thread's data
+-     in cache after a maximum temporal copy, which will maintain
+-     in cache a reasonable portion of the thread's stack and other
+-     active data. If the threshold is set higher than one thread's
+-     share of the cache, it has a substantial risk of negatively
+-     impacting the performance of other threads running on the chip. */
+  /* The default setting for the non_temporal threshold is 1/4 of size
+     of the chip's cache. For most Intel and AMD processors with an
+     initial release date between 2017 and 2023, a thread's typical
+     share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
+     estimate the point where non-temporal stores begin out-competing
+     REP MOVSB. As well the point where the fact that non-temporal
+     stores are forced back to main memory would already occurred to the
+     majority of the lines in the copy. Note, concerns about the
+     entire L3 cache being evicted by the copy are mostly alleviated
+     by the fact that modern HW detects streaming patterns and
+     provides proper LRU hints so that the maximum thrashing
+     capped at 1/associativity. */
+  unsigned long int non_temporal_threshold = shared / 4;
+  /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+     a higher risk of actually thrashing the cache as they don't have a HW LRU
+     hint. As well, their performance in highly parallel situations is
+     noticeably worse.  */
+  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+    non_temporal_threshold = shared_per_thread * 3 / 4;
+
+   __x86_shared_non_temporal_threshold
+     = (cpu_features->non_temporal_threshold != 0
+        ? cpu_features->non_temporal_threshold
+-       : __x86_shared_cache_size * 3 / 4);
+       : non_temporal_threshold);
+ 
+   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
+   unsigned int minimum_rep_movsb_threshold;
+Only in b/sysdeps/x86: cacheinfo.h~
--- a/glibc-rh2180462-2.patch
+++ b/glibc-rh2180462-2.patch
@ -0,0 +1,47 @@
+From 47f747217811db35854ea06741be3685e8bbd44d Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Mon, 17 Jul 2023 23:14:33 -0500
+Subject: [PATCH] x86: Fix slight bug in `shared_per_thread` cache size
+ calculation.
+Content-type: text/plain; charset=UTF-8
+
+After:
+```
+    commit af992e7abdc9049714da76cae1e5e18bc4838fb8
+    Author: Noah Goldstein <goldstein.w.n@gmail.com>
+    Date:   Wed Jun 7 13:18:01 2023 -0500
+
+        x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4`
+```
+
+Split `shared` (cumulative cache size) from `shared_per_thread` (cache
+size per socket), the `shared_per_thread` *can* be slightly off from
+the previous calculation.
+
+Previously we added `core` even if `threads_l2` was invalid, and only
+used `threads_l2` to divide `core` if it was present. The changed
+version only included `core` if `threads_l2` was valid.
+
+This change restores the old behavior if `threads_l2` is invalid by
+adding the entire value of `core`.
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+[DJ - ported to C8S]
+
+diff -rup b1/sysdeps/x86/cacheinfo.h b2/sysdeps/x86/cacheinfo.h
+--- b1/sysdeps/x86/cacheinfo.h	2023-08-08 13:44:55.185333601 -0400
+++ b2/sysdeps/x86/cacheinfo.h	2023-08-08 13:55:16.474680016 -0400
+@@ -253,8 +253,8 @@ get_common_cache_info (long int *shared_
+   /* Account for non-inclusive L2 and L3 caches.  */
+   if (!inclusive_cache)
+     {
+-      if (threads_l2 > 0)
+-	shared_per_thread += core / threads_l2;
+      long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
+      shared_per_thread += core_per_thread;
+       shared += core;
+     }
+ 
--- a/glibc-rh2180462-3.patch
+++ b/glibc-rh2180462-3.patch
@ -0,0 +1,44 @@
+From 8b9a0af8ca012217bf90d1dc0694f85b49ae09da Mon Sep 17 00:00:00 2001
+From: Noah Goldstein <goldstein.w.n@gmail.com>
+Date: Tue, 18 Jul 2023 10:27:59 -0500
+Subject: [PATCH] [PATCH v1] x86: Use `3/4*sizeof(per-thread-L3)` as low bound
+ for NT threshold.
+Content-type: text/plain; charset=UTF-8
+
+On some machines we end up with incomplete cache information. This can
+make the new calculation of `sizeof(total-L3)/custom-divisor` end up
+lower than intended (and lower than the prior value). So reintroduce
+the old bound as a lower bound to avoid potentially regressing code
+where we don't have complete information to make the decision.
+Reviewed-by: DJ Delorie <dj@redhat.com>
+---
+ sysdeps/x86/dl-cacheinfo.h | 15 ++++++++++++---
+ 1 file changed, 12 insertions(+), 3 deletions(-)
+
+[DJ - ported to C8S]
+
+diff -rup b2/sysdeps/x86/cacheinfo.h b3/sysdeps/x86/cacheinfo.h
+--- b2/sysdeps/x86/cacheinfo.h	2023-08-08 13:55:16.474680016 -0400
+++ b3/sysdeps/x86/cacheinfo.h	2023-08-08 13:59:14.507988958 -0400
+@@ -401,12 +401,20 @@ init_cacheinfo (void)
+      provides proper LRU hints so that the maximum thrashing
+      capped at 1/associativity. */
+   unsigned long int non_temporal_threshold = shared / 4;
+  /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
+     likely have incorrect/incomplete cache info in which case, default to
+     3/4 * per-thread L3 to avoid regressions.  */
+  unsigned long int non_temporal_threshold_lowbound
+      = shared_per_thread * 3 / 4;
+  if (non_temporal_threshold < non_temporal_threshold_lowbound)
+    non_temporal_threshold = non_temporal_threshold_lowbound;
+
+   /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
+      a higher risk of actually thrashing the cache as they don't have a HW LRU
+      hint. As well, their performance in highly parallel situations is
+      noticeably worse.  */
+   if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+-    non_temporal_threshold = shared_per_thread * 3 / 4;
+    non_temporal_threshold = non_temporal_threshold_lowbound;
+ 
+   __x86_shared_non_temporal_threshold
+     = (cpu_features->non_temporal_threshold != 0
--- a/glibc.spec
+++ b/glibc.spec
@ -1,6 +1,6 @@
 %define glibcsrcdir glibc-2.28
 %define glibcversion 2.28
-%define glibcrelease 234%{?dist}
+%define glibcrelease 235%{?dist}
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
 #
@ -1043,6 +1043,9 @@ Patch850: glibc-rh2176707-2.patch
 Patch851: glibc-rh2186781.patch
 Patch852: glibc-rh2224348.patch
 Patch853: glibc-rh2176707-3.patch
+Patch854: glibc-rh2180462-1.patch
+Patch855: glibc-rh2180462-2.patch
+Patch856: glibc-rh2180462-3.patch

 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2873,6 +2876,9 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared

 %changelog
+* Tue Aug  8 2023 DJ Delorie <dj@redhat.com> - 2.28-235
+- Fix temporal threshold calculations (#2180462)
+
 * Mon Aug  7 2023 Florian Weimer <fweimer@redhat.com> - 2.28-234
 - Ignore symbolic link change on /etc/nsswitch.conf (#2229709)