diff --git a/glibc-rh2180462-1.patch b/glibc-rh2180462-1.patch new file mode 100644 index 0000000..3a5d685 --- /dev/null +++ b/glibc-rh2180462-1.patch @@ -0,0 +1,216 @@ +From af992e7abdc9049714da76cae1e5e18bc4838fb8 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 7 Jun 2023 13:18:01 -0500 +Subject: [PATCH] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 + / 4` +Content-type: text/plain; charset=UTF-8 + +Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 / +ncores_per_socket'. This patch updates that value to roughly +'sizeof_L3 / 4` + +The original value (specifically dividing the `ncores_per_socket`) was +done to limit the amount of other threads' data a `memcpy`/`memset` +could evict. + +Dividing by 'ncores_per_socket', however leads to exceedingly low +non-temporal thresholds and leads to using non-temporal stores in +cases where REP MOVSB is multiple times faster. + +Furthermore, non-temporal stores are written directly to main memory +so using it at a size much smaller than L3 can place soon to be +accessed data much further away than it otherwise could be. As well, +modern machines are able to detect streaming patterns (especially if +REP MOVSB is used) and provide LRU hints to the memory subsystem. This +in affect caps the total amount of eviction at 1/cache_associativity, +far below meaningfully thrashing the entire cache. + +As best I can tell, the benchmarks that lead this small threshold +where done comparing non-temporal stores versus standard cacheable +stores. A better comparison (linked below) is to be REP MOVSB which, +on the measure systems, is nearly 2x faster than non-temporal stores +at the low-end of the previous threshold, and within 10% for over +100MB copies (well past even the current threshold). In cases with a +low number of threads competing for bandwidth, REP MOVSB is ~2x faster +up to `sizeof_L3`. + +The divisor of `4` is a somewhat arbitrary value. From benchmarks it +seems Skylake and Icelake both prefer a divisor of `2`, but older CPUs +such as Broadwell prefer something closer to `8`. This patch is meant +to be followed up by another one to make the divisor cpu-specific, but +in the meantime (and for easier backporting), this patch settles on +`4` as a middle-ground. + +Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable +stores where done using: +https://github.com/goldsteinn/memcpy-nt-benchmarks + +Sheets results (also available in pdf on the github): +https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml +Reviewed-by: DJ Delorie +Reviewed-by: Carlos O'Donell +--- + sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++--------------- + 1 file changed, 43 insertions(+), 27 deletions(-) + +[DJ - ported to C8S] + +diff -rup a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +--- a/sysdeps/x86/cacheinfo.h 2023-08-08 11:54:09.969791421 -0400 ++++ b/sysdeps/x86/cacheinfo.h 2023-08-08 13:44:55.185333601 -0400 +@@ -46,7 +46,7 @@ long int __x86_rep_movsb_threshold attri + long int __x86_rep_stosb_threshold attribute_hidden = 2048; + + static void +-get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr, ++get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr, + long int core) + { + unsigned int eax; +@@ -65,6 +65,7 @@ get_common_cache_info (long int *shared_ + unsigned int family = cpu_features->basic.family; + unsigned int model = cpu_features->basic.model; + long int shared = *shared_ptr; ++ long int shared_per_thread = *shared_per_thread_ptr; + unsigned int threads = *threads_ptr; + bool inclusive_cache = true; + bool support_count_mask = true; +@@ -80,6 +81,7 @@ get_common_cache_info (long int *shared_ + /* Try L2 otherwise. */ + level = 2; + shared = core; ++ shared_per_thread = core; + threads_l2 = 0; + threads_l3 = -1; + } +@@ -236,29 +238,28 @@ get_common_cache_info (long int *shared_ + } + else + { +-intel_bug_no_cache_info: +- /* Assume that all logical threads share the highest cache +- level. */ +- threads +- = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx +- >> 16) & 0xff); +- } +- +- /* Cap usage of highest cache level to the number of supported +- threads. */ +- if (shared > 0 && threads > 0) +- shared /= threads; ++ intel_bug_no_cache_info: ++ /* Assume that all logical threads share the highest cache ++ level. */ ++ threads = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx >> 16) ++ & 0xff); ++ ++ /* Get per-thread size of highest level cache. */ ++ if (shared_per_thread > 0 && threads > 0) ++ shared_per_thread /= threads; ++ } + } + + /* Account for non-inclusive L2 and L3 caches. */ + if (!inclusive_cache) + { + if (threads_l2 > 0) +- core /= threads_l2; ++ shared_per_thread += core / threads_l2; + shared += core; + } + + *shared_ptr = shared; ++ *shared_per_thread_ptr = shared_per_thread; + *threads_ptr = threads; + } + +@@ -272,6 +273,7 @@ init_cacheinfo (void) + int max_cpuid_ex; + long int data = -1; + long int shared = -1; ++ long int shared_per_thread = -1; + long int core; + unsigned int threads = 0; + const struct cpu_features *cpu_features = __get_cpu_features (); +@@ -287,22 +289,25 @@ init_cacheinfo (void) + data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); + core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); + shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); ++ shared_per_thread = shared; + +- get_common_cache_info (&shared, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, core); + } + else if (cpu_features->basic.kind == arch_kind_zhaoxin) + { + data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); + core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); + shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); ++ shared_per_thread = shared; + +- get_common_cache_info (&shared, &threads, core); ++ get_common_cache_info (&shared, &shared_per_thread, &threads, core); + } + else if (cpu_features->basic.kind == arch_kind_amd) + { + data = handle_amd (_SC_LEVEL1_DCACHE_SIZE); + long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); + shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); ++ shared_per_thread = shared; + + /* Get maximum extended function. */ + __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); +@@ -352,6 +357,9 @@ init_cacheinfo (void) + shared += core; + } + } ++ ++ if (shared_per_thread <= 0) ++ shared_per_thread = shared; + } + + if (cpu_features->data_cache_size != 0) +@@ -380,20 +388,30 @@ init_cacheinfo (void) + __x86_shared_cache_size = shared; + } + +- /* The default setting for the non_temporal threshold is 3/4 of one +- thread's share of the chip's cache. For most Intel and AMD processors +- with an initial release date between 2017 and 2020, a thread's typical +- share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4 +- threshold leaves 125 KBytes to 500 KBytes of the thread's data +- in cache after a maximum temporal copy, which will maintain +- in cache a reasonable portion of the thread's stack and other +- active data. If the threshold is set higher than one thread's +- share of the cache, it has a substantial risk of negatively +- impacting the performance of other threads running on the chip. */ ++ /* The default setting for the non_temporal threshold is 1/4 of size ++ of the chip's cache. For most Intel and AMD processors with an ++ initial release date between 2017 and 2023, a thread's typical ++ share of the cache is from 18-64MB. Using the 1/4 L3 is meant to ++ estimate the point where non-temporal stores begin out-competing ++ REP MOVSB. As well the point where the fact that non-temporal ++ stores are forced back to main memory would already occurred to the ++ majority of the lines in the copy. Note, concerns about the ++ entire L3 cache being evicted by the copy are mostly alleviated ++ by the fact that modern HW detects streaming patterns and ++ provides proper LRU hints so that the maximum thrashing ++ capped at 1/associativity. */ ++ unsigned long int non_temporal_threshold = shared / 4; ++ /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run ++ a higher risk of actually thrashing the cache as they don't have a HW LRU ++ hint. As well, their performance in highly parallel situations is ++ noticeably worse. */ ++ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ non_temporal_threshold = shared_per_thread * 3 / 4; ++ + __x86_shared_non_temporal_threshold + = (cpu_features->non_temporal_threshold != 0 + ? cpu_features->non_temporal_threshold +- : __x86_shared_cache_size * 3 / 4); ++ : non_temporal_threshold); + + /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ + unsigned int minimum_rep_movsb_threshold; +Only in b/sysdeps/x86: cacheinfo.h~ diff --git a/glibc-rh2180462-2.patch b/glibc-rh2180462-2.patch new file mode 100644 index 0000000..35d246b --- /dev/null +++ b/glibc-rh2180462-2.patch @@ -0,0 +1,47 @@ +From 47f747217811db35854ea06741be3685e8bbd44d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 17 Jul 2023 23:14:33 -0500 +Subject: [PATCH] x86: Fix slight bug in `shared_per_thread` cache size + calculation. +Content-type: text/plain; charset=UTF-8 + +After: +``` + commit af992e7abdc9049714da76cae1e5e18bc4838fb8 + Author: Noah Goldstein + Date: Wed Jun 7 13:18:01 2023 -0500 + + x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4` +``` + +Split `shared` (cumulative cache size) from `shared_per_thread` (cache +size per socket), the `shared_per_thread` *can* be slightly off from +the previous calculation. + +Previously we added `core` even if `threads_l2` was invalid, and only +used `threads_l2` to divide `core` if it was present. The changed +version only included `core` if `threads_l2` was valid. + +This change restores the old behavior if `threads_l2` is invalid by +adding the entire value of `core`. +Reviewed-by: DJ Delorie +--- + sysdeps/x86/dl-cacheinfo.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +[DJ - ported to C8S] + +diff -rup b1/sysdeps/x86/cacheinfo.h b2/sysdeps/x86/cacheinfo.h +--- b1/sysdeps/x86/cacheinfo.h 2023-08-08 13:44:55.185333601 -0400 ++++ b2/sysdeps/x86/cacheinfo.h 2023-08-08 13:55:16.474680016 -0400 +@@ -253,8 +253,8 @@ get_common_cache_info (long int *shared_ + /* Account for non-inclusive L2 and L3 caches. */ + if (!inclusive_cache) + { +- if (threads_l2 > 0) +- shared_per_thread += core / threads_l2; ++ long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core; ++ shared_per_thread += core_per_thread; + shared += core; + } + diff --git a/glibc-rh2180462-3.patch b/glibc-rh2180462-3.patch new file mode 100644 index 0000000..fd001d3 --- /dev/null +++ b/glibc-rh2180462-3.patch @@ -0,0 +1,44 @@ +From 8b9a0af8ca012217bf90d1dc0694f85b49ae09da Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 18 Jul 2023 10:27:59 -0500 +Subject: [PATCH] [PATCH v1] x86: Use `3/4*sizeof(per-thread-L3)` as low bound + for NT threshold. +Content-type: text/plain; charset=UTF-8 + +On some machines we end up with incomplete cache information. This can +make the new calculation of `sizeof(total-L3)/custom-divisor` end up +lower than intended (and lower than the prior value). So reintroduce +the old bound as a lower bound to avoid potentially regressing code +where we don't have complete information to make the decision. +Reviewed-by: DJ Delorie +--- + sysdeps/x86/dl-cacheinfo.h | 15 ++++++++++++--- + 1 file changed, 12 insertions(+), 3 deletions(-) + +[DJ - ported to C8S] + +diff -rup b2/sysdeps/x86/cacheinfo.h b3/sysdeps/x86/cacheinfo.h +--- b2/sysdeps/x86/cacheinfo.h 2023-08-08 13:55:16.474680016 -0400 ++++ b3/sysdeps/x86/cacheinfo.h 2023-08-08 13:59:14.507988958 -0400 +@@ -401,12 +401,20 @@ init_cacheinfo (void) + provides proper LRU hints so that the maximum thrashing + capped at 1/associativity. */ + unsigned long int non_temporal_threshold = shared / 4; ++ /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most ++ likely have incorrect/incomplete cache info in which case, default to ++ 3/4 * per-thread L3 to avoid regressions. */ ++ unsigned long int non_temporal_threshold_lowbound ++ = shared_per_thread * 3 / 4; ++ if (non_temporal_threshold < non_temporal_threshold_lowbound) ++ non_temporal_threshold = non_temporal_threshold_lowbound; ++ + /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run + a higher risk of actually thrashing the cache as they don't have a HW LRU + hint. As well, their performance in highly parallel situations is + noticeably worse. */ + if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- non_temporal_threshold = shared_per_thread * 3 / 4; ++ non_temporal_threshold = non_temporal_threshold_lowbound; + + __x86_shared_non_temporal_threshold + = (cpu_features->non_temporal_threshold != 0 diff --git a/glibc.spec b/glibc.spec index c806310..1b151be 100644 --- a/glibc.spec +++ b/glibc.spec @@ -1,6 +1,6 @@ %define glibcsrcdir glibc-2.28 %define glibcversion 2.28 -%define glibcrelease 234%{?dist} +%define glibcrelease 235%{?dist} # Pre-release tarballs are pulled in from git using a command that is # effectively: # @@ -1043,6 +1043,9 @@ Patch850: glibc-rh2176707-2.patch Patch851: glibc-rh2186781.patch Patch852: glibc-rh2224348.patch Patch853: glibc-rh2176707-3.patch +Patch854: glibc-rh2180462-1.patch +Patch855: glibc-rh2180462-2.patch +Patch856: glibc-rh2180462-3.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2873,6 +2876,9 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Tue Aug 8 2023 DJ Delorie - 2.28-235 +- Fix temporal threshold calculations (#2180462) + * Mon Aug 7 2023 Florian Weimer - 2.28-234 - Ignore symbolic link change on /etc/nsswitch.conf (#2229709)