forked from rpms/glibc
parent
1afa752bd9
commit
f089a914cf
216
glibc-rh2180462-1.patch
Normal file
216
glibc-rh2180462-1.patch
Normal file
@ -0,0 +1,216 @@
|
|||||||
|
From af992e7abdc9049714da76cae1e5e18bc4838fb8 Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Wed, 7 Jun 2023 13:18:01 -0500
|
||||||
|
Subject: [PATCH] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3
|
||||||
|
/ 4`
|
||||||
|
Content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 /
|
||||||
|
ncores_per_socket'. This patch updates that value to roughly
|
||||||
|
'sizeof_L3 / 4`
|
||||||
|
|
||||||
|
The original value (specifically dividing the `ncores_per_socket`) was
|
||||||
|
done to limit the amount of other threads' data a `memcpy`/`memset`
|
||||||
|
could evict.
|
||||||
|
|
||||||
|
Dividing by 'ncores_per_socket', however leads to exceedingly low
|
||||||
|
non-temporal thresholds and leads to using non-temporal stores in
|
||||||
|
cases where REP MOVSB is multiple times faster.
|
||||||
|
|
||||||
|
Furthermore, non-temporal stores are written directly to main memory
|
||||||
|
so using it at a size much smaller than L3 can place soon to be
|
||||||
|
accessed data much further away than it otherwise could be. As well,
|
||||||
|
modern machines are able to detect streaming patterns (especially if
|
||||||
|
REP MOVSB is used) and provide LRU hints to the memory subsystem. This
|
||||||
|
in affect caps the total amount of eviction at 1/cache_associativity,
|
||||||
|
far below meaningfully thrashing the entire cache.
|
||||||
|
|
||||||
|
As best I can tell, the benchmarks that lead this small threshold
|
||||||
|
where done comparing non-temporal stores versus standard cacheable
|
||||||
|
stores. A better comparison (linked below) is to be REP MOVSB which,
|
||||||
|
on the measure systems, is nearly 2x faster than non-temporal stores
|
||||||
|
at the low-end of the previous threshold, and within 10% for over
|
||||||
|
100MB copies (well past even the current threshold). In cases with a
|
||||||
|
low number of threads competing for bandwidth, REP MOVSB is ~2x faster
|
||||||
|
up to `sizeof_L3`.
|
||||||
|
|
||||||
|
The divisor of `4` is a somewhat arbitrary value. From benchmarks it
|
||||||
|
seems Skylake and Icelake both prefer a divisor of `2`, but older CPUs
|
||||||
|
such as Broadwell prefer something closer to `8`. This patch is meant
|
||||||
|
to be followed up by another one to make the divisor cpu-specific, but
|
||||||
|
in the meantime (and for easier backporting), this patch settles on
|
||||||
|
`4` as a middle-ground.
|
||||||
|
|
||||||
|
Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable
|
||||||
|
stores where done using:
|
||||||
|
https://github.com/goldsteinn/memcpy-nt-benchmarks
|
||||||
|
|
||||||
|
Sheets results (also available in pdf on the github):
|
||||||
|
https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml
|
||||||
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||||
|
Reviewed-by: Carlos O'Donell <carlos@redhat.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++---------------
|
||||||
|
1 file changed, 43 insertions(+), 27 deletions(-)
|
||||||
|
|
||||||
|
[DJ - ported to C8S]
|
||||||
|
|
||||||
|
diff -rup a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
||||||
|
--- a/sysdeps/x86/cacheinfo.h 2023-08-08 11:54:09.969791421 -0400
|
||||||
|
+++ b/sysdeps/x86/cacheinfo.h 2023-08-08 13:44:55.185333601 -0400
|
||||||
|
@@ -46,7 +46,7 @@ long int __x86_rep_movsb_threshold attri
|
||||||
|
long int __x86_rep_stosb_threshold attribute_hidden = 2048;
|
||||||
|
|
||||||
|
static void
|
||||||
|
-get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
|
||||||
|
+get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
|
||||||
|
long int core)
|
||||||
|
{
|
||||||
|
unsigned int eax;
|
||||||
|
@@ -65,6 +65,7 @@ get_common_cache_info (long int *shared_
|
||||||
|
unsigned int family = cpu_features->basic.family;
|
||||||
|
unsigned int model = cpu_features->basic.model;
|
||||||
|
long int shared = *shared_ptr;
|
||||||
|
+ long int shared_per_thread = *shared_per_thread_ptr;
|
||||||
|
unsigned int threads = *threads_ptr;
|
||||||
|
bool inclusive_cache = true;
|
||||||
|
bool support_count_mask = true;
|
||||||
|
@@ -80,6 +81,7 @@ get_common_cache_info (long int *shared_
|
||||||
|
/* Try L2 otherwise. */
|
||||||
|
level = 2;
|
||||||
|
shared = core;
|
||||||
|
+ shared_per_thread = core;
|
||||||
|
threads_l2 = 0;
|
||||||
|
threads_l3 = -1;
|
||||||
|
}
|
||||||
|
@@ -236,29 +238,28 @@ get_common_cache_info (long int *shared_
|
||||||
|
}
|
||||||
|
else
|
||||||
|
{
|
||||||
|
-intel_bug_no_cache_info:
|
||||||
|
- /* Assume that all logical threads share the highest cache
|
||||||
|
- level. */
|
||||||
|
- threads
|
||||||
|
- = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx
|
||||||
|
- >> 16) & 0xff);
|
||||||
|
- }
|
||||||
|
-
|
||||||
|
- /* Cap usage of highest cache level to the number of supported
|
||||||
|
- threads. */
|
||||||
|
- if (shared > 0 && threads > 0)
|
||||||
|
- shared /= threads;
|
||||||
|
+ intel_bug_no_cache_info:
|
||||||
|
+ /* Assume that all logical threads share the highest cache
|
||||||
|
+ level. */
|
||||||
|
+ threads = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx >> 16)
|
||||||
|
+ & 0xff);
|
||||||
|
+
|
||||||
|
+ /* Get per-thread size of highest level cache. */
|
||||||
|
+ if (shared_per_thread > 0 && threads > 0)
|
||||||
|
+ shared_per_thread /= threads;
|
||||||
|
+ }
|
||||||
|
}
|
||||||
|
|
||||||
|
/* Account for non-inclusive L2 and L3 caches. */
|
||||||
|
if (!inclusive_cache)
|
||||||
|
{
|
||||||
|
if (threads_l2 > 0)
|
||||||
|
- core /= threads_l2;
|
||||||
|
+ shared_per_thread += core / threads_l2;
|
||||||
|
shared += core;
|
||||||
|
}
|
||||||
|
|
||||||
|
*shared_ptr = shared;
|
||||||
|
+ *shared_per_thread_ptr = shared_per_thread;
|
||||||
|
*threads_ptr = threads;
|
||||||
|
}
|
||||||
|
|
||||||
|
@@ -272,6 +273,7 @@ init_cacheinfo (void)
|
||||||
|
int max_cpuid_ex;
|
||||||
|
long int data = -1;
|
||||||
|
long int shared = -1;
|
||||||
|
+ long int shared_per_thread = -1;
|
||||||
|
long int core;
|
||||||
|
unsigned int threads = 0;
|
||||||
|
const struct cpu_features *cpu_features = __get_cpu_features ();
|
||||||
|
@@ -287,22 +289,25 @@ init_cacheinfo (void)
|
||||||
|
data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features);
|
||||||
|
core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features);
|
||||||
|
shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features);
|
||||||
|
+ shared_per_thread = shared;
|
||||||
|
|
||||||
|
- get_common_cache_info (&shared, &threads, core);
|
||||||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.kind == arch_kind_zhaoxin)
|
||||||
|
{
|
||||||
|
data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE);
|
||||||
|
core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE);
|
||||||
|
+ shared_per_thread = shared;
|
||||||
|
|
||||||
|
- get_common_cache_info (&shared, &threads, core);
|
||||||
|
+ get_common_cache_info (&shared, &shared_per_thread, &threads, core);
|
||||||
|
}
|
||||||
|
else if (cpu_features->basic.kind == arch_kind_amd)
|
||||||
|
{
|
||||||
|
data = handle_amd (_SC_LEVEL1_DCACHE_SIZE);
|
||||||
|
long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE);
|
||||||
|
shared = handle_amd (_SC_LEVEL3_CACHE_SIZE);
|
||||||
|
+ shared_per_thread = shared;
|
||||||
|
|
||||||
|
/* Get maximum extended function. */
|
||||||
|
__cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx);
|
||||||
|
@@ -352,6 +357,9 @@ init_cacheinfo (void)
|
||||||
|
shared += core;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
+
|
||||||
|
+ if (shared_per_thread <= 0)
|
||||||
|
+ shared_per_thread = shared;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (cpu_features->data_cache_size != 0)
|
||||||
|
@@ -380,20 +388,30 @@ init_cacheinfo (void)
|
||||||
|
__x86_shared_cache_size = shared;
|
||||||
|
}
|
||||||
|
|
||||||
|
- /* The default setting for the non_temporal threshold is 3/4 of one
|
||||||
|
- thread's share of the chip's cache. For most Intel and AMD processors
|
||||||
|
- with an initial release date between 2017 and 2020, a thread's typical
|
||||||
|
- share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
|
||||||
|
- threshold leaves 125 KBytes to 500 KBytes of the thread's data
|
||||||
|
- in cache after a maximum temporal copy, which will maintain
|
||||||
|
- in cache a reasonable portion of the thread's stack and other
|
||||||
|
- active data. If the threshold is set higher than one thread's
|
||||||
|
- share of the cache, it has a substantial risk of negatively
|
||||||
|
- impacting the performance of other threads running on the chip. */
|
||||||
|
+ /* The default setting for the non_temporal threshold is 1/4 of size
|
||||||
|
+ of the chip's cache. For most Intel and AMD processors with an
|
||||||
|
+ initial release date between 2017 and 2023, a thread's typical
|
||||||
|
+ share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
|
||||||
|
+ estimate the point where non-temporal stores begin out-competing
|
||||||
|
+ REP MOVSB. As well the point where the fact that non-temporal
|
||||||
|
+ stores are forced back to main memory would already occurred to the
|
||||||
|
+ majority of the lines in the copy. Note, concerns about the
|
||||||
|
+ entire L3 cache being evicted by the copy are mostly alleviated
|
||||||
|
+ by the fact that modern HW detects streaming patterns and
|
||||||
|
+ provides proper LRU hints so that the maximum thrashing
|
||||||
|
+ capped at 1/associativity. */
|
||||||
|
+ unsigned long int non_temporal_threshold = shared / 4;
|
||||||
|
+ /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
|
||||||
|
+ a higher risk of actually thrashing the cache as they don't have a HW LRU
|
||||||
|
+ hint. As well, their performance in highly parallel situations is
|
||||||
|
+ noticeably worse. */
|
||||||
|
+ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
+ non_temporal_threshold = shared_per_thread * 3 / 4;
|
||||||
|
+
|
||||||
|
__x86_shared_non_temporal_threshold
|
||||||
|
= (cpu_features->non_temporal_threshold != 0
|
||||||
|
? cpu_features->non_temporal_threshold
|
||||||
|
- : __x86_shared_cache_size * 3 / 4);
|
||||||
|
+ : non_temporal_threshold);
|
||||||
|
|
||||||
|
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
|
||||||
|
unsigned int minimum_rep_movsb_threshold;
|
||||||
|
Only in b/sysdeps/x86: cacheinfo.h~
|
47
glibc-rh2180462-2.patch
Normal file
47
glibc-rh2180462-2.patch
Normal file
@ -0,0 +1,47 @@
|
|||||||
|
From 47f747217811db35854ea06741be3685e8bbd44d Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Mon, 17 Jul 2023 23:14:33 -0500
|
||||||
|
Subject: [PATCH] x86: Fix slight bug in `shared_per_thread` cache size
|
||||||
|
calculation.
|
||||||
|
Content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
After:
|
||||||
|
```
|
||||||
|
commit af992e7abdc9049714da76cae1e5e18bc4838fb8
|
||||||
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Wed Jun 7 13:18:01 2023 -0500
|
||||||
|
|
||||||
|
x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4`
|
||||||
|
```
|
||||||
|
|
||||||
|
Split `shared` (cumulative cache size) from `shared_per_thread` (cache
|
||||||
|
size per socket), the `shared_per_thread` *can* be slightly off from
|
||||||
|
the previous calculation.
|
||||||
|
|
||||||
|
Previously we added `core` even if `threads_l2` was invalid, and only
|
||||||
|
used `threads_l2` to divide `core` if it was present. The changed
|
||||||
|
version only included `core` if `threads_l2` was valid.
|
||||||
|
|
||||||
|
This change restores the old behavior if `threads_l2` is invalid by
|
||||||
|
adding the entire value of `core`.
|
||||||
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 4 ++--
|
||||||
|
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||||
|
|
||||||
|
[DJ - ported to C8S]
|
||||||
|
|
||||||
|
diff -rup b1/sysdeps/x86/cacheinfo.h b2/sysdeps/x86/cacheinfo.h
|
||||||
|
--- b1/sysdeps/x86/cacheinfo.h 2023-08-08 13:44:55.185333601 -0400
|
||||||
|
+++ b2/sysdeps/x86/cacheinfo.h 2023-08-08 13:55:16.474680016 -0400
|
||||||
|
@@ -253,8 +253,8 @@ get_common_cache_info (long int *shared_
|
||||||
|
/* Account for non-inclusive L2 and L3 caches. */
|
||||||
|
if (!inclusive_cache)
|
||||||
|
{
|
||||||
|
- if (threads_l2 > 0)
|
||||||
|
- shared_per_thread += core / threads_l2;
|
||||||
|
+ long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
|
||||||
|
+ shared_per_thread += core_per_thread;
|
||||||
|
shared += core;
|
||||||
|
}
|
||||||
|
|
44
glibc-rh2180462-3.patch
Normal file
44
glibc-rh2180462-3.patch
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
From 8b9a0af8ca012217bf90d1dc0694f85b49ae09da Mon Sep 17 00:00:00 2001
|
||||||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||||
|
Date: Tue, 18 Jul 2023 10:27:59 -0500
|
||||||
|
Subject: [PATCH] [PATCH v1] x86: Use `3/4*sizeof(per-thread-L3)` as low bound
|
||||||
|
for NT threshold.
|
||||||
|
Content-type: text/plain; charset=UTF-8
|
||||||
|
|
||||||
|
On some machines we end up with incomplete cache information. This can
|
||||||
|
make the new calculation of `sizeof(total-L3)/custom-divisor` end up
|
||||||
|
lower than intended (and lower than the prior value). So reintroduce
|
||||||
|
the old bound as a lower bound to avoid potentially regressing code
|
||||||
|
where we don't have complete information to make the decision.
|
||||||
|
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||||
|
---
|
||||||
|
sysdeps/x86/dl-cacheinfo.h | 15 ++++++++++++---
|
||||||
|
1 file changed, 12 insertions(+), 3 deletions(-)
|
||||||
|
|
||||||
|
[DJ - ported to C8S]
|
||||||
|
|
||||||
|
diff -rup b2/sysdeps/x86/cacheinfo.h b3/sysdeps/x86/cacheinfo.h
|
||||||
|
--- b2/sysdeps/x86/cacheinfo.h 2023-08-08 13:55:16.474680016 -0400
|
||||||
|
+++ b3/sysdeps/x86/cacheinfo.h 2023-08-08 13:59:14.507988958 -0400
|
||||||
|
@@ -401,12 +401,20 @@ init_cacheinfo (void)
|
||||||
|
provides proper LRU hints so that the maximum thrashing
|
||||||
|
capped at 1/associativity. */
|
||||||
|
unsigned long int non_temporal_threshold = shared / 4;
|
||||||
|
+ /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
|
||||||
|
+ likely have incorrect/incomplete cache info in which case, default to
|
||||||
|
+ 3/4 * per-thread L3 to avoid regressions. */
|
||||||
|
+ unsigned long int non_temporal_threshold_lowbound
|
||||||
|
+ = shared_per_thread * 3 / 4;
|
||||||
|
+ if (non_temporal_threshold < non_temporal_threshold_lowbound)
|
||||||
|
+ non_temporal_threshold = non_temporal_threshold_lowbound;
|
||||||
|
+
|
||||||
|
/* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
|
||||||
|
a higher risk of actually thrashing the cache as they don't have a HW LRU
|
||||||
|
hint. As well, their performance in highly parallel situations is
|
||||||
|
noticeably worse. */
|
||||||
|
if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||||
|
- non_temporal_threshold = shared_per_thread * 3 / 4;
|
||||||
|
+ non_temporal_threshold = non_temporal_threshold_lowbound;
|
||||||
|
|
||||||
|
__x86_shared_non_temporal_threshold
|
||||||
|
= (cpu_features->non_temporal_threshold != 0
|
@ -1,6 +1,6 @@
|
|||||||
%define glibcsrcdir glibc-2.28
|
%define glibcsrcdir glibc-2.28
|
||||||
%define glibcversion 2.28
|
%define glibcversion 2.28
|
||||||
%define glibcrelease 234%{?dist}
|
%define glibcrelease 235%{?dist}
|
||||||
# Pre-release tarballs are pulled in from git using a command that is
|
# Pre-release tarballs are pulled in from git using a command that is
|
||||||
# effectively:
|
# effectively:
|
||||||
#
|
#
|
||||||
@ -1043,6 +1043,9 @@ Patch850: glibc-rh2176707-2.patch
|
|||||||
Patch851: glibc-rh2186781.patch
|
Patch851: glibc-rh2186781.patch
|
||||||
Patch852: glibc-rh2224348.patch
|
Patch852: glibc-rh2224348.patch
|
||||||
Patch853: glibc-rh2176707-3.patch
|
Patch853: glibc-rh2176707-3.patch
|
||||||
|
Patch854: glibc-rh2180462-1.patch
|
||||||
|
Patch855: glibc-rh2180462-2.patch
|
||||||
|
Patch856: glibc-rh2180462-3.patch
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Continued list of core "glibc" package information:
|
# Continued list of core "glibc" package information:
|
||||||
@ -2873,6 +2876,9 @@ fi
|
|||||||
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Tue Aug 8 2023 DJ Delorie <dj@redhat.com> - 2.28-235
|
||||||
|
- Fix temporal threshold calculations (#2180462)
|
||||||
|
|
||||||
* Mon Aug 7 2023 Florian Weimer <fweimer@redhat.com> - 2.28-234
|
* Mon Aug 7 2023 Florian Weimer <fweimer@redhat.com> - 2.28-234
|
||||||
- Ignore symbolic link change on /etc/nsswitch.conf (#2229709)
|
- Ignore symbolic link change on /etc/nsswitch.conf (#2229709)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user