forked from rpms/glibc
		
	
							parent
							
								
									1afa752bd9
								
							
						
					
					
						commit
						f089a914cf
					
				
							
								
								
									
										216
									
								
								glibc-rh2180462-1.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										216
									
								
								glibc-rh2180462-1.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,216 @@ | |||||||
|  | From af992e7abdc9049714da76cae1e5e18bc4838fb8 Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Wed, 7 Jun 2023 13:18:01 -0500 | ||||||
|  | Subject: [PATCH] x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 | ||||||
|  |  / 4` | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | Current `non_temporal_threshold` set to roughly '3/4 * sizeof_L3 / | ||||||
|  | ncores_per_socket'. This patch updates that value to roughly | ||||||
|  | 'sizeof_L3 / 4` | ||||||
|  | 
 | ||||||
|  | The original value (specifically dividing the `ncores_per_socket`) was | ||||||
|  | done to limit the amount of other threads' data a `memcpy`/`memset` | ||||||
|  | could evict. | ||||||
|  | 
 | ||||||
|  | Dividing by 'ncores_per_socket', however leads to exceedingly low | ||||||
|  | non-temporal thresholds and leads to using non-temporal stores in | ||||||
|  | cases where REP MOVSB is multiple times faster. | ||||||
|  | 
 | ||||||
|  | Furthermore, non-temporal stores are written directly to main memory | ||||||
|  | so using it at a size much smaller than L3 can place soon to be | ||||||
|  | accessed data much further away than it otherwise could be. As well, | ||||||
|  | modern machines are able to detect streaming patterns (especially if | ||||||
|  | REP MOVSB is used) and provide LRU hints to the memory subsystem. This | ||||||
|  | in affect caps the total amount of eviction at 1/cache_associativity, | ||||||
|  | far below meaningfully thrashing the entire cache. | ||||||
|  | 
 | ||||||
|  | As best I can tell, the benchmarks that lead this small threshold | ||||||
|  | where done comparing non-temporal stores versus standard cacheable | ||||||
|  | stores. A better comparison (linked below) is to be REP MOVSB which, | ||||||
|  | on the measure systems, is nearly 2x faster than non-temporal stores | ||||||
|  | at the low-end of the previous threshold, and within 10% for over | ||||||
|  | 100MB copies (well past even the current threshold). In cases with a | ||||||
|  | low number of threads competing for bandwidth, REP MOVSB is ~2x faster | ||||||
|  | up to `sizeof_L3`. | ||||||
|  | 
 | ||||||
|  | The divisor of `4` is a somewhat arbitrary value. From benchmarks it | ||||||
|  | seems Skylake and Icelake both prefer a divisor of `2`, but older CPUs | ||||||
|  | such as Broadwell prefer something closer to `8`. This patch is meant | ||||||
|  | to be followed up by another one to make the divisor cpu-specific, but | ||||||
|  | in the meantime (and for easier backporting), this patch settles on | ||||||
|  | `4` as a middle-ground. | ||||||
|  | 
 | ||||||
|  | Benchmarks comparing non-temporal stores, REP MOVSB, and cacheable | ||||||
|  | stores where done using: | ||||||
|  | https://github.com/goldsteinn/memcpy-nt-benchmarks | ||||||
|  | 
 | ||||||
|  | Sheets results (also available in pdf on the github): | ||||||
|  | https://docs.google.com/spreadsheets/d/e/2PACX-1vS183r0rW_jRX6tG_E90m9qVuFiMbRIJvi5VAE8yYOvEOIEEc3aSNuEsrFbuXw5c3nGboxMmrupZD7K/pubhtml | ||||||
|  | Reviewed-by: DJ Delorie <dj@redhat.com> | ||||||
|  | Reviewed-by: Carlos O'Donell <carlos@redhat.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/dl-cacheinfo.h | 70 +++++++++++++++++++++++--------------- | ||||||
|  |  1 file changed, 43 insertions(+), 27 deletions(-) | ||||||
|  | 
 | ||||||
|  | [DJ - ported to C8S] | ||||||
|  | 
 | ||||||
|  | diff -rup a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
 | ||||||
|  | --- a/sysdeps/x86/cacheinfo.h	2023-08-08 11:54:09.969791421 -0400
 | ||||||
|  | +++ b/sysdeps/x86/cacheinfo.h	2023-08-08 13:44:55.185333601 -0400
 | ||||||
|  | @@ -46,7 +46,7 @@ long int __x86_rep_movsb_threshold attri
 | ||||||
|  |  long int __x86_rep_stosb_threshold attribute_hidden = 2048; | ||||||
|  |   | ||||||
|  |  static void | ||||||
|  | -get_common_cache_info (long int *shared_ptr, unsigned int *threads_ptr,
 | ||||||
|  | +get_common_cache_info (long int *shared_ptr, long int * shared_per_thread_ptr, unsigned int *threads_ptr,
 | ||||||
|  |  		       long int core) | ||||||
|  |  { | ||||||
|  |    unsigned int eax; | ||||||
|  | @@ -65,6 +65,7 @@ get_common_cache_info (long int *shared_
 | ||||||
|  |    unsigned int family = cpu_features->basic.family; | ||||||
|  |    unsigned int model = cpu_features->basic.model; | ||||||
|  |    long int shared = *shared_ptr; | ||||||
|  | +  long int shared_per_thread = *shared_per_thread_ptr;
 | ||||||
|  |    unsigned int threads = *threads_ptr; | ||||||
|  |    bool inclusive_cache = true; | ||||||
|  |    bool support_count_mask = true; | ||||||
|  | @@ -80,6 +81,7 @@ get_common_cache_info (long int *shared_
 | ||||||
|  |        /* Try L2 otherwise.  */ | ||||||
|  |        level  = 2; | ||||||
|  |        shared = core; | ||||||
|  | +      shared_per_thread = core;
 | ||||||
|  |        threads_l2 = 0; | ||||||
|  |        threads_l3 = -1; | ||||||
|  |      } | ||||||
|  | @@ -236,29 +238,28 @@ get_common_cache_info (long int *shared_
 | ||||||
|  |          } | ||||||
|  |        else | ||||||
|  |          { | ||||||
|  | -intel_bug_no_cache_info:
 | ||||||
|  | -          /* Assume that all logical threads share the highest cache
 | ||||||
|  | -             level.  */
 | ||||||
|  | -          threads
 | ||||||
|  | -            = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx
 | ||||||
|  | -                >> 16) & 0xff);
 | ||||||
|  | -        }
 | ||||||
|  | -
 | ||||||
|  | -        /* Cap usage of highest cache level to the number of supported
 | ||||||
|  | -           threads.  */
 | ||||||
|  | -        if (shared > 0 && threads > 0)
 | ||||||
|  | -          shared /= threads;
 | ||||||
|  | +	intel_bug_no_cache_info:
 | ||||||
|  | +	  /* Assume that all logical threads share the highest cache
 | ||||||
|  | +	     level.  */
 | ||||||
|  | +	  threads = ((cpu_features->features[COMMON_CPUID_INDEX_1].cpuid.ebx >> 16)
 | ||||||
|  | +		     & 0xff);
 | ||||||
|  | +
 | ||||||
|  | +	  /* Get per-thread size of highest level cache.  */
 | ||||||
|  | +	  if (shared_per_thread > 0 && threads > 0)
 | ||||||
|  | +	    shared_per_thread /= threads;
 | ||||||
|  | +	}
 | ||||||
|  |      } | ||||||
|  |   | ||||||
|  |    /* Account for non-inclusive L2 and L3 caches.  */ | ||||||
|  |    if (!inclusive_cache) | ||||||
|  |      { | ||||||
|  |        if (threads_l2 > 0) | ||||||
|  | -        core /= threads_l2;
 | ||||||
|  | +	shared_per_thread += core / threads_l2;
 | ||||||
|  |        shared += core; | ||||||
|  |      } | ||||||
|  |   | ||||||
|  |    *shared_ptr = shared; | ||||||
|  | +  *shared_per_thread_ptr = shared_per_thread;
 | ||||||
|  |    *threads_ptr = threads; | ||||||
|  |  } | ||||||
|  |   | ||||||
|  | @@ -272,6 +273,7 @@ init_cacheinfo (void)
 | ||||||
|  |    int max_cpuid_ex; | ||||||
|  |    long int data = -1; | ||||||
|  |    long int shared = -1; | ||||||
|  | +  long int shared_per_thread = -1;
 | ||||||
|  |    long int core; | ||||||
|  |    unsigned int threads = 0; | ||||||
|  |    const struct cpu_features *cpu_features = __get_cpu_features (); | ||||||
|  | @@ -287,22 +289,25 @@ init_cacheinfo (void)
 | ||||||
|  |        data = handle_intel (_SC_LEVEL1_DCACHE_SIZE, cpu_features); | ||||||
|  |        core = handle_intel (_SC_LEVEL2_CACHE_SIZE, cpu_features); | ||||||
|  |        shared = handle_intel (_SC_LEVEL3_CACHE_SIZE, cpu_features); | ||||||
|  | +      shared_per_thread = shared;
 | ||||||
|  |   | ||||||
|  | -      get_common_cache_info (&shared, &threads, core);
 | ||||||
|  | +      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
 | ||||||
|  |      } | ||||||
|  |    else if (cpu_features->basic.kind == arch_kind_zhaoxin) | ||||||
|  |      { | ||||||
|  |        data = handle_zhaoxin (_SC_LEVEL1_DCACHE_SIZE); | ||||||
|  |        core = handle_zhaoxin (_SC_LEVEL2_CACHE_SIZE); | ||||||
|  |        shared = handle_zhaoxin (_SC_LEVEL3_CACHE_SIZE); | ||||||
|  | +      shared_per_thread = shared;
 | ||||||
|  |   | ||||||
|  | -      get_common_cache_info (&shared, &threads, core);
 | ||||||
|  | +      get_common_cache_info (&shared, &shared_per_thread, &threads, core);
 | ||||||
|  |      } | ||||||
|  |    else if (cpu_features->basic.kind == arch_kind_amd) | ||||||
|  |      { | ||||||
|  |        data   = handle_amd (_SC_LEVEL1_DCACHE_SIZE); | ||||||
|  |        long int core = handle_amd (_SC_LEVEL2_CACHE_SIZE); | ||||||
|  |        shared = handle_amd (_SC_LEVEL3_CACHE_SIZE); | ||||||
|  | +      shared_per_thread = shared;
 | ||||||
|  |   | ||||||
|  |        /* Get maximum extended function. */ | ||||||
|  |        __cpuid (0x80000000, max_cpuid_ex, ebx, ecx, edx); | ||||||
|  | @@ -352,6 +357,9 @@ init_cacheinfo (void)
 | ||||||
|  |  	      shared += core; | ||||||
|  |              } | ||||||
|  |  	} | ||||||
|  | +
 | ||||||
|  | +      if (shared_per_thread <= 0)
 | ||||||
|  | +	shared_per_thread = shared;
 | ||||||
|  |      } | ||||||
|  |   | ||||||
|  |    if (cpu_features->data_cache_size != 0) | ||||||
|  | @@ -380,20 +388,30 @@ init_cacheinfo (void)
 | ||||||
|  |        __x86_shared_cache_size = shared; | ||||||
|  |      } | ||||||
|  |   | ||||||
|  | -  /* The default setting for the non_temporal threshold is 3/4 of one
 | ||||||
|  | -     thread's share of the chip's cache. For most Intel and AMD processors
 | ||||||
|  | -     with an initial release date between 2017 and 2020, a thread's typical
 | ||||||
|  | -     share of the cache is from 500 KBytes to 2 MBytes. Using the 3/4
 | ||||||
|  | -     threshold leaves 125 KBytes to 500 KBytes of the thread's data
 | ||||||
|  | -     in cache after a maximum temporal copy, which will maintain
 | ||||||
|  | -     in cache a reasonable portion of the thread's stack and other
 | ||||||
|  | -     active data. If the threshold is set higher than one thread's
 | ||||||
|  | -     share of the cache, it has a substantial risk of negatively
 | ||||||
|  | -     impacting the performance of other threads running on the chip. */
 | ||||||
|  | +  /* The default setting for the non_temporal threshold is 1/4 of size
 | ||||||
|  | +     of the chip's cache. For most Intel and AMD processors with an
 | ||||||
|  | +     initial release date between 2017 and 2023, a thread's typical
 | ||||||
|  | +     share of the cache is from 18-64MB. Using the 1/4 L3 is meant to
 | ||||||
|  | +     estimate the point where non-temporal stores begin out-competing
 | ||||||
|  | +     REP MOVSB. As well the point where the fact that non-temporal
 | ||||||
|  | +     stores are forced back to main memory would already occurred to the
 | ||||||
|  | +     majority of the lines in the copy. Note, concerns about the
 | ||||||
|  | +     entire L3 cache being evicted by the copy are mostly alleviated
 | ||||||
|  | +     by the fact that modern HW detects streaming patterns and
 | ||||||
|  | +     provides proper LRU hints so that the maximum thrashing
 | ||||||
|  | +     capped at 1/associativity. */
 | ||||||
|  | +  unsigned long int non_temporal_threshold = shared / 4;
 | ||||||
|  | +  /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run
 | ||||||
|  | +     a higher risk of actually thrashing the cache as they don't have a HW LRU
 | ||||||
|  | +     hint. As well, their performance in highly parallel situations is
 | ||||||
|  | +     noticeably worse.  */
 | ||||||
|  | +  if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS))
 | ||||||
|  | +    non_temporal_threshold = shared_per_thread * 3 / 4;
 | ||||||
|  | +
 | ||||||
|  |    __x86_shared_non_temporal_threshold | ||||||
|  |      = (cpu_features->non_temporal_threshold != 0 | ||||||
|  |         ? cpu_features->non_temporal_threshold | ||||||
|  | -       : __x86_shared_cache_size * 3 / 4);
 | ||||||
|  | +       : non_temporal_threshold);
 | ||||||
|  |   | ||||||
|  |    /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */ | ||||||
|  |    unsigned int minimum_rep_movsb_threshold; | ||||||
|  | Only in b/sysdeps/x86: cacheinfo.h~ | ||||||
							
								
								
									
										47
									
								
								glibc-rh2180462-2.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										47
									
								
								glibc-rh2180462-2.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,47 @@ | |||||||
|  | From 47f747217811db35854ea06741be3685e8bbd44d Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Mon, 17 Jul 2023 23:14:33 -0500 | ||||||
|  | Subject: [PATCH] x86: Fix slight bug in `shared_per_thread` cache size | ||||||
|  |  calculation. | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | After: | ||||||
|  | ``` | ||||||
|  |     commit af992e7abdc9049714da76cae1e5e18bc4838fb8 | ||||||
|  |     Author: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  |     Date:   Wed Jun 7 13:18:01 2023 -0500 | ||||||
|  | 
 | ||||||
|  |         x86: Increase `non_temporal_threshold` to roughly `sizeof_L3 / 4` | ||||||
|  | ``` | ||||||
|  | 
 | ||||||
|  | Split `shared` (cumulative cache size) from `shared_per_thread` (cache | ||||||
|  | size per socket), the `shared_per_thread` *can* be slightly off from | ||||||
|  | the previous calculation. | ||||||
|  | 
 | ||||||
|  | Previously we added `core` even if `threads_l2` was invalid, and only | ||||||
|  | used `threads_l2` to divide `core` if it was present. The changed | ||||||
|  | version only included `core` if `threads_l2` was valid. | ||||||
|  | 
 | ||||||
|  | This change restores the old behavior if `threads_l2` is invalid by | ||||||
|  | adding the entire value of `core`. | ||||||
|  | Reviewed-by: DJ Delorie <dj@redhat.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/dl-cacheinfo.h | 4 ++-- | ||||||
|  |  1 file changed, 2 insertions(+), 2 deletions(-) | ||||||
|  | 
 | ||||||
|  | [DJ - ported to C8S] | ||||||
|  | 
 | ||||||
|  | diff -rup b1/sysdeps/x86/cacheinfo.h b2/sysdeps/x86/cacheinfo.h
 | ||||||
|  | --- b1/sysdeps/x86/cacheinfo.h	2023-08-08 13:44:55.185333601 -0400
 | ||||||
|  | +++ b2/sysdeps/x86/cacheinfo.h	2023-08-08 13:55:16.474680016 -0400
 | ||||||
|  | @@ -253,8 +253,8 @@ get_common_cache_info (long int *shared_
 | ||||||
|  |    /* Account for non-inclusive L2 and L3 caches.  */ | ||||||
|  |    if (!inclusive_cache) | ||||||
|  |      { | ||||||
|  | -      if (threads_l2 > 0)
 | ||||||
|  | -	shared_per_thread += core / threads_l2;
 | ||||||
|  | +      long int core_per_thread = threads_l2 > 0 ? (core / threads_l2) : core;
 | ||||||
|  | +      shared_per_thread += core_per_thread;
 | ||||||
|  |        shared += core; | ||||||
|  |      } | ||||||
|  |   | ||||||
							
								
								
									
										44
									
								
								glibc-rh2180462-3.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										44
									
								
								glibc-rh2180462-3.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,44 @@ | |||||||
|  | From 8b9a0af8ca012217bf90d1dc0694f85b49ae09da Mon Sep 17 00:00:00 2001 | ||||||
|  | From: Noah Goldstein <goldstein.w.n@gmail.com> | ||||||
|  | Date: Tue, 18 Jul 2023 10:27:59 -0500 | ||||||
|  | Subject: [PATCH] [PATCH v1] x86: Use `3/4*sizeof(per-thread-L3)` as low bound | ||||||
|  |  for NT threshold. | ||||||
|  | Content-type: text/plain; charset=UTF-8 | ||||||
|  | 
 | ||||||
|  | On some machines we end up with incomplete cache information. This can | ||||||
|  | make the new calculation of `sizeof(total-L3)/custom-divisor` end up | ||||||
|  | lower than intended (and lower than the prior value). So reintroduce | ||||||
|  | the old bound as a lower bound to avoid potentially regressing code | ||||||
|  | where we don't have complete information to make the decision. | ||||||
|  | Reviewed-by: DJ Delorie <dj@redhat.com> | ||||||
|  | ---
 | ||||||
|  |  sysdeps/x86/dl-cacheinfo.h | 15 ++++++++++++--- | ||||||
|  |  1 file changed, 12 insertions(+), 3 deletions(-) | ||||||
|  | 
 | ||||||
|  | [DJ - ported to C8S] | ||||||
|  | 
 | ||||||
|  | diff -rup b2/sysdeps/x86/cacheinfo.h b3/sysdeps/x86/cacheinfo.h
 | ||||||
|  | --- b2/sysdeps/x86/cacheinfo.h	2023-08-08 13:55:16.474680016 -0400
 | ||||||
|  | +++ b3/sysdeps/x86/cacheinfo.h	2023-08-08 13:59:14.507988958 -0400
 | ||||||
|  | @@ -401,12 +401,20 @@ init_cacheinfo (void)
 | ||||||
|  |       provides proper LRU hints so that the maximum thrashing | ||||||
|  |       capped at 1/associativity. */ | ||||||
|  |    unsigned long int non_temporal_threshold = shared / 4; | ||||||
|  | +  /* If the computed non_temporal_threshold <= 3/4 * per-thread L3, we most
 | ||||||
|  | +     likely have incorrect/incomplete cache info in which case, default to
 | ||||||
|  | +     3/4 * per-thread L3 to avoid regressions.  */
 | ||||||
|  | +  unsigned long int non_temporal_threshold_lowbound
 | ||||||
|  | +      = shared_per_thread * 3 / 4;
 | ||||||
|  | +  if (non_temporal_threshold < non_temporal_threshold_lowbound)
 | ||||||
|  | +    non_temporal_threshold = non_temporal_threshold_lowbound;
 | ||||||
|  | +
 | ||||||
|  |    /* If no ERMS, we use the per-thread L3 chunking. Normal cacheable stores run | ||||||
|  |       a higher risk of actually thrashing the cache as they don't have a HW LRU | ||||||
|  |       hint. As well, their performance in highly parallel situations is | ||||||
|  |       noticeably worse.  */ | ||||||
|  |    if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) | ||||||
|  | -    non_temporal_threshold = shared_per_thread * 3 / 4;
 | ||||||
|  | +    non_temporal_threshold = non_temporal_threshold_lowbound;
 | ||||||
|  |   | ||||||
|  |    __x86_shared_non_temporal_threshold | ||||||
|  |      = (cpu_features->non_temporal_threshold != 0 | ||||||
| @ -1,6 +1,6 @@ | |||||||
| %define glibcsrcdir glibc-2.28 | %define glibcsrcdir glibc-2.28 | ||||||
| %define glibcversion 2.28 | %define glibcversion 2.28 | ||||||
| %define glibcrelease 234%{?dist} | %define glibcrelease 235%{?dist} | ||||||
| # Pre-release tarballs are pulled in from git using a command that is | # Pre-release tarballs are pulled in from git using a command that is | ||||||
| # effectively: | # effectively: | ||||||
| # | # | ||||||
| @ -1043,6 +1043,9 @@ Patch850: glibc-rh2176707-2.patch | |||||||
| Patch851: glibc-rh2186781.patch | Patch851: glibc-rh2186781.patch | ||||||
| Patch852: glibc-rh2224348.patch | Patch852: glibc-rh2224348.patch | ||||||
| Patch853: glibc-rh2176707-3.patch | Patch853: glibc-rh2176707-3.patch | ||||||
|  | Patch854: glibc-rh2180462-1.patch | ||||||
|  | Patch855: glibc-rh2180462-2.patch | ||||||
|  | Patch856: glibc-rh2180462-3.patch | ||||||
| 
 | 
 | ||||||
| ############################################################################## | ############################################################################## | ||||||
| # Continued list of core "glibc" package information: | # Continued list of core "glibc" package information: | ||||||
| @ -2873,6 +2876,9 @@ fi | |||||||
| %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared | %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared | ||||||
| 
 | 
 | ||||||
| %changelog | %changelog | ||||||
|  | * Tue Aug  8 2023 DJ Delorie <dj@redhat.com> - 2.28-235 | ||||||
|  | - Fix temporal threshold calculations (#2180462) | ||||||
|  | 
 | ||||||
| * Mon Aug  7 2023 Florian Weimer <fweimer@redhat.com> - 2.28-234 | * Mon Aug  7 2023 Florian Weimer <fweimer@redhat.com> - 2.28-234 | ||||||
| - Ignore symbolic link change on /etc/nsswitch.conf (#2229709) | - Ignore symbolic link change on /etc/nsswitch.conf (#2229709) | ||||||
| 
 | 
 | ||||||
|  | |||||||
		Loading…
	
		Reference in New Issue
	
	Block a user