glibc/glibc-upstream-2.34-182.patch

commit cecbac52123456e2fbcff062a4165bf7b9174797
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Mon Nov 1 00:49:52 2021 -0500

    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
    
    No bug.
    
    This patch doubles the rep_movsb_threshold when using ERMS. Based on
    benchmarks the vector copy loop, especially now that it handles 4k
    aliasing, is better for these medium ranged.
    
    On Skylake with ERMS:
    
    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
    4096,   0,      0,      0,      0.975
    4096,   0,      0,      1,      0.953
    4096,   12,     0,      0,      0.969
    4096,   12,     0,      1,      0.872
    4096,   44,     0,      0,      0.979
    4096,   44,     0,      1,      0.83
    4096,   0,      12,     0,      1.006
    4096,   0,      12,     1,      0.989
    4096,   0,      44,     0,      0.739
    4096,   0,      44,     1,      0.942
    4096,   12,     12,     0,      1.009
    4096,   12,     12,     1,      0.973
    4096,   44,     44,     0,      0.791
    4096,   44,     44,     1,      0.961
    4096,   2048,   0,      0,      0.978
    4096,   2048,   0,      1,      0.951
    4096,   2060,   0,      0,      0.986
    4096,   2060,   0,      1,      0.963
    4096,   2048,   12,     0,      0.971
    4096,   2048,   12,     1,      0.941
    4096,   2060,   12,     0,      0.977
    4096,   2060,   12,     1,      0.949
    8192,   0,      0,      0,      0.85
    8192,   0,      0,      1,      0.845
    8192,   13,     0,      0,      0.937
    8192,   13,     0,      1,      0.939
    8192,   45,     0,      0,      0.932
    8192,   45,     0,      1,      0.927
    8192,   0,      13,     0,      0.621
    8192,   0,      13,     1,      0.62
    8192,   0,      45,     0,      0.53
    8192,   0,      45,     1,      0.516
    8192,   13,     13,     0,      0.664
    8192,   13,     13,     1,      0.659
    8192,   45,     45,     0,      0.593
    8192,   45,     45,     1,      0.575
    8192,   2048,   0,      0,      0.854
    8192,   2048,   0,      1,      0.834
    8192,   2061,   0,      0,      0.863
    8192,   2061,   0,      1,      0.857
    8192,   2048,   13,     0,      0.63
    8192,   2048,   13,     1,      0.629
    8192,   2061,   13,     0,      0.627
    8192,   2061,   13,     1,      0.62
    
    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
 #endif
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
+     threshold is 2048 * (VEC_SIZE / 16).  */
   unsigned int rep_movsb_threshold;
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index dd6e1d65c9490d4f..419313804d49cf65 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
     }
     x86_rep_movsb_threshold {
       type: SIZE_T
-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
-      # isn't faster on short data.  The memcpy micro benchmark in glibc
-      # shows that 2KB is the approximate value above which REP MOVSB
-      # becomes faster than SSE2 optimization on processors with Enhanced
-      # REP MOVSB.  Since larger register size can move more data with a
-      # single load and store, the threshold is higher with larger register
-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
-      # times of vector size and the default value is 2048 * (vector size
-      # / 16), the default value and the minimum value must be updated at
-      # run-time.  NB: Don't set the default value since we can't tell if
-      # the tunable value is set by user or not [BZ #27069].
+      # Since there is overhead to set up REP MOVSB operation, REP
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
+      # in glibc shows that 2KB is the approximate value above which
+      # REP MOVSB becomes faster than SSE2 optimization on processors
+      # with Enhanced REP MOVSB.  Since larger register size can move
+      # more data with a single load and store, the threshold is
+      # higher with larger register size.  Micro benchmarks show AVX
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
+      # threshold is universally set at 2112 bytes.  Note: Since the
+      # REP MOVSB threshold must be greater than 8 times of vector
+      # size and the default value is 4096 * (vector size / 16), the
+      # default value and the minimum value must be updated at
+      # run-time.  NB: Don't set the default value since we can't tell
+      # if the tunable value is set by user or not [BZ #27069].
       minval: 1
     }
     x86_rep_stosb_threshold {
Import glibc-2.34-32.fc35 from f35 * Thu Apr 28 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-32 - Sync with upstream branch release/2.34/master, commit c66c92181ddbd82306537a608e8c0282587131de: - posix/glob.c: update from gnulib (BZ#25659) - linux: Fix fchmodat with AT_SYMLINK_NOFOLLOW for 64 bit time_t (BZ#29097) * Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31 - Sync with upstream branch release/2.34/master, commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe: - i386: Regenerate ulps - linux: Fix missing internal 64 bit time_t stat usage - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI - x86-64: Use notl in EVEX strcmp [BZ #28646] - x86: Shrink memcmp-sse4.S code size - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h - x86: Optimize memmove-vec-unaligned-erms.S - x86-64: Replace movzx with movzbl - x86-64: Remove Prefer_AVX2_STRCMP - x86-64: Improve EVEX strcmp with masked load - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S - x86: Optimize memset-vec-unaligned-erms.S - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size - x86: Modify ENTRY in sysdep.h so that p2align can be specified - x86-64: Optimize load of all bits set into ZMM register [BZ #28252] - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) - INSTALL: Rephrase -with-default-link documentation - misc: Fix rare fortify crash on wchar funcs. [BZ 29030] - Default to --with-default-link=no (bug 25812) - scripts: Add glibcelf.py module * Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30 - Sync with upstream branch release/2.34/master, commit 71326f1f2fd09dafb9c34404765fb88129e94237: - nptl: Fix pthread_cancel cancelhandling atomic operations - mips: Fix mips64n32 64 bit time_t stat support (BZ#29069) - hurd: Fix arbitrary error code - nptl: Handle spurious EINTR when thread cancellation is disabled (BZ#29029) - S390: Add new s390 platform z16. - NEWS: Update fixed bug list for LD_AUDIT backports. - hppa: Fix bind-now audit (BZ #28857) - elf: Replace tst-audit24bmod2.so with tst-audit24bmod2 - Fix elf/tst-audit25a with default bind now toolchains - elf: Fix runtime linker auditing on aarch64 (BZ #26643) - elf: Issue la_symbind for bind-now (BZ #23734) - elf: Fix initial-exec TLS access on audit modules (BZ #28096) - elf: Add la_activity during application exit - elf: Do not fail for failed dlmopen on audit modules (BZ #28061) - elf: Issue audit la_objopen for vDSO - elf: Add audit tests for modules with TLSDESC - elf: Avoid unnecessary slowdown from profiling with audit (BZ#15533) - elf: Add _dl_audit_pltexit - elf: Add _dl_audit_pltenter - elf: Add _dl_audit_preinit - elf: Add _dl_audit_symbind_alt and _dl_audit_symbind - elf: Add _dl_audit_objclose - elf: Add _dl_audit_objsearch - elf: Add _dl_audit_activity_map and _dl_audit_activity_nsid - elf: Add _dl_audit_objopen - elf: Move la_activity (LA_ACT_ADD) after _dl_add_to_namespace_list() (BZ #28062) - elf: Move LAV_CURRENT to link_lavcurrent.h - elf: Fix elf_get_dynamic_info() for bootstrap - elf: Fix dynamic-link.h usage on rtld.c - elf: Fix elf_get_dynamic_info definition - elf: Avoid nested functions in the loader [BZ #27220] - powerpc: Delete unneeded ELF_MACHINE_BEFORE_RTLD_RELOC - hppa: Use END instead of PSEUDO_END in swapcontext.S - hppa: Implement swapcontext in assembler (bug 28960) Resolves: #2003291 Resolves: #2064181 Resolves: #2072328 Resolves: #2075713 Resolves: #2077838 2022-04-21 22:26:44 +00:00			`commit cecbac52123456e2fbcff062a4165bf7b9174797`
			`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
			`Date: Mon Nov 1 00:49:52 2021 -0500`

			`x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h`

			`No bug.`

			`This patch doubles the rep_movsb_threshold when using ERMS. Based on`
			`benchmarks the vector copy loop, especially now that it handles 4k`
			`aliasing, is better for these medium ranged.`

			`On Skylake with ERMS:`

			`Size, Align1, Align2, dst>src,(rep movsb) / (vec copy)`
			`4096, 0, 0, 0, 0.975`
			`4096, 0, 0, 1, 0.953`
			`4096, 12, 0, 0, 0.969`
			`4096, 12, 0, 1, 0.872`
			`4096, 44, 0, 0, 0.979`
			`4096, 44, 0, 1, 0.83`
			`4096, 0, 12, 0, 1.006`
			`4096, 0, 12, 1, 0.989`
			`4096, 0, 44, 0, 0.739`
			`4096, 0, 44, 1, 0.942`
			`4096, 12, 12, 0, 1.009`
			`4096, 12, 12, 1, 0.973`
			`4096, 44, 44, 0, 0.791`
			`4096, 44, 44, 1, 0.961`
			`4096, 2048, 0, 0, 0.978`
			`4096, 2048, 0, 1, 0.951`
			`4096, 2060, 0, 0, 0.986`
			`4096, 2060, 0, 1, 0.963`
			`4096, 2048, 12, 0, 0.971`
			`4096, 2048, 12, 1, 0.941`
			`4096, 2060, 12, 0, 0.977`
			`4096, 2060, 12, 1, 0.949`
			`8192, 0, 0, 0, 0.85`
			`8192, 0, 0, 1, 0.845`
			`8192, 13, 0, 0, 0.937`
			`8192, 13, 0, 1, 0.939`
			`8192, 45, 0, 0, 0.932`
			`8192, 45, 0, 1, 0.927`
			`8192, 0, 13, 0, 0.621`
			`8192, 0, 13, 1, 0.62`
			`8192, 0, 45, 0, 0.53`
			`8192, 0, 45, 1, 0.516`
			`8192, 13, 13, 0, 0.664`
			`8192, 13, 13, 1, 0.659`
			`8192, 45, 45, 0, 0.593`
			`8192, 45, 45, 1, 0.575`
			`8192, 2048, 0, 0, 0.854`
			`8192, 2048, 0, 1, 0.834`
			`8192, 2061, 0, 0, 0.863`
			`8192, 2061, 0, 1, 0.857`
			`8192, 2048, 13, 0, 0.63`
			`8192, 2048, 13, 1, 0.629`
			`8192, 2061, 13, 0, 0.627`
			`8192, 2061, 13, 1, 0.62`

			`Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>`
			`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`
			`(cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)`

			`diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h`
			`index e6c94dfd023a25dc..2e43e67e4f4037d3 100644`
			`--- a/sysdeps/x86/dl-cacheinfo.h`
			`+++ b/sysdeps/x86/dl-cacheinfo.h`
			`@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)`
			`/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */`
			`unsigned int minimum_rep_movsb_threshold;`
			`#endif`
			`- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */`
			`+ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for`
			`+ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB`
			`+ threshold is 2048 * (VEC_SIZE / 16). */`
			`unsigned int rep_movsb_threshold;`
			`if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)`
			`&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))`
			`{`
			`- rep_movsb_threshold = 2048 * (64 / 16);`
			`+ rep_movsb_threshold = 4096 * (64 / 16);`
			`#if HAVE_TUNABLES`
			`minimum_rep_movsb_threshold = 64 * 8;`
			`#endif`
			`@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)`
			`else if (CPU_FEATURE_PREFERRED_P (cpu_features,`
			`AVX_Fast_Unaligned_Load))`
			`{`
			`- rep_movsb_threshold = 2048 * (32 / 16);`
			`+ rep_movsb_threshold = 4096 * (32 / 16);`
			`#if HAVE_TUNABLES`
			`minimum_rep_movsb_threshold = 32 * 8;`
			`#endif`
			`diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list`
			`index dd6e1d65c9490d4f..419313804d49cf65 100644`
			`--- a/sysdeps/x86/dl-tunables.list`
			`+++ b/sysdeps/x86/dl-tunables.list`
			`@@ -32,17 +32,21 @@ glibc {`
			`}`
			`x86_rep_movsb_threshold {`
			`type: SIZE_T`
			`- # Since there is overhead to set up REP MOVSB operation, REP MOVSB`
			`- # isn't faster on short data. The memcpy micro benchmark in glibc`
			`- # shows that 2KB is the approximate value above which REP MOVSB`
			`- # becomes faster than SSE2 optimization on processors with Enhanced`
			`- # REP MOVSB. Since larger register size can move more data with a`
			`- # single load and store, the threshold is higher with larger register`
			`- # size. Note: Since the REP MOVSB threshold must be greater than 8`
			`- # times of vector size and the default value is 2048 * (vector size`
			`- # / 16), the default value and the minimum value must be updated at`
			`- # run-time. NB: Don't set the default value since we can't tell if`
			`- # the tunable value is set by user or not [BZ #27069].`
			`+ # Since there is overhead to set up REP MOVSB operation, REP`
			`+ # MOVSB isn't faster on short data. The memcpy micro benchmark`
			`+ # in glibc shows that 2KB is the approximate value above which`
			`+ # REP MOVSB becomes faster than SSE2 optimization on processors`
			`+ # with Enhanced REP MOVSB. Since larger register size can move`
			`+ # more data with a single load and store, the threshold is`
			`+ # higher with larger register size. Micro benchmarks show AVX`
			`+ # REP MOVSB becomes faster apprximately at 8KB. The AVX512`
			`+ # threshold is extrapolated to 16KB. For machines with FSRM the`
			`+ # threshold is universally set at 2112 bytes. Note: Since the`
			`+ # REP MOVSB threshold must be greater than 8 times of vector`
			`+ # size and the default value is 4096 * (vector size / 16), the`
			`+ # default value and the minimum value must be updated at`
			`+ # run-time. NB: Don't set the default value since we can't tell`
			`+ # if the tunable value is set by user or not [BZ #27069].`
			`minval: 1`
			`}`
			`x86_rep_stosb_threshold {`