glibc/glibc-upstream-2.34-182.patch

commit cecbac52123456e2fbcff062a4165bf7b9174797
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Mon Nov 1 00:49:52 2021 -0500

    x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h

    No bug.

    This patch doubles the rep_movsb_threshold when using ERMS. Based on
    benchmarks the vector copy loop, especially now that it handles 4k
    aliasing, is better for these medium ranged.

    On Skylake with ERMS:

    Size,   Align1, Align2, dst>src,(rep movsb) / (vec copy)
    4096,   0,      0,      0,      0.975
    4096,   0,      0,      1,      0.953
    4096,   12,     0,      0,      0.969
    4096,   12,     0,      1,      0.872
    4096,   44,     0,      0,      0.979
    4096,   44,     0,      1,      0.83
    4096,   0,      12,     0,      1.006
    4096,   0,      12,     1,      0.989
    4096,   0,      44,     0,      0.739
    4096,   0,      44,     1,      0.942
    4096,   12,     12,     0,      1.009
    4096,   12,     12,     1,      0.973
    4096,   44,     44,     0,      0.791
    4096,   44,     44,     1,      0.961
    4096,   2048,   0,      0,      0.978
    4096,   2048,   0,      1,      0.951
    4096,   2060,   0,      0,      0.986
    4096,   2060,   0,      1,      0.963
    4096,   2048,   12,     0,      0.971
    4096,   2048,   12,     1,      0.941
    4096,   2060,   12,     0,      0.977
    4096,   2060,   12,     1,      0.949
    8192,   0,      0,      0,      0.85
    8192,   0,      0,      1,      0.845
    8192,   13,     0,      0,      0.937
    8192,   13,     0,      1,      0.939
    8192,   45,     0,      0,      0.932
    8192,   45,     0,      1,      0.927
    8192,   0,      13,     0,      0.621
    8192,   0,      13,     1,      0.62
    8192,   0,      45,     0,      0.53
    8192,   0,      45,     1,      0.516
    8192,   13,     13,     0,      0.664
    8192,   13,     13,     1,      0.659
    8192,   45,     45,     0,      0.593
    8192,   45,     45,     1,      0.575
    8192,   2048,   0,      0,      0.854
    8192,   2048,   0,      1,      0.834
    8192,   2061,   0,      0,      0.863
    8192,   2061,   0,      1,      0.857
    8192,   2048,   13,     0,      0.63
    8192,   2048,   13,     1,      0.629
    8192,   2061,   13,     0,      0.627
    8192,   2061,   13,     1,      0.62

    Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    (cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)

diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
--- a/sysdeps/x86/dl-cacheinfo.h
+++ b/sysdeps/x86/dl-cacheinfo.h
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8.  */
   unsigned int minimum_rep_movsb_threshold;
 #endif
-  /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16).  */
+  /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+     VEC_SIZE == 64 or 32.  For VEC_SIZE == 16, the default REP MOVSB
+     threshold is 2048 * (VEC_SIZE / 16).  */
   unsigned int rep_movsb_threshold;
   if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
       && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
     {
-      rep_movsb_threshold = 2048 * (64 / 16);
+      rep_movsb_threshold = 4096 * (64 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 64 * 8;
 #endif
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
   else if (CPU_FEATURE_PREFERRED_P (cpu_features,
 				    AVX_Fast_Unaligned_Load))
     {
-      rep_movsb_threshold = 2048 * (32 / 16);
+      rep_movsb_threshold = 4096 * (32 / 16);
 #if HAVE_TUNABLES
       minimum_rep_movsb_threshold = 32 * 8;
 #endif
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index dd6e1d65c9490d4f..419313804d49cf65 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
     }
     x86_rep_movsb_threshold {
       type: SIZE_T
-      # Since there is overhead to set up REP MOVSB operation, REP MOVSB
-      # isn't faster on short data.  The memcpy micro benchmark in glibc
-      # shows that 2KB is the approximate value above which REP MOVSB
-      # becomes faster than SSE2 optimization on processors with Enhanced
-      # REP MOVSB.  Since larger register size can move more data with a
-      # single load and store, the threshold is higher with larger register
-      # size.  Note: Since the REP MOVSB threshold must be greater than 8
-      # times of vector size and the default value is 2048 * (vector size
-      # / 16), the default value and the minimum value must be updated at
-      # run-time.  NB: Don't set the default value since we can't tell if
-      # the tunable value is set by user or not [BZ #27069].
+      # Since there is overhead to set up REP MOVSB operation, REP
+      # MOVSB isn't faster on short data.  The memcpy micro benchmark
+      # in glibc shows that 2KB is the approximate value above which
+      # REP MOVSB becomes faster than SSE2 optimization on processors
+      # with Enhanced REP MOVSB.  Since larger register size can move
+      # more data with a single load and store, the threshold is
+      # higher with larger register size.  Micro benchmarks show AVX
+      # REP MOVSB becomes faster apprximately at 8KB.  The AVX512
+      # threshold is extrapolated to 16KB.  For machines with FSRM the
+      # threshold is universally set at 2112 bytes.  Note: Since the
+      # REP MOVSB threshold must be greater than 8 times of vector
+      # size and the default value is 4096 * (vector size / 16), the
+      # default value and the minimum value must be updated at
+      # run-time.  NB: Don't set the default value since we can't tell
+      # if the tunable value is set by user or not [BZ #27069].
       minval: 1
     }
     x86_rep_stosb_threshold {