73667d0be6
* Thu Apr 28 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-32 - Sync with upstream branch release/2.34/master, commit c66c92181ddbd82306537a608e8c0282587131de: - posix/glob.c: update from gnulib (BZ#25659) - linux: Fix fchmodat with AT_SYMLINK_NOFOLLOW for 64 bit time_t (BZ#29097) * Wed Apr 27 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-31 - Sync with upstream branch release/2.34/master, commit 55640ed3fde48360a8e8083be4843bd2dc7cecfe: - i386: Regenerate ulps - linux: Fix missing internal 64 bit time_t stat usage - x86: Optimize L(less_vec) case in memcmp-evex-movbe.S - x86: Don't set Prefer_No_AVX512 for processors with AVX512 and AVX-VNNI - x86-64: Use notl in EVEX strcmp [BZ #28646] - x86: Shrink memcmp-sse4.S code size - x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h - x86: Optimize memmove-vec-unaligned-erms.S - x86-64: Replace movzx with movzbl - x86-64: Remove Prefer_AVX2_STRCMP - x86-64: Improve EVEX strcmp with masked load - x86: Replace sse2 instructions with avx in memcmp-evex-movbe.S - x86: Optimize memset-vec-unaligned-erms.S - x86: Optimize memcmp-evex-movbe.S for frontend behavior and size - x86: Modify ENTRY in sysdep.h so that p2align can be specified - x86-64: Optimize load of all bits set into ZMM register [BZ #28252] - scripts/glibcelf.py: Mark as UNSUPPORTED on Python 3.5 and earlier - dlfcn: Do not use rtld_active () to determine ld.so state (bug 29078) - INSTALL: Rephrase -with-default-link documentation - misc: Fix rare fortify crash on wchar funcs. [BZ 29030] - Default to --with-default-link=no (bug 25812) - scripts: Add glibcelf.py module * Thu Apr 21 2022 Carlos O'Donell <carlos@redhat.com> - 2.34-30 - Sync with upstream branch release/2.34/master, commit 71326f1f2fd09dafb9c34404765fb88129e94237: - nptl: Fix pthread_cancel cancelhandling atomic operations - mips: Fix mips64n32 64 bit time_t stat support (BZ#29069) - hurd: Fix arbitrary error code - nptl: Handle spurious EINTR when thread cancellation is disabled (BZ#29029) - S390: Add new s390 platform z16. - NEWS: Update fixed bug list for LD_AUDIT backports. - hppa: Fix bind-now audit (BZ #28857) - elf: Replace tst-audit24bmod2.so with tst-audit24bmod2 - Fix elf/tst-audit25a with default bind now toolchains - elf: Fix runtime linker auditing on aarch64 (BZ #26643) - elf: Issue la_symbind for bind-now (BZ #23734) - elf: Fix initial-exec TLS access on audit modules (BZ #28096) - elf: Add la_activity during application exit - elf: Do not fail for failed dlmopen on audit modules (BZ #28061) - elf: Issue audit la_objopen for vDSO - elf: Add audit tests for modules with TLSDESC - elf: Avoid unnecessary slowdown from profiling with audit (BZ#15533) - elf: Add _dl_audit_pltexit - elf: Add _dl_audit_pltenter - elf: Add _dl_audit_preinit - elf: Add _dl_audit_symbind_alt and _dl_audit_symbind - elf: Add _dl_audit_objclose - elf: Add _dl_audit_objsearch - elf: Add _dl_audit_activity_map and _dl_audit_activity_nsid - elf: Add _dl_audit_objopen - elf: Move la_activity (LA_ACT_ADD) after _dl_add_to_namespace_list() (BZ #28062) - elf: Move LAV_CURRENT to link_lavcurrent.h - elf: Fix elf_get_dynamic_info() for bootstrap - elf: Fix dynamic-link.h usage on rtld.c - elf: Fix elf_get_dynamic_info definition - elf: Avoid nested functions in the loader [BZ #27220] - powerpc: Delete unneeded ELF_MACHINE_BEFORE_RTLD_RELOC - hppa: Use END instead of PSEUDO_END in swapcontext.S - hppa: Implement swapcontext in assembler (bug 28960) Resolves: #2003291 Resolves: #2064181 Resolves: #2072328 Resolves: #2075713 Resolves: #2077838
132 lines
5.9 KiB
Diff
132 lines
5.9 KiB
Diff
commit cecbac52123456e2fbcff062a4165bf7b9174797
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Mon Nov 1 00:49:52 2021 -0500
|
|
|
|
x86: Double size of ERMS rep_movsb_threshold in dl-cacheinfo.h
|
|
|
|
No bug.
|
|
|
|
This patch doubles the rep_movsb_threshold when using ERMS. Based on
|
|
benchmarks the vector copy loop, especially now that it handles 4k
|
|
aliasing, is better for these medium ranged.
|
|
|
|
On Skylake with ERMS:
|
|
|
|
Size, Align1, Align2, dst>src,(rep movsb) / (vec copy)
|
|
4096, 0, 0, 0, 0.975
|
|
4096, 0, 0, 1, 0.953
|
|
4096, 12, 0, 0, 0.969
|
|
4096, 12, 0, 1, 0.872
|
|
4096, 44, 0, 0, 0.979
|
|
4096, 44, 0, 1, 0.83
|
|
4096, 0, 12, 0, 1.006
|
|
4096, 0, 12, 1, 0.989
|
|
4096, 0, 44, 0, 0.739
|
|
4096, 0, 44, 1, 0.942
|
|
4096, 12, 12, 0, 1.009
|
|
4096, 12, 12, 1, 0.973
|
|
4096, 44, 44, 0, 0.791
|
|
4096, 44, 44, 1, 0.961
|
|
4096, 2048, 0, 0, 0.978
|
|
4096, 2048, 0, 1, 0.951
|
|
4096, 2060, 0, 0, 0.986
|
|
4096, 2060, 0, 1, 0.963
|
|
4096, 2048, 12, 0, 0.971
|
|
4096, 2048, 12, 1, 0.941
|
|
4096, 2060, 12, 0, 0.977
|
|
4096, 2060, 12, 1, 0.949
|
|
8192, 0, 0, 0, 0.85
|
|
8192, 0, 0, 1, 0.845
|
|
8192, 13, 0, 0, 0.937
|
|
8192, 13, 0, 1, 0.939
|
|
8192, 45, 0, 0, 0.932
|
|
8192, 45, 0, 1, 0.927
|
|
8192, 0, 13, 0, 0.621
|
|
8192, 0, 13, 1, 0.62
|
|
8192, 0, 45, 0, 0.53
|
|
8192, 0, 45, 1, 0.516
|
|
8192, 13, 13, 0, 0.664
|
|
8192, 13, 13, 1, 0.659
|
|
8192, 45, 45, 0, 0.593
|
|
8192, 45, 45, 1, 0.575
|
|
8192, 2048, 0, 0, 0.854
|
|
8192, 2048, 0, 1, 0.834
|
|
8192, 2061, 0, 0, 0.863
|
|
8192, 2061, 0, 1, 0.857
|
|
8192, 2048, 13, 0, 0.63
|
|
8192, 2048, 13, 1, 0.629
|
|
8192, 2061, 13, 0, 0.627
|
|
8192, 2061, 13, 1, 0.62
|
|
|
|
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
(cherry picked from commit 475b63702ef38b69558fc3d31a0b66776a70f1d3)
|
|
|
|
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
|
index e6c94dfd023a25dc..2e43e67e4f4037d3 100644
|
|
--- a/sysdeps/x86/dl-cacheinfo.h
|
|
+++ b/sysdeps/x86/dl-cacheinfo.h
|
|
@@ -866,12 +866,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
|
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
|
|
unsigned int minimum_rep_movsb_threshold;
|
|
#endif
|
|
- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
|
|
+ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
|
|
+ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
|
|
+ threshold is 2048 * (VEC_SIZE / 16). */
|
|
unsigned int rep_movsb_threshold;
|
|
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
|
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
|
|
{
|
|
- rep_movsb_threshold = 2048 * (64 / 16);
|
|
+ rep_movsb_threshold = 4096 * (64 / 16);
|
|
#if HAVE_TUNABLES
|
|
minimum_rep_movsb_threshold = 64 * 8;
|
|
#endif
|
|
@@ -879,7 +881,7 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
|
else if (CPU_FEATURE_PREFERRED_P (cpu_features,
|
|
AVX_Fast_Unaligned_Load))
|
|
{
|
|
- rep_movsb_threshold = 2048 * (32 / 16);
|
|
+ rep_movsb_threshold = 4096 * (32 / 16);
|
|
#if HAVE_TUNABLES
|
|
minimum_rep_movsb_threshold = 32 * 8;
|
|
#endif
|
|
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
|
|
index dd6e1d65c9490d4f..419313804d49cf65 100644
|
|
--- a/sysdeps/x86/dl-tunables.list
|
|
+++ b/sysdeps/x86/dl-tunables.list
|
|
@@ -32,17 +32,21 @@ glibc {
|
|
}
|
|
x86_rep_movsb_threshold {
|
|
type: SIZE_T
|
|
- # Since there is overhead to set up REP MOVSB operation, REP MOVSB
|
|
- # isn't faster on short data. The memcpy micro benchmark in glibc
|
|
- # shows that 2KB is the approximate value above which REP MOVSB
|
|
- # becomes faster than SSE2 optimization on processors with Enhanced
|
|
- # REP MOVSB. Since larger register size can move more data with a
|
|
- # single load and store, the threshold is higher with larger register
|
|
- # size. Note: Since the REP MOVSB threshold must be greater than 8
|
|
- # times of vector size and the default value is 2048 * (vector size
|
|
- # / 16), the default value and the minimum value must be updated at
|
|
- # run-time. NB: Don't set the default value since we can't tell if
|
|
- # the tunable value is set by user or not [BZ #27069].
|
|
+ # Since there is overhead to set up REP MOVSB operation, REP
|
|
+ # MOVSB isn't faster on short data. The memcpy micro benchmark
|
|
+ # in glibc shows that 2KB is the approximate value above which
|
|
+ # REP MOVSB becomes faster than SSE2 optimization on processors
|
|
+ # with Enhanced REP MOVSB. Since larger register size can move
|
|
+ # more data with a single load and store, the threshold is
|
|
+ # higher with larger register size. Micro benchmarks show AVX
|
|
+ # REP MOVSB becomes faster apprximately at 8KB. The AVX512
|
|
+ # threshold is extrapolated to 16KB. For machines with FSRM the
|
|
+ # threshold is universally set at 2112 bytes. Note: Since the
|
|
+ # REP MOVSB threshold must be greater than 8 times of vector
|
|
+ # size and the default value is 4096 * (vector size / 16), the
|
|
+ # default value and the minimum value must be updated at
|
|
+ # run-time. NB: Don't set the default value since we can't tell
|
|
+ # if the tunable value is set by user or not [BZ #27069].
|
|
minval: 1
|
|
}
|
|
x86_rep_stosb_threshold {
|