Upstream commit: 808a84a8b81468b517a4d721fdc62069cb8c211f - Fix underallocation of abort_msg_s struct (CVE-2025-0395) - x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212] - x86: Improve large memset perf with non-temporal stores [RHEL-29312] - x86: Avoid integer truncation with large cache sizes (bug 32470) - math: Exclude internal math symbols for tests [BZ #32414] - malloc: add indirection for malloc(-like) functions in tests [BZ #32366] - Pass -nostdlib -nostartfiles together with -r [BZ #31753] - nptl: initialize cpu_id_start prior to rseq registration - nptl: initialize rseq area prior to registration
251 lines
8.5 KiB
Diff
251 lines
8.5 KiB
Diff
commit 61daaa76390e0ff73eade3a688d3626b7e7e0c20
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Fri May 24 12:38:50 2024 -0500
|
|
|
|
x86: Improve large memset perf with non-temporal stores [RHEL-29312]
|
|
|
|
Previously we use `rep stosb` for all medium/large memsets. This is
|
|
notably worse than non-temporal stores for large (above a
|
|
few MBs) memsets.
|
|
See:
|
|
https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing
|
|
For data using different stategies for large memset on ICX and SKX.
|
|
|
|
Using non-temporal stores can be up to 3x faster on ICX and 2x faster
|
|
on SKX. Historically, these numbers would not have been so good
|
|
because of the zero-over-zero writeback optimization that `rep stosb`
|
|
is able to do. But, the zero-over-zero writeback optimization has been
|
|
removed as a potential side-channel attack, so there is no longer any
|
|
good reason to only rely on `rep stosb` for large memsets. On the flip
|
|
size, non-temporal writes can avoid data in their RFO requests saving
|
|
memory bandwidth.
|
|
|
|
All of the other changes to the file are to re-organize the
|
|
code-blocks to maintain "good" alignment given the new code added in
|
|
the `L(stosb_local)` case.
|
|
|
|
The results from running the GLIBC memset benchmarks on TGL-client for
|
|
N=20 runs:
|
|
|
|
Geometric Mean across the suite New / Old EXEX256: 0.979
|
|
Geometric Mean across the suite New / Old EXEX512: 0.979
|
|
Geometric Mean across the suite New / Old AVX2 : 0.986
|
|
Geometric Mean across the suite New / Old SSE2 : 0.979
|
|
|
|
Most of the cases are essentially unchanged, this is mostly to show
|
|
that adding the non-temporal case didn't add any regressions to the
|
|
other cases.
|
|
|
|
The results on the memset-large benchmark suite on TGL-client for N=20
|
|
runs:
|
|
|
|
Geometric Mean across the suite New / Old EXEX256: 0.926
|
|
Geometric Mean across the suite New / Old EXEX512: 0.925
|
|
Geometric Mean across the suite New / Old AVX2 : 0.928
|
|
Geometric Mean across the suite New / Old SSE2 : 0.924
|
|
|
|
So roughly a 7.5% speedup. This is lower than what we see on servers
|
|
(likely because clients typically have faster single-core bandwidth so
|
|
saving bandwidth on RFOs is less impactful), but still advantageous.
|
|
|
|
Full test-suite passes on x86_64 w/ and w/o multiarch.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit 5bf0ab80573d66e4ae5d94b094659094336da90f)
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
index 97839a22483b0613..637caadb406b2544 100644
|
|
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
@@ -21,10 +21,13 @@
|
|
2. If size is less than VEC, use integer register stores.
|
|
3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores.
|
|
4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores.
|
|
- 5. On machines ERMS feature, if size is greater or equal than
|
|
- __x86_rep_stosb_threshold then REP STOSB will be used.
|
|
- 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with
|
|
- 4 VEC stores and store 4 * VEC at a time until done. */
|
|
+ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with
|
|
+ 4 VEC stores and store 4 * VEC at a time until done.
|
|
+ 6. On machines ERMS feature, if size is range
|
|
+ [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold)
|
|
+ then REP STOSB will be used.
|
|
+ 7. If size >= __x86_shared_non_temporal_threshold, use a
|
|
+ non-temporal stores. */
|
|
|
|
#include <sysdep.h>
|
|
|
|
@@ -147,6 +150,41 @@ L(entry_from_wmemset):
|
|
VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx)
|
|
VMOVU %VMM(0), (%rdi)
|
|
VZEROUPPER_RETURN
|
|
+
|
|
+ /* If have AVX512 mask instructions put L(less_vec) close to
|
|
+ entry as it doesn't take much space and is likely a hot target. */
|
|
+#ifdef USE_LESS_VEC_MASK_STORE
|
|
+ /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */
|
|
+ .p2align 6,, 47
|
|
+ .p2align 4
|
|
+L(less_vec):
|
|
+L(less_vec_from_wmemset):
|
|
+ /* Less than 1 VEC. */
|
|
+# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
+# error Unsupported VEC_SIZE!
|
|
+# endif
|
|
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
|
+ cross check. Note that we are using rax which is set in
|
|
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
|
+ andl $(PAGE_SIZE - 1), %edi
|
|
+ /* Check if VEC_SIZE store cross page. Mask stores suffer
|
|
+ serious performance degradation when it has to fault suppress. */
|
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
|
+ /* This is generally considered a cold target. */
|
|
+ ja L(cross_page)
|
|
+# if VEC_SIZE > 32
|
|
+ movq $-1, %rcx
|
|
+ bzhiq %rdx, %rcx, %rcx
|
|
+ kmovq %rcx, %k1
|
|
+# else
|
|
+ movl $-1, %ecx
|
|
+ bzhil %edx, %ecx, %ecx
|
|
+ kmovd %ecx, %k1
|
|
+# endif
|
|
+ vmovdqu8 %VMM(0), (%rax){%k1}
|
|
+ VZEROUPPER_RETURN
|
|
+#endif
|
|
+
|
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
|
END (MEMSET_SYMBOL (__memset, unaligned))
|
|
|
|
@@ -185,54 +223,6 @@ L(last_2x_vec):
|
|
#endif
|
|
VZEROUPPER_RETURN
|
|
|
|
- /* If have AVX512 mask instructions put L(less_vec) close to
|
|
- entry as it doesn't take much space and is likely a hot target.
|
|
- */
|
|
-#ifdef USE_LESS_VEC_MASK_STORE
|
|
- .p2align 4,, 10
|
|
-L(less_vec):
|
|
-L(less_vec_from_wmemset):
|
|
- /* Less than 1 VEC. */
|
|
-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
-# error Unsupported VEC_SIZE!
|
|
-# endif
|
|
- /* Clear high bits from edi. Only keeping bits relevant to page
|
|
- cross check. Note that we are using rax which is set in
|
|
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
|
- andl $(PAGE_SIZE - 1), %edi
|
|
- /* Check if VEC_SIZE store cross page. Mask stores suffer
|
|
- serious performance degradation when it has to fault suppress.
|
|
- */
|
|
- cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
|
- /* This is generally considered a cold target. */
|
|
- ja L(cross_page)
|
|
-# if VEC_SIZE > 32
|
|
- movq $-1, %rcx
|
|
- bzhiq %rdx, %rcx, %rcx
|
|
- kmovq %rcx, %k1
|
|
-# else
|
|
- movl $-1, %ecx
|
|
- bzhil %edx, %ecx, %ecx
|
|
- kmovd %ecx, %k1
|
|
-# endif
|
|
- vmovdqu8 %VMM(0), (%rax){%k1}
|
|
- VZEROUPPER_RETURN
|
|
-
|
|
-# if defined USE_MULTIARCH && IS_IN (libc)
|
|
- /* Include L(stosb_local) here if including L(less_vec) between
|
|
- L(stosb_more_2x_vec) and ENTRY. This is to cache align the
|
|
- L(stosb_more_2x_vec) target. */
|
|
- .p2align 4,, 10
|
|
-L(stosb_local):
|
|
- movzbl %sil, %eax
|
|
- mov %RDX_LP, %RCX_LP
|
|
- mov %RDI_LP, %RDX_LP
|
|
- rep stosb
|
|
- mov %RDX_LP, %RAX_LP
|
|
- VZEROUPPER_RETURN
|
|
-# endif
|
|
-#endif
|
|
-
|
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
|
.p2align 4
|
|
L(stosb_more_2x_vec):
|
|
@@ -318,21 +308,33 @@ L(return_vzeroupper):
|
|
ret
|
|
#endif
|
|
|
|
- .p2align 4,, 10
|
|
-#ifndef USE_LESS_VEC_MASK_STORE
|
|
-# if defined USE_MULTIARCH && IS_IN (libc)
|
|
+#ifdef USE_WITH_AVX2
|
|
+ .p2align 4
|
|
+#else
|
|
+ .p2align 4,, 4
|
|
+#endif
|
|
+
|
|
+#if defined USE_MULTIARCH && IS_IN (libc)
|
|
/* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
|
range for 2-byte jump encoding. */
|
|
L(stosb_local):
|
|
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
+ jae L(nt_memset)
|
|
movzbl %sil, %eax
|
|
mov %RDX_LP, %RCX_LP
|
|
mov %RDI_LP, %RDX_LP
|
|
rep stosb
|
|
+# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512)
|
|
+ /* Use xchg to save 1-byte (this helps align targets below). */
|
|
+ xchg %RDX_LP, %RAX_LP
|
|
+# else
|
|
mov %RDX_LP, %RAX_LP
|
|
- VZEROUPPER_RETURN
|
|
# endif
|
|
+ VZEROUPPER_RETURN
|
|
+#endif
|
|
+#ifndef USE_LESS_VEC_MASK_STORE
|
|
/* Define L(less_vec) only if not otherwise defined. */
|
|
- .p2align 4
|
|
+ .p2align 4,, 12
|
|
L(less_vec):
|
|
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
|
xmm). This is only does anything for AVX2. */
|
|
@@ -423,4 +425,35 @@ L(between_2_3):
|
|
movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
|
|
#endif
|
|
ret
|
|
-END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
|
+
|
|
+#if defined USE_MULTIARCH && IS_IN (libc)
|
|
+# ifdef USE_WITH_AVX512
|
|
+ /* Force align so the loop doesn't cross a cache-line. */
|
|
+ .p2align 4
|
|
+# endif
|
|
+ .p2align 4,, 7
|
|
+ /* Memset using non-temporal stores. */
|
|
+L(nt_memset):
|
|
+ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi)
|
|
+ leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx
|
|
+ /* Align DST. */
|
|
+ orq $(VEC_SIZE * 1 - 1), %rdi
|
|
+ incq %rdi
|
|
+ .p2align 4,, 7
|
|
+L(nt_loop):
|
|
+ VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi)
|
|
+ VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi)
|
|
+ VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi)
|
|
+ VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi)
|
|
+ subq $(VEC_SIZE * -4), %rdi
|
|
+ cmpq %rdx, %rdi
|
|
+ jb L(nt_loop)
|
|
+ sfence
|
|
+ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx)
|
|
+ VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx)
|
|
+ VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx)
|
|
+ VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx)
|
|
+ VZEROUPPER_RETURN
|
|
+#endif
|
|
+
|
|
+END(MEMSET_SYMBOL(__memset, unaligned_erms))
|