forked from rpms/glibc
844 lines
25 KiB
Diff
844 lines
25 KiB
Diff
commit a7392db2ff2b9dd906500941ac6361dbe2211b0d
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Mon Nov 1 00:49:51 2021 -0500
|
|
|
|
x86: Optimize memmove-vec-unaligned-erms.S
|
|
|
|
No bug.
|
|
|
|
The optimizations are as follows:
|
|
|
|
1) Always align entry to 64 bytes. This makes behavior more
|
|
predictable and makes other frontend optimizations easier.
|
|
|
|
2) Make the L(more_8x_vec) cases 4k aliasing aware. This can have
|
|
significant benefits in the case that:
|
|
0 < (dst - src) < [256, 512]
|
|
|
|
3) Align before `rep movsb`. For ERMS this is roughly a [0, 30%]
|
|
improvement and for FSRM [-10%, 25%].
|
|
|
|
In addition to these primary changes there is general cleanup
|
|
throughout to optimize the aligning routines and control flow logic.
|
|
|
|
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
(cherry picked from commit a6b7502ec0c2da89a7437f43171f160d713e39c6)
|
|
|
|
diff --git a/sysdeps/x86_64/memmove.S b/sysdeps/x86_64/memmove.S
|
|
index db106a7a1f23f268..b2b318084823dceb 100644
|
|
--- a/sysdeps/x86_64/memmove.S
|
|
+++ b/sysdeps/x86_64/memmove.S
|
|
@@ -25,7 +25,7 @@
|
|
/* Use movups and movaps for smaller code sizes. */
|
|
#define VMOVU movups
|
|
#define VMOVA movaps
|
|
-
|
|
+#define MOV_SIZE 3
|
|
#define SECTION(p) p
|
|
|
|
#ifdef USE_MULTIARCH
|
|
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
|
|
index 1ec1962e861dbf63..67a55f0c85af841c 100644
|
|
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
|
|
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S
|
|
@@ -4,7 +4,7 @@
|
|
# define VMOVNT vmovntdq
|
|
# define VMOVU vmovdqu
|
|
# define VMOVA vmovdqa
|
|
-
|
|
+# define MOV_SIZE 4
|
|
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
|
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
|
|
index e195e93f153c9512..975ae6c0515b83cb 100644
|
|
--- a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms.S
|
|
@@ -4,7 +4,7 @@
|
|
# define VMOVNT vmovntdq
|
|
# define VMOVU vmovdqu
|
|
# define VMOVA vmovdqa
|
|
-
|
|
+# define MOV_SIZE 4
|
|
# define SECTION(p) p##.avx
|
|
# define MEMMOVE_SYMBOL(p,s) p##_avx_##s
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
index 848848ab39ff9326..0fa7126830af7acb 100644
|
|
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
|
@@ -25,7 +25,7 @@
|
|
# define VMOVU vmovdqu64
|
|
# define VMOVA vmovdqa64
|
|
# define VZEROUPPER
|
|
-
|
|
+# define MOV_SIZE 6
|
|
# define SECTION(p) p##.evex512
|
|
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
index 0cbce8f944da51a0..88715441feaaccf5 100644
|
|
--- a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
|
@@ -25,7 +25,7 @@
|
|
# define VMOVU vmovdqu64
|
|
# define VMOVA vmovdqa64
|
|
# define VZEROUPPER
|
|
-
|
|
+# define MOV_SIZE 6
|
|
# define SECTION(p) p##.evex
|
|
# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
index abde8438d41f2320..7b27cbdda5fb99f7 100644
|
|
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
|
@@ -76,6 +76,25 @@
|
|
# endif
|
|
#endif
|
|
|
|
+/* Whether to align before movsb. Ultimately we want 64 byte
|
|
+ align and not worth it to load 4x VEC for VEC_SIZE == 16. */
|
|
+#define ALIGN_MOVSB (VEC_SIZE > 16)
|
|
+/* Number of bytes to align movsb to. */
|
|
+#define MOVSB_ALIGN_TO 64
|
|
+
|
|
+#define SMALL_MOV_SIZE (MOV_SIZE <= 4)
|
|
+#define LARGE_MOV_SIZE (MOV_SIZE > 4)
|
|
+
|
|
+#if SMALL_MOV_SIZE + LARGE_MOV_SIZE != 1
|
|
+# error MOV_SIZE Unknown
|
|
+#endif
|
|
+
|
|
+#if LARGE_MOV_SIZE
|
|
+# define SMALL_SIZE_OFFSET (4)
|
|
+#else
|
|
+# define SMALL_SIZE_OFFSET (0)
|
|
+#endif
|
|
+
|
|
#ifndef PAGE_SIZE
|
|
# define PAGE_SIZE 4096
|
|
#endif
|
|
@@ -199,25 +218,21 @@ L(start):
|
|
# endif
|
|
cmp $VEC_SIZE, %RDX_LP
|
|
jb L(less_vec)
|
|
+ /* Load regardless. */
|
|
+ VMOVU (%rsi), %VEC(0)
|
|
cmp $(VEC_SIZE * 2), %RDX_LP
|
|
ja L(more_2x_vec)
|
|
-#if !defined USE_MULTIARCH || !IS_IN (libc)
|
|
-L(last_2x_vec):
|
|
-#endif
|
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
- VMOVU (%rsi), %VEC(0)
|
|
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
|
-#if !defined USE_MULTIARCH || !IS_IN (libc)
|
|
-L(nop):
|
|
- ret
|
|
+#if !(defined USE_MULTIARCH && IS_IN (libc))
|
|
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
#else
|
|
VZEROUPPER_RETURN
|
|
#endif
|
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
|
END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
|
-
|
|
# if VEC_SIZE == 16
|
|
ENTRY (__mempcpy_chk_erms)
|
|
cmp %RDX_LP, %RCX_LP
|
|
@@ -289,7 +304,7 @@ ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
|
# endif
|
|
|
|
-ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
|
+ENTRY_P2ALIGN (MEMMOVE_SYMBOL (__memmove, unaligned_erms), 6)
|
|
movq %rdi, %rax
|
|
L(start_erms):
|
|
# ifdef __ILP32__
|
|
@@ -298,310 +313,448 @@ L(start_erms):
|
|
# endif
|
|
cmp $VEC_SIZE, %RDX_LP
|
|
jb L(less_vec)
|
|
+ /* Load regardless. */
|
|
+ VMOVU (%rsi), %VEC(0)
|
|
cmp $(VEC_SIZE * 2), %RDX_LP
|
|
ja L(movsb_more_2x_vec)
|
|
-L(last_2x_vec):
|
|
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
- VMOVU (%rsi), %VEC(0)
|
|
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1)
|
|
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
|
+ */
|
|
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(1)
|
|
VMOVU %VEC(0), (%rdi)
|
|
- VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx)
|
|
+ VMOVU %VEC(1), -VEC_SIZE(%rdi, %rdx)
|
|
L(return):
|
|
-#if VEC_SIZE > 16
|
|
+# if VEC_SIZE > 16
|
|
ZERO_UPPER_VEC_REGISTERS_RETURN
|
|
-#else
|
|
+# else
|
|
ret
|
|
+# endif
|
|
#endif
|
|
|
|
-L(movsb):
|
|
- cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
|
- jae L(more_8x_vec)
|
|
- cmpq %rsi, %rdi
|
|
- jb 1f
|
|
- /* Source == destination is less common. */
|
|
- je L(nop)
|
|
- leaq (%rsi,%rdx), %r9
|
|
- cmpq %r9, %rdi
|
|
- /* Avoid slow backward REP MOVSB. */
|
|
- jb L(more_8x_vec_backward)
|
|
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
|
- jz 3f
|
|
- movq %rdi, %rcx
|
|
- subq %rsi, %rcx
|
|
- jmp 2f
|
|
-# endif
|
|
-1:
|
|
-# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
- testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
|
- jz 3f
|
|
- movq %rsi, %rcx
|
|
- subq %rdi, %rcx
|
|
-2:
|
|
-/* Avoid "rep movsb" if RCX, the distance between source and destination,
|
|
- is N*4GB + [1..63] with N >= 0. */
|
|
- cmpl $63, %ecx
|
|
- jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
|
|
-3:
|
|
-# endif
|
|
- mov %RDX_LP, %RCX_LP
|
|
- rep movsb
|
|
-L(nop):
|
|
+#if LARGE_MOV_SIZE
|
|
+ /* If LARGE_MOV_SIZE this fits in the aligning bytes between the
|
|
+ ENTRY block and L(less_vec). */
|
|
+ .p2align 4,, 8
|
|
+L(between_4_7):
|
|
+ /* From 4 to 7. No branch when size == 4. */
|
|
+ movl (%rsi), %ecx
|
|
+ movl (%rsi, %rdx), %esi
|
|
+ movl %ecx, (%rdi)
|
|
+ movl %esi, (%rdi, %rdx)
|
|
ret
|
|
#endif
|
|
|
|
+ .p2align 4
|
|
L(less_vec):
|
|
/* Less than 1 VEC. */
|
|
#if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
# error Unsupported VEC_SIZE!
|
|
#endif
|
|
#if VEC_SIZE > 32
|
|
- cmpb $32, %dl
|
|
+ cmpl $32, %edx
|
|
jae L(between_32_63)
|
|
#endif
|
|
#if VEC_SIZE > 16
|
|
- cmpb $16, %dl
|
|
+ cmpl $16, %edx
|
|
jae L(between_16_31)
|
|
#endif
|
|
- cmpb $8, %dl
|
|
+ cmpl $8, %edx
|
|
jae L(between_8_15)
|
|
- cmpb $4, %dl
|
|
+#if SMALL_MOV_SIZE
|
|
+ cmpl $4, %edx
|
|
+#else
|
|
+ subq $4, %rdx
|
|
+#endif
|
|
jae L(between_4_7)
|
|
- cmpb $1, %dl
|
|
- ja L(between_2_3)
|
|
- jb 1f
|
|
- movzbl (%rsi), %ecx
|
|
+ cmpl $(1 - SMALL_SIZE_OFFSET), %edx
|
|
+ jl L(copy_0)
|
|
+ movb (%rsi), %cl
|
|
+ je L(copy_1)
|
|
+ movzwl (-2 + SMALL_SIZE_OFFSET)(%rsi, %rdx), %esi
|
|
+ movw %si, (-2 + SMALL_SIZE_OFFSET)(%rdi, %rdx)
|
|
+L(copy_1):
|
|
movb %cl, (%rdi)
|
|
-1:
|
|
+L(copy_0):
|
|
ret
|
|
+
|
|
+#if SMALL_MOV_SIZE
|
|
+ .p2align 4,, 8
|
|
+L(between_4_7):
|
|
+ /* From 4 to 7. No branch when size == 4. */
|
|
+ movl -4(%rsi, %rdx), %ecx
|
|
+ movl (%rsi), %esi
|
|
+ movl %ecx, -4(%rdi, %rdx)
|
|
+ movl %esi, (%rdi)
|
|
+ ret
|
|
+#endif
|
|
+
|
|
+#if VEC_SIZE > 16
|
|
+ /* From 16 to 31. No branch when size == 16. */
|
|
+ .p2align 4,, 8
|
|
+L(between_16_31):
|
|
+ vmovdqu (%rsi), %xmm0
|
|
+ vmovdqu -16(%rsi, %rdx), %xmm1
|
|
+ vmovdqu %xmm0, (%rdi)
|
|
+ vmovdqu %xmm1, -16(%rdi, %rdx)
|
|
+ /* No ymm registers have been touched. */
|
|
+ ret
|
|
+#endif
|
|
+
|
|
#if VEC_SIZE > 32
|
|
+ .p2align 4,, 10
|
|
L(between_32_63):
|
|
/* From 32 to 63. No branch when size == 32. */
|
|
VMOVU (%rsi), %YMM0
|
|
- VMOVU -32(%rsi,%rdx), %YMM1
|
|
+ VMOVU -32(%rsi, %rdx), %YMM1
|
|
VMOVU %YMM0, (%rdi)
|
|
- VMOVU %YMM1, -32(%rdi,%rdx)
|
|
- VZEROUPPER_RETURN
|
|
-#endif
|
|
-#if VEC_SIZE > 16
|
|
- /* From 16 to 31. No branch when size == 16. */
|
|
-L(between_16_31):
|
|
- VMOVU (%rsi), %XMM0
|
|
- VMOVU -16(%rsi,%rdx), %XMM1
|
|
- VMOVU %XMM0, (%rdi)
|
|
- VMOVU %XMM1, -16(%rdi,%rdx)
|
|
+ VMOVU %YMM1, -32(%rdi, %rdx)
|
|
VZEROUPPER_RETURN
|
|
#endif
|
|
+
|
|
+ .p2align 4,, 10
|
|
L(between_8_15):
|
|
/* From 8 to 15. No branch when size == 8. */
|
|
- movq -8(%rsi,%rdx), %rcx
|
|
+ movq -8(%rsi, %rdx), %rcx
|
|
movq (%rsi), %rsi
|
|
- movq %rcx, -8(%rdi,%rdx)
|
|
movq %rsi, (%rdi)
|
|
+ movq %rcx, -8(%rdi, %rdx)
|
|
ret
|
|
-L(between_4_7):
|
|
- /* From 4 to 7. No branch when size == 4. */
|
|
- movl -4(%rsi,%rdx), %ecx
|
|
- movl (%rsi), %esi
|
|
- movl %ecx, -4(%rdi,%rdx)
|
|
- movl %esi, (%rdi)
|
|
- ret
|
|
-L(between_2_3):
|
|
- /* From 2 to 3. No branch when size == 2. */
|
|
- movzwl -2(%rsi,%rdx), %ecx
|
|
- movzwl (%rsi), %esi
|
|
- movw %cx, -2(%rdi,%rdx)
|
|
- movw %si, (%rdi)
|
|
- ret
|
|
|
|
+ .p2align 4,, 10
|
|
+L(last_4x_vec):
|
|
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
|
+
|
|
+ /* VEC(0) and VEC(1) have already been loaded. */
|
|
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(2)
|
|
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(3)
|
|
+ VMOVU %VEC(0), (%rdi)
|
|
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
|
|
+ VMOVU %VEC(2), -VEC_SIZE(%rdi, %rdx)
|
|
+ VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi, %rdx)
|
|
+ VZEROUPPER_RETURN
|
|
+
|
|
+ .p2align 4
|
|
#if defined USE_MULTIARCH && IS_IN (libc)
|
|
L(movsb_more_2x_vec):
|
|
cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
|
|
ja L(movsb)
|
|
#endif
|
|
L(more_2x_vec):
|
|
- /* More than 2 * VEC and there may be overlap between destination
|
|
- and source. */
|
|
+ /* More than 2 * VEC and there may be overlap between
|
|
+ destination and source. */
|
|
cmpq $(VEC_SIZE * 8), %rdx
|
|
ja L(more_8x_vec)
|
|
+ /* Load VEC(1) regardless. VEC(0) has already been loaded. */
|
|
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
cmpq $(VEC_SIZE * 4), %rdx
|
|
jbe L(last_4x_vec)
|
|
- /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
|
- VMOVU (%rsi), %VEC(0)
|
|
- VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
|
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(4)
|
|
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(5)
|
|
- VMOVU -(VEC_SIZE * 3)(%rsi,%rdx), %VEC(6)
|
|
- VMOVU -(VEC_SIZE * 4)(%rsi,%rdx), %VEC(7)
|
|
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(4)
|
|
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(5)
|
|
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(6)
|
|
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(7)
|
|
VMOVU %VEC(0), (%rdi)
|
|
VMOVU %VEC(1), VEC_SIZE(%rdi)
|
|
VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
- VMOVU %VEC(4), -VEC_SIZE(%rdi,%rdx)
|
|
- VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
- VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx)
|
|
- VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
|
- VZEROUPPER_RETURN
|
|
-L(last_4x_vec):
|
|
- /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
|
- VMOVU (%rsi), %VEC(0)
|
|
- VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
|
|
- VMOVU -(VEC_SIZE * 2)(%rsi,%rdx), %VEC(3)
|
|
- VMOVU %VEC(0), (%rdi)
|
|
- VMOVU %VEC(1), VEC_SIZE(%rdi)
|
|
- VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx)
|
|
- VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx)
|
|
+ VMOVU %VEC(4), -VEC_SIZE(%rdi, %rdx)
|
|
+ VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi, %rdx)
|
|
+ VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi, %rdx)
|
|
+ VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi, %rdx)
|
|
VZEROUPPER_RETURN
|
|
|
|
+ .p2align 4,, 4
|
|
L(more_8x_vec):
|
|
+ movq %rdi, %rcx
|
|
+ subq %rsi, %rcx
|
|
+ /* Go to backwards temporal copy if overlap no matter what as
|
|
+ backward REP MOVSB is slow and we don't want to use NT stores if
|
|
+ there is overlap. */
|
|
+ cmpq %rdx, %rcx
|
|
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
|
|
+ jb L(more_8x_vec_backward_check_nop)
|
|
/* Check if non-temporal move candidate. */
|
|
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
/* Check non-temporal store threshold. */
|
|
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
|
ja L(large_memcpy_2x)
|
|
#endif
|
|
- /* Entry if rdx is greater than non-temporal threshold but there
|
|
- is overlap. */
|
|
+ /* To reach this point there cannot be overlap and dst > src. So
|
|
+ check for overlap and src > dst in which case correctness
|
|
+ requires forward copy. Otherwise decide between backward/forward
|
|
+ copy depending on address aliasing. */
|
|
+
|
|
+ /* Entry if rdx is greater than __x86_rep_movsb_stop_threshold
|
|
+ but less than __x86_shared_non_temporal_threshold. */
|
|
L(more_8x_vec_check):
|
|
- cmpq %rsi, %rdi
|
|
- ja L(more_8x_vec_backward)
|
|
- /* Source == destination is less common. */
|
|
- je L(nop)
|
|
- /* Load the first VEC and last 4 * VEC to support overlapping
|
|
- addresses. */
|
|
- VMOVU (%rsi), %VEC(4)
|
|
+ /* rcx contains dst - src. Add back length (rdx). */
|
|
+ leaq (%rcx, %rdx), %r8
|
|
+ /* If r8 has different sign than rcx then there is overlap so we
|
|
+ must do forward copy. */
|
|
+ xorq %rcx, %r8
|
|
+ /* Isolate just sign bit of r8. */
|
|
+ shrq $63, %r8
|
|
+ /* Get 4k difference dst - src. */
|
|
+ andl $(PAGE_SIZE - 256), %ecx
|
|
+ /* If r8 is non-zero must do foward for correctness. Otherwise
|
|
+ if ecx is non-zero there is 4k False Alaising so do backward
|
|
+ copy. */
|
|
+ addl %r8d, %ecx
|
|
+ jz L(more_8x_vec_backward)
|
|
+
|
|
+ /* if rdx is greater than __x86_shared_non_temporal_threshold
|
|
+ but there is overlap, or from short distance movsb. */
|
|
+L(more_8x_vec_forward):
|
|
+ /* Load first and last 4 * VEC to support overlapping addresses.
|
|
+ */
|
|
+
|
|
+ /* First vec was already loaded into VEC(0). */
|
|
VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(5)
|
|
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(6)
|
|
+ /* Save begining of dst. */
|
|
+ movq %rdi, %rcx
|
|
+ /* Align dst to VEC_SIZE - 1. */
|
|
+ orq $(VEC_SIZE - 1), %rdi
|
|
VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(7)
|
|
VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(8)
|
|
- /* Save start and stop of the destination buffer. */
|
|
- movq %rdi, %r11
|
|
- leaq -VEC_SIZE(%rdi, %rdx), %rcx
|
|
- /* Align destination for aligned stores in the loop. Compute
|
|
- how much destination is misaligned. */
|
|
- movq %rdi, %r8
|
|
- andq $(VEC_SIZE - 1), %r8
|
|
- /* Get the negative of offset for alignment. */
|
|
- subq $VEC_SIZE, %r8
|
|
- /* Adjust source. */
|
|
- subq %r8, %rsi
|
|
- /* Adjust destination which should be aligned now. */
|
|
- subq %r8, %rdi
|
|
- /* Adjust length. */
|
|
- addq %r8, %rdx
|
|
|
|
- .p2align 4
|
|
+ /* Subtract dst from src. Add back after dst aligned. */
|
|
+ subq %rcx, %rsi
|
|
+ /* Finish aligning dst. */
|
|
+ incq %rdi
|
|
+ /* Restore src adjusted with new value for aligned dst. */
|
|
+ addq %rdi, %rsi
|
|
+ /* Store end of buffer minus tail in rdx. */
|
|
+ leaq (VEC_SIZE * -4)(%rcx, %rdx), %rdx
|
|
+
|
|
+ /* Dont use multi-byte nop to align. */
|
|
+ .p2align 4,, 11
|
|
L(loop_4x_vec_forward):
|
|
/* Copy 4 * VEC a time forward. */
|
|
- VMOVU (%rsi), %VEC(0)
|
|
- VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
+ VMOVU (%rsi), %VEC(1)
|
|
+ VMOVU VEC_SIZE(%rsi), %VEC(2)
|
|
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(3)
|
|
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(4)
|
|
subq $-(VEC_SIZE * 4), %rsi
|
|
- addq $-(VEC_SIZE * 4), %rdx
|
|
- VMOVA %VEC(0), (%rdi)
|
|
- VMOVA %VEC(1), VEC_SIZE(%rdi)
|
|
- VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
- VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
+ VMOVA %VEC(1), (%rdi)
|
|
+ VMOVA %VEC(2), VEC_SIZE(%rdi)
|
|
+ VMOVA %VEC(3), (VEC_SIZE * 2)(%rdi)
|
|
+ VMOVA %VEC(4), (VEC_SIZE * 3)(%rdi)
|
|
subq $-(VEC_SIZE * 4), %rdi
|
|
- cmpq $(VEC_SIZE * 4), %rdx
|
|
+ cmpq %rdi, %rdx
|
|
ja L(loop_4x_vec_forward)
|
|
/* Store the last 4 * VEC. */
|
|
- VMOVU %VEC(5), (%rcx)
|
|
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
|
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
|
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
|
+ VMOVU %VEC(5), (VEC_SIZE * 3)(%rdx)
|
|
+ VMOVU %VEC(6), (VEC_SIZE * 2)(%rdx)
|
|
+ VMOVU %VEC(7), VEC_SIZE(%rdx)
|
|
+ VMOVU %VEC(8), (%rdx)
|
|
/* Store the first VEC. */
|
|
- VMOVU %VEC(4), (%r11)
|
|
+ VMOVU %VEC(0), (%rcx)
|
|
+ /* Keep L(nop_backward) target close to jmp for 2-byte encoding.
|
|
+ */
|
|
+L(nop_backward):
|
|
VZEROUPPER_RETURN
|
|
|
|
+ .p2align 4,, 8
|
|
+L(more_8x_vec_backward_check_nop):
|
|
+ /* rcx contains dst - src. Test for dst == src to skip all of
|
|
+ memmove. */
|
|
+ testq %rcx, %rcx
|
|
+ jz L(nop_backward)
|
|
L(more_8x_vec_backward):
|
|
/* Load the first 4 * VEC and last VEC to support overlapping
|
|
addresses. */
|
|
- VMOVU (%rsi), %VEC(4)
|
|
+
|
|
+ /* First vec was also loaded into VEC(0). */
|
|
VMOVU VEC_SIZE(%rsi), %VEC(5)
|
|
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(6)
|
|
+ /* Begining of region for 4x backward copy stored in rcx. */
|
|
+ leaq (VEC_SIZE * -4 + -1)(%rdi, %rdx), %rcx
|
|
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(7)
|
|
- VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(8)
|
|
- /* Save stop of the destination buffer. */
|
|
- leaq -VEC_SIZE(%rdi, %rdx), %r11
|
|
- /* Align destination end for aligned stores in the loop. Compute
|
|
- how much destination end is misaligned. */
|
|
- leaq -VEC_SIZE(%rsi, %rdx), %rcx
|
|
- movq %r11, %r9
|
|
- movq %r11, %r8
|
|
- andq $(VEC_SIZE - 1), %r8
|
|
- /* Adjust source. */
|
|
- subq %r8, %rcx
|
|
- /* Adjust the end of destination which should be aligned now. */
|
|
- subq %r8, %r9
|
|
- /* Adjust length. */
|
|
- subq %r8, %rdx
|
|
-
|
|
- .p2align 4
|
|
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(8)
|
|
+ /* Subtract dst from src. Add back after dst aligned. */
|
|
+ subq %rdi, %rsi
|
|
+ /* Align dst. */
|
|
+ andq $-(VEC_SIZE), %rcx
|
|
+ /* Restore src. */
|
|
+ addq %rcx, %rsi
|
|
+
|
|
+ /* Don't use multi-byte nop to align. */
|
|
+ .p2align 4,, 11
|
|
L(loop_4x_vec_backward):
|
|
/* Copy 4 * VEC a time backward. */
|
|
- VMOVU (%rcx), %VEC(0)
|
|
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
|
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
|
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
|
- addq $-(VEC_SIZE * 4), %rcx
|
|
- addq $-(VEC_SIZE * 4), %rdx
|
|
- VMOVA %VEC(0), (%r9)
|
|
- VMOVA %VEC(1), -VEC_SIZE(%r9)
|
|
- VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
|
- VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
|
- addq $-(VEC_SIZE * 4), %r9
|
|
- cmpq $(VEC_SIZE * 4), %rdx
|
|
- ja L(loop_4x_vec_backward)
|
|
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(1)
|
|
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
+ VMOVU (VEC_SIZE * 1)(%rsi), %VEC(3)
|
|
+ VMOVU (VEC_SIZE * 0)(%rsi), %VEC(4)
|
|
+ addq $(VEC_SIZE * -4), %rsi
|
|
+ VMOVA %VEC(1), (VEC_SIZE * 3)(%rcx)
|
|
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rcx)
|
|
+ VMOVA %VEC(3), (VEC_SIZE * 1)(%rcx)
|
|
+ VMOVA %VEC(4), (VEC_SIZE * 0)(%rcx)
|
|
+ addq $(VEC_SIZE * -4), %rcx
|
|
+ cmpq %rcx, %rdi
|
|
+ jb L(loop_4x_vec_backward)
|
|
/* Store the first 4 * VEC. */
|
|
- VMOVU %VEC(4), (%rdi)
|
|
+ VMOVU %VEC(0), (%rdi)
|
|
VMOVU %VEC(5), VEC_SIZE(%rdi)
|
|
VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
|
VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
|
/* Store the last VEC. */
|
|
- VMOVU %VEC(8), (%r11)
|
|
+ VMOVU %VEC(8), -VEC_SIZE(%rdx, %rdi)
|
|
+ VZEROUPPER_RETURN
|
|
+
|
|
+#if defined USE_MULTIARCH && IS_IN (libc)
|
|
+ /* L(skip_short_movsb_check) is only used with ERMS. Not for
|
|
+ FSRM. */
|
|
+ .p2align 5,, 16
|
|
+# if ALIGN_MOVSB
|
|
+L(skip_short_movsb_check):
|
|
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
|
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
+# endif
|
|
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
|
|
+# error Unsupported MOVSB_ALIGN_TO
|
|
+# endif
|
|
+ /* If CPU does not have FSRM two options for aligning. Align src
|
|
+ if dst and src 4k alias. Otherwise align dst. */
|
|
+ testl $(PAGE_SIZE - 512), %ecx
|
|
+ jnz L(movsb_align_dst)
|
|
+ /* Fall through. dst and src 4k alias. It's better to align src
|
|
+ here because the bottleneck will be loads dues to the false
|
|
+ dependency on dst. */
|
|
+
|
|
+ /* rcx already has dst - src. */
|
|
+ movq %rcx, %r9
|
|
+ /* Add src to len. Subtract back after src aligned. -1 because
|
|
+ src is initially aligned to MOVSB_ALIGN_TO - 1. */
|
|
+ leaq -1(%rsi, %rdx), %rcx
|
|
+ /* Inclusively align src to MOVSB_ALIGN_TO - 1. */
|
|
+ orq $(MOVSB_ALIGN_TO - 1), %rsi
|
|
+ /* Restore dst and len adjusted with new values for aligned dst.
|
|
+ */
|
|
+ leaq 1(%rsi, %r9), %rdi
|
|
+ subq %rsi, %rcx
|
|
+ /* Finish aligning src. */
|
|
+ incq %rsi
|
|
+
|
|
+ rep movsb
|
|
+
|
|
+ VMOVU %VEC(0), (%r8)
|
|
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
|
+ VMOVU %VEC(1), VEC_SIZE(%r8)
|
|
+# endif
|
|
VZEROUPPER_RETURN
|
|
+# endif
|
|
+
|
|
+ .p2align 4,, 12
|
|
+L(movsb):
|
|
+ movq %rdi, %rcx
|
|
+ subq %rsi, %rcx
|
|
+ /* Go to backwards temporal copy if overlap no matter what as
|
|
+ backward REP MOVSB is slow and we don't want to use NT stores if
|
|
+ there is overlap. */
|
|
+ cmpq %rdx, %rcx
|
|
+ /* L(more_8x_vec_backward_check_nop) checks for src == dst. */
|
|
+ jb L(more_8x_vec_backward_check_nop)
|
|
+# if ALIGN_MOVSB
|
|
+ /* Save dest for storing aligning VECs later. */
|
|
+ movq %rdi, %r8
|
|
+# endif
|
|
+ /* If above __x86_rep_movsb_stop_threshold most likely is
|
|
+ candidate for NT moves aswell. */
|
|
+ cmp __x86_rep_movsb_stop_threshold(%rip), %RDX_LP
|
|
+ jae L(large_memcpy_2x_check)
|
|
+# if AVOID_SHORT_DISTANCE_REP_MOVSB || ALIGN_MOVSB
|
|
+ /* Only avoid short movsb if CPU has FSRM. */
|
|
+ testl $X86_STRING_CONTROL_AVOID_SHORT_DISTANCE_REP_MOVSB, __x86_string_control(%rip)
|
|
+ jz L(skip_short_movsb_check)
|
|
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
|
+ /* Avoid "rep movsb" if RCX, the distance between source and
|
|
+ destination, is N*4GB + [1..63] with N >= 0. */
|
|
+
|
|
+ /* ecx contains dst - src. Early check for backward copy
|
|
+ conditions means only case of slow movsb with src = dst + [0,
|
|
+ 63] is ecx in [-63, 0]. Use unsigned comparison with -64 check
|
|
+ for that case. */
|
|
+ cmpl $-64, %ecx
|
|
+ ja L(more_8x_vec_forward)
|
|
+# endif
|
|
+# endif
|
|
+# if ALIGN_MOVSB
|
|
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
|
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
+# endif
|
|
+# if MOVSB_ALIGN_TO > (VEC_SIZE * 2)
|
|
+# error Unsupported MOVSB_ALIGN_TO
|
|
+# endif
|
|
+ /* Fall through means cpu has FSRM. In that case exclusively
|
|
+ align destination. */
|
|
+L(movsb_align_dst):
|
|
+ /* Subtract dst from src. Add back after dst aligned. */
|
|
+ subq %rdi, %rsi
|
|
+ /* Exclusively align dst to MOVSB_ALIGN_TO (64). */
|
|
+ addq $(MOVSB_ALIGN_TO - 1), %rdi
|
|
+ /* Add dst to len. Subtract back after dst aligned. */
|
|
+ leaq (%r8, %rdx), %rcx
|
|
+ /* Finish aligning dst. */
|
|
+ andq $-(MOVSB_ALIGN_TO), %rdi
|
|
+ /* Restore src and len adjusted with new values for aligned dst.
|
|
+ */
|
|
+ addq %rdi, %rsi
|
|
+ subq %rdi, %rcx
|
|
+
|
|
+ rep movsb
|
|
+
|
|
+ /* Store VECs loaded for aligning. */
|
|
+ VMOVU %VEC(0), (%r8)
|
|
+# if MOVSB_ALIGN_TO > VEC_SIZE
|
|
+ VMOVU %VEC(1), VEC_SIZE(%r8)
|
|
+# endif
|
|
+ VZEROUPPER_RETURN
|
|
+# else /* !ALIGN_MOVSB. */
|
|
+L(skip_short_movsb_check):
|
|
+ mov %RDX_LP, %RCX_LP
|
|
+ rep movsb
|
|
+ ret
|
|
+# endif
|
|
+#endif
|
|
|
|
+ .p2align 4,, 10
|
|
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
|
- .p2align 4
|
|
+L(large_memcpy_2x_check):
|
|
+ cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
|
|
+ jb L(more_8x_vec_check)
|
|
L(large_memcpy_2x):
|
|
- /* Compute absolute value of difference between source and
|
|
- destination. */
|
|
- movq %rdi, %r9
|
|
- subq %rsi, %r9
|
|
- movq %r9, %r8
|
|
- leaq -1(%r9), %rcx
|
|
- sarq $63, %r8
|
|
- xorq %r8, %r9
|
|
- subq %r8, %r9
|
|
- /* Don't use non-temporal store if there is overlap between
|
|
- destination and source since destination may be in cache when
|
|
- source is loaded. */
|
|
- cmpq %r9, %rdx
|
|
- ja L(more_8x_vec_check)
|
|
+ /* To reach this point it is impossible for dst > src and
|
|
+ overlap. Remaining to check is src > dst and overlap. rcx
|
|
+ already contains dst - src. Negate rcx to get src - dst. If
|
|
+ length > rcx then there is overlap and forward copy is best. */
|
|
+ negq %rcx
|
|
+ cmpq %rcx, %rdx
|
|
+ ja L(more_8x_vec_forward)
|
|
|
|
/* Cache align destination. First store the first 64 bytes then
|
|
adjust alignments. */
|
|
- VMOVU (%rsi), %VEC(8)
|
|
-#if VEC_SIZE < 64
|
|
- VMOVU VEC_SIZE(%rsi), %VEC(9)
|
|
-#if VEC_SIZE < 32
|
|
- VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
|
|
- VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
|
|
-#endif
|
|
-#endif
|
|
- VMOVU %VEC(8), (%rdi)
|
|
-#if VEC_SIZE < 64
|
|
- VMOVU %VEC(9), VEC_SIZE(%rdi)
|
|
-#if VEC_SIZE < 32
|
|
- VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
|
|
- VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
|
|
-#endif
|
|
-#endif
|
|
+
|
|
+ /* First vec was also loaded into VEC(0). */
|
|
+# if VEC_SIZE < 64
|
|
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
|
+# if VEC_SIZE < 32
|
|
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
|
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
|
+# endif
|
|
+# endif
|
|
+ VMOVU %VEC(0), (%rdi)
|
|
+# if VEC_SIZE < 64
|
|
+ VMOVU %VEC(1), VEC_SIZE(%rdi)
|
|
+# if VEC_SIZE < 32
|
|
+ VMOVU %VEC(2), (VEC_SIZE * 2)(%rdi)
|
|
+ VMOVU %VEC(3), (VEC_SIZE * 3)(%rdi)
|
|
+# endif
|
|
+# endif
|
|
+
|
|
/* Adjust source, destination, and size. */
|
|
movq %rdi, %r8
|
|
andq $63, %r8
|
|
@@ -614,9 +767,13 @@ L(large_memcpy_2x):
|
|
/* Adjust length. */
|
|
addq %r8, %rdx
|
|
|
|
- /* Test if source and destination addresses will alias. If they do
|
|
- the larger pipeline in large_memcpy_4x alleviated the
|
|
+ /* Test if source and destination addresses will alias. If they
|
|
+ do the larger pipeline in large_memcpy_4x alleviated the
|
|
performance drop. */
|
|
+
|
|
+ /* ecx contains -(dst - src). not ecx will return dst - src - 1
|
|
+ which works for testing aliasing. */
|
|
+ notl %ecx
|
|
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
|
jz L(large_memcpy_4x)
|
|
|
|
@@ -704,8 +861,8 @@ L(loop_large_memcpy_4x_outer):
|
|
/* ecx stores inner loop counter. */
|
|
movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
|
L(loop_large_memcpy_4x_inner):
|
|
- /* Only one prefetch set per page as doing 4 pages give more time
|
|
- for prefetcher to keep up. */
|
|
+ /* Only one prefetch set per page as doing 4 pages give more
|
|
+ time for prefetcher to keep up. */
|
|
PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
|
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
|
PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
|