601650f878
* Tue May 31 2022 Arjun Shankar <arjun@redhat.com> - 2.34-35 - Sync with upstream branch release/2.34/master, commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31: - Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose - x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #29127] - string.h: fix __fortified_attr_access macro call [BZ #29162] - linux: Add a getauxval test [BZ #23293] - rtld: Use generic argv adjustment in ld.so [BZ #23293] - S390: Enable static PIE * Thu May 19 2022 Florian Weimer <fweimer@redhat.com> - 2.34-34 - Sync with upstream branch release/2.34/master, commit ede8d94d154157d269b18f3601440ac576c1f96a: - csu: Implement and use _dl_early_allocate during static startup - Linux: Introduce __brk_call for invoking the brk system call - Linux: Implement a useful version of _startup_fatal - ia64: Always define IA64_USE_NEW_STUB as a flag macro - Linux: Define MMAP_CALL_INTERNAL - i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls - i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S - elf: Remove __libc_init_secure - Linux: Consolidate auxiliary vector parsing (redo) - Linux: Include <dl-auxv.h> in dl-sysdep.c only for SHARED - Revert "Linux: Consolidate auxiliary vector parsing" - Linux: Consolidate auxiliary vector parsing - Linux: Assume that NEED_DL_SYSINFO_DSO is always defined - Linux: Remove DL_FIND_ARG_COMPONENTS - Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE - elf: Merge dl-sysdep.c into the Linux version - elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr - x86: Optimize {str|wcs}rchr-evex - x86: Optimize {str|wcs}rchr-avx2 - x86: Optimize {str|wcs}rchr-sse2 - x86: Cleanup page cross code in memcmp-avx2-movbe.S - x86: Remove memcmp-sse4.S - x86: Small improvements for wcslen - x86: Remove AVX str{n}casecmp - x86: Add EVEX optimized str{n}casecmp - x86: Add AVX2 optimized str{n}casecmp - x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S - x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S - x86: Remove strspn-sse2.S and use the generic implementation - x86: Remove strpbrk-sse2.S and use the generic implementation - x87: Remove strcspn-sse2.S and use the generic implementation - x86: Optimize strspn in strspn-c.c - x86: Optimize strcspn and strpbrk in strcspn-c.c - x86: Code cleanup in strchr-evex and comment justifying branch - x86: Code cleanup in strchr-avx2 and comment justifying branch - x86_64: Remove bcopy optimizations - x86-64: Remove bzero weak alias in SS2 memset - x86_64/multiarch: Sort sysdep_routines and put one entry per line - x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) - fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141] * Thu May 12 2022 Florian Weimer <fweimer@redhat.com> - 2.34-33 - Sync with upstream branch release/2.34/master, commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23: - dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo - manual: Document the dlinfo function - x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] - x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] - x86: Set .text section in memset-vec-unaligned-erms - x86-64: Optimize bzero - x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) - x86: Improve vec generation in memset-vec-unaligned-erms.S - x86-64: Fix strcmp-evex.S - x86-64: Fix strcmp-avx2.S - x86: Optimize strcmp-evex.S - x86: Optimize strcmp-avx2.S - manual: Clarify that abbreviations of long options are allowed - Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h - aarch64: Add HWCAP2_ECV from Linux 5.16 - Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h - Update kernel version to 5.17 in tst-mman-consts.py - Update kernel version to 5.16 in tst-mman-consts.py - Update syscall lists for Linux 5.17 - Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h - Update kernel version to 5.15 in tst-mman-consts.py - Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h Resolves: #2091541
452 lines
13 KiB
Diff
452 lines
13 KiB
Diff
commit ea19c490a3f5628d55ded271cbb753e66b2f05e8
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Sun Feb 6 00:54:18 2022 -0600
|
|
|
|
x86: Improve vec generation in memset-vec-unaligned-erms.S
|
|
|
|
No bug.
|
|
|
|
Split vec generation into multiple steps. This allows the
|
|
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
|
|
case. This saves an expensive lane-cross instruction and removes
|
|
the need for 'vzeroupper'.
|
|
|
|
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
|
|
byte broadcast.
|
|
|
|
Results for memset-avx2 small (geomean of N = 20 benchset runs).
|
|
|
|
size, New Time, Old Time, New / Old
|
|
0, 4.100, 3.831, 0.934
|
|
1, 5.074, 4.399, 0.867
|
|
2, 4.433, 4.411, 0.995
|
|
4, 4.487, 4.415, 0.984
|
|
8, 4.454, 4.396, 0.987
|
|
16, 4.502, 4.443, 0.987
|
|
|
|
All relevant string/wcsmbs tests are passing.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d)
|
|
|
|
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
|
index 0137eba4cdd9f830..34ee0bfdcb81fb39 100644
|
|
--- a/sysdeps/x86_64/memset.S
|
|
+++ b/sysdeps/x86_64/memset.S
|
|
@@ -28,17 +28,22 @@
|
|
#define VMOVU movups
|
|
#define VMOVA movaps
|
|
|
|
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
movd d, %xmm0; \
|
|
- movq r, %rax; \
|
|
- punpcklbw %xmm0, %xmm0; \
|
|
- punpcklwd %xmm0, %xmm0; \
|
|
- pshufd $0, %xmm0, %xmm0
|
|
+ pxor %xmm1, %xmm1; \
|
|
+ pshufb %xmm1, %xmm0; \
|
|
+ movq r, %rax
|
|
|
|
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
movd d, %xmm0; \
|
|
- movq r, %rax; \
|
|
- pshufd $0, %xmm0, %xmm0
|
|
+ pshufd $0, %xmm0, %xmm0; \
|
|
+ movq r, %rax
|
|
+
|
|
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
|
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
|
+
|
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
|
|
|
#define SECTION(p) p
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
|
index 1af668af0aeda59e..c0bf2875d03d51ab 100644
|
|
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
|
@@ -10,15 +10,18 @@
|
|
# define VMOVU vmovdqu
|
|
# define VMOVA vmovdqa
|
|
|
|
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
vmovd d, %xmm0; \
|
|
- movq r, %rax; \
|
|
- vpbroadcastb %xmm0, %ymm0
|
|
+ movq r, %rax;
|
|
|
|
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
- vmovd d, %xmm0; \
|
|
- movq r, %rax; \
|
|
- vpbroadcastd %xmm0, %ymm0
|
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
|
+
|
|
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
|
|
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
|
|
+
|
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
|
|
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
|
|
|
|
# ifndef SECTION
|
|
# define SECTION(p) p##.avx
|
|
@@ -30,5 +33,6 @@
|
|
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
|
# endif
|
|
|
|
+# define USE_XMM_LESS_VEC
|
|
# include "memset-vec-unaligned-erms.S"
|
|
#endif
|
|
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
index f14d6f8493c21a36..5241216a77bf72b7 100644
|
|
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
|
@@ -15,13 +15,19 @@
|
|
|
|
# define VZEROUPPER
|
|
|
|
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
- movq r, %rax; \
|
|
- vpbroadcastb d, %VEC0
|
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
+ vpbroadcastb d, %VEC0; \
|
|
+ movq r, %rax
|
|
|
|
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
- movq r, %rax; \
|
|
- vpbroadcastd d, %VEC0
|
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
+ vpbroadcastd d, %VEC0; \
|
|
+ movq r, %rax
|
|
+
|
|
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
|
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
|
+
|
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
|
|
|
# define SECTION(p) p##.evex512
|
|
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
|
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
index 64b09e77cc20cc42..637002150659123c 100644
|
|
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
|
@@ -15,13 +15,19 @@
|
|
|
|
# define VZEROUPPER
|
|
|
|
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
- movq r, %rax; \
|
|
- vpbroadcastb d, %VEC0
|
|
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
+ vpbroadcastb d, %VEC0; \
|
|
+ movq r, %rax
|
|
|
|
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
|
- movq r, %rax; \
|
|
- vpbroadcastd d, %VEC0
|
|
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
|
+ vpbroadcastd d, %VEC0; \
|
|
+ movq r, %rax
|
|
+
|
|
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
|
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
|
+
|
|
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
|
|
|
# define SECTION(p) p##.evex
|
|
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
|
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
index e723413a664c088f..c8db87dcbf69f0d8 100644
|
|
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
|
@@ -58,8 +58,10 @@
|
|
#ifndef MOVQ
|
|
# if VEC_SIZE > 16
|
|
# define MOVQ vmovq
|
|
+# define MOVD vmovd
|
|
# else
|
|
# define MOVQ movq
|
|
+# define MOVD movd
|
|
# endif
|
|
#endif
|
|
|
|
@@ -72,9 +74,17 @@
|
|
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
|
# define END_REG rcx
|
|
# define LOOP_REG rdi
|
|
+# define LESS_VEC_REG rax
|
|
#else
|
|
# define END_REG rdi
|
|
# define LOOP_REG rdx
|
|
+# define LESS_VEC_REG rdi
|
|
+#endif
|
|
+
|
|
+#ifdef USE_XMM_LESS_VEC
|
|
+# define XMM_SMALL 1
|
|
+#else
|
|
+# define XMM_SMALL 0
|
|
#endif
|
|
|
|
#define PAGE_SIZE 4096
|
|
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
|
|
|
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
shl $2, %RDX_LP
|
|
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
- jmp L(entry_from_bzero)
|
|
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
+ WMEMSET_VDUP_TO_VEC0_LOW()
|
|
+ cmpq $VEC_SIZE, %rdx
|
|
+ jb L(less_vec_no_vdup)
|
|
+ WMEMSET_VDUP_TO_VEC0_HIGH()
|
|
+ jmp L(entry_from_wmemset)
|
|
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
|
#endif
|
|
|
|
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
|
#endif
|
|
|
|
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
# ifdef __ILP32__
|
|
/* Clear the upper 32 bits. */
|
|
mov %edx, %edx
|
|
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
|
L(entry_from_bzero):
|
|
cmpq $VEC_SIZE, %rdx
|
|
jb L(less_vec)
|
|
+ MEMSET_VDUP_TO_VEC0_HIGH()
|
|
+L(entry_from_wmemset):
|
|
cmpq $(VEC_SIZE * 2), %rdx
|
|
ja L(more_2x_vec)
|
|
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
|
# endif
|
|
|
|
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
|
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
|
# ifdef __ILP32__
|
|
/* Clear the upper 32 bits. */
|
|
mov %edx, %edx
|
|
# endif
|
|
cmp $VEC_SIZE, %RDX_LP
|
|
jb L(less_vec)
|
|
+ MEMSET_VDUP_TO_VEC0_HIGH ()
|
|
cmp $(VEC_SIZE * 2), %RDX_LP
|
|
ja L(stosb_more_2x_vec)
|
|
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
|
- */
|
|
- VMOVU %VEC(0), (%rax)
|
|
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
|
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
|
+ VMOVU %VEC(0), (%rdi)
|
|
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
|
VZEROUPPER_RETURN
|
|
#endif
|
|
|
|
- .p2align 4,, 10
|
|
+ .p2align 4,, 4
|
|
L(last_2x_vec):
|
|
#ifdef USE_LESS_VEC_MASK_STORE
|
|
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
|
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
|
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
|
|
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
|
#else
|
|
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
|
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
|
@@ -212,6 +228,7 @@ L(last_2x_vec):
|
|
#ifdef USE_LESS_VEC_MASK_STORE
|
|
.p2align 4,, 10
|
|
L(less_vec):
|
|
+L(less_vec_no_vdup):
|
|
/* Less than 1 VEC. */
|
|
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
|
# error Unsupported VEC_SIZE!
|
|
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
|
|
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
|
and (4x, 8x] jump to target. */
|
|
L(more_2x_vec):
|
|
-
|
|
- /* Two different methods of setting up pointers / compare. The
|
|
- two methods are based on the fact that EVEX/AVX512 mov
|
|
- instructions take more bytes then AVX2/SSE2 mov instructions. As
|
|
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
|
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
|
- this saves code size and keeps a few targets in one fetch block.
|
|
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
|
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
|
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
|
- LOOP_4X_OFFSET) with LEA_BID. */
|
|
-
|
|
- /* END_REG is rcx for EVEX/AVX512. */
|
|
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
|
-#endif
|
|
-
|
|
- /* Stores to first 2x VEC before cmp as any path forward will
|
|
- require it. */
|
|
- VMOVU %VEC(0), (%rax)
|
|
- VMOVU %VEC(0), VEC_SIZE(%rax)
|
|
+ /* Store next 2x vec regardless. */
|
|
+ VMOVU %VEC(0), (%rdi)
|
|
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
|
|
|
|
|
|
+ /* Two different methods of setting up pointers / compare. The two
|
|
+ methods are based on the fact that EVEX/AVX512 mov instructions take
|
|
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
|
|
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
|
|
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
|
|
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
|
|
+ bottlenecks. */
|
|
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
|
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
|
addq %rdx, %END_REG
|
|
@@ -292,6 +299,15 @@ L(more_2x_vec):
|
|
cmpq $(VEC_SIZE * 4), %rdx
|
|
jbe L(last_2x_vec)
|
|
|
|
+
|
|
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
|
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
|
|
+ LEA_BID. */
|
|
+
|
|
+ /* END_REG is rcx for EVEX/AVX512. */
|
|
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
|
+#endif
|
|
+
|
|
/* Store next 2x vec regardless. */
|
|
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
|
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
|
@@ -355,65 +371,93 @@ L(stosb_local):
|
|
/* Define L(less_vec) only if not otherwise defined. */
|
|
.p2align 4
|
|
L(less_vec):
|
|
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
|
+ xmm). This is only does anything for AVX2. */
|
|
+ MEMSET_VDUP_TO_VEC0_LOW ()
|
|
+L(less_vec_no_vdup):
|
|
#endif
|
|
L(cross_page):
|
|
#if VEC_SIZE > 32
|
|
cmpl $32, %edx
|
|
- jae L(between_32_63)
|
|
+ jge L(between_32_63)
|
|
#endif
|
|
#if VEC_SIZE > 16
|
|
cmpl $16, %edx
|
|
- jae L(between_16_31)
|
|
+ jge L(between_16_31)
|
|
+#endif
|
|
+#ifndef USE_XMM_LESS_VEC
|
|
+ MOVQ %XMM0, %rcx
|
|
#endif
|
|
- MOVQ %XMM0, %rdi
|
|
cmpl $8, %edx
|
|
- jae L(between_8_15)
|
|
+ jge L(between_8_15)
|
|
cmpl $4, %edx
|
|
- jae L(between_4_7)
|
|
+ jge L(between_4_7)
|
|
cmpl $1, %edx
|
|
- ja L(between_2_3)
|
|
- jb L(return)
|
|
- movb %sil, (%rax)
|
|
- VZEROUPPER_RETURN
|
|
+ jg L(between_2_3)
|
|
+ jl L(between_0_0)
|
|
+ movb %sil, (%LESS_VEC_REG)
|
|
+L(between_0_0):
|
|
+ ret
|
|
|
|
- /* Align small targets only if not doing so would cross a fetch
|
|
- line. */
|
|
+ /* Align small targets only if not doing so would cross a fetch line.
|
|
+ */
|
|
#if VEC_SIZE > 32
|
|
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
|
/* From 32 to 63. No branch when size == 32. */
|
|
L(between_32_63):
|
|
- VMOVU %YMM0, (%rax)
|
|
- VMOVU %YMM0, -32(%rax, %rdx)
|
|
+ VMOVU %YMM0, (%LESS_VEC_REG)
|
|
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
|
|
VZEROUPPER_RETURN
|
|
#endif
|
|
|
|
#if VEC_SIZE >= 32
|
|
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
|
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
|
|
L(between_16_31):
|
|
/* From 16 to 31. No branch when size == 16. */
|
|
- VMOVU %XMM0, (%rax)
|
|
- VMOVU %XMM0, -16(%rax, %rdx)
|
|
- VZEROUPPER_RETURN
|
|
+ VMOVU %XMM0, (%LESS_VEC_REG)
|
|
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
|
|
+ ret
|
|
#endif
|
|
|
|
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
|
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
|
+ */
|
|
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
|
|
L(between_8_15):
|
|
/* From 8 to 15. No branch when size == 8. */
|
|
- movq %rdi, (%rax)
|
|
- movq %rdi, -8(%rax, %rdx)
|
|
- VZEROUPPER_RETURN
|
|
+#ifdef USE_XMM_LESS_VEC
|
|
+ MOVQ %XMM0, (%rdi)
|
|
+ MOVQ %XMM0, -8(%rdi, %rdx)
|
|
+#else
|
|
+ movq %rcx, (%LESS_VEC_REG)
|
|
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
|
|
+#endif
|
|
+ ret
|
|
|
|
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
|
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
|
+ */
|
|
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
|
|
L(between_4_7):
|
|
/* From 4 to 7. No branch when size == 4. */
|
|
- movl %edi, (%rax)
|
|
- movl %edi, -4(%rax, %rdx)
|
|
- VZEROUPPER_RETURN
|
|
+#ifdef USE_XMM_LESS_VEC
|
|
+ MOVD %XMM0, (%rdi)
|
|
+ MOVD %XMM0, -4(%rdi, %rdx)
|
|
+#else
|
|
+ movl %ecx, (%LESS_VEC_REG)
|
|
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
|
|
+#endif
|
|
+ ret
|
|
|
|
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
|
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
|
|
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
|
|
L(between_2_3):
|
|
/* From 2 to 3. No branch when size == 2. */
|
|
- movw %di, (%rax)
|
|
- movb %dil, -1(%rax, %rdx)
|
|
- VZEROUPPER_RETURN
|
|
+#ifdef USE_XMM_LESS_VEC
|
|
+ movb %sil, (%rdi)
|
|
+ movb %sil, 1(%rdi)
|
|
+ movb %sil, -1(%rdi, %rdx)
|
|
+#else
|
|
+ movw %cx, (%LESS_VEC_REG)
|
|
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
|
|
+#endif
|
|
+ ret
|
|
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|