601650f878
* Tue May 31 2022 Arjun Shankar <arjun@redhat.com> - 2.34-35 - Sync with upstream branch release/2.34/master, commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31: - Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose - x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #29127] - string.h: fix __fortified_attr_access macro call [BZ #29162] - linux: Add a getauxval test [BZ #23293] - rtld: Use generic argv adjustment in ld.so [BZ #23293] - S390: Enable static PIE * Thu May 19 2022 Florian Weimer <fweimer@redhat.com> - 2.34-34 - Sync with upstream branch release/2.34/master, commit ede8d94d154157d269b18f3601440ac576c1f96a: - csu: Implement and use _dl_early_allocate during static startup - Linux: Introduce __brk_call for invoking the brk system call - Linux: Implement a useful version of _startup_fatal - ia64: Always define IA64_USE_NEW_STUB as a flag macro - Linux: Define MMAP_CALL_INTERNAL - i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls - i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S - elf: Remove __libc_init_secure - Linux: Consolidate auxiliary vector parsing (redo) - Linux: Include <dl-auxv.h> in dl-sysdep.c only for SHARED - Revert "Linux: Consolidate auxiliary vector parsing" - Linux: Consolidate auxiliary vector parsing - Linux: Assume that NEED_DL_SYSINFO_DSO is always defined - Linux: Remove DL_FIND_ARG_COMPONENTS - Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE - elf: Merge dl-sysdep.c into the Linux version - elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr - x86: Optimize {str|wcs}rchr-evex - x86: Optimize {str|wcs}rchr-avx2 - x86: Optimize {str|wcs}rchr-sse2 - x86: Cleanup page cross code in memcmp-avx2-movbe.S - x86: Remove memcmp-sse4.S - x86: Small improvements for wcslen - x86: Remove AVX str{n}casecmp - x86: Add EVEX optimized str{n}casecmp - x86: Add AVX2 optimized str{n}casecmp - x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S - x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S - x86: Remove strspn-sse2.S and use the generic implementation - x86: Remove strpbrk-sse2.S and use the generic implementation - x87: Remove strcspn-sse2.S and use the generic implementation - x86: Optimize strspn in strspn-c.c - x86: Optimize strcspn and strpbrk in strcspn-c.c - x86: Code cleanup in strchr-evex and comment justifying branch - x86: Code cleanup in strchr-avx2 and comment justifying branch - x86_64: Remove bcopy optimizations - x86-64: Remove bzero weak alias in SS2 memset - x86_64/multiarch: Sort sysdep_routines and put one entry per line - x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) - fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141] * Thu May 12 2022 Florian Weimer <fweimer@redhat.com> - 2.34-33 - Sync with upstream branch release/2.34/master, commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23: - dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo - manual: Document the dlinfo function - x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] - x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] - x86: Set .text section in memset-vec-unaligned-erms - x86-64: Optimize bzero - x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) - x86: Improve vec generation in memset-vec-unaligned-erms.S - x86-64: Fix strcmp-evex.S - x86-64: Fix strcmp-avx2.S - x86: Optimize strcmp-evex.S - x86: Optimize strcmp-avx2.S - manual: Clarify that abbreviations of long options are allowed - Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h - aarch64: Add HWCAP2_ECV from Linux 5.16 - Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h - Update kernel version to 5.17 in tst-mman-consts.py - Update kernel version to 5.16 in tst-mman-consts.py - Update syscall lists for Linux 5.17 - Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h - Update kernel version to 5.15 in tst-mman-consts.py - Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h Resolves: #2091541
866 lines
17 KiB
Diff
866 lines
17 KiB
Diff
commit 0a11305416e287d85c64f04337cfd64b6b350e0c
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Thu Apr 21 20:52:28 2022 -0500
|
|
|
|
x86: Optimize {str|wcs}rchr-sse2
|
|
|
|
The new code unrolls the main loop slightly without adding too much
|
|
overhead and minimizes the comparisons for the search CHAR.
|
|
|
|
Geometric Mean of all benchmarks New / Old: 0.741
|
|
See email for all results.
|
|
|
|
Full xcheck passes on x86_64 with and without multiarch enabled.
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c)
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
|
index 67c30d0260cef8a3..a56300bc1830dedd 100644
|
|
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
|
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
|
@@ -17,7 +17,7 @@
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#if IS_IN (libc)
|
|
-# define strrchr __strrchr_sse2
|
|
+# define STRRCHR __strrchr_sse2
|
|
|
|
# undef weak_alias
|
|
# define weak_alias(strrchr, rindex)
|
|
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
|
index a36034b40afe8d3d..00f69f2be77a43a0 100644
|
|
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
|
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
|
@@ -17,7 +17,6 @@
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
#if IS_IN (libc)
|
|
-# define wcsrchr __wcsrchr_sse2
|
|
+# define STRRCHR __wcsrchr_sse2
|
|
#endif
|
|
-
|
|
#include "../wcsrchr.S"
|
|
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
|
|
index dfd09fe9508cb5bc..fc1598bb11417fd5 100644
|
|
--- a/sysdeps/x86_64/strrchr.S
|
|
+++ b/sysdeps/x86_64/strrchr.S
|
|
@@ -19,210 +19,360 @@
|
|
|
|
#include <sysdep.h>
|
|
|
|
+#ifndef STRRCHR
|
|
+# define STRRCHR strrchr
|
|
+#endif
|
|
+
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+# define PCMPEQ pcmpeqd
|
|
+# define CHAR_SIZE 4
|
|
+# define PMINU pminud
|
|
+#else
|
|
+# define PCMPEQ pcmpeqb
|
|
+# define CHAR_SIZE 1
|
|
+# define PMINU pminub
|
|
+#endif
|
|
+
|
|
+#define PAGE_SIZE 4096
|
|
+#define VEC_SIZE 16
|
|
+
|
|
.text
|
|
-ENTRY (strrchr)
|
|
- movd %esi, %xmm1
|
|
+ENTRY(STRRCHR)
|
|
+ movd %esi, %xmm0
|
|
movq %rdi, %rax
|
|
- andl $4095, %eax
|
|
- punpcklbw %xmm1, %xmm1
|
|
- cmpq $4032, %rax
|
|
- punpcklwd %xmm1, %xmm1
|
|
- pshufd $0, %xmm1, %xmm1
|
|
+ andl $(PAGE_SIZE - 1), %eax
|
|
+#ifndef USE_AS_WCSRCHR
|
|
+ punpcklbw %xmm0, %xmm0
|
|
+ punpcklwd %xmm0, %xmm0
|
|
+#endif
|
|
+ pshufd $0, %xmm0, %xmm0
|
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
ja L(cross_page)
|
|
- movdqu (%rdi), %xmm0
|
|
+
|
|
+L(cross_page_continue):
|
|
+ movups (%rdi), %xmm1
|
|
pxor %xmm2, %xmm2
|
|
- movdqa %xmm0, %xmm3
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pcmpeqb %xmm2, %xmm3
|
|
- pmovmskb %xmm0, %ecx
|
|
- pmovmskb %xmm3, %edx
|
|
- testq %rdx, %rdx
|
|
- je L(next_48_bytes)
|
|
- leaq -1(%rdx), %rax
|
|
- xorq %rdx, %rax
|
|
- andq %rcx, %rax
|
|
- je L(exit)
|
|
- bsrq %rax, %rax
|
|
+ PCMPEQ %xmm1, %xmm2
|
|
+ pmovmskb %xmm2, %ecx
|
|
+ testl %ecx, %ecx
|
|
+ jz L(aligned_more)
|
|
+
|
|
+ PCMPEQ %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+ leal -1(%rcx), %edx
|
|
+ xorl %edx, %ecx
|
|
+ andl %ecx, %eax
|
|
+ jz L(ret0)
|
|
+ bsrl %eax, %eax
|
|
addq %rdi, %rax
|
|
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
|
|
+ search CHAR is zero we are correct. Either way `andq
|
|
+ -CHAR_SIZE, %rax` gets the correct result. */
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
+L(ret0):
|
|
ret
|
|
|
|
+ /* Returns for first vec x1/x2 have hard coded backward search
|
|
+ path for earlier matches. */
|
|
.p2align 4
|
|
-L(next_48_bytes):
|
|
- movdqu 16(%rdi), %xmm4
|
|
- movdqa %xmm4, %xmm5
|
|
- movdqu 32(%rdi), %xmm3
|
|
- pcmpeqb %xmm1, %xmm4
|
|
- pcmpeqb %xmm2, %xmm5
|
|
- movdqu 48(%rdi), %xmm0
|
|
- pmovmskb %xmm5, %edx
|
|
- movdqa %xmm3, %xmm5
|
|
- pcmpeqb %xmm1, %xmm3
|
|
- pcmpeqb %xmm2, %xmm5
|
|
- pcmpeqb %xmm0, %xmm2
|
|
- salq $16, %rdx
|
|
- pmovmskb %xmm3, %r8d
|
|
- pmovmskb %xmm5, %eax
|
|
- pmovmskb %xmm2, %esi
|
|
- salq $32, %r8
|
|
- salq $32, %rax
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- orq %rdx, %rax
|
|
- movq %rsi, %rdx
|
|
- pmovmskb %xmm4, %esi
|
|
- salq $48, %rdx
|
|
- salq $16, %rsi
|
|
- orq %r8, %rsi
|
|
- orq %rcx, %rsi
|
|
- pmovmskb %xmm0, %ecx
|
|
- salq $48, %rcx
|
|
- orq %rcx, %rsi
|
|
- orq %rdx, %rax
|
|
- je L(loop_header2)
|
|
- leaq -1(%rax), %rcx
|
|
- xorq %rax, %rcx
|
|
- andq %rcx, %rsi
|
|
- je L(exit)
|
|
- bsrq %rsi, %rsi
|
|
- leaq (%rdi,%rsi), %rax
|
|
+L(first_vec_x0_test):
|
|
+ PCMPEQ %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(ret0)
|
|
+ bsrl %eax, %eax
|
|
+ addq %r8, %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
ret
|
|
|
|
.p2align 4
|
|
-L(loop_header2):
|
|
- testq %rsi, %rsi
|
|
- movq %rdi, %rcx
|
|
- je L(no_c_found)
|
|
-L(loop_header):
|
|
- addq $64, %rdi
|
|
- pxor %xmm7, %xmm7
|
|
- andq $-64, %rdi
|
|
- jmp L(loop_entry)
|
|
+L(first_vec_x1):
|
|
+ PCMPEQ %xmm0, %xmm2
|
|
+ pmovmskb %xmm2, %eax
|
|
+ leal -1(%rcx), %edx
|
|
+ xorl %edx, %ecx
|
|
+ andl %ecx, %eax
|
|
+ jz L(first_vec_x0_test)
|
|
+ bsrl %eax, %eax
|
|
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
+ ret
|
|
|
|
.p2align 4
|
|
-L(loop64):
|
|
- testq %rdx, %rdx
|
|
- cmovne %rdx, %rsi
|
|
- cmovne %rdi, %rcx
|
|
- addq $64, %rdi
|
|
-L(loop_entry):
|
|
- movdqa 32(%rdi), %xmm3
|
|
- pxor %xmm6, %xmm6
|
|
- movdqa 48(%rdi), %xmm2
|
|
- movdqa %xmm3, %xmm0
|
|
- movdqa 16(%rdi), %xmm4
|
|
- pminub %xmm2, %xmm0
|
|
- movdqa (%rdi), %xmm5
|
|
- pminub %xmm4, %xmm0
|
|
- pminub %xmm5, %xmm0
|
|
- pcmpeqb %xmm7, %xmm0
|
|
- pmovmskb %xmm0, %eax
|
|
- movdqa %xmm5, %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pmovmskb %xmm0, %r9d
|
|
- movdqa %xmm4, %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- pmovmskb %xmm0, %edx
|
|
- movdqa %xmm3, %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- salq $16, %rdx
|
|
- pmovmskb %xmm0, %r10d
|
|
- movdqa %xmm2, %xmm0
|
|
- pcmpeqb %xmm1, %xmm0
|
|
- salq $32, %r10
|
|
- orq %r10, %rdx
|
|
- pmovmskb %xmm0, %r8d
|
|
- orq %r9, %rdx
|
|
- salq $48, %r8
|
|
- orq %r8, %rdx
|
|
+L(first_vec_x1_test):
|
|
+ PCMPEQ %xmm0, %xmm2
|
|
+ pmovmskb %xmm2, %eax
|
|
testl %eax, %eax
|
|
- je L(loop64)
|
|
- pcmpeqb %xmm6, %xmm4
|
|
- pcmpeqb %xmm6, %xmm3
|
|
- pcmpeqb %xmm6, %xmm5
|
|
- pmovmskb %xmm4, %eax
|
|
- pmovmskb %xmm3, %r10d
|
|
- pcmpeqb %xmm6, %xmm2
|
|
- pmovmskb %xmm5, %r9d
|
|
- salq $32, %r10
|
|
- salq $16, %rax
|
|
- pmovmskb %xmm2, %r8d
|
|
- orq %r10, %rax
|
|
- orq %r9, %rax
|
|
- salq $48, %r8
|
|
- orq %r8, %rax
|
|
- leaq -1(%rax), %r8
|
|
- xorq %rax, %r8
|
|
- andq %r8, %rdx
|
|
- cmovne %rdi, %rcx
|
|
- cmovne %rdx, %rsi
|
|
- bsrq %rsi, %rsi
|
|
- leaq (%rcx,%rsi), %rax
|
|
+ jz L(first_vec_x0_test)
|
|
+ bsrl %eax, %eax
|
|
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x2):
|
|
+ PCMPEQ %xmm0, %xmm3
|
|
+ pmovmskb %xmm3, %eax
|
|
+ leal -1(%rcx), %edx
|
|
+ xorl %edx, %ecx
|
|
+ andl %ecx, %eax
|
|
+ jz L(first_vec_x1_test)
|
|
+ bsrl %eax, %eax
|
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(aligned_more):
|
|
+ /* Save original pointer if match was in VEC 0. */
|
|
+ movq %rdi, %r8
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+
|
|
+ movaps VEC_SIZE(%rdi), %xmm2
|
|
+ pxor %xmm3, %xmm3
|
|
+ PCMPEQ %xmm2, %xmm3
|
|
+ pmovmskb %xmm3, %ecx
|
|
+ testl %ecx, %ecx
|
|
+ jnz L(first_vec_x1)
|
|
+
|
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
|
|
+ pxor %xmm4, %xmm4
|
|
+ PCMPEQ %xmm3, %xmm4
|
|
+ pmovmskb %xmm4, %ecx
|
|
+ testl %ecx, %ecx
|
|
+ jnz L(first_vec_x2)
|
|
+
|
|
+ addq $VEC_SIZE, %rdi
|
|
+ /* Save pointer again before realigning. */
|
|
+ movq %rdi, %rsi
|
|
+ andq $-(VEC_SIZE * 2), %rdi
|
|
+ .p2align 4
|
|
+L(first_loop):
|
|
+ /* Do 2x VEC at a time. */
|
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
|
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
|
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
|
+ detecting zero. Note if this is found to be a bottleneck it
|
|
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ movaps %xmm5, %xmm6
|
|
+ pxor %xmm8, %xmm8
|
|
+
|
|
+ PCMPEQ %xmm8, %xmm5
|
|
+ PCMPEQ %xmm4, %xmm8
|
|
+ por %xmm5, %xmm8
|
|
+#else
|
|
+ movaps %xmm5, %xmm6
|
|
+ PMINU %xmm4, %xmm5
|
|
+#endif
|
|
+
|
|
+ movaps %xmm4, %xmm9
|
|
+ PCMPEQ %xmm0, %xmm4
|
|
+ PCMPEQ %xmm0, %xmm6
|
|
+ movaps %xmm6, %xmm7
|
|
+ por %xmm4, %xmm6
|
|
+#ifndef USE_AS_WCSRCHR
|
|
+ pxor %xmm8, %xmm8
|
|
+ PCMPEQ %xmm5, %xmm8
|
|
+#endif
|
|
+ pmovmskb %xmm8, %ecx
|
|
+ pmovmskb %xmm6, %eax
|
|
+
|
|
+ addq $(VEC_SIZE * 2), %rdi
|
|
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
|
|
+ macro-fuse with `jz`. */
|
|
+ addl %ecx, %eax
|
|
+ jz L(first_loop)
|
|
+
|
|
+ /* Check if there is zero match. */
|
|
+ testl %ecx, %ecx
|
|
+ jz L(second_loop_match)
|
|
+
|
|
+ /* Check if there was a match in last iteration. */
|
|
+ subl %ecx, %eax
|
|
+ jnz L(new_match)
|
|
+
|
|
+L(first_loop_old_match):
|
|
+ PCMPEQ %xmm0, %xmm2
|
|
+ PCMPEQ %xmm0, %xmm3
|
|
+ pmovmskb %xmm2, %ecx
|
|
+ pmovmskb %xmm3, %eax
|
|
+ addl %eax, %ecx
|
|
+ jz L(first_vec_x0_test)
|
|
+ /* NB: We could move this shift to before the branch and save a
|
|
+ bit of code size / performance on the fall through. The
|
|
+ branch leads to the null case which generally seems hotter
|
|
+ than char in first 3x VEC. */
|
|
+ sall $16, %eax
|
|
+ orl %ecx, %eax
|
|
+
|
|
+ bsrl %eax, %eax
|
|
+ addq %rsi, %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(new_match):
|
|
+ pxor %xmm6, %xmm6
|
|
+ PCMPEQ %xmm9, %xmm6
|
|
+ pmovmskb %xmm6, %eax
|
|
+ sall $16, %ecx
|
|
+ orl %eax, %ecx
|
|
+
|
|
+ /* We can't reuse either of the old comparisons as since we mask
|
|
+ of zeros after first zero (instead of using the full
|
|
+ comparison) we can't gurantee no interference between match
|
|
+ after end of string and valid match. */
|
|
+ pmovmskb %xmm4, %eax
|
|
+ pmovmskb %xmm7, %edx
|
|
+ sall $16, %edx
|
|
+ orl %edx, %eax
|
|
+
|
|
+ leal -1(%ecx), %edx
|
|
+ xorl %edx, %ecx
|
|
+ andl %ecx, %eax
|
|
+ jz L(first_loop_old_match)
|
|
+ bsrl %eax, %eax
|
|
+ addq %rdi, %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
ret
|
|
|
|
+ /* Save minimum state for getting most recent match. We can
|
|
+ throw out all previous work. */
|
|
.p2align 4
|
|
-L(no_c_found):
|
|
- movl $1, %esi
|
|
- xorl %ecx, %ecx
|
|
- jmp L(loop_header)
|
|
+L(second_loop_match):
|
|
+ movq %rdi, %rsi
|
|
+ movaps %xmm4, %xmm2
|
|
+ movaps %xmm7, %xmm3
|
|
|
|
.p2align 4
|
|
-L(exit):
|
|
- xorl %eax, %eax
|
|
+L(second_loop):
|
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
|
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
|
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
|
+ detecting zero. Note if this is found to be a bottleneck it
|
|
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ movaps %xmm5, %xmm6
|
|
+ pxor %xmm8, %xmm8
|
|
+
|
|
+ PCMPEQ %xmm8, %xmm5
|
|
+ PCMPEQ %xmm4, %xmm8
|
|
+ por %xmm5, %xmm8
|
|
+#else
|
|
+ movaps %xmm5, %xmm6
|
|
+ PMINU %xmm4, %xmm5
|
|
+#endif
|
|
+
|
|
+ movaps %xmm4, %xmm9
|
|
+ PCMPEQ %xmm0, %xmm4
|
|
+ PCMPEQ %xmm0, %xmm6
|
|
+ movaps %xmm6, %xmm7
|
|
+ por %xmm4, %xmm6
|
|
+#ifndef USE_AS_WCSRCHR
|
|
+ pxor %xmm8, %xmm8
|
|
+ PCMPEQ %xmm5, %xmm8
|
|
+#endif
|
|
+
|
|
+ pmovmskb %xmm8, %ecx
|
|
+ pmovmskb %xmm6, %eax
|
|
+
|
|
+ addq $(VEC_SIZE * 2), %rdi
|
|
+ /* Either null term or new occurence of CHAR. */
|
|
+ addl %ecx, %eax
|
|
+ jz L(second_loop)
|
|
+
|
|
+ /* No null term so much be new occurence of CHAR. */
|
|
+ testl %ecx, %ecx
|
|
+ jz L(second_loop_match)
|
|
+
|
|
+
|
|
+ subl %ecx, %eax
|
|
+ jnz L(second_loop_new_match)
|
|
+
|
|
+L(second_loop_old_match):
|
|
+ pmovmskb %xmm2, %ecx
|
|
+ pmovmskb %xmm3, %eax
|
|
+ sall $16, %eax
|
|
+ orl %ecx, %eax
|
|
+ bsrl %eax, %eax
|
|
+ addq %rsi, %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
ret
|
|
|
|
.p2align 4
|
|
+L(second_loop_new_match):
|
|
+ pxor %xmm6, %xmm6
|
|
+ PCMPEQ %xmm9, %xmm6
|
|
+ pmovmskb %xmm6, %eax
|
|
+ sall $16, %ecx
|
|
+ orl %eax, %ecx
|
|
+
|
|
+ /* We can't reuse either of the old comparisons as since we mask
|
|
+ of zeros after first zero (instead of using the full
|
|
+ comparison) we can't gurantee no interference between match
|
|
+ after end of string and valid match. */
|
|
+ pmovmskb %xmm4, %eax
|
|
+ pmovmskb %xmm7, %edx
|
|
+ sall $16, %edx
|
|
+ orl %edx, %eax
|
|
+
|
|
+ leal -1(%ecx), %edx
|
|
+ xorl %edx, %ecx
|
|
+ andl %ecx, %eax
|
|
+ jz L(second_loop_old_match)
|
|
+ bsrl %eax, %eax
|
|
+ addq %rdi, %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4,, 4
|
|
L(cross_page):
|
|
- movq %rdi, %rax
|
|
- pxor %xmm0, %xmm0
|
|
- andq $-64, %rax
|
|
- movdqu (%rax), %xmm5
|
|
- movdqa %xmm5, %xmm6
|
|
- movdqu 16(%rax), %xmm4
|
|
- pcmpeqb %xmm1, %xmm5
|
|
- pcmpeqb %xmm0, %xmm6
|
|
- movdqu 32(%rax), %xmm3
|
|
- pmovmskb %xmm6, %esi
|
|
- movdqa %xmm4, %xmm6
|
|
- movdqu 48(%rax), %xmm2
|
|
- pcmpeqb %xmm1, %xmm4
|
|
- pcmpeqb %xmm0, %xmm6
|
|
- pmovmskb %xmm6, %edx
|
|
- movdqa %xmm3, %xmm6
|
|
- pcmpeqb %xmm1, %xmm3
|
|
- pcmpeqb %xmm0, %xmm6
|
|
- pcmpeqb %xmm2, %xmm0
|
|
- salq $16, %rdx
|
|
- pmovmskb %xmm3, %r9d
|
|
- pmovmskb %xmm6, %r8d
|
|
- pmovmskb %xmm0, %ecx
|
|
- salq $32, %r9
|
|
- salq $32, %r8
|
|
- pcmpeqb %xmm1, %xmm2
|
|
- orq %r8, %rdx
|
|
- salq $48, %rcx
|
|
- pmovmskb %xmm5, %r8d
|
|
- orq %rsi, %rdx
|
|
- pmovmskb %xmm4, %esi
|
|
- orq %rcx, %rdx
|
|
- pmovmskb %xmm2, %ecx
|
|
- salq $16, %rsi
|
|
- salq $48, %rcx
|
|
- orq %r9, %rsi
|
|
- orq %r8, %rsi
|
|
- orq %rcx, %rsi
|
|
+ movq %rdi, %rsi
|
|
+ andq $-VEC_SIZE, %rsi
|
|
+ movaps (%rsi), %xmm1
|
|
+ pxor %xmm2, %xmm2
|
|
+ PCMPEQ %xmm1, %xmm2
|
|
+ pmovmskb %xmm2, %edx
|
|
movl %edi, %ecx
|
|
- subl %eax, %ecx
|
|
- shrq %cl, %rdx
|
|
- shrq %cl, %rsi
|
|
- testq %rdx, %rdx
|
|
- je L(loop_header2)
|
|
- leaq -1(%rdx), %rax
|
|
- xorq %rdx, %rax
|
|
- andq %rax, %rsi
|
|
- je L(exit)
|
|
- bsrq %rsi, %rax
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ sarl %cl, %edx
|
|
+ jz L(cross_page_continue)
|
|
+ PCMPEQ %xmm0, %xmm1
|
|
+ pmovmskb %xmm1, %eax
|
|
+ sarl %cl, %eax
|
|
+ leal -1(%rdx), %ecx
|
|
+ xorl %edx, %ecx
|
|
+ andl %ecx, %eax
|
|
+ jz L(ret1)
|
|
+ bsrl %eax, %eax
|
|
addq %rdi, %rax
|
|
+#ifdef USE_AS_WCSRCHR
|
|
+ andq $-CHAR_SIZE, %rax
|
|
+#endif
|
|
+L(ret1):
|
|
ret
|
|
-END (strrchr)
|
|
+END(STRRCHR)
|
|
|
|
-weak_alias (strrchr, rindex)
|
|
-libc_hidden_builtin_def (strrchr)
|
|
+#ifndef USE_AS_WCSRCHR
|
|
+ weak_alias (STRRCHR, rindex)
|
|
+ libc_hidden_builtin_def (STRRCHR)
|
|
+#endif
|
|
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
|
|
index 6b318d3f29de9a9e..9006f2220963d76c 100644
|
|
--- a/sysdeps/x86_64/wcsrchr.S
|
|
+++ b/sysdeps/x86_64/wcsrchr.S
|
|
@@ -17,266 +17,12 @@
|
|
License along with the GNU C Library; if not, see
|
|
<https://www.gnu.org/licenses/>. */
|
|
|
|
-#include <sysdep.h>
|
|
|
|
- .text
|
|
-ENTRY (wcsrchr)
|
|
+#define USE_AS_WCSRCHR 1
|
|
+#define NO_PMINU 1
|
|
|
|
- movd %rsi, %xmm1
|
|
- mov %rdi, %rcx
|
|
- punpckldq %xmm1, %xmm1
|
|
- pxor %xmm2, %xmm2
|
|
- punpckldq %xmm1, %xmm1
|
|
- and $63, %rcx
|
|
- cmp $48, %rcx
|
|
- ja L(crosscache)
|
|
+#ifndef STRRCHR
|
|
+# define STRRCHR wcsrchr
|
|
+#endif
|
|
|
|
- movdqu (%rdi), %xmm0
|
|
- pcmpeqd %xmm0, %xmm2
|
|
- pcmpeqd %xmm1, %xmm0
|
|
- pmovmskb %xmm2, %rcx
|
|
- pmovmskb %xmm0, %rax
|
|
- add $16, %rdi
|
|
-
|
|
- test %rax, %rax
|
|
- jnz L(unaligned_match1)
|
|
-
|
|
- test %rcx, %rcx
|
|
- jnz L(return_null)
|
|
-
|
|
- and $-16, %rdi
|
|
- xor %r8, %r8
|
|
- jmp L(loop)
|
|
-
|
|
- .p2align 4
|
|
-L(unaligned_match1):
|
|
- test %rcx, %rcx
|
|
- jnz L(prolog_find_zero_1)
|
|
-
|
|
- mov %rax, %r8
|
|
- mov %rdi, %rsi
|
|
- and $-16, %rdi
|
|
- jmp L(loop)
|
|
-
|
|
- .p2align 4
|
|
-L(crosscache):
|
|
- and $15, %rcx
|
|
- and $-16, %rdi
|
|
- pxor %xmm3, %xmm3
|
|
- movdqa (%rdi), %xmm0
|
|
- pcmpeqd %xmm0, %xmm3
|
|
- pcmpeqd %xmm1, %xmm0
|
|
- pmovmskb %xmm3, %rdx
|
|
- pmovmskb %xmm0, %rax
|
|
- shr %cl, %rdx
|
|
- shr %cl, %rax
|
|
- add $16, %rdi
|
|
-
|
|
- test %rax, %rax
|
|
- jnz L(unaligned_match)
|
|
-
|
|
- test %rdx, %rdx
|
|
- jnz L(return_null)
|
|
-
|
|
- xor %r8, %r8
|
|
- jmp L(loop)
|
|
-
|
|
- .p2align 4
|
|
-L(unaligned_match):
|
|
- test %rdx, %rdx
|
|
- jnz L(prolog_find_zero)
|
|
-
|
|
- mov %rax, %r8
|
|
- lea (%rdi, %rcx), %rsi
|
|
-
|
|
-/* Loop start on aligned string. */
|
|
- .p2align 4
|
|
-L(loop):
|
|
- movdqa (%rdi), %xmm0
|
|
- pcmpeqd %xmm0, %xmm2
|
|
- add $16, %rdi
|
|
- pcmpeqd %xmm1, %xmm0
|
|
- pmovmskb %xmm2, %rcx
|
|
- pmovmskb %xmm0, %rax
|
|
- or %rax, %rcx
|
|
- jnz L(matches)
|
|
-
|
|
- movdqa (%rdi), %xmm3
|
|
- pcmpeqd %xmm3, %xmm2
|
|
- add $16, %rdi
|
|
- pcmpeqd %xmm1, %xmm3
|
|
- pmovmskb %xmm2, %rcx
|
|
- pmovmskb %xmm3, %rax
|
|
- or %rax, %rcx
|
|
- jnz L(matches)
|
|
-
|
|
- movdqa (%rdi), %xmm4
|
|
- pcmpeqd %xmm4, %xmm2
|
|
- add $16, %rdi
|
|
- pcmpeqd %xmm1, %xmm4
|
|
- pmovmskb %xmm2, %rcx
|
|
- pmovmskb %xmm4, %rax
|
|
- or %rax, %rcx
|
|
- jnz L(matches)
|
|
-
|
|
- movdqa (%rdi), %xmm5
|
|
- pcmpeqd %xmm5, %xmm2
|
|
- add $16, %rdi
|
|
- pcmpeqd %xmm1, %xmm5
|
|
- pmovmskb %xmm2, %rcx
|
|
- pmovmskb %xmm5, %rax
|
|
- or %rax, %rcx
|
|
- jz L(loop)
|
|
-
|
|
- .p2align 4
|
|
-L(matches):
|
|
- test %rax, %rax
|
|
- jnz L(match)
|
|
-L(return_value):
|
|
- test %r8, %r8
|
|
- jz L(return_null)
|
|
- mov %r8, %rax
|
|
- mov %rsi, %rdi
|
|
-
|
|
- test $15 << 4, %ah
|
|
- jnz L(match_fourth_wchar)
|
|
- test %ah, %ah
|
|
- jnz L(match_third_wchar)
|
|
- test $15 << 4, %al
|
|
- jnz L(match_second_wchar)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(match):
|
|
- pmovmskb %xmm2, %rcx
|
|
- test %rcx, %rcx
|
|
- jnz L(find_zero)
|
|
- mov %rax, %r8
|
|
- mov %rdi, %rsi
|
|
- jmp L(loop)
|
|
-
|
|
- .p2align 4
|
|
-L(find_zero):
|
|
- test $15, %cl
|
|
- jnz L(find_zero_in_first_wchar)
|
|
- test %cl, %cl
|
|
- jnz L(find_zero_in_second_wchar)
|
|
- test $15, %ch
|
|
- jnz L(find_zero_in_third_wchar)
|
|
-
|
|
- and $1 << 13 - 1, %rax
|
|
- jz L(return_value)
|
|
-
|
|
- test $15 << 4, %ah
|
|
- jnz L(match_fourth_wchar)
|
|
- test %ah, %ah
|
|
- jnz L(match_third_wchar)
|
|
- test $15 << 4, %al
|
|
- jnz L(match_second_wchar)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(find_zero_in_first_wchar):
|
|
- test $1, %rax
|
|
- jz L(return_value)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(find_zero_in_second_wchar):
|
|
- and $1 << 5 - 1, %rax
|
|
- jz L(return_value)
|
|
-
|
|
- test $15 << 4, %al
|
|
- jnz L(match_second_wchar)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(find_zero_in_third_wchar):
|
|
- and $1 << 9 - 1, %rax
|
|
- jz L(return_value)
|
|
-
|
|
- test %ah, %ah
|
|
- jnz L(match_third_wchar)
|
|
- test $15 << 4, %al
|
|
- jnz L(match_second_wchar)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(prolog_find_zero):
|
|
- add %rcx, %rdi
|
|
- mov %rdx, %rcx
|
|
-L(prolog_find_zero_1):
|
|
- test $15, %cl
|
|
- jnz L(prolog_find_zero_in_first_wchar)
|
|
- test %cl, %cl
|
|
- jnz L(prolog_find_zero_in_second_wchar)
|
|
- test $15, %ch
|
|
- jnz L(prolog_find_zero_in_third_wchar)
|
|
-
|
|
- and $1 << 13 - 1, %rax
|
|
- jz L(return_null)
|
|
-
|
|
- test $15 << 4, %ah
|
|
- jnz L(match_fourth_wchar)
|
|
- test %ah, %ah
|
|
- jnz L(match_third_wchar)
|
|
- test $15 << 4, %al
|
|
- jnz L(match_second_wchar)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(prolog_find_zero_in_first_wchar):
|
|
- test $1, %rax
|
|
- jz L(return_null)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(prolog_find_zero_in_second_wchar):
|
|
- and $1 << 5 - 1, %rax
|
|
- jz L(return_null)
|
|
-
|
|
- test $15 << 4, %al
|
|
- jnz L(match_second_wchar)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(prolog_find_zero_in_third_wchar):
|
|
- and $1 << 9 - 1, %rax
|
|
- jz L(return_null)
|
|
-
|
|
- test %ah, %ah
|
|
- jnz L(match_third_wchar)
|
|
- test $15 << 4, %al
|
|
- jnz L(match_second_wchar)
|
|
- lea -16(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(match_second_wchar):
|
|
- lea -12(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(match_third_wchar):
|
|
- lea -8(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(match_fourth_wchar):
|
|
- lea -4(%rdi), %rax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
-L(return_null):
|
|
- xor %rax, %rax
|
|
- ret
|
|
-
|
|
-END (wcsrchr)
|
|
+#include "../strrchr.S"
|