877 lines
18 KiB
Diff
877 lines
18 KiB
Diff
|
From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
|
||
|
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||
|
Date: Thu, 21 Apr 2022 20:52:28 -0500
|
||
|
Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
|
||
|
Content-type: text/plain; charset=UTF-8
|
||
|
|
||
|
The new code unrolls the main loop slightly without adding too much
|
||
|
overhead and minimizes the comparisons for the search CHAR.
|
||
|
|
||
|
Geometric Mean of all benchmarks New / Old: 0.741
|
||
|
See email for all results.
|
||
|
|
||
|
Full xcheck passes on x86_64 with and without multiarch enabled.
|
||
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||
|
---
|
||
|
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
|
||
|
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
|
||
|
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
|
||
|
sysdeps/x86_64/wcsrchr.S | 266 +-----------
|
||
|
4 files changed, 338 insertions(+), 443 deletions(-)
|
||
|
|
||
|
Conflicts:
|
||
|
sysdeps/x86_64/wcsrchr.S
|
||
|
(copyright header)
|
||
|
|
||
|
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||
|
index 0ec76fe9..6bb1284b 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||
|
@@ -17,7 +17,7 @@
|
||
|
<http://www.gnu.org/licenses/>. */
|
||
|
|
||
|
#if IS_IN (libc)
|
||
|
-# define strrchr __strrchr_sse2
|
||
|
+# define STRRCHR __strrchr_sse2
|
||
|
|
||
|
# undef weak_alias
|
||
|
# define weak_alias(strrchr, rindex)
|
||
|
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||
|
index d015e953..f26d53b5 100644
|
||
|
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||
|
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||
|
@@ -17,7 +17,6 @@
|
||
|
<http://www.gnu.org/licenses/>. */
|
||
|
|
||
|
#if IS_IN (libc)
|
||
|
-# define wcsrchr __wcsrchr_sse2
|
||
|
+# define STRRCHR __wcsrchr_sse2
|
||
|
#endif
|
||
|
-
|
||
|
#include "../wcsrchr.S"
|
||
|
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
|
||
|
index aca98e7e..a58cc220 100644
|
||
|
--- a/sysdeps/x86_64/strrchr.S
|
||
|
+++ b/sysdeps/x86_64/strrchr.S
|
||
|
@@ -19,210 +19,360 @@
|
||
|
|
||
|
#include <sysdep.h>
|
||
|
|
||
|
+#ifndef STRRCHR
|
||
|
+# define STRRCHR strrchr
|
||
|
+#endif
|
||
|
+
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+# define PCMPEQ pcmpeqd
|
||
|
+# define CHAR_SIZE 4
|
||
|
+# define PMINU pminud
|
||
|
+#else
|
||
|
+# define PCMPEQ pcmpeqb
|
||
|
+# define CHAR_SIZE 1
|
||
|
+# define PMINU pminub
|
||
|
+#endif
|
||
|
+
|
||
|
+#define PAGE_SIZE 4096
|
||
|
+#define VEC_SIZE 16
|
||
|
+
|
||
|
.text
|
||
|
-ENTRY (strrchr)
|
||
|
- movd %esi, %xmm1
|
||
|
+ENTRY(STRRCHR)
|
||
|
+ movd %esi, %xmm0
|
||
|
movq %rdi, %rax
|
||
|
- andl $4095, %eax
|
||
|
- punpcklbw %xmm1, %xmm1
|
||
|
- cmpq $4032, %rax
|
||
|
- punpcklwd %xmm1, %xmm1
|
||
|
- pshufd $0, %xmm1, %xmm1
|
||
|
+ andl $(PAGE_SIZE - 1), %eax
|
||
|
+#ifndef USE_AS_WCSRCHR
|
||
|
+ punpcklbw %xmm0, %xmm0
|
||
|
+ punpcklwd %xmm0, %xmm0
|
||
|
+#endif
|
||
|
+ pshufd $0, %xmm0, %xmm0
|
||
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||
|
ja L(cross_page)
|
||
|
- movdqu (%rdi), %xmm0
|
||
|
+
|
||
|
+L(cross_page_continue):
|
||
|
+ movups (%rdi), %xmm1
|
||
|
pxor %xmm2, %xmm2
|
||
|
- movdqa %xmm0, %xmm3
|
||
|
- pcmpeqb %xmm1, %xmm0
|
||
|
- pcmpeqb %xmm2, %xmm3
|
||
|
- pmovmskb %xmm0, %ecx
|
||
|
- pmovmskb %xmm3, %edx
|
||
|
- testq %rdx, %rdx
|
||
|
- je L(next_48_bytes)
|
||
|
- leaq -1(%rdx), %rax
|
||
|
- xorq %rdx, %rax
|
||
|
- andq %rcx, %rax
|
||
|
- je L(exit)
|
||
|
- bsrq %rax, %rax
|
||
|
+ PCMPEQ %xmm1, %xmm2
|
||
|
+ pmovmskb %xmm2, %ecx
|
||
|
+ testl %ecx, %ecx
|
||
|
+ jz L(aligned_more)
|
||
|
+
|
||
|
+ PCMPEQ %xmm0, %xmm1
|
||
|
+ pmovmskb %xmm1, %eax
|
||
|
+ leal -1(%rcx), %edx
|
||
|
+ xorl %edx, %ecx
|
||
|
+ andl %ecx, %eax
|
||
|
+ jz L(ret0)
|
||
|
+ bsrl %eax, %eax
|
||
|
addq %rdi, %rax
|
||
|
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
|
||
|
+ search CHAR is zero we are correct. Either way `andq
|
||
|
+ -CHAR_SIZE, %rax` gets the correct result. */
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
+L(ret0):
|
||
|
ret
|
||
|
|
||
|
+ /* Returns for first vec x1/x2 have hard coded backward search
|
||
|
+ path for earlier matches. */
|
||
|
.p2align 4
|
||
|
-L(next_48_bytes):
|
||
|
- movdqu 16(%rdi), %xmm4
|
||
|
- movdqa %xmm4, %xmm5
|
||
|
- movdqu 32(%rdi), %xmm3
|
||
|
- pcmpeqb %xmm1, %xmm4
|
||
|
- pcmpeqb %xmm2, %xmm5
|
||
|
- movdqu 48(%rdi), %xmm0
|
||
|
- pmovmskb %xmm5, %edx
|
||
|
- movdqa %xmm3, %xmm5
|
||
|
- pcmpeqb %xmm1, %xmm3
|
||
|
- pcmpeqb %xmm2, %xmm5
|
||
|
- pcmpeqb %xmm0, %xmm2
|
||
|
- salq $16, %rdx
|
||
|
- pmovmskb %xmm3, %r8d
|
||
|
- pmovmskb %xmm5, %eax
|
||
|
- pmovmskb %xmm2, %esi
|
||
|
- salq $32, %r8
|
||
|
- salq $32, %rax
|
||
|
- pcmpeqb %xmm1, %xmm0
|
||
|
- orq %rdx, %rax
|
||
|
- movq %rsi, %rdx
|
||
|
- pmovmskb %xmm4, %esi
|
||
|
- salq $48, %rdx
|
||
|
- salq $16, %rsi
|
||
|
- orq %r8, %rsi
|
||
|
- orq %rcx, %rsi
|
||
|
- pmovmskb %xmm0, %ecx
|
||
|
- salq $48, %rcx
|
||
|
- orq %rcx, %rsi
|
||
|
- orq %rdx, %rax
|
||
|
- je L(loop_header2)
|
||
|
- leaq -1(%rax), %rcx
|
||
|
- xorq %rax, %rcx
|
||
|
- andq %rcx, %rsi
|
||
|
- je L(exit)
|
||
|
- bsrq %rsi, %rsi
|
||
|
- leaq (%rdi,%rsi), %rax
|
||
|
+L(first_vec_x0_test):
|
||
|
+ PCMPEQ %xmm0, %xmm1
|
||
|
+ pmovmskb %xmm1, %eax
|
||
|
+ testl %eax, %eax
|
||
|
+ jz L(ret0)
|
||
|
+ bsrl %eax, %eax
|
||
|
+ addq %r8, %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
ret
|
||
|
|
||
|
.p2align 4
|
||
|
-L(loop_header2):
|
||
|
- testq %rsi, %rsi
|
||
|
- movq %rdi, %rcx
|
||
|
- je L(no_c_found)
|
||
|
-L(loop_header):
|
||
|
- addq $64, %rdi
|
||
|
- pxor %xmm7, %xmm7
|
||
|
- andq $-64, %rdi
|
||
|
- jmp L(loop_entry)
|
||
|
+L(first_vec_x1):
|
||
|
+ PCMPEQ %xmm0, %xmm2
|
||
|
+ pmovmskb %xmm2, %eax
|
||
|
+ leal -1(%rcx), %edx
|
||
|
+ xorl %edx, %ecx
|
||
|
+ andl %ecx, %eax
|
||
|
+ jz L(first_vec_x0_test)
|
||
|
+ bsrl %eax, %eax
|
||
|
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
+ ret
|
||
|
|
||
|
.p2align 4
|
||
|
-L(loop64):
|
||
|
- testq %rdx, %rdx
|
||
|
- cmovne %rdx, %rsi
|
||
|
- cmovne %rdi, %rcx
|
||
|
- addq $64, %rdi
|
||
|
-L(loop_entry):
|
||
|
- movdqa 32(%rdi), %xmm3
|
||
|
- pxor %xmm6, %xmm6
|
||
|
- movdqa 48(%rdi), %xmm2
|
||
|
- movdqa %xmm3, %xmm0
|
||
|
- movdqa 16(%rdi), %xmm4
|
||
|
- pminub %xmm2, %xmm0
|
||
|
- movdqa (%rdi), %xmm5
|
||
|
- pminub %xmm4, %xmm0
|
||
|
- pminub %xmm5, %xmm0
|
||
|
- pcmpeqb %xmm7, %xmm0
|
||
|
- pmovmskb %xmm0, %eax
|
||
|
- movdqa %xmm5, %xmm0
|
||
|
- pcmpeqb %xmm1, %xmm0
|
||
|
- pmovmskb %xmm0, %r9d
|
||
|
- movdqa %xmm4, %xmm0
|
||
|
- pcmpeqb %xmm1, %xmm0
|
||
|
- pmovmskb %xmm0, %edx
|
||
|
- movdqa %xmm3, %xmm0
|
||
|
- pcmpeqb %xmm1, %xmm0
|
||
|
- salq $16, %rdx
|
||
|
- pmovmskb %xmm0, %r10d
|
||
|
- movdqa %xmm2, %xmm0
|
||
|
- pcmpeqb %xmm1, %xmm0
|
||
|
- salq $32, %r10
|
||
|
- orq %r10, %rdx
|
||
|
- pmovmskb %xmm0, %r8d
|
||
|
- orq %r9, %rdx
|
||
|
- salq $48, %r8
|
||
|
- orq %r8, %rdx
|
||
|
+L(first_vec_x1_test):
|
||
|
+ PCMPEQ %xmm0, %xmm2
|
||
|
+ pmovmskb %xmm2, %eax
|
||
|
testl %eax, %eax
|
||
|
- je L(loop64)
|
||
|
- pcmpeqb %xmm6, %xmm4
|
||
|
- pcmpeqb %xmm6, %xmm3
|
||
|
- pcmpeqb %xmm6, %xmm5
|
||
|
- pmovmskb %xmm4, %eax
|
||
|
- pmovmskb %xmm3, %r10d
|
||
|
- pcmpeqb %xmm6, %xmm2
|
||
|
- pmovmskb %xmm5, %r9d
|
||
|
- salq $32, %r10
|
||
|
- salq $16, %rax
|
||
|
- pmovmskb %xmm2, %r8d
|
||
|
- orq %r10, %rax
|
||
|
- orq %r9, %rax
|
||
|
- salq $48, %r8
|
||
|
- orq %r8, %rax
|
||
|
- leaq -1(%rax), %r8
|
||
|
- xorq %rax, %r8
|
||
|
- andq %r8, %rdx
|
||
|
- cmovne %rdi, %rcx
|
||
|
- cmovne %rdx, %rsi
|
||
|
- bsrq %rsi, %rsi
|
||
|
- leaq (%rcx,%rsi), %rax
|
||
|
+ jz L(first_vec_x0_test)
|
||
|
+ bsrl %eax, %eax
|
||
|
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
+ ret
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(first_vec_x2):
|
||
|
+ PCMPEQ %xmm0, %xmm3
|
||
|
+ pmovmskb %xmm3, %eax
|
||
|
+ leal -1(%rcx), %edx
|
||
|
+ xorl %edx, %ecx
|
||
|
+ andl %ecx, %eax
|
||
|
+ jz L(first_vec_x1_test)
|
||
|
+ bsrl %eax, %eax
|
||
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
+ ret
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(aligned_more):
|
||
|
+ /* Save original pointer if match was in VEC 0. */
|
||
|
+ movq %rdi, %r8
|
||
|
+ andq $-VEC_SIZE, %rdi
|
||
|
+
|
||
|
+ movaps VEC_SIZE(%rdi), %xmm2
|
||
|
+ pxor %xmm3, %xmm3
|
||
|
+ PCMPEQ %xmm2, %xmm3
|
||
|
+ pmovmskb %xmm3, %ecx
|
||
|
+ testl %ecx, %ecx
|
||
|
+ jnz L(first_vec_x1)
|
||
|
+
|
||
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
|
||
|
+ pxor %xmm4, %xmm4
|
||
|
+ PCMPEQ %xmm3, %xmm4
|
||
|
+ pmovmskb %xmm4, %ecx
|
||
|
+ testl %ecx, %ecx
|
||
|
+ jnz L(first_vec_x2)
|
||
|
+
|
||
|
+ addq $VEC_SIZE, %rdi
|
||
|
+ /* Save pointer again before realigning. */
|
||
|
+ movq %rdi, %rsi
|
||
|
+ andq $-(VEC_SIZE * 2), %rdi
|
||
|
+ .p2align 4
|
||
|
+L(first_loop):
|
||
|
+ /* Do 2x VEC at a time. */
|
||
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||
|
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||
|
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||
|
+ detecting zero. Note if this is found to be a bottleneck it
|
||
|
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ movaps %xmm5, %xmm6
|
||
|
+ pxor %xmm8, %xmm8
|
||
|
+
|
||
|
+ PCMPEQ %xmm8, %xmm5
|
||
|
+ PCMPEQ %xmm4, %xmm8
|
||
|
+ por %xmm5, %xmm8
|
||
|
+#else
|
||
|
+ movaps %xmm5, %xmm6
|
||
|
+ PMINU %xmm4, %xmm5
|
||
|
+#endif
|
||
|
+
|
||
|
+ movaps %xmm4, %xmm9
|
||
|
+ PCMPEQ %xmm0, %xmm4
|
||
|
+ PCMPEQ %xmm0, %xmm6
|
||
|
+ movaps %xmm6, %xmm7
|
||
|
+ por %xmm4, %xmm6
|
||
|
+#ifndef USE_AS_WCSRCHR
|
||
|
+ pxor %xmm8, %xmm8
|
||
|
+ PCMPEQ %xmm5, %xmm8
|
||
|
+#endif
|
||
|
+ pmovmskb %xmm8, %ecx
|
||
|
+ pmovmskb %xmm6, %eax
|
||
|
+
|
||
|
+ addq $(VEC_SIZE * 2), %rdi
|
||
|
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
|
||
|
+ macro-fuse with `jz`. */
|
||
|
+ addl %ecx, %eax
|
||
|
+ jz L(first_loop)
|
||
|
+
|
||
|
+ /* Check if there is zero match. */
|
||
|
+ testl %ecx, %ecx
|
||
|
+ jz L(second_loop_match)
|
||
|
+
|
||
|
+ /* Check if there was a match in last iteration. */
|
||
|
+ subl %ecx, %eax
|
||
|
+ jnz L(new_match)
|
||
|
+
|
||
|
+L(first_loop_old_match):
|
||
|
+ PCMPEQ %xmm0, %xmm2
|
||
|
+ PCMPEQ %xmm0, %xmm3
|
||
|
+ pmovmskb %xmm2, %ecx
|
||
|
+ pmovmskb %xmm3, %eax
|
||
|
+ addl %eax, %ecx
|
||
|
+ jz L(first_vec_x0_test)
|
||
|
+ /* NB: We could move this shift to before the branch and save a
|
||
|
+ bit of code size / performance on the fall through. The
|
||
|
+ branch leads to the null case which generally seems hotter
|
||
|
+ than char in first 3x VEC. */
|
||
|
+ sall $16, %eax
|
||
|
+ orl %ecx, %eax
|
||
|
+
|
||
|
+ bsrl %eax, %eax
|
||
|
+ addq %rsi, %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
+ ret
|
||
|
+
|
||
|
+ .p2align 4
|
||
|
+L(new_match):
|
||
|
+ pxor %xmm6, %xmm6
|
||
|
+ PCMPEQ %xmm9, %xmm6
|
||
|
+ pmovmskb %xmm6, %eax
|
||
|
+ sall $16, %ecx
|
||
|
+ orl %eax, %ecx
|
||
|
+
|
||
|
+ /* We can't reuse either of the old comparisons as since we mask
|
||
|
+ of zeros after first zero (instead of using the full
|
||
|
+ comparison) we can't gurantee no interference between match
|
||
|
+ after end of string and valid match. */
|
||
|
+ pmovmskb %xmm4, %eax
|
||
|
+ pmovmskb %xmm7, %edx
|
||
|
+ sall $16, %edx
|
||
|
+ orl %edx, %eax
|
||
|
+
|
||
|
+ leal -1(%ecx), %edx
|
||
|
+ xorl %edx, %ecx
|
||
|
+ andl %ecx, %eax
|
||
|
+ jz L(first_loop_old_match)
|
||
|
+ bsrl %eax, %eax
|
||
|
+ addq %rdi, %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
ret
|
||
|
|
||
|
+ /* Save minimum state for getting most recent match. We can
|
||
|
+ throw out all previous work. */
|
||
|
.p2align 4
|
||
|
-L(no_c_found):
|
||
|
- movl $1, %esi
|
||
|
- xorl %ecx, %ecx
|
||
|
- jmp L(loop_header)
|
||
|
+L(second_loop_match):
|
||
|
+ movq %rdi, %rsi
|
||
|
+ movaps %xmm4, %xmm2
|
||
|
+ movaps %xmm7, %xmm3
|
||
|
|
||
|
.p2align 4
|
||
|
-L(exit):
|
||
|
- xorl %eax, %eax
|
||
|
+L(second_loop):
|
||
|
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||
|
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||
|
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||
|
+ detecting zero. Note if this is found to be a bottleneck it
|
||
|
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ movaps %xmm5, %xmm6
|
||
|
+ pxor %xmm8, %xmm8
|
||
|
+
|
||
|
+ PCMPEQ %xmm8, %xmm5
|
||
|
+ PCMPEQ %xmm4, %xmm8
|
||
|
+ por %xmm5, %xmm8
|
||
|
+#else
|
||
|
+ movaps %xmm5, %xmm6
|
||
|
+ PMINU %xmm4, %xmm5
|
||
|
+#endif
|
||
|
+
|
||
|
+ movaps %xmm4, %xmm9
|
||
|
+ PCMPEQ %xmm0, %xmm4
|
||
|
+ PCMPEQ %xmm0, %xmm6
|
||
|
+ movaps %xmm6, %xmm7
|
||
|
+ por %xmm4, %xmm6
|
||
|
+#ifndef USE_AS_WCSRCHR
|
||
|
+ pxor %xmm8, %xmm8
|
||
|
+ PCMPEQ %xmm5, %xmm8
|
||
|
+#endif
|
||
|
+
|
||
|
+ pmovmskb %xmm8, %ecx
|
||
|
+ pmovmskb %xmm6, %eax
|
||
|
+
|
||
|
+ addq $(VEC_SIZE * 2), %rdi
|
||
|
+ /* Either null term or new occurence of CHAR. */
|
||
|
+ addl %ecx, %eax
|
||
|
+ jz L(second_loop)
|
||
|
+
|
||
|
+ /* No null term so much be new occurence of CHAR. */
|
||
|
+ testl %ecx, %ecx
|
||
|
+ jz L(second_loop_match)
|
||
|
+
|
||
|
+
|
||
|
+ subl %ecx, %eax
|
||
|
+ jnz L(second_loop_new_match)
|
||
|
+
|
||
|
+L(second_loop_old_match):
|
||
|
+ pmovmskb %xmm2, %ecx
|
||
|
+ pmovmskb %xmm3, %eax
|
||
|
+ sall $16, %eax
|
||
|
+ orl %ecx, %eax
|
||
|
+ bsrl %eax, %eax
|
||
|
+ addq %rsi, %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
ret
|
||
|
|
||
|
.p2align 4
|
||
|
+L(second_loop_new_match):
|
||
|
+ pxor %xmm6, %xmm6
|
||
|
+ PCMPEQ %xmm9, %xmm6
|
||
|
+ pmovmskb %xmm6, %eax
|
||
|
+ sall $16, %ecx
|
||
|
+ orl %eax, %ecx
|
||
|
+
|
||
|
+ /* We can't reuse either of the old comparisons as since we mask
|
||
|
+ of zeros after first zero (instead of using the full
|
||
|
+ comparison) we can't gurantee no interference between match
|
||
|
+ after end of string and valid match. */
|
||
|
+ pmovmskb %xmm4, %eax
|
||
|
+ pmovmskb %xmm7, %edx
|
||
|
+ sall $16, %edx
|
||
|
+ orl %edx, %eax
|
||
|
+
|
||
|
+ leal -1(%ecx), %edx
|
||
|
+ xorl %edx, %ecx
|
||
|
+ andl %ecx, %eax
|
||
|
+ jz L(second_loop_old_match)
|
||
|
+ bsrl %eax, %eax
|
||
|
+ addq %rdi, %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
+ ret
|
||
|
+
|
||
|
+ .p2align 4,, 4
|
||
|
L(cross_page):
|
||
|
- movq %rdi, %rax
|
||
|
- pxor %xmm0, %xmm0
|
||
|
- andq $-64, %rax
|
||
|
- movdqu (%rax), %xmm5
|
||
|
- movdqa %xmm5, %xmm6
|
||
|
- movdqu 16(%rax), %xmm4
|
||
|
- pcmpeqb %xmm1, %xmm5
|
||
|
- pcmpeqb %xmm0, %xmm6
|
||
|
- movdqu 32(%rax), %xmm3
|
||
|
- pmovmskb %xmm6, %esi
|
||
|
- movdqa %xmm4, %xmm6
|
||
|
- movdqu 48(%rax), %xmm2
|
||
|
- pcmpeqb %xmm1, %xmm4
|
||
|
- pcmpeqb %xmm0, %xmm6
|
||
|
- pmovmskb %xmm6, %edx
|
||
|
- movdqa %xmm3, %xmm6
|
||
|
- pcmpeqb %xmm1, %xmm3
|
||
|
- pcmpeqb %xmm0, %xmm6
|
||
|
- pcmpeqb %xmm2, %xmm0
|
||
|
- salq $16, %rdx
|
||
|
- pmovmskb %xmm3, %r9d
|
||
|
- pmovmskb %xmm6, %r8d
|
||
|
- pmovmskb %xmm0, %ecx
|
||
|
- salq $32, %r9
|
||
|
- salq $32, %r8
|
||
|
- pcmpeqb %xmm1, %xmm2
|
||
|
- orq %r8, %rdx
|
||
|
- salq $48, %rcx
|
||
|
- pmovmskb %xmm5, %r8d
|
||
|
- orq %rsi, %rdx
|
||
|
- pmovmskb %xmm4, %esi
|
||
|
- orq %rcx, %rdx
|
||
|
- pmovmskb %xmm2, %ecx
|
||
|
- salq $16, %rsi
|
||
|
- salq $48, %rcx
|
||
|
- orq %r9, %rsi
|
||
|
- orq %r8, %rsi
|
||
|
- orq %rcx, %rsi
|
||
|
+ movq %rdi, %rsi
|
||
|
+ andq $-VEC_SIZE, %rsi
|
||
|
+ movaps (%rsi), %xmm1
|
||
|
+ pxor %xmm2, %xmm2
|
||
|
+ PCMPEQ %xmm1, %xmm2
|
||
|
+ pmovmskb %xmm2, %edx
|
||
|
movl %edi, %ecx
|
||
|
- subl %eax, %ecx
|
||
|
- shrq %cl, %rdx
|
||
|
- shrq %cl, %rsi
|
||
|
- testq %rdx, %rdx
|
||
|
- je L(loop_header2)
|
||
|
- leaq -1(%rdx), %rax
|
||
|
- xorq %rdx, %rax
|
||
|
- andq %rax, %rsi
|
||
|
- je L(exit)
|
||
|
- bsrq %rsi, %rax
|
||
|
+ andl $(VEC_SIZE - 1), %ecx
|
||
|
+ sarl %cl, %edx
|
||
|
+ jz L(cross_page_continue)
|
||
|
+ PCMPEQ %xmm0, %xmm1
|
||
|
+ pmovmskb %xmm1, %eax
|
||
|
+ sarl %cl, %eax
|
||
|
+ leal -1(%rdx), %ecx
|
||
|
+ xorl %edx, %ecx
|
||
|
+ andl %ecx, %eax
|
||
|
+ jz L(ret1)
|
||
|
+ bsrl %eax, %eax
|
||
|
addq %rdi, %rax
|
||
|
+#ifdef USE_AS_WCSRCHR
|
||
|
+ andq $-CHAR_SIZE, %rax
|
||
|
+#endif
|
||
|
+L(ret1):
|
||
|
ret
|
||
|
-END (strrchr)
|
||
|
+END(STRRCHR)
|
||
|
|
||
|
-weak_alias (strrchr, rindex)
|
||
|
-libc_hidden_builtin_def (strrchr)
|
||
|
+#ifndef USE_AS_WCSRCHR
|
||
|
+ weak_alias (STRRCHR, rindex)
|
||
|
+ libc_hidden_builtin_def (STRRCHR)
|
||
|
+#endif
|
||
|
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
|
||
|
index 2f388537..ae3cfa7d 100644
|
||
|
--- a/sysdeps/x86_64/wcsrchr.S
|
||
|
+++ b/sysdeps/x86_64/wcsrchr.S
|
||
|
@@ -17,266 +17,12 @@
|
||
|
License along with the GNU C Library; if not, see
|
||
|
<http://www.gnu.org/licenses/>. */
|
||
|
|
||
|
-#include <sysdep.h>
|
||
|
|
||
|
- .text
|
||
|
-ENTRY (wcsrchr)
|
||
|
+#define USE_AS_WCSRCHR 1
|
||
|
+#define NO_PMINU 1
|
||
|
|
||
|
- movd %rsi, %xmm1
|
||
|
- mov %rdi, %rcx
|
||
|
- punpckldq %xmm1, %xmm1
|
||
|
- pxor %xmm2, %xmm2
|
||
|
- punpckldq %xmm1, %xmm1
|
||
|
- and $63, %rcx
|
||
|
- cmp $48, %rcx
|
||
|
- ja L(crosscache)
|
||
|
+#ifndef STRRCHR
|
||
|
+# define STRRCHR wcsrchr
|
||
|
+#endif
|
||
|
|
||
|
- movdqu (%rdi), %xmm0
|
||
|
- pcmpeqd %xmm0, %xmm2
|
||
|
- pcmpeqd %xmm1, %xmm0
|
||
|
- pmovmskb %xmm2, %rcx
|
||
|
- pmovmskb %xmm0, %rax
|
||
|
- add $16, %rdi
|
||
|
-
|
||
|
- test %rax, %rax
|
||
|
- jnz L(unaligned_match1)
|
||
|
-
|
||
|
- test %rcx, %rcx
|
||
|
- jnz L(return_null)
|
||
|
-
|
||
|
- and $-16, %rdi
|
||
|
- xor %r8, %r8
|
||
|
- jmp L(loop)
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(unaligned_match1):
|
||
|
- test %rcx, %rcx
|
||
|
- jnz L(prolog_find_zero_1)
|
||
|
-
|
||
|
- mov %rax, %r8
|
||
|
- mov %rdi, %rsi
|
||
|
- and $-16, %rdi
|
||
|
- jmp L(loop)
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(crosscache):
|
||
|
- and $15, %rcx
|
||
|
- and $-16, %rdi
|
||
|
- pxor %xmm3, %xmm3
|
||
|
- movdqa (%rdi), %xmm0
|
||
|
- pcmpeqd %xmm0, %xmm3
|
||
|
- pcmpeqd %xmm1, %xmm0
|
||
|
- pmovmskb %xmm3, %rdx
|
||
|
- pmovmskb %xmm0, %rax
|
||
|
- shr %cl, %rdx
|
||
|
- shr %cl, %rax
|
||
|
- add $16, %rdi
|
||
|
-
|
||
|
- test %rax, %rax
|
||
|
- jnz L(unaligned_match)
|
||
|
-
|
||
|
- test %rdx, %rdx
|
||
|
- jnz L(return_null)
|
||
|
-
|
||
|
- xor %r8, %r8
|
||
|
- jmp L(loop)
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(unaligned_match):
|
||
|
- test %rdx, %rdx
|
||
|
- jnz L(prolog_find_zero)
|
||
|
-
|
||
|
- mov %rax, %r8
|
||
|
- lea (%rdi, %rcx), %rsi
|
||
|
-
|
||
|
-/* Loop start on aligned string. */
|
||
|
- .p2align 4
|
||
|
-L(loop):
|
||
|
- movdqa (%rdi), %xmm0
|
||
|
- pcmpeqd %xmm0, %xmm2
|
||
|
- add $16, %rdi
|
||
|
- pcmpeqd %xmm1, %xmm0
|
||
|
- pmovmskb %xmm2, %rcx
|
||
|
- pmovmskb %xmm0, %rax
|
||
|
- or %rax, %rcx
|
||
|
- jnz L(matches)
|
||
|
-
|
||
|
- movdqa (%rdi), %xmm3
|
||
|
- pcmpeqd %xmm3, %xmm2
|
||
|
- add $16, %rdi
|
||
|
- pcmpeqd %xmm1, %xmm3
|
||
|
- pmovmskb %xmm2, %rcx
|
||
|
- pmovmskb %xmm3, %rax
|
||
|
- or %rax, %rcx
|
||
|
- jnz L(matches)
|
||
|
-
|
||
|
- movdqa (%rdi), %xmm4
|
||
|
- pcmpeqd %xmm4, %xmm2
|
||
|
- add $16, %rdi
|
||
|
- pcmpeqd %xmm1, %xmm4
|
||
|
- pmovmskb %xmm2, %rcx
|
||
|
- pmovmskb %xmm4, %rax
|
||
|
- or %rax, %rcx
|
||
|
- jnz L(matches)
|
||
|
-
|
||
|
- movdqa (%rdi), %xmm5
|
||
|
- pcmpeqd %xmm5, %xmm2
|
||
|
- add $16, %rdi
|
||
|
- pcmpeqd %xmm1, %xmm5
|
||
|
- pmovmskb %xmm2, %rcx
|
||
|
- pmovmskb %xmm5, %rax
|
||
|
- or %rax, %rcx
|
||
|
- jz L(loop)
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(matches):
|
||
|
- test %rax, %rax
|
||
|
- jnz L(match)
|
||
|
-L(return_value):
|
||
|
- test %r8, %r8
|
||
|
- jz L(return_null)
|
||
|
- mov %r8, %rax
|
||
|
- mov %rsi, %rdi
|
||
|
-
|
||
|
- test $15 << 4, %ah
|
||
|
- jnz L(match_fourth_wchar)
|
||
|
- test %ah, %ah
|
||
|
- jnz L(match_third_wchar)
|
||
|
- test $15 << 4, %al
|
||
|
- jnz L(match_second_wchar)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(match):
|
||
|
- pmovmskb %xmm2, %rcx
|
||
|
- test %rcx, %rcx
|
||
|
- jnz L(find_zero)
|
||
|
- mov %rax, %r8
|
||
|
- mov %rdi, %rsi
|
||
|
- jmp L(loop)
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(find_zero):
|
||
|
- test $15, %cl
|
||
|
- jnz L(find_zero_in_first_wchar)
|
||
|
- test %cl, %cl
|
||
|
- jnz L(find_zero_in_second_wchar)
|
||
|
- test $15, %ch
|
||
|
- jnz L(find_zero_in_third_wchar)
|
||
|
-
|
||
|
- and $1 << 13 - 1, %rax
|
||
|
- jz L(return_value)
|
||
|
-
|
||
|
- test $15 << 4, %ah
|
||
|
- jnz L(match_fourth_wchar)
|
||
|
- test %ah, %ah
|
||
|
- jnz L(match_third_wchar)
|
||
|
- test $15 << 4, %al
|
||
|
- jnz L(match_second_wchar)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(find_zero_in_first_wchar):
|
||
|
- test $1, %rax
|
||
|
- jz L(return_value)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(find_zero_in_second_wchar):
|
||
|
- and $1 << 5 - 1, %rax
|
||
|
- jz L(return_value)
|
||
|
-
|
||
|
- test $15 << 4, %al
|
||
|
- jnz L(match_second_wchar)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(find_zero_in_third_wchar):
|
||
|
- and $1 << 9 - 1, %rax
|
||
|
- jz L(return_value)
|
||
|
-
|
||
|
- test %ah, %ah
|
||
|
- jnz L(match_third_wchar)
|
||
|
- test $15 << 4, %al
|
||
|
- jnz L(match_second_wchar)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(prolog_find_zero):
|
||
|
- add %rcx, %rdi
|
||
|
- mov %rdx, %rcx
|
||
|
-L(prolog_find_zero_1):
|
||
|
- test $15, %cl
|
||
|
- jnz L(prolog_find_zero_in_first_wchar)
|
||
|
- test %cl, %cl
|
||
|
- jnz L(prolog_find_zero_in_second_wchar)
|
||
|
- test $15, %ch
|
||
|
- jnz L(prolog_find_zero_in_third_wchar)
|
||
|
-
|
||
|
- and $1 << 13 - 1, %rax
|
||
|
- jz L(return_null)
|
||
|
-
|
||
|
- test $15 << 4, %ah
|
||
|
- jnz L(match_fourth_wchar)
|
||
|
- test %ah, %ah
|
||
|
- jnz L(match_third_wchar)
|
||
|
- test $15 << 4, %al
|
||
|
- jnz L(match_second_wchar)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(prolog_find_zero_in_first_wchar):
|
||
|
- test $1, %rax
|
||
|
- jz L(return_null)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(prolog_find_zero_in_second_wchar):
|
||
|
- and $1 << 5 - 1, %rax
|
||
|
- jz L(return_null)
|
||
|
-
|
||
|
- test $15 << 4, %al
|
||
|
- jnz L(match_second_wchar)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(prolog_find_zero_in_third_wchar):
|
||
|
- and $1 << 9 - 1, %rax
|
||
|
- jz L(return_null)
|
||
|
-
|
||
|
- test %ah, %ah
|
||
|
- jnz L(match_third_wchar)
|
||
|
- test $15 << 4, %al
|
||
|
- jnz L(match_second_wchar)
|
||
|
- lea -16(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(match_second_wchar):
|
||
|
- lea -12(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(match_third_wchar):
|
||
|
- lea -8(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(match_fourth_wchar):
|
||
|
- lea -4(%rdi), %rax
|
||
|
- ret
|
||
|
-
|
||
|
- .p2align 4
|
||
|
-L(return_null):
|
||
|
- xor %rax, %rax
|
||
|
- ret
|
||
|
-
|
||
|
-END (wcsrchr)
|
||
|
+#include "../strrchr.S"
|
||
|
--
|
||
|
GitLab
|
||
|
|