Author: Florian Weimer Date: Wed Oct 18 11:12:29 2023 +0200 Revert "x86: Prepare `strrchr-evex` and `strrchr-evex512` for AVX10" This reverts commit a3c50bf46a1ca6d9d2b7d879176d345abf95a9de. diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S index cd6a0a870a02b9bd..58b2853ab69265e8 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S @@ -1,4 +1,4 @@ -/* Implementation for strrchr using evex256 and evex512. +/* Placeholder function, not used by any processor at the moment. Copyright (C) 2022-2023 Free Software Foundation, Inc. This file is part of the GNU C Library. @@ -16,6 +16,8 @@ License along with the GNU C Library; if not, see . */ +/* UNUSED. Exists purely as reference implementation. */ + #include #if ISA_SHOULD_BUILD (4) @@ -23,351 +25,240 @@ # include # ifdef USE_AS_WCSRCHR -# if VEC_SIZE == 64 -# define RCX_M cx -# define KORTEST_M kortestw -# else -# define RCX_M cl -# define KORTEST_M kortestb -# endif - -# define SHIFT_REG VRCX # define CHAR_SIZE 4 -# define VPCMP vpcmpd -# define VPMIN vpminud -# define VPCOMPRESS vpcompressd -# define VPTESTN vptestnmd -# define VPTEST vptestmd -# define VPBROADCAST vpbroadcastd +# define VPBROADCAST vpbroadcastd # define VPCMPEQ vpcmpeqd - +# define VPMINU vpminud +# define VPTESTN vptestnmd # else -# define SHIFT_REG VRDI # define CHAR_SIZE 1 -# define VPCMP vpcmpb -# define VPMIN vpminub -# define VPCOMPRESS vpcompressb -# define VPTESTN vptestnmb -# define VPTEST vptestmb -# define VPBROADCAST vpbroadcastb +# define VPBROADCAST vpbroadcastb # define VPCMPEQ vpcmpeqb - -# define RCX_M VRCX -# define KORTEST_M KORTEST +# define VPMINU vpminub +# define VPTESTN vptestnmb # endif -# define VMATCH VMM(0) -# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) # define PAGE_SIZE 4096 +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) .section SECTION(.text), "ax", @progbits - /* Aligning entry point to 64 byte, provides better performance for - one vector length string. */ -ENTRY_P2ALIGN(STRRCHR, 6) - movl %edi, %eax - /* Broadcast CHAR to VMATCH. */ - VPBROADCAST %esi, %VMATCH +/* Aligning entry point to 64 byte, provides better performance for + one vector length string. */ +ENTRY_P2ALIGN (STRRCHR, 6) - andl $(PAGE_SIZE - 1), %eax - cmpl $(PAGE_SIZE - VEC_SIZE), %eax - jg L(cross_page_boundary) + /* Broadcast CHAR to VMM(0). */ + VPBROADCAST %esi, %VMM(0) + movl %edi, %eax + sall $20, %eax + cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax + ja L(page_cross) +L(page_cross_continue): + /* Compare [w]char for null, mask bit will be set for match. */ VMOVU (%rdi), %VMM(1) - /* k0 has a 1 for each zero CHAR in YMM1. */ - VPTESTN %VMM(1), %VMM(1), %k0 - KMOV %k0, %VGPR(rsi) - test %VGPR(rsi), %VGPR(rsi) - jz L(aligned_more) - /* fallthrough: zero CHAR in first VEC. */ -L(page_cross_return): - /* K1 has a 1 for each search CHAR match in VEC(1). */ - VPCMPEQ %VMATCH, %VMM(1), %k1 - KMOV %k1, %VGPR(rax) - /* Build mask up until first zero CHAR (used to mask of - potential search CHAR matches past the end of the string). */ - blsmsk %VGPR(rsi), %VGPR(rsi) - /* Use `and` here to remove any out of bounds matches so we can - do a reverse scan on `rax` to find the last match. */ - and %VGPR(rsi), %VGPR(rax) - jz L(ret0) - /* Get last match. */ - bsr %VGPR(rax), %VGPR(rax) + + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRCX + test %VRCX, %VRCX + jz L(align_more) + + VPCMPEQ %VMM(1), %VMM(0), %k0 + KMOV %k0, %VRAX + BLSMSK %VRCX, %VRCX + and %VRCX, %VRAX + jz L(ret) + + BSR %VRAX, %VRAX # ifdef USE_AS_WCSRCHR leaq (%rdi, %rax, CHAR_SIZE), %rax # else - addq %rdi, %rax + add %rdi, %rax # endif -L(ret0): +L(ret): ret - /* Returns for first vec x1/x2/x3 have hard coded backward - search path for earlier matches. */ - .p2align 4,, 6 -L(first_vec_x1): - VPCMPEQ %VMATCH, %VMM(2), %k1 - KMOV %k1, %VGPR(rax) - blsmsk %VGPR(rcx), %VGPR(rcx) - /* eax non-zero if search CHAR in range. */ - and %VGPR(rcx), %VGPR(rax) - jnz L(first_vec_x1_return) - - /* fallthrough: no match in YMM2 then need to check for earlier - matches (in YMM1). */ - .p2align 4,, 4 -L(first_vec_x0_test): - VPCMPEQ %VMATCH, %VMM(1), %k1 - KMOV %k1, %VGPR(rax) - test %VGPR(rax), %VGPR(rax) - jz L(ret1) - bsr %VGPR(rax), %VGPR(rax) +L(vector_x2_end): + VPCMPEQ %VMM(2), %VMM(0), %k2 + KMOV %k2, %VRAX + BLSMSK %VRCX, %VRCX + and %VRCX, %VRAX + jz L(vector_x1_ret) + + BSR %VRAX, %VRAX + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + /* Check the first vector at very last to look for match. */ +L(vector_x1_ret): + VPCMPEQ %VMM(1), %VMM(0), %k2 + KMOV %k2, %VRAX + test %VRAX, %VRAX + jz L(ret) + + BSR %VRAX, %VRAX # ifdef USE_AS_WCSRCHR leaq (%rsi, %rax, CHAR_SIZE), %rax # else - addq %rsi, %rax + add %rsi, %rax # endif -L(ret1): ret - .p2align 4,, 10 -L(first_vec_x3): - VPCMPEQ %VMATCH, %VMM(4), %k1 - KMOV %k1, %VGPR(rax) - blsmsk %VGPR(rcx), %VGPR(rcx) - /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ - and %VGPR(rcx), %VGPR(rax) - jz L(first_vec_x1_or_x2) - bsr %VGPR(rax), %VGPR(rax) - leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax - ret - .p2align 4,, 4 - -L(first_vec_x2): - VPCMPEQ %VMATCH, %VMM(3), %k1 - KMOV %k1, %VGPR(rax) - blsmsk %VGPR(rcx), %VGPR(rcx) - /* Check YMM3 for last match first. If no match try YMM2/YMM1. */ - and %VGPR(rcx), %VGPR(rax) - jz L(first_vec_x0_x1_test) - bsr %VGPR(rax), %VGPR(rax) - leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax - ret - - .p2align 4,, 6 -L(first_vec_x0_x1_test): - VPCMPEQ %VMATCH, %VMM(2), %k1 - KMOV %k1, %VGPR(rax) - /* Check YMM2 for last match first. If no match try YMM1. */ - test %VGPR(rax), %VGPR(rax) - jz L(first_vec_x0_test) - .p2align 4,, 4 -L(first_vec_x1_return): - bsr %VGPR(rax), %VGPR(rax) - leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax - ret - - .p2align 4,, 12 -L(aligned_more): -L(page_cross_continue): - /* Need to keep original pointer incase VEC(1) has last match. */ +L(align_more): + /* Zero r8 to store match result. */ + xorl %r8d, %r8d + /* Save pointer of first vector, in case if no match found. */ movq %rdi, %rsi + /* Align pointer to vector size. */ andq $-VEC_SIZE, %rdi - - VMOVU VEC_SIZE(%rdi), %VMM(2) + /* Loop unroll for 2 vector loop. */ + VMOVA (VEC_SIZE)(%rdi), %VMM(2) VPTESTN %VMM(2), %VMM(2), %k0 KMOV %k0, %VRCX - movq %rdi, %r8 test %VRCX, %VRCX - jnz L(first_vec_x1) - - VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3) - VPTESTN %VMM(3), %VMM(3), %k0 - KMOV %k0, %VRCX - - test %VRCX, %VRCX - jnz L(first_vec_x2) - - VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4) - VPTESTN %VMM(4), %VMM(4), %k0 - KMOV %k0, %VRCX - - /* Intentionally use 64-bit here. EVEX256 version needs 1-byte - padding for efficient nop before loop alignment. */ - test %rcx, %rcx - jnz L(first_vec_x3) + jnz L(vector_x2_end) + /* Save pointer of second vector, in case if no match + found. */ + movq %rdi, %r9 + /* Align address to VEC_SIZE * 2 for loop. */ andq $-(VEC_SIZE * 2), %rdi - .p2align 4 -L(first_aligned_loop): - /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can - gurantee they don't store a match. */ - VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5) - VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6) - - VPCMP $4, %VMM(5), %VMATCH, %k2 - VPCMP $4, %VMM(6), %VMATCH, %k3{%k2} - - VPMIN %VMM(5), %VMM(6), %VMM(7) - VPTEST %VMM(7), %VMM(7), %k1{%k3} - subq $(VEC_SIZE * -2), %rdi - KORTEST_M %k1, %k1 - jc L(first_aligned_loop) + .p2align 4,,11 +L(loop): + /* 2 vector loop, as it provide better performance as compared + to 4 vector loop. */ + VMOVA (VEC_SIZE * 2)(%rdi), %VMM(3) + VMOVA (VEC_SIZE * 3)(%rdi), %VMM(4) + VPCMPEQ %VMM(3), %VMM(0), %k1 + VPCMPEQ %VMM(4), %VMM(0), %k2 + VPMINU %VMM(3), %VMM(4), %VMM(5) + VPTESTN %VMM(5), %VMM(5), %k0 + KOR %k1, %k2, %k3 + subq $-(VEC_SIZE * 2), %rdi + /* If k0 and k3 zero, match and end of string not found. */ + KORTEST %k0, %k3 + jz L(loop) + + /* If k0 is non zero, end of string found. */ + KORTEST %k0, %k0 + jnz L(endloop) + + lea VEC_SIZE(%rdi), %r8 + /* A match found, it need to be stored in r8 before loop + continue. */ + /* Check second vector first. */ + KMOV %k2, %VRDX + test %VRDX, %VRDX + jnz L(loop_vec_x2_match) - VPTESTN %VMM(7), %VMM(7), %k1 KMOV %k1, %VRDX - test %VRDX, %VRDX - jz L(second_aligned_loop_prep) + /* Match is in first vector, rdi offset need to be subtracted + by VEC_SIZE. */ + sub $VEC_SIZE, %r8 + + /* If second vector doesn't have match, first vector must + have match. */ +L(loop_vec_x2_match): + BSR %VRDX, %VRDX +# ifdef USE_AS_WCSRCHR + sal $2, %rdx +# endif + add %rdx, %r8 + jmp L(loop) - KORTEST_M %k3, %k3 - jnc L(return_first_aligned_loop) +L(endloop): + /* Check if string end in first loop vector. */ + VPTESTN %VMM(3), %VMM(3), %k0 + KMOV %k0, %VRCX + test %VRCX, %VRCX + jnz L(loop_vector_x1_end) - .p2align 4,, 6 -L(first_vec_x1_or_x2_or_x3): - VPCMPEQ %VMM(4), %VMATCH, %k4 - KMOV %k4, %VRAX + /* Check if it has match in first loop vector. */ + KMOV %k1, %VRAX test %VRAX, %VRAX - jz L(first_vec_x1_or_x2) - bsr %VRAX, %VRAX - leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax - ret + jz L(loop_vector_x2_end) - .p2align 4,, 8 -L(return_first_aligned_loop): - VPTESTN %VMM(5), %VMM(5), %k0 + BSR %VRAX, %VRAX + leaq (%rdi, %rax, CHAR_SIZE), %r8 + + /* String must end in second loop vector. */ +L(loop_vector_x2_end): + VPTESTN %VMM(4), %VMM(4), %k0 KMOV %k0, %VRCX - blsmsk %VRCX, %VRCX - jnc L(return_first_new_match_first) - blsmsk %VRDX, %VRDX - VPCMPEQ %VMM(6), %VMATCH, %k0 - KMOV %k0, %VRAX - addq $VEC_SIZE, %rdi - and %VRDX, %VRAX - jnz L(return_first_new_match_ret) - subq $VEC_SIZE, %rdi -L(return_first_new_match_first): KMOV %k2, %VRAX -# ifdef USE_AS_WCSRCHR - xorl $((1 << CHAR_PER_VEC)- 1), %VRAX + BLSMSK %VRCX, %VRCX + /* Check if it has match in second loop vector. */ and %VRCX, %VRAX -# else - andn %VRCX, %VRAX, %VRAX -# endif - jz L(first_vec_x1_or_x2_or_x3) -L(return_first_new_match_ret): - bsr %VRAX, %VRAX - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax - ret + jz L(check_last_match) - .p2align 4,, 10 -L(first_vec_x1_or_x2): - VPCMPEQ %VMM(3), %VMATCH, %k3 - KMOV %k3, %VRAX - test %VRAX, %VRAX - jz L(first_vec_x0_x1_test) - bsr %VRAX, %VRAX - leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax + BSR %VRAX, %VRAX + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ret - .p2align 4 - /* We can throw away the work done for the first 4x checks here - as we have a later match. This is the 'fast' path persay. */ -L(second_aligned_loop_prep): -L(second_aligned_loop_set_furthest_match): - movq %rdi, %rsi - VMOVA %VMM(5), %VMM(7) - VMOVA %VMM(6), %VMM(8) - .p2align 4 -L(second_aligned_loop): - VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5) - VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6) - VPCMP $4, %VMM(5), %VMATCH, %k2 - VPCMP $4, %VMM(6), %VMATCH, %k3{%k2} - - VPMIN %VMM(5), %VMM(6), %VMM(4) - - VPTEST %VMM(4), %VMM(4), %k1{%k3} - subq $(VEC_SIZE * -2), %rdi - KMOV %k1, %VRCX - inc %RCX_M - jz L(second_aligned_loop) - VPTESTN %VMM(4), %VMM(4), %k1 - KMOV %k1, %VRDX - test %VRDX, %VRDX - jz L(second_aligned_loop_set_furthest_match) - - KORTEST_M %k3, %k3 - jnc L(return_new_match) - /* branch here because there is a significant advantage interms - of output dependency chance in using edx. */ + /* String end in first loop vector. */ +L(loop_vector_x1_end): + KMOV %k1, %VRAX + BLSMSK %VRCX, %VRCX + /* Check if it has match in second loop vector. */ + and %VRCX, %VRAX + jz L(check_last_match) -L(return_old_match): - VPCMPEQ %VMM(8), %VMATCH, %k0 - KMOV %k0, %VRCX - bsr %VRCX, %VRCX - jnz L(return_old_match_ret) + BSR %VRAX, %VRAX + leaq (%rdi, %rax, CHAR_SIZE), %rax + ret - VPCMPEQ %VMM(7), %VMATCH, %k0 - KMOV %k0, %VRCX - bsr %VRCX, %VRCX - subq $VEC_SIZE, %rsi -L(return_old_match_ret): - leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax + /* No match in first and second loop vector. */ +L(check_last_match): + /* Check if any match recorded in r8. */ + test %r8, %r8 + jz L(vector_x2_ret) + movq %r8, %rax ret -L(return_new_match): - VPTESTN %VMM(5), %VMM(5), %k0 - KMOV %k0, %VRCX - blsmsk %VRCX, %VRCX - jnc L(return_new_match_first) - dec %VRDX - VPCMPEQ %VMM(6), %VMATCH, %k0 - KMOV %k0, %VRAX - addq $VEC_SIZE, %rdi - and %VRDX, %VRAX - jnz L(return_new_match_ret) - subq $VEC_SIZE, %rdi -L(return_new_match_first): + /* No match recorded in r8. Check the second saved vector + in beginning. */ +L(vector_x2_ret): + VPCMPEQ %VMM(2), %VMM(0), %k2 KMOV %k2, %VRAX -# ifdef USE_AS_WCSRCHR - xorl $((1 << CHAR_PER_VEC)- 1), %VRAX - and %VRCX, %VRAX -# else - andn %VRCX, %VRAX, %VRAX -# endif - jz L(return_old_match) -L(return_new_match_ret): - bsr %VRAX, %VRAX - leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + test %VRAX, %VRAX + jz L(vector_x1_ret) + + /* Match found in the second saved vector. */ + BSR %VRAX, %VRAX + leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax ret - .p2align 4,, 4 -L(cross_page_boundary): - xorq %rdi, %rax - mov $-1, %VRDX - VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) - VPTESTN %VMM(6), %VMM(6), %k0 - KMOV %k0, %VRSI +L(page_cross): + mov %rdi, %rax + movl %edi, %ecx # ifdef USE_AS_WCSRCHR - movl %edi, %ecx - and $(VEC_SIZE - 1), %ecx - shrl $2, %ecx + /* Calculate number of compare result bits to be skipped for + wide string alignment adjustment. */ + andl $(VEC_SIZE - 1), %ecx + sarl $2, %ecx # endif - shlx %SHIFT_REG, %VRDX, %VRDX - + /* ecx contains number of w[char] to be skipped as a result + of address alignment. */ + andq $-VEC_SIZE, %rax + VMOVA (%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k1 + KMOV %k1, %VRAX + SHR %cl, %VRAX + jz L(page_cross_continue) + VPCMPEQ %VMM(1), %VMM(0), %k0 + KMOV %k0, %VRDX + SHR %cl, %VRDX + BLSMSK %VRAX, %VRAX + and %VRDX, %VRAX + jz L(ret) + BSR %VRAX, %VRAX # ifdef USE_AS_WCSRCHR - kmovw %edx, %k1 + leaq (%rdi, %rax, CHAR_SIZE), %rax # else - KMOV %VRDX, %k1 + add %rdi, %rax # endif - VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} - /* We could technically just jmp back after the vpcompress but - it doesn't save any 16-byte blocks. */ - shrx %SHIFT_REG, %VRSI, %VRSI - test %VRSI, %VRSI - jnz L(page_cross_return) - jmp L(page_cross_continue) - /* 1-byte from cache line. */ -END(STRRCHR) + ret +END (STRRCHR) #endif diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S index 3bf6a5101422e4d1..85e3b0119f5dc923 100644 --- a/sysdeps/x86_64/multiarch/strrchr-evex.S +++ b/sysdeps/x86_64/multiarch/strrchr-evex.S @@ -1,8 +1,394 @@ +/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. + Copyright (C) 2021-2023 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or + modify it under the terms of the GNU Lesser General Public + License as published by the Free Software Foundation; either + version 2.1 of the License, or (at your option) any later version. + + The GNU C Library is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + Lesser General Public License for more details. + + You should have received a copy of the GNU Lesser General Public + License along with the GNU C Library; if not, see + . */ + +#include + +#if ISA_SHOULD_BUILD (4) + +# include + # ifndef STRRCHR # define STRRCHR __strrchr_evex # endif -#include "x86-evex256-vecs.h" -#include "reg-macros.h" +# include "x86-evex256-vecs.h" + +# ifdef USE_AS_WCSRCHR +# define SHIFT_REG rsi +# define kunpck_2x kunpckbw +# define kmov_2x kmovd +# define maskz_2x ecx +# define maskm_2x eax +# define CHAR_SIZE 4 +# define VPMIN vpminud +# define VPTESTN vptestnmd +# define VPTEST vptestmd +# define VPBROADCAST vpbroadcastd +# define VPCMPEQ vpcmpeqd +# define VPCMP vpcmpd + +# define USE_WIDE_CHAR +# else +# define SHIFT_REG rdi +# define kunpck_2x kunpckdq +# define kmov_2x kmovq +# define maskz_2x rcx +# define maskm_2x rax + +# define CHAR_SIZE 1 +# define VPMIN vpminub +# define VPTESTN vptestnmb +# define VPTEST vptestmb +# define VPBROADCAST vpbroadcastb +# define VPCMPEQ vpcmpeqb +# define VPCMP vpcmpb +# endif + +# include "reg-macros.h" + +# define VMATCH VMM(0) +# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +# define PAGE_SIZE 4096 + + .section SECTION(.text), "ax", @progbits +ENTRY_P2ALIGN(STRRCHR, 6) + movl %edi, %eax + /* Broadcast CHAR to VMATCH. */ + VPBROADCAST %esi, %VMATCH + + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - VEC_SIZE), %eax + jg L(cross_page_boundary) +L(page_cross_continue): + VMOVU (%rdi), %VMM(1) + /* k0 has a 1 for each zero CHAR in VEC(1). */ + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRSI + test %VRSI, %VRSI + jz L(aligned_more) + /* fallthrough: zero CHAR in first VEC. */ + /* K1 has a 1 for each search CHAR match in VEC(1). */ + VPCMPEQ %VMATCH, %VMM(1), %k1 + KMOV %k1, %VRAX + /* Build mask up until first zero CHAR (used to mask of + potential search CHAR matches past the end of the string). + */ + blsmsk %VRSI, %VRSI + and %VRSI, %VRAX + jz L(ret0) + /* Get last match (the `and` removed any out of bounds matches). + */ + bsr %VRAX, %VRAX +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif +L(ret0): + ret + + /* Returns for first vec x1/x2/x3 have hard coded backward + search path for earlier matches. */ + .p2align 4,, 6 +L(first_vec_x1): + VPCMPEQ %VMATCH, %VMM(2), %k1 + KMOV %k1, %VRAX + blsmsk %VRCX, %VRCX + /* eax non-zero if search CHAR in range. */ + and %VRCX, %VRAX + jnz L(first_vec_x1_return) + + /* fallthrough: no match in VEC(2) then need to check for + earlier matches (in VEC(1)). */ + .p2align 4,, 4 +L(first_vec_x0_test): + VPCMPEQ %VMATCH, %VMM(1), %k1 + KMOV %k1, %VRAX + test %VRAX, %VRAX + jz L(ret1) + bsr %VRAX, %VRAX +# ifdef USE_AS_WCSRCHR + leaq (%rsi, %rax, CHAR_SIZE), %rax +# else + addq %rsi, %rax +# endif +L(ret1): + ret + + .p2align 4,, 10 +L(first_vec_x1_or_x2): + VPCMPEQ %VMM(3), %VMATCH, %k3 + VPCMPEQ %VMM(2), %VMATCH, %k2 + /* K2 and K3 have 1 for any search CHAR match. Test if any + matches between either of them. Otherwise check VEC(1). */ + KORTEST %k2, %k3 + jz L(first_vec_x0_test) + + /* Guaranteed that VEC(2) and VEC(3) are within range so merge + the two bitmasks then get last result. */ + kunpck_2x %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax + ret + + .p2align 4,, 7 +L(first_vec_x3): + VPCMPEQ %VMATCH, %VMM(4), %k1 + KMOV %k1, %VRAX + blsmsk %VRCX, %VRCX + /* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3). + */ + and %VRCX, %VRAX + jz L(first_vec_x1_or_x2) + bsr %VRAX, %VRAX + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + + + .p2align 4,, 6 +L(first_vec_x0_x1_test): + VPCMPEQ %VMATCH, %VMM(2), %k1 + KMOV %k1, %VRAX + /* Check VEC(2) for last match first. If no match try VEC(1). + */ + test %VRAX, %VRAX + jz L(first_vec_x0_test) + .p2align 4,, 4 +L(first_vec_x1_return): + bsr %VRAX, %VRAX + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + + .p2align 4,, 10 +L(first_vec_x2): + VPCMPEQ %VMATCH, %VMM(3), %k1 + KMOV %k1, %VRAX + blsmsk %VRCX, %VRCX + /* Check VEC(3) for last match first. If no match try + VEC(2)/VEC(1). */ + and %VRCX, %VRAX + jz L(first_vec_x0_x1_test) + bsr %VRAX, %VRAX + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + + .p2align 4,, 12 +L(aligned_more): + /* Need to keep original pointer in case VEC(1) has last match. + */ + movq %rdi, %rsi + andq $-VEC_SIZE, %rdi + + VMOVU VEC_SIZE(%rdi), %VMM(2) + VPTESTN %VMM(2), %VMM(2), %k0 + KMOV %k0, %VRCX + + test %VRCX, %VRCX + jnz L(first_vec_x1) + + VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3) + VPTESTN %VMM(3), %VMM(3), %k0 + KMOV %k0, %VRCX + + test %VRCX, %VRCX + jnz L(first_vec_x2) + + VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4) + VPTESTN %VMM(4), %VMM(4), %k0 + KMOV %k0, %VRCX + movq %rdi, %r8 + test %VRCX, %VRCX + jnz L(first_vec_x3) + + andq $-(VEC_SIZE * 2), %rdi + .p2align 4,, 10 +L(first_aligned_loop): + /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can + guarantee they don't store a match. */ + VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5) + VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6) + + VPCMPEQ %VMM(5), %VMATCH, %k2 + vpxord %VMM(6), %VMATCH, %VMM(7) + + VPMIN %VMM(5), %VMM(6), %VMM(8) + VPMIN %VMM(8), %VMM(7), %VMM(7) + + VPTESTN %VMM(7), %VMM(7), %k1 + subq $(VEC_SIZE * -2), %rdi + KORTEST %k1, %k2 + jz L(first_aligned_loop) + + VPCMPEQ %VMM(6), %VMATCH, %k3 + VPTESTN %VMM(8), %VMM(8), %k1 + + /* If k1 is zero, then we found a CHAR match but no null-term. + We can now safely throw out VEC1-4. */ + KTEST %k1, %k1 + jz L(second_aligned_loop_prep) + + KORTEST %k2, %k3 + jnz L(return_first_aligned_loop) + + + .p2align 4,, 6 +L(first_vec_x1_or_x2_or_x3): + VPCMPEQ %VMM(4), %VMATCH, %k4 + KMOV %k4, %VRAX + bsr %VRAX, %VRAX + jz L(first_vec_x1_or_x2) + leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax + ret + + + .p2align 4,, 8 +L(return_first_aligned_loop): + VPTESTN %VMM(5), %VMM(5), %k0 + + /* Combined results from VEC5/6. */ + kunpck_2x %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck_2x %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + and %maskz_2x, %maskm_2x + jz L(first_vec_x1_or_x2_or_x3) + + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 + /* We can throw away the work done for the first 4x checks here + as we have a later match. This is the 'fast' path persay. + */ +L(second_aligned_loop_prep): +L(second_aligned_loop_set_furthest_match): + movq %rdi, %rsi + /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on + port0 and have noticeable overhead in the loop. */ + VMOVA %VMM(5), %VMM(7) + VMOVA %VMM(6), %VMM(8) + .p2align 4 +L(second_aligned_loop): + VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5) + VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6) + VPCMPEQ %VMM(5), %VMATCH, %k2 + vpxord %VMM(6), %VMATCH, %VMM(3) + + VPMIN %VMM(5), %VMM(6), %VMM(4) + VPMIN %VMM(3), %VMM(4), %VMM(3) + + VPTESTN %VMM(3), %VMM(3), %k1 + subq $(VEC_SIZE * -2), %rdi + KORTEST %k1, %k2 + jz L(second_aligned_loop) + VPCMPEQ %VMM(6), %VMATCH, %k3 + VPTESTN %VMM(4), %VMM(4), %k1 + KTEST %k1, %k1 + jz L(second_aligned_loop_set_furthest_match) + + /* branch here because we know we have a match in VEC7/8 but + might not in VEC5/6 so the latter is expected to be less + likely. */ + KORTEST %k2, %k3 + jnz L(return_new_match) + +L(return_old_match): + VPCMPEQ %VMM(8), %VMATCH, %k0 + KMOV %k0, %VRCX + bsr %VRCX, %VRCX + jnz L(return_old_match_ret) + + VPCMPEQ %VMM(7), %VMATCH, %k0 + KMOV %k0, %VRCX + bsr %VRCX, %VRCX + subq $VEC_SIZE, %rsi +L(return_old_match_ret): + leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax + ret + + .p2align 4,, 10 +L(return_new_match): + VPTESTN %VMM(5), %VMM(5), %k0 + + /* Combined results from VEC5/6. */ + kunpck_2x %k0, %k1, %k0 + kmov_2x %k0, %maskz_2x + + blsmsk %maskz_2x, %maskz_2x + kunpck_2x %k2, %k3, %k3 + kmov_2x %k3, %maskm_2x + + /* Match at end was out-of-bounds so use last known match. */ + and %maskz_2x, %maskm_2x + jz L(return_old_match) + + bsr %maskm_2x, %maskm_2x + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +L(cross_page_boundary): + /* eax contains all the page offset bits of src (rdi). `xor rdi, + rax` sets pointer will all page offset bits cleared so + offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC + before page cross (guaranteed to be safe to read). Doing this + as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves + a bit of code size. */ + xorq %rdi, %rax + VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) + VPTESTN %VMM(1), %VMM(1), %k0 + KMOV %k0, %VRCX + + /* Shift out zero CHAR matches that are before the beginning of + src (rdi). */ +# ifdef USE_AS_WCSRCHR + movl %edi, %esi + andl $(VEC_SIZE - 1), %esi + shrl $2, %esi +# endif + shrx %VGPR(SHIFT_REG), %VRCX, %VRCX + + test %VRCX, %VRCX + jz L(page_cross_continue) -#include "strrchr-evex-base.S" + /* Found zero CHAR so need to test for search CHAR. */ + VPCMP $0, %VMATCH, %VMM(1), %k1 + KMOV %k1, %VRAX + /* Shift out search CHAR matches that are before the beginning of + src (rdi). */ + shrx %VGPR(SHIFT_REG), %VRAX, %VRAX + + /* Check if any search CHAR match in range. */ + blsmsk %VRCX, %VRCX + and %VRCX, %VRAX + jz L(ret3) + bsr %VRAX, %VRAX +# ifdef USE_AS_WCSRCHR + leaq (%rdi, %rax, CHAR_SIZE), %rax +# else + addq %rdi, %rax +# endif +L(ret3): + ret +END(STRRCHR) +#endif diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S index a584cd3f430ba9d5..e5c5fe3bf28a5966 100644 --- a/sysdeps/x86_64/multiarch/wcsrchr-evex.S +++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S @@ -4,5 +4,4 @@ #define STRRCHR WCSRCHR #define USE_AS_WCSRCHR 1 -#define USE_WIDE_CHAR 1 #include "strrchr-evex.S"