glibc/glibc-upstream-2.34-234.patch

commit 00f09a14d2818f438959e764834abb3913f2b20a
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Thu Apr 21 20:52:29 2022 -0500

    x86: Optimize {str|wcs}rchr-avx2
    
    The new code unrolls the main loop slightly without adding too much
    overhead and minimizes the comparisons for the search CHAR.
    
    Geometric Mean of all benchmarks New / Old: 0.832
    See email for all results.
    
    Full xcheck passes on x86_64 with and without multiarch enabled.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    
    (cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6)

diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 0deba97114d3b83d..b8dec737d5213b25 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
 # ifdef USE_AS_WCSRCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,196 +45,304 @@
 # endif
 
 # define VEC_SIZE	32
+# define PAGE_SIZE	4096
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
-	movd	%esi, %xmm4
-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
 	vpxor	%xmm0, %xmm0, %xmm0
 
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
 
+L(page_cross_continue):
 	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	addq	$VEC_SIZE, %rdi
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	/* Check ymm2 for search CHAR match. If no match then check ymm1
+	   before returning.  */
 	testl	%eax, %eax
-	jnz	L(first_vec)
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	testl	%ecx, %ecx
-	jnz	L(return_null)
 
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	/* If no in-range search CHAR match in ymm3 then need to check
+	   ymm1/ymm2 for an earlier match (we delay checking search
+	   CHAR matches until needed).  */
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
 
 	.p2align 4
-L(first_vec):
-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %edx
-	vpmovmskb %ymm3, %eax
-	shrl	%cl, %edx
-	shrl	%cl, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
+	jz	L(first_aligned_loop)
 
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(first_aligned_loop_return)
 
-	.p2align 4
-L(aligned_loop):
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	add	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
 	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
-
-	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a nul CHAR in a loop.  */
-	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
+	/* Search char cannot be zero.  */
 	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
 
 	.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
 	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a nul CHAR.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
 	testl	%eax, %eax
-	/* Return null pointer if the nul CHAR comes first.  */
-	jz	L(return_null)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	jnz	L(return_new_match)
+
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
 	VZEROUPPER_RETURN
 
-END (STRRCHR)
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	/* Shift out zero CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+
+	/* Shift out search CHAR matches that are before the begining of
+	   src (rdi).  */
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	/* Check if any search CHAR match in range.  */
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
 #endif
Import glibc-2.34-35.fc35 from f35 * Tue May 31 2022 Arjun Shankar <arjun@redhat.com> - 2.34-35 - Sync with upstream branch release/2.34/master, commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31: - Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose - x86: Fallback {str\|wcs}cmp RTM in the ncmp overflow case [BZ #29127] - string.h: fix __fortified_attr_access macro call [BZ #29162] - linux: Add a getauxval test [BZ #23293] - rtld: Use generic argv adjustment in ld.so [BZ #23293] - S390: Enable static PIE * Thu May 19 2022 Florian Weimer <fweimer@redhat.com> - 2.34-34 - Sync with upstream branch release/2.34/master, commit ede8d94d154157d269b18f3601440ac576c1f96a: - csu: Implement and use _dl_early_allocate during static startup - Linux: Introduce __brk_call for invoking the brk system call - Linux: Implement a useful version of _startup_fatal - ia64: Always define IA64_USE_NEW_STUB as a flag macro - Linux: Define MMAP_CALL_INTERNAL - i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls - i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S - elf: Remove __libc_init_secure - Linux: Consolidate auxiliary vector parsing (redo) - Linux: Include <dl-auxv.h> in dl-sysdep.c only for SHARED - Revert "Linux: Consolidate auxiliary vector parsing" - Linux: Consolidate auxiliary vector parsing - Linux: Assume that NEED_DL_SYSINFO_DSO is always defined - Linux: Remove DL_FIND_ARG_COMPONENTS - Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE - elf: Merge dl-sysdep.c into the Linux version - elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr - x86: Optimize {str\|wcs}rchr-evex - x86: Optimize {str\|wcs}rchr-avx2 - x86: Optimize {str\|wcs}rchr-sse2 - x86: Cleanup page cross code in memcmp-avx2-movbe.S - x86: Remove memcmp-sse4.S - x86: Small improvements for wcslen - x86: Remove AVX str{n}casecmp - x86: Add EVEX optimized str{n}casecmp - x86: Add AVX2 optimized str{n}casecmp - x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S - x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S - x86: Remove strspn-sse2.S and use the generic implementation - x86: Remove strpbrk-sse2.S and use the generic implementation - x87: Remove strcspn-sse2.S and use the generic implementation - x86: Optimize strspn in strspn-c.c - x86: Optimize strcspn and strpbrk in strcspn-c.c - x86: Code cleanup in strchr-evex and comment justifying branch - x86: Code cleanup in strchr-avx2 and comment justifying branch - x86_64: Remove bcopy optimizations - x86-64: Remove bzero weak alias in SS2 memset - x86_64/multiarch: Sort sysdep_routines and put one entry per line - x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) - fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141] * Thu May 12 2022 Florian Weimer <fweimer@redhat.com> - 2.34-33 - Sync with upstream branch release/2.34/master, commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23: - dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo - manual: Document the dlinfo function - x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] - x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] - x86: Set .text section in memset-vec-unaligned-erms - x86-64: Optimize bzero - x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) - x86: Improve vec generation in memset-vec-unaligned-erms.S - x86-64: Fix strcmp-evex.S - x86-64: Fix strcmp-avx2.S - x86: Optimize strcmp-evex.S - x86: Optimize strcmp-avx2.S - manual: Clarify that abbreviations of long options are allowed - Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h - aarch64: Add HWCAP2_ECV from Linux 5.16 - Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h - Update kernel version to 5.17 in tst-mman-consts.py - Update kernel version to 5.16 in tst-mman-consts.py - Update syscall lists for Linux 5.17 - Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h - Update kernel version to 5.15 in tst-mman-consts.py - Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h Resolves: #2091541 2022-06-06 12:53:44 +00:00			`commit 00f09a14d2818f438959e764834abb3913f2b20a`
			`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
			`Date: Thu Apr 21 20:52:29 2022 -0500`

			`x86: Optimize {str\|wcs}rchr-avx2`

			`The new code unrolls the main loop slightly without adding too much`
			`overhead and minimizes the comparisons for the search CHAR.`

			`Geometric Mean of all benchmarks New / Old: 0.832`
			`See email for all results.`

			`Full xcheck passes on x86_64 with and without multiarch enabled.`
			`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`

			`(cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6)`

			`diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S`
			`index 0deba97114d3b83d..b8dec737d5213b25 100644`
			`--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S`
			`+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S`
			`@@ -27,9 +27,13 @@`
			`# ifdef USE_AS_WCSRCHR`
			`# define VPBROADCAST vpbroadcastd`
			`# define VPCMPEQ vpcmpeqd`
			`+# define VPMIN vpminud`
			`+# define CHAR_SIZE 4`
			`# else`
			`# define VPBROADCAST vpbroadcastb`
			`# define VPCMPEQ vpcmpeqb`
			`+# define VPMIN vpminub`
			`+# define CHAR_SIZE 1`
			`# endif`

			`# ifndef VZEROUPPER`
			`@@ -41,196 +45,304 @@`
			`# endif`

			`# define VEC_SIZE 32`
			`+# define PAGE_SIZE 4096`

			`- .section SECTION(.text),"ax",@progbits`
			`-ENTRY (STRRCHR)`
			`- movd %esi, %xmm4`
			`- movl %edi, %ecx`
			`+ .section SECTION(.text), "ax", @progbits`
			`+ENTRY(STRRCHR)`
			`+ movd %esi, %xmm7`
			`+ movl %edi, %eax`
			`/* Broadcast CHAR to YMM4. */`
			`- VPBROADCAST %xmm4, %ymm4`
			`+ VPBROADCAST %xmm7, %ymm7`
			`vpxor %xmm0, %xmm0, %xmm0`

			`- /* Check if we may cross page boundary with one vector load. */`
			`- andl $(2 * VEC_SIZE - 1), %ecx`
			`- cmpl $VEC_SIZE, %ecx`
			`- ja L(cros_page_boundary)`
			+ /* Shift here instead of `andl` to save code size (saves a fetch
			`+ block). */`
			`+ sall $20, %eax`
			`+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax`
			`+ ja L(cross_page)`

			`+L(page_cross_continue):`
			`vmovdqu (%rdi), %ymm1`
			`- VPCMPEQ %ymm1, %ymm0, %ymm2`
			`- VPCMPEQ %ymm1, %ymm4, %ymm3`
			`- vpmovmskb %ymm2, %ecx`
			`- vpmovmskb %ymm3, %eax`
			`- addq $VEC_SIZE, %rdi`
			`+ /* Check end of string match. */`
			`+ VPCMPEQ %ymm1, %ymm0, %ymm6`
			`+ vpmovmskb %ymm6, %ecx`
			`+ testl %ecx, %ecx`
			`+ jz L(aligned_more)`
			`+`
			`+ /* Only check match with search CHAR if needed. */`
			`+ VPCMPEQ %ymm1, %ymm7, %ymm1`
			`+ vpmovmskb %ymm1, %eax`
			`+ /* Check if match before first zero. */`
			`+ blsmskl %ecx, %ecx`
			`+ andl %ecx, %eax`
			`+ jz L(ret0)`
			`+ bsrl %eax, %eax`
			`+ addq %rdi, %rax`
			`+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If`
			+ search CHAR is zero we are correct. Either way `andq
			+ -CHAR_SIZE, %rax` gets the correct result. */
			`+# ifdef USE_AS_WCSRCHR`
			`+ andq $-CHAR_SIZE, %rax`
			`+# endif`
			`+L(ret0):`
			`+L(return_vzeroupper):`
			`+ ZERO_UPPER_VEC_REGISTERS_RETURN`
			`+`
			`+ /* Returns for first vec x1/x2 have hard coded backward search`
			`+ path for earlier matches. */`
			`+ .p2align 4,, 10`
			`+L(first_vec_x1):`
			`+ VPCMPEQ %ymm2, %ymm7, %ymm6`
			`+ vpmovmskb %ymm6, %eax`
			`+ blsmskl %ecx, %ecx`
			`+ andl %ecx, %eax`
			`+ jnz L(first_vec_x1_return)`
			`+`
			`+ .p2align 4,, 4`
			`+L(first_vec_x0_test):`
			`+ VPCMPEQ %ymm1, %ymm7, %ymm6`
			`+ vpmovmskb %ymm6, %eax`
			`+ testl %eax, %eax`
			`+ jz L(ret1)`
			`+ bsrl %eax, %eax`
			`+ addq %r8, %rax`
			`+# ifdef USE_AS_WCSRCHR`
			`+ andq $-CHAR_SIZE, %rax`
			`+# endif`
			`+L(ret1):`
			`+ VZEROUPPER_RETURN`

			`+ .p2align 4,, 10`
			`+L(first_vec_x0_x1_test):`
			`+ VPCMPEQ %ymm2, %ymm7, %ymm6`
			`+ vpmovmskb %ymm6, %eax`
			`+ /* Check ymm2 for search CHAR match. If no match then check ymm1`
			`+ before returning. */`
			`testl %eax, %eax`
			`- jnz L(first_vec)`
			`+ jz L(first_vec_x0_test)`
			`+ .p2align 4,, 4`
			`+L(first_vec_x1_return):`
			`+ bsrl %eax, %eax`
			`+ leaq 1(%rdi, %rax), %rax`
			`+# ifdef USE_AS_WCSRCHR`
			`+ andq $-CHAR_SIZE, %rax`
			`+# endif`
			`+ VZEROUPPER_RETURN`

			`- testl %ecx, %ecx`
			`- jnz L(return_null)`

			`- andq $-VEC_SIZE, %rdi`
			`- xorl %edx, %edx`
			`- jmp L(aligned_loop)`
			`+ .p2align 4,, 10`
			`+L(first_vec_x2):`
			`+ VPCMPEQ %ymm3, %ymm7, %ymm6`
			`+ vpmovmskb %ymm6, %eax`
			`+ blsmskl %ecx, %ecx`
			`+ /* If no in-range search CHAR match in ymm3 then need to check`
			`+ ymm1/ymm2 for an earlier match (we delay checking search`
			`+ CHAR matches until needed). */`
			`+ andl %ecx, %eax`
			`+ jz L(first_vec_x0_x1_test)`
			`+ bsrl %eax, %eax`
			`+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax`
			`+# ifdef USE_AS_WCSRCHR`
			`+ andq $-CHAR_SIZE, %rax`
			`+# endif`
			`+ VZEROUPPER_RETURN`
			`+`

			`.p2align 4`
			`-L(first_vec):`
			`- /* Check if there is a nul CHAR. */`
			`+L(aligned_more):`
			`+ /* Save original pointer if match was in VEC 0. */`
			`+ movq %rdi, %r8`
			`+`
			`+ /* Align src. */`
			`+ orq $(VEC_SIZE - 1), %rdi`
			`+ vmovdqu 1(%rdi), %ymm2`
			`+ VPCMPEQ %ymm2, %ymm0, %ymm6`
			`+ vpmovmskb %ymm6, %ecx`
			`testl %ecx, %ecx`
			`- jnz L(char_and_nul_in_first_vec)`
			`+ jnz L(first_vec_x1)`

			`- /* Remember the match and keep searching. */`
			`- movl %eax, %edx`
			`- movq %rdi, %rsi`
			`- andq $-VEC_SIZE, %rdi`
			`- jmp L(aligned_loop)`
			`+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3`
			`+ VPCMPEQ %ymm3, %ymm0, %ymm6`
			`+ vpmovmskb %ymm6, %ecx`
			`+ testl %ecx, %ecx`
			`+ jnz L(first_vec_x2)`

			`+ /* Save pointer again before realigning. */`
			`+ movq %rdi, %rsi`
			`+ addq $(VEC_SIZE + 1), %rdi`
			`+ andq $-(VEC_SIZE * 2), %rdi`
			`.p2align 4`
			`-L(cros_page_boundary):`
			`- andl $(VEC_SIZE - 1), %ecx`
			`- andq $-VEC_SIZE, %rdi`
			`- vmovdqa (%rdi), %ymm1`
			`- VPCMPEQ %ymm1, %ymm0, %ymm2`
			`- VPCMPEQ %ymm1, %ymm4, %ymm3`
			`- vpmovmskb %ymm2, %edx`
			`- vpmovmskb %ymm3, %eax`
			`- shrl %cl, %edx`
			`- shrl %cl, %eax`
			`- addq $VEC_SIZE, %rdi`
			`-`
			`- /* Check if there is a CHAR. */`
			`+L(first_aligned_loop):`
			`+ /* Do 2x VEC at a time. Any more and the cost of finding the`
			`+ match outweights loop benefit. */`
			`+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4`
			`+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5`
			`+`
			`+ VPCMPEQ %ymm4, %ymm7, %ymm6`
			`+ VPMIN %ymm4, %ymm5, %ymm8`
			`+ VPCMPEQ %ymm5, %ymm7, %ymm10`
			`+ vpor %ymm6, %ymm10, %ymm5`
			`+ VPCMPEQ %ymm8, %ymm0, %ymm8`
			`+ vpor %ymm5, %ymm8, %ymm9`
			`+`
			`+ vpmovmskb %ymm9, %eax`
			`+ addq $(VEC_SIZE * 2), %rdi`
			`+ /* No zero or search CHAR. */`
			`testl %eax, %eax`
			`- jnz L(found_char)`
			`-`
			`- testl %edx, %edx`
			`- jnz L(return_null)`
			`+ jz L(first_aligned_loop)`

			`- jmp L(aligned_loop)`
			`-`
			`- .p2align 4`
			`-L(found_char):`
			`- testl %edx, %edx`
			`- jnz L(char_and_nul)`
			`+ /* If no zero CHAR then go to second loop (this allows us to`
			`+ throw away all prior work). */`
			`+ vpmovmskb %ymm8, %ecx`
			`+ testl %ecx, %ecx`
			`+ jz L(second_aligned_loop_prep)`

			`- /* Remember the match and keep searching. */`
			`- movl %eax, %edx`
			`- leaq (%rdi, %rcx), %rsi`
			`+ /* Search char could be zero so we need to get the true match.`
			`+ */`
			`+ vpmovmskb %ymm5, %eax`
			`+ testl %eax, %eax`
			`+ jnz L(first_aligned_loop_return)`

			`- .p2align 4`
			`-L(aligned_loop):`
			`- vmovdqa (%rdi), %ymm1`
			`- VPCMPEQ %ymm1, %ymm0, %ymm2`
			`- addq $VEC_SIZE, %rdi`
			`- VPCMPEQ %ymm1, %ymm4, %ymm3`
			`- vpmovmskb %ymm2, %ecx`
			`- vpmovmskb %ymm3, %eax`
			`- orl %eax, %ecx`
			`- jnz L(char_nor_null)`
			`-`
			`- vmovdqa (%rdi), %ymm1`
			`- VPCMPEQ %ymm1, %ymm0, %ymm2`
			`- add $VEC_SIZE, %rdi`
			`- VPCMPEQ %ymm1, %ymm4, %ymm3`
			`- vpmovmskb %ymm2, %ecx`
			`+ .p2align 4,, 4`
			`+L(first_vec_x1_or_x2):`
			`+ VPCMPEQ %ymm3, %ymm7, %ymm3`
			`+ VPCMPEQ %ymm2, %ymm7, %ymm2`
			`vpmovmskb %ymm3, %eax`
			`- orl %eax, %ecx`
			`- jnz L(char_nor_null)`
			`-`
			`- vmovdqa (%rdi), %ymm1`
			`- VPCMPEQ %ymm1, %ymm0, %ymm2`
			`- addq $VEC_SIZE, %rdi`
			`- VPCMPEQ %ymm1, %ymm4, %ymm3`
			`- vpmovmskb %ymm2, %ecx`
			`- vpmovmskb %ymm3, %eax`
			`- orl %eax, %ecx`
			`- jnz L(char_nor_null)`
			`-`
			`- vmovdqa (%rdi), %ymm1`
			`- VPCMPEQ %ymm1, %ymm0, %ymm2`
			`- addq $VEC_SIZE, %rdi`
			`- VPCMPEQ %ymm1, %ymm4, %ymm3`
			`- vpmovmskb %ymm2, %ecx`
			`- vpmovmskb %ymm3, %eax`
			`- orl %eax, %ecx`
			`- jz L(aligned_loop)`
			`-`
			`- .p2align 4`
			`-L(char_nor_null):`
			`- /* Find a CHAR or a nul CHAR in a loop. */`
			`- testl %eax, %eax`
			`- jnz L(match)`
			`-L(return_value):`
			`- testl %edx, %edx`
			`- jz L(return_null)`
			`- movl %edx, %eax`
			`- movq %rsi, %rdi`
			`+ vpmovmskb %ymm2, %edx`
			`+ /* Use add for macro-fusion. */`
			`+ addq %rax, %rdx`
			`+ jz L(first_vec_x0_test)`
			`+ /* NB: We could move this shift to before the branch and save a`
			`+ bit of code size / performance on the fall through. The`
			`+ branch leads to the null case which generally seems hotter`
			`+ than char in first 3x VEC. */`
			`+ salq $32, %rax`
			`+ addq %rdx, %rax`
			`+ bsrq %rax, %rax`
			`+ leaq 1(%rsi, %rax), %rax`
			`+# ifdef USE_AS_WCSRCHR`
			`+ andq $-CHAR_SIZE, %rax`
			`+# endif`
			`+ VZEROUPPER_RETURN`

			`+ .p2align 4,, 8`
			`+L(first_aligned_loop_return):`
			`+ VPCMPEQ %ymm4, %ymm0, %ymm4`
			`+ vpmovmskb %ymm4, %edx`
			`+ salq $32, %rcx`
			`+ orq %rdx, %rcx`
			`+`
			`+ vpmovmskb %ymm10, %eax`
			`+ vpmovmskb %ymm6, %edx`
			`+ salq $32, %rax`
			`+ orq %rdx, %rax`
			`+ blsmskq %rcx, %rcx`
			`+ andq %rcx, %rax`
			`+ jz L(first_vec_x1_or_x2)`
			`+`
			`+ bsrq %rax, %rax`
			`+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax`
			`# ifdef USE_AS_WCSRCHR`
			`- /* Keep the first bit for each matching CHAR for bsr. */`
			`- andl $0x11111111, %eax`
			`+ andq $-CHAR_SIZE, %rax`
			`# endif`
			`- bsrl %eax, %eax`
			`- leaq -VEC_SIZE(%rdi, %rax), %rax`
			`-L(return_vzeroupper):`
			`- ZERO_UPPER_VEC_REGISTERS_RETURN`
			`+ VZEROUPPER_RETURN`

			`+ /* Search char cannot be zero. */`
			`.p2align 4`
			`-L(match):`
			`- /* Find a CHAR. Check if there is a nul CHAR. */`
			`- vpmovmskb %ymm2, %ecx`
			`- testl %ecx, %ecx`
			`- jnz L(find_nul)`
			`-`
			`- /* Remember the match and keep searching. */`
			`- movl %eax, %edx`
			`+L(second_aligned_loop_set_furthest_match):`
			`+ /* Save VEC and pointer from most recent match. */`
			`+L(second_aligned_loop_prep):`
			`movq %rdi, %rsi`
			`- jmp L(aligned_loop)`
			`+ vmovdqu %ymm6, %ymm2`
			`+ vmovdqu %ymm10, %ymm3`

			`.p2align 4`
			`-L(find_nul):`
			`-# ifdef USE_AS_WCSRCHR`
			`- /* Keep the first bit for each matching CHAR for bsr. */`
			`- andl $0x11111111, %ecx`
			`- andl $0x11111111, %eax`
			`-# endif`
			`- /* Mask out any matching bits after the nul CHAR. */`
			`- movl %ecx, %r8d`
			`- subl $1, %r8d`
			`- xorl %ecx, %r8d`
			`- andl %r8d, %eax`
			`+L(second_aligned_loop):`
			`+ /* Search 2x at at time. */`
			`+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4`
			`+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5`
			`+`
			`+ VPCMPEQ %ymm4, %ymm7, %ymm6`
			`+ VPMIN %ymm4, %ymm5, %ymm1`
			`+ VPCMPEQ %ymm5, %ymm7, %ymm10`
			`+ vpor %ymm6, %ymm10, %ymm5`
			`+ VPCMPEQ %ymm1, %ymm0, %ymm1`
			`+ vpor %ymm5, %ymm1, %ymm9`
			`+`
			`+ vpmovmskb %ymm9, %eax`
			`+ addq $(VEC_SIZE * 2), %rdi`
			`testl %eax, %eax`
			`- /* If there is no CHAR here, return the remembered one. */`
			`- jz L(return_value)`
			`- bsrl %eax, %eax`
			`- leaq -VEC_SIZE(%rdi, %rax), %rax`
			`- VZEROUPPER_RETURN`
			`-`
			`- .p2align 4`
			`-L(char_and_nul):`
			`- /* Find both a CHAR and a nul CHAR. */`
			`- addq %rcx, %rdi`
			`- movl %edx, %ecx`
			`-L(char_and_nul_in_first_vec):`
			`-# ifdef USE_AS_WCSRCHR`
			`- /* Keep the first bit for each matching CHAR for bsr. */`
			`- andl $0x11111111, %ecx`
			`- andl $0x11111111, %eax`
			`-# endif`
			`- /* Mask out any matching bits after the nul CHAR. */`
			`- movl %ecx, %r8d`
			`- subl $1, %r8d`
			`- xorl %ecx, %r8d`
			`- andl %r8d, %eax`
			`+ jz L(second_aligned_loop)`
			`+ vpmovmskb %ymm1, %ecx`
			`+ testl %ecx, %ecx`
			`+ jz L(second_aligned_loop_set_furthest_match)`
			`+ vpmovmskb %ymm5, %eax`
			`testl %eax, %eax`
			`- /* Return null pointer if the nul CHAR comes first. */`
			`- jz L(return_null)`
			`- bsrl %eax, %eax`
			`- leaq -VEC_SIZE(%rdi, %rax), %rax`
			`+ jnz L(return_new_match)`
			`+`
			`+ /* This is the hot patch. We know CHAR is inbounds and that`
			`+ ymm3/ymm2 have latest match. */`
			`+ .p2align 4,, 4`
			`+L(return_old_match):`
			`+ vpmovmskb %ymm3, %eax`
			`+ vpmovmskb %ymm2, %edx`
			`+ salq $32, %rax`
			`+ orq %rdx, %rax`
			`+ bsrq %rax, %rax`
			`+ /* Search char cannot be zero so safe to just use lea for`
			`+ wcsrchr. */`
			`+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax`
			`VZEROUPPER_RETURN`

			`- .p2align 4`
			`-L(return_null):`
			`- xorl %eax, %eax`
			`+ /* Last iteration also potentially has a match. */`
			`+ .p2align 4,, 8`
			`+L(return_new_match):`
			`+ VPCMPEQ %ymm4, %ymm0, %ymm4`
			`+ vpmovmskb %ymm4, %edx`
			`+ salq $32, %rcx`
			`+ orq %rdx, %rcx`
			`+`
			`+ vpmovmskb %ymm10, %eax`
			`+ vpmovmskb %ymm6, %edx`
			`+ salq $32, %rax`
			`+ orq %rdx, %rax`
			`+ blsmskq %rcx, %rcx`
			`+ andq %rcx, %rax`
			`+ jz L(return_old_match)`
			`+ bsrq %rax, %rax`
			`+ /* Search char cannot be zero so safe to just use lea for`
			`+ wcsrchr. */`
			`+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax`
			`VZEROUPPER_RETURN`

			`-END (STRRCHR)`
			`+ .p2align 4,, 4`
			`+L(cross_page):`
			`+ movq %rdi, %rsi`
			`+ andq $-VEC_SIZE, %rsi`
			`+ vmovdqu (%rsi), %ymm1`
			`+ VPCMPEQ %ymm1, %ymm0, %ymm6`
			`+ vpmovmskb %ymm6, %ecx`
			`+ /* Shift out zero CHAR matches that are before the begining of`
			`+ src (rdi). */`
			`+ shrxl %edi, %ecx, %ecx`
			`+ testl %ecx, %ecx`
			`+ jz L(page_cross_continue)`
			`+ VPCMPEQ %ymm1, %ymm7, %ymm1`
			`+ vpmovmskb %ymm1, %eax`
			`+`
			`+ /* Shift out search CHAR matches that are before the begining of`
			`+ src (rdi). */`
			`+ shrxl %edi, %eax, %eax`
			`+ blsmskl %ecx, %ecx`
			`+ /* Check if any search CHAR match in range. */`
			`+ andl %ecx, %eax`
			`+ jz L(ret2)`
			`+ bsrl %eax, %eax`
			`+ addq %rdi, %rax`
			`+# ifdef USE_AS_WCSRCHR`
			`+ andq $-CHAR_SIZE, %rax`
			`+# endif`
			`+L(ret2):`
			`+ VZEROUPPER_RETURN`
			`+END(STRRCHR)`
			`#endif`