glibc/glibc-upstream-2.34-287.patch

commit 3c87383a20daff9a230439e31b778716bfed4d8b
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date:   Mon Jun 6 21:11:34 2022 -0700

    x86: Shrink code size of memchr-evex.S
    
    This is not meant as a performance optimization. The previous code was
    far to liberal in aligning targets and wasted code size unnecissarily.
    
    The total code size saving is: 64 bytes
    
    There are no non-negligible changes in the benchmarks.
    Geometric Mean of all benchmarks New / Old: 1.000
    
    Full xcheck passes on x86_64.
    Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
    
    (cherry picked from commit 56da3fe1dd075285fa8186d44b3c28e68c687e62)

diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 4d0ed6d136f099e1..68381c99a4948134 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -88,7 +88,7 @@
 # define PAGE_SIZE 4096
 
 	.section SECTION(.text),"ax",@progbits
-ENTRY (MEMCHR)
+ENTRY_P2ALIGN (MEMCHR, 6)
 # ifndef USE_AS_RAWMEMCHR
 	/* Check for zero length.  */
 	test	%RDX_LP, %RDX_LP
@@ -131,22 +131,24 @@ L(zero):
 	xorl	%eax, %eax
 	ret
 
-	.p2align 5
+	.p2align 4
 L(first_vec_x0):
-	/* Check if first match was before length.  */
-	tzcntl	%eax, %eax
-	xorl	%ecx, %ecx
-	cmpl	%eax, %edx
-	leaq	(%rdi, %rax, CHAR_SIZE), %rax
-	cmovle	%rcx, %rax
+	/* Check if first match was before length. NB: tzcnt has false data-
+	   dependency on destination. eax already had a data-dependency on esi
+	   so this should have no affect here.  */
+	tzcntl	%eax, %esi
+#  ifdef USE_AS_WMEMCHR
+	leaq	(%rdi, %rsi, CHAR_SIZE), %rdi
+#  else
+	addq	%rsi, %rdi
+#  endif
+	xorl	%eax, %eax
+	cmpl	%esi, %edx
+	cmovg	%rdi, %rax
 	ret
-# else
-	/* NB: first_vec_x0 is 17 bytes which will leave
-	   cross_page_boundary (which is relatively cold) close enough
-	   to ideal alignment. So only realign L(cross_page_boundary) if
-	   rawmemchr.  */
-	.p2align 4
 # endif
+
+	.p2align 4
 L(cross_page_boundary):
 	/* Save pointer before aligning as its original value is
 	   necessary for computer return address if byte is found or
@@ -400,10 +402,14 @@ L(last_2x_vec):
 L(zero_end):
 	ret
 
+L(set_zero_end):
+	xorl	%eax, %eax
+	ret
 
 	.p2align 4
 L(first_vec_x1_check):
-	tzcntl	%eax, %eax
+	/* eax must be non-zero. Use bsfl to save code size.  */
+	bsfl	%eax, %eax
 	/* Adjust length.  */
 	subl	$-(CHAR_PER_VEC * 4), %edx
 	/* Check if match within remaining length.  */
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
 	/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count.  */
 	leaq	VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
 	ret
-L(set_zero_end):
-	xorl	%eax, %eax
-	ret
 
 	.p2align 4
 L(loop_4x_vec_end):
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
 # endif
 	ret
 
-	.p2align 4
+	.p2align 4,, 10
 L(last_vec_x1_return):
 	tzcntl	%eax, %eax
 # if defined USE_AS_WMEMCHR || RET_OFFSET != 0
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
 # endif
 
 # ifndef USE_AS_RAWMEMCHR
+	.p2align 4,, 5
 L(last_4x_vec_or_less_cmpeq):
 	VPCMP	$0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
 	kmovd	%k0, %eax
@@ -546,7 +550,7 @@ L(last_4x_vec):
 #  endif
 	andl	%ecx, %eax
 	jz	L(zero_end2)
-	tzcntl	%eax, %eax
+	bsfl	%eax, %eax
 	leaq	(VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
 L(zero_end2):
 	ret
@@ -562,6 +566,6 @@ L(last_vec_x3):
 	leaq	(VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
 	ret
 # endif
-
+	/* 7 bytes from next cache line.  */
 END (MEMCHR)
 #endif
Import glibc-2.34-40.fc35 from f35 * Fri Jul 22 2022 Arjun Shankar <arjun@redhat.com> - 2.34-40 - Sync with upstream branch release/2.34/master, commit b2f32e746492615a6eb3e66fac1e766e32e8deb1: - malloc: Simplify implementation of __malloc_assert - Update syscall-names.list for Linux 5.18 - x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S - x86: Move mem{p}{mov\|cpy}_{chk_}erms to its own file - x86: Move and slightly improve memset_erms - x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list - x86: Put wcs{n}len-sse4.1 in the sse4.1 text section - x86: Align entry for memrchr to 64-bytes. - x86: Add BMI1/BMI2 checks for ISA_V3 check - x86: Cleanup bounds checking in large memcpy case - x86: Add bounds `x86_non_temporal_threshold` - x86: Add sse42 implementation to strcmp's ifunc - x86: Fix misordered logic for setting `rep_movsb_stop_threshold` - x86: Align varshift table to 32-bytes - x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions - x86: Shrink code size of memchr-evex.S - x86: Shrink code size of memchr-avx2.S - x86: Optimize memrchr-avx2.S - x86: Optimize memrchr-evex.S - x86: Optimize memrchr-sse2.S - x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` - x86: Create header for VEC classes in x86 strings library - x86_64: Add strstr function with 512-bit EVEX - x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT - x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen - x86_64: Remove bzero optimization - x86_64: Remove end of line trailing spaces - nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore - linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304) Resolves: #2109505 2022-07-22 13:47:47 +00:00			`commit 3c87383a20daff9a230439e31b778716bfed4d8b`
			`Author: Noah Goldstein <goldstein.w.n@gmail.com>`
			`Date: Mon Jun 6 21:11:34 2022 -0700`

			`x86: Shrink code size of memchr-evex.S`

			`This is not meant as a performance optimization. The previous code was`
			`far to liberal in aligning targets and wasted code size unnecissarily.`

			`The total code size saving is: 64 bytes`

			`There are no non-negligible changes in the benchmarks.`
			`Geometric Mean of all benchmarks New / Old: 1.000`

			`Full xcheck passes on x86_64.`
			`Reviewed-by: H.J. Lu <hjl.tools@gmail.com>`

			`(cherry picked from commit 56da3fe1dd075285fa8186d44b3c28e68c687e62)`

			`diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S`
			`index 4d0ed6d136f099e1..68381c99a4948134 100644`
			`--- a/sysdeps/x86_64/multiarch/memchr-evex.S`
			`+++ b/sysdeps/x86_64/multiarch/memchr-evex.S`
			`@@ -88,7 +88,7 @@`
			`# define PAGE_SIZE 4096`

			`.section SECTION(.text),"ax",@progbits`
			`-ENTRY (MEMCHR)`
			`+ENTRY_P2ALIGN (MEMCHR, 6)`
			`# ifndef USE_AS_RAWMEMCHR`
			`/* Check for zero length. */`
			`test %RDX_LP, %RDX_LP`
			`@@ -131,22 +131,24 @@ L(zero):`
			`xorl %eax, %eax`
			`ret`

			`- .p2align 5`
			`+ .p2align 4`
			`L(first_vec_x0):`
			`- /* Check if first match was before length. */`
			`- tzcntl %eax, %eax`
			`- xorl %ecx, %ecx`
			`- cmpl %eax, %edx`
			`- leaq (%rdi, %rax, CHAR_SIZE), %rax`
			`- cmovle %rcx, %rax`
			`+ /* Check if first match was before length. NB: tzcnt has false data-`
			`+ dependency on destination. eax already had a data-dependency on esi`
			`+ so this should have no affect here. */`
			`+ tzcntl %eax, %esi`
			`+# ifdef USE_AS_WMEMCHR`
			`+ leaq (%rdi, %rsi, CHAR_SIZE), %rdi`
			`+# else`
			`+ addq %rsi, %rdi`
			`+# endif`
			`+ xorl %eax, %eax`
			`+ cmpl %esi, %edx`
			`+ cmovg %rdi, %rax`
			`ret`
			`-# else`
			`- /* NB: first_vec_x0 is 17 bytes which will leave`
			`- cross_page_boundary (which is relatively cold) close enough`
			`- to ideal alignment. So only realign L(cross_page_boundary) if`
			`- rawmemchr. */`
			`- .p2align 4`
			`# endif`
			`+`
			`+ .p2align 4`
			`L(cross_page_boundary):`
			`/* Save pointer before aligning as its original value is`
			`necessary for computer return address if byte is found or`
			`@@ -400,10 +402,14 @@ L(last_2x_vec):`
			`L(zero_end):`
			`ret`

			`+L(set_zero_end):`
			`+ xorl %eax, %eax`
			`+ ret`

			`.p2align 4`
			`L(first_vec_x1_check):`
			`- tzcntl %eax, %eax`
			`+ /* eax must be non-zero. Use bsfl to save code size. */`
			`+ bsfl %eax, %eax`
			`/* Adjust length. */`
			`subl $-(CHAR_PER_VEC * 4), %edx`
			`/* Check if match within remaining length. */`
			`@@ -412,9 +418,6 @@ L(first_vec_x1_check):`
			`/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */`
			`leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax`
			`ret`
			`-L(set_zero_end):`
			`- xorl %eax, %eax`
			`- ret`

			`.p2align 4`
			`L(loop_4x_vec_end):`
			`@@ -464,7 +467,7 @@ L(loop_4x_vec_end):`
			`# endif`
			`ret`

			`- .p2align 4`
			`+ .p2align 4,, 10`
			`L(last_vec_x1_return):`
			`tzcntl %eax, %eax`
			`# if defined USE_AS_WMEMCHR \|\| RET_OFFSET != 0`
			`@@ -496,6 +499,7 @@ L(last_vec_x3_return):`
			`# endif`

			`# ifndef USE_AS_RAWMEMCHR`
			`+ .p2align 4,, 5`
			`L(last_4x_vec_or_less_cmpeq):`
			`VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0`
			`kmovd %k0, %eax`
			`@@ -546,7 +550,7 @@ L(last_4x_vec):`
			`# endif`
			`andl %ecx, %eax`
			`jz L(zero_end2)`
			`- tzcntl %eax, %eax`
			`+ bsfl %eax, %eax`
			`leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax`
			`L(zero_end2):`
			`ret`
			`@@ -562,6 +566,6 @@ L(last_vec_x3):`
			`leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax`
			`ret`
			`# endif`
			`-`
			`+ /* 7 bytes from next cache line. */`
			`END (MEMCHR)`
			`#endif`