601650f878
* Tue May 31 2022 Arjun Shankar <arjun@redhat.com> - 2.34-35 - Sync with upstream branch release/2.34/master, commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31: - Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose - x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #29127] - string.h: fix __fortified_attr_access macro call [BZ #29162] - linux: Add a getauxval test [BZ #23293] - rtld: Use generic argv adjustment in ld.so [BZ #23293] - S390: Enable static PIE * Thu May 19 2022 Florian Weimer <fweimer@redhat.com> - 2.34-34 - Sync with upstream branch release/2.34/master, commit ede8d94d154157d269b18f3601440ac576c1f96a: - csu: Implement and use _dl_early_allocate during static startup - Linux: Introduce __brk_call for invoking the brk system call - Linux: Implement a useful version of _startup_fatal - ia64: Always define IA64_USE_NEW_STUB as a flag macro - Linux: Define MMAP_CALL_INTERNAL - i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls - i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S - elf: Remove __libc_init_secure - Linux: Consolidate auxiliary vector parsing (redo) - Linux: Include <dl-auxv.h> in dl-sysdep.c only for SHARED - Revert "Linux: Consolidate auxiliary vector parsing" - Linux: Consolidate auxiliary vector parsing - Linux: Assume that NEED_DL_SYSINFO_DSO is always defined - Linux: Remove DL_FIND_ARG_COMPONENTS - Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE - elf: Merge dl-sysdep.c into the Linux version - elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr - x86: Optimize {str|wcs}rchr-evex - x86: Optimize {str|wcs}rchr-avx2 - x86: Optimize {str|wcs}rchr-sse2 - x86: Cleanup page cross code in memcmp-avx2-movbe.S - x86: Remove memcmp-sse4.S - x86: Small improvements for wcslen - x86: Remove AVX str{n}casecmp - x86: Add EVEX optimized str{n}casecmp - x86: Add AVX2 optimized str{n}casecmp - x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S - x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S - x86: Remove strspn-sse2.S and use the generic implementation - x86: Remove strpbrk-sse2.S and use the generic implementation - x87: Remove strcspn-sse2.S and use the generic implementation - x86: Optimize strspn in strspn-c.c - x86: Optimize strcspn and strpbrk in strcspn-c.c - x86: Code cleanup in strchr-evex and comment justifying branch - x86: Code cleanup in strchr-avx2 and comment justifying branch - x86_64: Remove bcopy optimizations - x86-64: Remove bzero weak alias in SS2 memset - x86_64/multiarch: Sort sysdep_routines and put one entry per line - x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) - fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141] * Thu May 12 2022 Florian Weimer <fweimer@redhat.com> - 2.34-33 - Sync with upstream branch release/2.34/master, commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23: - dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo - manual: Document the dlinfo function - x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] - x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] - x86: Set .text section in memset-vec-unaligned-erms - x86-64: Optimize bzero - x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) - x86: Improve vec generation in memset-vec-unaligned-erms.S - x86-64: Fix strcmp-evex.S - x86-64: Fix strcmp-avx2.S - x86: Optimize strcmp-evex.S - x86: Optimize strcmp-avx2.S - manual: Clarify that abbreviations of long options are allowed - Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h - aarch64: Add HWCAP2_ECV from Linux 5.16 - Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h - Update kernel version to 5.17 in tst-mman-consts.py - Update kernel version to 5.16 in tst-mman-consts.py - Update syscall lists for Linux 5.17 - Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h - Update kernel version to 5.15 in tst-mman-consts.py - Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h Resolves: #2091541
260 lines
8.3 KiB
Diff
260 lines
8.3 KiB
Diff
commit df5de87260dba479873b2850bbe5c0b81c2376f6
|
|
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
|
Date: Fri Apr 15 12:28:01 2022 -0500
|
|
|
|
x86: Cleanup page cross code in memcmp-avx2-movbe.S
|
|
|
|
Old code was both inefficient and wasted code size. New code (-62
|
|
bytes) and comparable or better performance in the page cross case.
|
|
|
|
geometric_mean(N=20) of page cross cases New / Original: 0.960
|
|
|
|
size, align0, align1, ret, New Time/Old Time
|
|
1, 4095, 0, 0, 1.001
|
|
1, 4095, 0, 1, 0.999
|
|
1, 4095, 0, -1, 1.0
|
|
2, 4094, 0, 0, 1.0
|
|
2, 4094, 0, 1, 1.0
|
|
2, 4094, 0, -1, 1.0
|
|
3, 4093, 0, 0, 1.0
|
|
3, 4093, 0, 1, 1.0
|
|
3, 4093, 0, -1, 1.0
|
|
4, 4092, 0, 0, 0.987
|
|
4, 4092, 0, 1, 1.0
|
|
4, 4092, 0, -1, 1.0
|
|
5, 4091, 0, 0, 0.984
|
|
5, 4091, 0, 1, 1.002
|
|
5, 4091, 0, -1, 1.005
|
|
6, 4090, 0, 0, 0.993
|
|
6, 4090, 0, 1, 1.001
|
|
6, 4090, 0, -1, 1.003
|
|
7, 4089, 0, 0, 0.991
|
|
7, 4089, 0, 1, 1.0
|
|
7, 4089, 0, -1, 1.001
|
|
8, 4088, 0, 0, 0.875
|
|
8, 4088, 0, 1, 0.881
|
|
8, 4088, 0, -1, 0.888
|
|
9, 4087, 0, 0, 0.872
|
|
9, 4087, 0, 1, 0.879
|
|
9, 4087, 0, -1, 0.883
|
|
10, 4086, 0, 0, 0.878
|
|
10, 4086, 0, 1, 0.886
|
|
10, 4086, 0, -1, 0.873
|
|
11, 4085, 0, 0, 0.878
|
|
11, 4085, 0, 1, 0.881
|
|
11, 4085, 0, -1, 0.879
|
|
12, 4084, 0, 0, 0.873
|
|
12, 4084, 0, 1, 0.889
|
|
12, 4084, 0, -1, 0.875
|
|
13, 4083, 0, 0, 0.873
|
|
13, 4083, 0, 1, 0.863
|
|
13, 4083, 0, -1, 0.863
|
|
14, 4082, 0, 0, 0.838
|
|
14, 4082, 0, 1, 0.869
|
|
14, 4082, 0, -1, 0.877
|
|
15, 4081, 0, 0, 0.841
|
|
15, 4081, 0, 1, 0.869
|
|
15, 4081, 0, -1, 0.876
|
|
16, 4080, 0, 0, 0.988
|
|
16, 4080, 0, 1, 0.99
|
|
16, 4080, 0, -1, 0.989
|
|
17, 4079, 0, 0, 0.978
|
|
17, 4079, 0, 1, 0.981
|
|
17, 4079, 0, -1, 0.98
|
|
18, 4078, 0, 0, 0.981
|
|
18, 4078, 0, 1, 0.98
|
|
18, 4078, 0, -1, 0.985
|
|
19, 4077, 0, 0, 0.977
|
|
19, 4077, 0, 1, 0.979
|
|
19, 4077, 0, -1, 0.986
|
|
20, 4076, 0, 0, 0.977
|
|
20, 4076, 0, 1, 0.986
|
|
20, 4076, 0, -1, 0.984
|
|
21, 4075, 0, 0, 0.977
|
|
21, 4075, 0, 1, 0.983
|
|
21, 4075, 0, -1, 0.988
|
|
22, 4074, 0, 0, 0.983
|
|
22, 4074, 0, 1, 0.994
|
|
22, 4074, 0, -1, 0.993
|
|
23, 4073, 0, 0, 0.98
|
|
23, 4073, 0, 1, 0.992
|
|
23, 4073, 0, -1, 0.995
|
|
24, 4072, 0, 0, 0.989
|
|
24, 4072, 0, 1, 0.989
|
|
24, 4072, 0, -1, 0.991
|
|
25, 4071, 0, 0, 0.99
|
|
25, 4071, 0, 1, 0.999
|
|
25, 4071, 0, -1, 0.996
|
|
26, 4070, 0, 0, 0.993
|
|
26, 4070, 0, 1, 0.995
|
|
26, 4070, 0, -1, 0.998
|
|
27, 4069, 0, 0, 0.993
|
|
27, 4069, 0, 1, 0.999
|
|
27, 4069, 0, -1, 1.0
|
|
28, 4068, 0, 0, 0.997
|
|
28, 4068, 0, 1, 1.0
|
|
28, 4068, 0, -1, 0.999
|
|
29, 4067, 0, 0, 0.996
|
|
29, 4067, 0, 1, 0.999
|
|
29, 4067, 0, -1, 0.999
|
|
30, 4066, 0, 0, 0.991
|
|
30, 4066, 0, 1, 1.001
|
|
30, 4066, 0, -1, 0.999
|
|
31, 4065, 0, 0, 0.988
|
|
31, 4065, 0, 1, 0.998
|
|
31, 4065, 0, -1, 0.998
|
|
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
|
|
|
(cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f)
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
index 2621ec907aedb781..ec9cf0852edf216d 100644
|
|
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
|
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
|
|
# ifndef USE_AS_WMEMCMP
|
|
cmpl $8, %edx
|
|
jae L(between_8_15)
|
|
+ /* Fall through for [4, 7]. */
|
|
cmpl $4, %edx
|
|
- jae L(between_4_7)
|
|
+ jb L(between_2_3)
|
|
|
|
- /* Load as big endian to avoid branches. */
|
|
- movzwl (%rdi), %eax
|
|
- movzwl (%rsi), %ecx
|
|
- shll $8, %eax
|
|
- shll $8, %ecx
|
|
- bswap %eax
|
|
- bswap %ecx
|
|
- movzbl -1(%rdi, %rdx), %edi
|
|
- movzbl -1(%rsi, %rdx), %esi
|
|
- orl %edi, %eax
|
|
- orl %esi, %ecx
|
|
- /* Subtraction is okay because the upper 8 bits are zero. */
|
|
- subl %ecx, %eax
|
|
+ movbe (%rdi), %eax
|
|
+ movbe (%rsi), %ecx
|
|
+ shlq $32, %rax
|
|
+ shlq $32, %rcx
|
|
+ movbe -4(%rdi, %rdx), %edi
|
|
+ movbe -4(%rsi, %rdx), %esi
|
|
+ orq %rdi, %rax
|
|
+ orq %rsi, %rcx
|
|
+ subq %rcx, %rax
|
|
+ /* Fast path for return zero. */
|
|
+ jnz L(ret_nonzero)
|
|
/* No ymm register was touched. */
|
|
ret
|
|
|
|
@@ -457,9 +456,33 @@ L(one_or_less):
|
|
/* No ymm register was touched. */
|
|
ret
|
|
|
|
+ .p2align 4,, 5
|
|
+L(ret_nonzero):
|
|
+ sbbl %eax, %eax
|
|
+ orl $1, %eax
|
|
+ /* No ymm register was touched. */
|
|
+ ret
|
|
+
|
|
+ .p2align 4,, 2
|
|
+L(zero):
|
|
+ xorl %eax, %eax
|
|
+ /* No ymm register was touched. */
|
|
+ ret
|
|
+
|
|
.p2align 4
|
|
L(between_8_15):
|
|
-# endif
|
|
+ movbe (%rdi), %rax
|
|
+ movbe (%rsi), %rcx
|
|
+ subq %rcx, %rax
|
|
+ jnz L(ret_nonzero)
|
|
+ movbe -8(%rdi, %rdx), %rax
|
|
+ movbe -8(%rsi, %rdx), %rcx
|
|
+ subq %rcx, %rax
|
|
+ /* Fast path for return zero. */
|
|
+ jnz L(ret_nonzero)
|
|
+ /* No ymm register was touched. */
|
|
+ ret
|
|
+# else
|
|
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
|
vmovq (%rdi), %xmm1
|
|
vmovq (%rsi), %xmm2
|
|
@@ -475,16 +498,13 @@ L(between_8_15):
|
|
VPCMPEQ %xmm1, %xmm2, %xmm2
|
|
vpmovmskb %xmm2, %eax
|
|
subl $0xffff, %eax
|
|
+ /* Fast path for return zero. */
|
|
jnz L(return_vec_0)
|
|
/* No ymm register was touched. */
|
|
ret
|
|
+# endif
|
|
|
|
- .p2align 4
|
|
-L(zero):
|
|
- xorl %eax, %eax
|
|
- ret
|
|
-
|
|
- .p2align 4
|
|
+ .p2align 4,, 10
|
|
L(between_16_31):
|
|
/* From 16 to 31 bytes. No branch when size == 16. */
|
|
vmovdqu (%rsi), %xmm2
|
|
@@ -501,11 +521,17 @@ L(between_16_31):
|
|
VPCMPEQ (%rdi), %xmm2, %xmm2
|
|
vpmovmskb %xmm2, %eax
|
|
subl $0xffff, %eax
|
|
+ /* Fast path for return zero. */
|
|
jnz L(return_vec_0)
|
|
/* No ymm register was touched. */
|
|
ret
|
|
|
|
# ifdef USE_AS_WMEMCMP
|
|
+ .p2align 4,, 2
|
|
+L(zero):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+
|
|
.p2align 4
|
|
L(one_or_less):
|
|
jb L(zero)
|
|
@@ -520,22 +546,20 @@ L(one_or_less):
|
|
# else
|
|
|
|
.p2align 4
|
|
-L(between_4_7):
|
|
- /* Load as big endian with overlapping movbe to avoid branches.
|
|
- */
|
|
- movbe (%rdi), %eax
|
|
- movbe (%rsi), %ecx
|
|
- shlq $32, %rax
|
|
- shlq $32, %rcx
|
|
- movbe -4(%rdi, %rdx), %edi
|
|
- movbe -4(%rsi, %rdx), %esi
|
|
- orq %rdi, %rax
|
|
- orq %rsi, %rcx
|
|
- subq %rcx, %rax
|
|
- jz L(zero_4_7)
|
|
- sbbl %eax, %eax
|
|
- orl $1, %eax
|
|
-L(zero_4_7):
|
|
+L(between_2_3):
|
|
+ /* Load as big endian to avoid branches. */
|
|
+ movzwl (%rdi), %eax
|
|
+ movzwl (%rsi), %ecx
|
|
+ bswap %eax
|
|
+ bswap %ecx
|
|
+ shrl %eax
|
|
+ shrl %ecx
|
|
+ movzbl -1(%rdi, %rdx), %edi
|
|
+ movzbl -1(%rsi, %rdx), %esi
|
|
+ orl %edi, %eax
|
|
+ orl %esi, %ecx
|
|
+ /* Subtraction is okay because the upper bit is zero. */
|
|
+ subl %ecx, %eax
|
|
/* No ymm register was touched. */
|
|
ret
|
|
# endif
|