glibc/glibc-upstream-2.34-230.patch
Arjun Shankar 601650f878 Import glibc-2.34-35.fc35 from f35
* Tue May 31 2022 Arjun Shankar <arjun@redhat.com> - 2.34-35
- Sync with upstream branch release/2.34/master,
  commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31:
- Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose
- x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #29127]
- string.h: fix __fortified_attr_access macro call [BZ #29162]
- linux: Add a getauxval test [BZ #23293]
- rtld: Use generic argv adjustment in ld.so [BZ #23293]
- S390: Enable static PIE

* Thu May 19 2022 Florian Weimer <fweimer@redhat.com> - 2.34-34
- Sync with upstream branch release/2.34/master,
  commit ede8d94d154157d269b18f3601440ac576c1f96a:
- csu: Implement and use _dl_early_allocate during static startup
- Linux: Introduce __brk_call for invoking the brk system call
- Linux: Implement a useful version of _startup_fatal
- ia64: Always define IA64_USE_NEW_STUB as a flag macro
- Linux: Define MMAP_CALL_INTERNAL
- i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls
- i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S
- elf: Remove __libc_init_secure
- Linux: Consolidate auxiliary vector parsing (redo)
- Linux: Include <dl-auxv.h> in dl-sysdep.c only for SHARED
- Revert "Linux: Consolidate auxiliary vector parsing"
- Linux: Consolidate auxiliary vector parsing
- Linux: Assume that NEED_DL_SYSINFO_DSO is always defined
- Linux: Remove DL_FIND_ARG_COMPONENTS
- Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE
- elf: Merge dl-sysdep.c into the Linux version
- elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr
- x86: Optimize {str|wcs}rchr-evex
- x86: Optimize {str|wcs}rchr-avx2
- x86: Optimize {str|wcs}rchr-sse2
- x86: Cleanup page cross code in memcmp-avx2-movbe.S
- x86: Remove memcmp-sse4.S
- x86: Small improvements for wcslen
- x86: Remove AVX str{n}casecmp
- x86: Add EVEX optimized str{n}casecmp
- x86: Add AVX2 optimized str{n}casecmp
- x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S
- x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S
- x86: Remove strspn-sse2.S and use the generic implementation
- x86: Remove strpbrk-sse2.S and use the generic implementation
- x87: Remove strcspn-sse2.S and use the generic implementation
- x86: Optimize strspn in strspn-c.c
- x86: Optimize strcspn and strpbrk in strcspn-c.c
- x86: Code cleanup in strchr-evex and comment justifying branch
- x86: Code cleanup in strchr-avx2 and comment justifying branch
- x86_64: Remove bcopy optimizations
- x86-64: Remove bzero weak alias in SS2 memset
- x86_64/multiarch: Sort sysdep_routines and put one entry per line
- x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ))
- fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141]

* Thu May 12 2022 Florian Weimer <fweimer@redhat.com> - 2.34-33
- Sync with upstream branch release/2.34/master,
  commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23:
- dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo
- manual: Document the dlinfo function
- x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896]
- x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
- x86: Set .text section in memset-vec-unaligned-erms
- x86-64: Optimize bzero
- x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only)
- x86: Improve vec generation in memset-vec-unaligned-erms.S
- x86-64: Fix strcmp-evex.S
- x86-64: Fix strcmp-avx2.S
- x86: Optimize strcmp-evex.S
- x86: Optimize strcmp-avx2.S
- manual: Clarify that abbreviations of long options are allowed
- Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h
- aarch64: Add HWCAP2_ECV from Linux 5.16
- Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h
- Update kernel version to 5.17 in tst-mman-consts.py
- Update kernel version to 5.16 in tst-mman-consts.py
- Update syscall lists for Linux 5.17
- Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h
- Update kernel version to 5.15 in tst-mman-consts.py
- Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h

Resolves: #2091541
2022-06-06 16:33:33 +02:00

254 lines
4.3 KiB
Diff

commit 4ff6ae069b7caacd5f99088abd755717b994f660
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri Mar 25 17:13:33 2022 -0500
x86: Small improvements for wcslen
Just a few QOL changes.
1. Prefer `add` > `lea` as it has high execution units it can run
on.
2. Don't break macro-fusion between `test` and `jcc`
3. Reduce code size by removing gratuitous padding bytes (-90
bytes).
geometric_mean(N=20) of all benchmarks New / Original: 0.959
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 244b415d386487521882debb845a040a4758cb18)
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index 61edea1d14d454c6..ad066863a44ea0a5 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
pxor %xmm0, %xmm0
lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
+ addq $16, %rdi
and $-16, %rax
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
and $-0x40, %rax
@@ -133,104 +133,100 @@ L(aligned_64_loop):
pminub %xmm0, %xmm2
pcmpeqd %xmm3, %xmm2
pmovmskb %xmm2, %edx
+ addq $64, %rax
test %edx, %edx
- lea 64(%rax), %rax
jz L(aligned_64_loop)
pcmpeqd -64(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $48, %rdi
test %edx, %edx
- lea 48(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd -32(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm6, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
+ jz L(aligned_64_loop)
.p2align 4
L(exit):
- sub %rcx, %rax
+ sub %rdi, %rax
shr $2, %rax
test %dl, %dl
jz L(exit_high)
- mov %dl, %cl
- and $15, %cl
+ andl $15, %edx
jz L(exit_1)
ret
- .p2align 4
+ /* No align here. Naturally aligned % 16 == 1. */
L(exit_high):
- mov %dh, %ch
- and $15, %ch
+ andl $(15 << 8), %edx
jz L(exit_3)
add $2, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_1):
add $1, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_3):
add $3, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_tail0):
- xor %rax, %rax
+ xorl %eax, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail1):
- mov $1, %rax
+ movl $1, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail2):
- mov $2, %rax
+ movl $2, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail3):
- mov $3, %rax
+ movl $3, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail4):
- mov $4, %rax
+ movl $4, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail5):
- mov $5, %rax
+ movl $5, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail6):
- mov $6, %rax
+ movl $6, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail7):
- mov $7, %rax
+ movl $7, %eax
ret
END (__wcslen)