glibc/glibc-upstream-2.34-221.patch

144 lines
4.5 KiB
Diff
Raw Normal View History

Import glibc-2.34-35.fc35 from f35 * Tue May 31 2022 Arjun Shankar <arjun@redhat.com> - 2.34-35 - Sync with upstream branch release/2.34/master, commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31: - Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose - x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #29127] - string.h: fix __fortified_attr_access macro call [BZ #29162] - linux: Add a getauxval test [BZ #23293] - rtld: Use generic argv adjustment in ld.so [BZ #23293] - S390: Enable static PIE * Thu May 19 2022 Florian Weimer <fweimer@redhat.com> - 2.34-34 - Sync with upstream branch release/2.34/master, commit ede8d94d154157d269b18f3601440ac576c1f96a: - csu: Implement and use _dl_early_allocate during static startup - Linux: Introduce __brk_call for invoking the brk system call - Linux: Implement a useful version of _startup_fatal - ia64: Always define IA64_USE_NEW_STUB as a flag macro - Linux: Define MMAP_CALL_INTERNAL - i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls - i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S - elf: Remove __libc_init_secure - Linux: Consolidate auxiliary vector parsing (redo) - Linux: Include <dl-auxv.h> in dl-sysdep.c only for SHARED - Revert "Linux: Consolidate auxiliary vector parsing" - Linux: Consolidate auxiliary vector parsing - Linux: Assume that NEED_DL_SYSINFO_DSO is always defined - Linux: Remove DL_FIND_ARG_COMPONENTS - Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE - elf: Merge dl-sysdep.c into the Linux version - elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr - x86: Optimize {str|wcs}rchr-evex - x86: Optimize {str|wcs}rchr-avx2 - x86: Optimize {str|wcs}rchr-sse2 - x86: Cleanup page cross code in memcmp-avx2-movbe.S - x86: Remove memcmp-sse4.S - x86: Small improvements for wcslen - x86: Remove AVX str{n}casecmp - x86: Add EVEX optimized str{n}casecmp - x86: Add AVX2 optimized str{n}casecmp - x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S - x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S - x86: Remove strspn-sse2.S and use the generic implementation - x86: Remove strpbrk-sse2.S and use the generic implementation - x87: Remove strcspn-sse2.S and use the generic implementation - x86: Optimize strspn in strspn-c.c - x86: Optimize strcspn and strpbrk in strcspn-c.c - x86: Code cleanup in strchr-evex and comment justifying branch - x86: Code cleanup in strchr-avx2 and comment justifying branch - x86_64: Remove bcopy optimizations - x86-64: Remove bzero weak alias in SS2 memset - x86_64/multiarch: Sort sysdep_routines and put one entry per line - x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) - fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141] * Thu May 12 2022 Florian Weimer <fweimer@redhat.com> - 2.34-33 - Sync with upstream branch release/2.34/master, commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23: - dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo - manual: Document the dlinfo function - x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] - x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] - x86: Set .text section in memset-vec-unaligned-erms - x86-64: Optimize bzero - x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) - x86: Improve vec generation in memset-vec-unaligned-erms.S - x86-64: Fix strcmp-evex.S - x86-64: Fix strcmp-avx2.S - x86: Optimize strcmp-evex.S - x86: Optimize strcmp-avx2.S - manual: Clarify that abbreviations of long options are allowed - Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h - aarch64: Add HWCAP2_ECV from Linux 5.16 - Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h - Update kernel version to 5.17 in tst-mman-consts.py - Update kernel version to 5.16 in tst-mman-consts.py - Update syscall lists for Linux 5.17 - Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h - Update kernel version to 5.15 in tst-mman-consts.py - Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h Resolves: #2091541
2022-06-06 12:53:44 +00:00
commit 0a2da0111037b1cc214f8f40ca5bdebf36f35cbd
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Mar 23 16:57:24 2022 -0500
x86: Optimize strspn in strspn-c.c
Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of
_mm_cmpistri. Also change offset to unsigned to avoid unnecessary
sign extensions.
geometric_mean(N=20) of all benchmarks that dont fallback on
sse2; New / Original: .901
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
(cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046)
diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c
index a17196296b9ebe52..3bcc479f1b52ff6a 100644
--- a/sysdeps/x86_64/multiarch/strspn-c.c
+++ b/sysdeps/x86_64/multiarch/strspn-c.c
@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a)
return 0;
const char *aligned;
- __m128i mask;
- int offset = (int) ((size_t) a & 15);
+ __m128i mask, maskz, zero;
+ unsigned int maskz_bits;
+ unsigned int offset = (int) ((size_t) a & 15);
+ zero = _mm_set1_epi8 (0);
if (offset != 0)
{
/* Load masks. */
aligned = (const char *) ((size_t) a & -16L);
__m128i mask0 = _mm_load_si128 ((__m128i *) aligned);
-
- mask = __m128i_shift_right (mask0, offset);
+ maskz = _mm_cmpeq_epi8 (mask0, zero);
/* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16 - offset)
- {
- /* There is no NULL terminator. */
- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16));
- int index = _mm_cmpistri (mask1, mask1, 0x3a);
- length += index;
-
- /* Don't use SSE4.2 if the length of A > 16. */
- if (length > 16)
- return __strspn_sse2 (s, a);
-
- if (index != 0)
- {
- /* Combine mask0 and mask1. We could play games with
- palignr, but frankly this data should be in L1 now
- so do the merge via an unaligned load. */
- mask = _mm_loadu_si128 ((__m128i *) a);
- }
- }
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
+ {
+ mask = __m128i_shift_right (mask0, offset);
+ offset = (unsigned int) ((size_t) s & 15);
+ if (offset)
+ goto start_unaligned;
+
+ aligned = s;
+ goto start_loop;
+ }
}
- else
- {
- /* A is aligned. */
- mask = _mm_load_si128 ((__m128i *) a);
- /* Find where the NULL terminator is. */
- int length = _mm_cmpistri (mask, mask, 0x3a);
- if (length == 16)
- {
- /* There is no NULL terminator. Don't use SSE4.2 if the length
- of A > 16. */
- if (a[16] != 0)
- return __strspn_sse2 (s, a);
- }
+ /* A is aligned. */
+ mask = _mm_loadu_si128 ((__m128i *) a);
+
+ /* Find where the NULL terminator is. */
+ maskz = _mm_cmpeq_epi8 (mask, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz);
+ if (maskz_bits == 0)
+ {
+ /* There is no NULL terminator. Don't use SSE4.2 if the length
+ of A > 16. */
+ if (a[16] != 0)
+ return __strspn_sse2 (s, a);
}
+ aligned = s;
+ offset = (unsigned int) ((size_t) s & 15);
- offset = (int) ((size_t) s & 15);
if (offset != 0)
{
+ start_unaligned:
/* Check partial string. */
aligned = (const char *) ((size_t) s & -16L);
__m128i value = _mm_load_si128 ((__m128i *) aligned);
+ __m128i adj_value = __m128i_shift_right (value, offset);
- value = __m128i_shift_right (value, offset);
-
- int length = _mm_cmpistri (mask, value, 0x12);
+ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12);
/* No need to check CFlag since it is always 1. */
if (length < 16 - offset)
return length;
/* Find where the NULL terminator is. */
- int index = _mm_cmpistri (value, value, 0x3a);
- if (index < 16 - offset)
+ maskz = _mm_cmpeq_epi8 (value, zero);
+ maskz_bits = _mm_movemask_epi8 (maskz) >> offset;
+ if (maskz_bits != 0)
return length;
aligned += 16;
}
- else
- aligned = s;
+start_loop:
while (1)
{
__m128i value = _mm_load_si128 ((__m128i *) aligned);
- int index = _mm_cmpistri (mask, value, 0x12);
- int cflag = _mm_cmpistrc (mask, value, 0x12);
+ unsigned int index = _mm_cmpistri (mask, value, 0x12);
+ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12);
if (cflag)
return (size_t) (aligned + index - s);
aligned += 16;