Import glibc-2.34-40.fc35 from f35
* Fri Jul 22 2022 Arjun Shankar <arjun@redhat.com> - 2.34-40 - Sync with upstream branch release/2.34/master, commit b2f32e746492615a6eb3e66fac1e766e32e8deb1: - malloc: Simplify implementation of __malloc_assert - Update syscall-names.list for Linux 5.18 - x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S - x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file - x86: Move and slightly improve memset_erms - x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list - x86: Put wcs{n}len-sse4.1 in the sse4.1 text section - x86: Align entry for memrchr to 64-bytes. - x86: Add BMI1/BMI2 checks for ISA_V3 check - x86: Cleanup bounds checking in large memcpy case - x86: Add bounds `x86_non_temporal_threshold` - x86: Add sse42 implementation to strcmp's ifunc - x86: Fix misordered logic for setting `rep_movsb_stop_threshold` - x86: Align varshift table to 32-bytes - x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions - x86: Shrink code size of memchr-evex.S - x86: Shrink code size of memchr-avx2.S - x86: Optimize memrchr-avx2.S - x86: Optimize memrchr-evex.S - x86: Optimize memrchr-sse2.S - x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` - x86: Create header for VEC classes in x86 strings library - x86_64: Add strstr function with 512-bit EVEX - x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT - x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen - x86_64: Remove bzero optimization - x86_64: Remove end of line trailing spaces - nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore - linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304) Resolves: #2109505
This commit is contained in:
parent
bc1778ead1
commit
668eaab0c7
27
glibc-upstream-2.34-274.patch
Normal file
27
glibc-upstream-2.34-274.patch
Normal file
@ -0,0 +1,27 @@
|
||||
commit 4b246b2bbd1d5a77035bb990d6097b7337c34bbb
|
||||
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Date: Thu Jun 30 09:08:31 2022 -0300
|
||||
|
||||
linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304)
|
||||
|
||||
On success, mq_receive() and mq_timedreceive() return the number of
|
||||
bytes in the received message, so it requires to check if the value
|
||||
is larger than 0.
|
||||
|
||||
Checked on i686-linux-gnu.
|
||||
|
||||
(cherry picked from commit 71d87d85bf54f6522813aec97c19bdd24997341e)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/mq_timedreceive.c b/sysdeps/unix/sysv/linux/mq_timedreceive.c
|
||||
index 7f3a112d7f2cbbe7..1fc98752e7d6d506 100644
|
||||
--- a/sysdeps/unix/sysv/linux/mq_timedreceive.c
|
||||
+++ b/sysdeps/unix/sysv/linux/mq_timedreceive.c
|
||||
@@ -41,7 +41,7 @@ ___mq_timedreceive_time64 (mqd_t mqdes, char *__restrict msg_ptr, size_t msg_len
|
||||
{
|
||||
int r = SYSCALL_CANCEL (mq_timedreceive_time64, mqdes, msg_ptr, msg_len,
|
||||
msg_prio, abs_timeout);
|
||||
- if (r == 0 || errno != ENOSYS)
|
||||
+ if (r >= 0 || errno != ENOSYS)
|
||||
return r;
|
||||
__set_errno (EOVERFLOW);
|
||||
return -1;
|
25
glibc-upstream-2.34-275.patch
Normal file
25
glibc-upstream-2.34-275.patch
Normal file
@ -0,0 +1,25 @@
|
||||
commit 7789a849234f8b303a571134abe72691ce8c2540
|
||||
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Date: Wed Jul 13 10:37:32 2022 -0300
|
||||
|
||||
nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore
|
||||
|
||||
This was due a wrong revert done on 404656009b459658.
|
||||
|
||||
Checked on x86_64-linux-gnu and i686-linux-gnu.
|
||||
|
||||
(cherry picked from commit f27e5e21787abc9f719879af47687221aa1027b3)
|
||||
|
||||
diff --git a/nptl/cleanup_defer.c b/nptl/cleanup_defer.c
|
||||
index 35ba40fb0247c7cc..59571229d8ccf481 100644
|
||||
--- a/nptl/cleanup_defer.c
|
||||
+++ b/nptl/cleanup_defer.c
|
||||
@@ -72,7 +72,7 @@ ___pthread_unregister_cancel_restore (__pthread_unwind_buf_t *buf)
|
||||
return;
|
||||
|
||||
int cancelhandling = atomic_load_relaxed (&self->cancelhandling);
|
||||
- if (cancelhandling & CANCELTYPE_BITMASK)
|
||||
+ if ((cancelhandling & CANCELTYPE_BITMASK) == 0)
|
||||
{
|
||||
int newval;
|
||||
do
|
29
glibc-upstream-2.34-276.patch
Normal file
29
glibc-upstream-2.34-276.patch
Normal file
@ -0,0 +1,29 @@
|
||||
commit 8d324019e69203f5998f223d0e905de1395330ea
|
||||
Author: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Mon Jul 18 18:38:48 2022 -0700
|
||||
|
||||
x86_64: Remove end of line trailing spaces
|
||||
|
||||
This commit remove trailing space introduced by following commit.
|
||||
|
||||
commit a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 23 01:56:29 2021 -0400
|
||||
|
||||
x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974]
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
index b7657282bd2e6fa5..031753a91763b351 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
@@ -66,8 +66,8 @@ ENTRY(strlen)
|
||||
L(n_nonzero):
|
||||
# ifdef AS_WCSLEN
|
||||
/* Check for overflow from maxlen * sizeof(wchar_t). If it would
|
||||
- overflow the only way this program doesn't have undefined behavior
|
||||
- is if there is a null terminator in valid memory so wcslen will
|
||||
+ overflow the only way this program doesn't have undefined behavior
|
||||
+ is if there is a null terminator in valid memory so wcslen will
|
||||
suffice. */
|
||||
mov %RSI_LP, %R10_LP
|
||||
sar $62, %R10_LP
|
457
glibc-upstream-2.34-277.patch
Normal file
457
glibc-upstream-2.34-277.patch
Normal file
@ -0,0 +1,457 @@
|
||||
commit eb9aa96facc5231e208de0946870f10c21aee9f3
|
||||
Author: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
Date: Fri May 13 09:33:30 2022 -0300
|
||||
|
||||
x86_64: Remove bzero optimization
|
||||
|
||||
Both symbols are marked as legacy in POSIX.1-2001 and removed on
|
||||
POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE
|
||||
or _DEFAULT_SOURCE.
|
||||
|
||||
GCC also replaces bcopy with a memmove and bzero with memset on default
|
||||
configuration (to actually get a bzero libc call the code requires
|
||||
to omit string.h inclusion and built with -fno-builtin), so it is
|
||||
highly unlikely programs are actually calling libc bzero symbol.
|
||||
|
||||
On a recent Linux distro (Ubuntu 22.04), there is no bzero calls
|
||||
by the installed binaries.
|
||||
|
||||
$ cat count_bstring.sh
|
||||
#!/bin/bash
|
||||
|
||||
files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done`
|
||||
total=0
|
||||
for file in $files; do
|
||||
symbols=`objdump -R $file 2>&1`
|
||||
if [ $? -eq 0 ]; then
|
||||
ncalls=`echo $symbols | grep -w $1 | wc -l`
|
||||
((total=total+ncalls))
|
||||
if [ $ncalls -gt 0 ]; then
|
||||
echo "$file: $ncalls"
|
||||
fi
|
||||
fi
|
||||
done
|
||||
echo "TOTAL=$total"
|
||||
$ ./count_bstring.sh bzero
|
||||
TOTAL=0
|
||||
|
||||
Checked on x86_64-linux-gnu.
|
||||
|
||||
(cherry picked from commit 9403b71ae97e3f1a91c796ddcbb4e6f044434734)
|
||||
|
||||
diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S
|
||||
deleted file mode 100644
|
||||
index f96d567fd87696af..0000000000000000
|
||||
--- a/sysdeps/x86_64/bzero.S
|
||||
+++ /dev/null
|
||||
@@ -1 +0,0 @@
|
||||
-/* Implemented in memset.S. */
|
||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||||
index 0358210c7ff3a976..2b64741fd10a8ec2 100644
|
||||
--- a/sysdeps/x86_64/memset.S
|
||||
+++ b/sysdeps/x86_64/memset.S
|
||||
@@ -1,4 +1,4 @@
|
||||
-/* memset/bzero -- set memory area to CH/0
|
||||
+/* memset -- set memory area to CH/0
|
||||
Optimized version for x86-64.
|
||||
Copyright (C) 2002-2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
@@ -35,9 +35,6 @@
|
||||
punpcklwd %xmm0, %xmm0; \
|
||||
pshufd $0, %xmm0, %xmm0
|
||||
|
||||
-# define BZERO_ZERO_VEC0() \
|
||||
- pxor %xmm0, %xmm0
|
||||
-
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
pshufd $0, %xmm0, %xmm0; \
|
||||
@@ -56,10 +53,6 @@
|
||||
# define MEMSET_SYMBOL(p,s) memset
|
||||
#endif
|
||||
|
||||
-#ifndef BZERO_SYMBOL
|
||||
-# define BZERO_SYMBOL(p,s) __bzero
|
||||
-#endif
|
||||
-
|
||||
#ifndef WMEMSET_SYMBOL
|
||||
# define WMEMSET_CHK_SYMBOL(p,s) p
|
||||
# define WMEMSET_SYMBOL(p,s) __wmemset
|
||||
@@ -70,7 +63,6 @@
|
||||
libc_hidden_builtin_def (memset)
|
||||
|
||||
#if IS_IN (libc)
|
||||
-weak_alias (__bzero, bzero)
|
||||
libc_hidden_def (__wmemset)
|
||||
weak_alias (__wmemset, wmemset)
|
||||
libc_hidden_weak (wmemset)
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index b503e4b81e92a11c..67401162d526f664 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -1,7 +1,6 @@
|
||||
ifeq ($(subdir),string)
|
||||
|
||||
sysdep_routines += \
|
||||
- bzero \
|
||||
memchr-avx2 \
|
||||
memchr-avx2-rtm \
|
||||
memchr-evex \
|
||||
diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
|
||||
deleted file mode 100644
|
||||
index 13e399a9a1fbdeb2..0000000000000000
|
||||
--- a/sysdeps/x86_64/multiarch/bzero.c
|
||||
+++ /dev/null
|
||||
@@ -1,108 +0,0 @@
|
||||
-/* Multiple versions of bzero.
|
||||
- All versions must be listed in ifunc-impl-list.c.
|
||||
- Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <https://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-/* Define multiple versions only for the definition in libc. */
|
||||
-#if IS_IN (libc)
|
||||
-# define __bzero __redirect___bzero
|
||||
-# include <string.h>
|
||||
-# undef __bzero
|
||||
-
|
||||
-/* OPTIMIZE1 definition required for bzero patch. */
|
||||
-# define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name)
|
||||
-# define SYMBOL_NAME __bzero
|
||||
-# include <init-arch.h>
|
||||
-
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
|
||||
- attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
|
||||
- attribute_hidden;
|
||||
-
|
||||
-static inline void *
|
||||
-IFUNC_SELECTOR (void)
|
||||
-{
|
||||
- const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE1 (avx512_unaligned_erms);
|
||||
-
|
||||
- return OPTIMIZE1 (avx512_unaligned);
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE1 (evex_unaligned_erms);
|
||||
-
|
||||
- return OPTIMIZE1 (evex_unaligned);
|
||||
- }
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE1 (avx2_unaligned_erms_rtm);
|
||||
-
|
||||
- return OPTIMIZE1 (avx2_unaligned_rtm);
|
||||
- }
|
||||
-
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE1 (avx2_unaligned_erms);
|
||||
-
|
||||
- return OPTIMIZE1 (avx2_unaligned);
|
||||
- }
|
||||
- }
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE1 (sse2_unaligned_erms);
|
||||
-
|
||||
- return OPTIMIZE1 (sse2_unaligned);
|
||||
-}
|
||||
-
|
||||
-libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
|
||||
-
|
||||
-weak_alias (__bzero, bzero)
|
||||
-#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index e5e48b36c3175e68..d990a7149489efd9 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -280,48 +280,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__memset_avx512_no_vzeroupper)
|
||||
)
|
||||
|
||||
- /* Support sysdeps/x86_64/multiarch/bzero.c. */
|
||||
- IFUNC_IMPL (i, name, bzero,
|
||||
- IFUNC_IMPL_ADD (array, i, bzero, 1,
|
||||
- __bzero_sse2_unaligned)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero, 1,
|
||||
- __bzero_sse2_unaligned_erms)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
- __bzero_avx2_unaligned)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
- __bzero_avx2_unaligned_erms)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- (CPU_FEATURE_USABLE (AVX2)
|
||||
- && CPU_FEATURE_USABLE (RTM)),
|
||||
- __bzero_avx2_unaligned_rtm)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- (CPU_FEATURE_USABLE (AVX2)
|
||||
- && CPU_FEATURE_USABLE (RTM)),
|
||||
- __bzero_avx2_unaligned_erms_rtm)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- (CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)
|
||||
- && CPU_FEATURE_USABLE (BMI2)),
|
||||
- __bzero_evex_unaligned)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- (CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)
|
||||
- && CPU_FEATURE_USABLE (BMI2)),
|
||||
- __bzero_evex_unaligned_erms)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- (CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)
|
||||
- && CPU_FEATURE_USABLE (BMI2)),
|
||||
- __bzero_avx512_unaligned_erms)
|
||||
- IFUNC_IMPL_ADD (array, i, bzero,
|
||||
- (CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)
|
||||
- && CPU_FEATURE_USABLE (BMI2)),
|
||||
- __bzero_avx512_unaligned)
|
||||
- )
|
||||
-
|
||||
/* Support sysdeps/x86_64/multiarch/rawmemchr.c. */
|
||||
IFUNC_IMPL (i, name, rawmemchr,
|
||||
IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
|
||||
index 5a5ee6f67299400b..8ac3e479bba488be 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
|
||||
@@ -5,7 +5,6 @@
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
-#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
|
||||
#include "memset-avx2-unaligned-erms.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
index a093a2831f3dfa0d..c0bf2875d03d51ab 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
@@ -14,9 +14,6 @@
|
||||
vmovd d, %xmm0; \
|
||||
movq r, %rax;
|
||||
|
||||
-# define BZERO_ZERO_VEC0() \
|
||||
- vpxor %xmm0, %xmm0, %xmm0
|
||||
-
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
||||
|
||||
@@ -32,9 +29,6 @@
|
||||
# ifndef MEMSET_SYMBOL
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
-# ifndef BZERO_SYMBOL
|
||||
-# define BZERO_SYMBOL(p,s) p##_avx2_##s
|
||||
-# endif
|
||||
# ifndef WMEMSET_SYMBOL
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index 727c92133a15900f..5241216a77bf72b7 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -19,9 +19,6 @@
|
||||
vpbroadcastb d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
-# define BZERO_ZERO_VEC0() \
|
||||
- vpxorq %XMM0, %XMM0, %XMM0
|
||||
-
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastd d, %VEC0; \
|
||||
movq r, %rax
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
index 5d8fa78f05476b10..637002150659123c 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -19,9 +19,6 @@
|
||||
vpbroadcastb d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
-# define BZERO_ZERO_VEC0() \
|
||||
- vpxorq %XMM0, %XMM0, %XMM0
|
||||
-
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastd d, %VEC0; \
|
||||
movq r, %rax
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
index 2951f7f5f70e274a..c47f3a9c955508a2 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
@@ -22,7 +22,6 @@
|
||||
|
||||
#if IS_IN (libc)
|
||||
# define MEMSET_SYMBOL(p,s) p##_sse2_##s
|
||||
-# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
|
||||
# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
|
||||
|
||||
# ifdef SHARED
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index d9c577fb5ff9700f..abc12d9cda1b3843 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -1,5 +1,5 @@
|
||||
-/* memset/bzero with unaligned store and rep stosb
|
||||
- Copyright (C) 2016-2021 Free Software Foundation, Inc.
|
||||
+/* memset with unaligned store and rep stosb
|
||||
+ Copyright (C) 2016-2022 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@@ -26,10 +26,6 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
-#ifndef BZERO_SYMBOL
|
||||
-# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
|
||||
-#endif
|
||||
-
|
||||
#ifndef MEMSET_CHK_SYMBOL
|
||||
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
|
||||
#endif
|
||||
@@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
#endif
|
||||
|
||||
-ENTRY (BZERO_SYMBOL(__bzero, unaligned))
|
||||
-#if VEC_SIZE > 16
|
||||
- BZERO_ZERO_VEC0 ()
|
||||
-#endif
|
||||
- mov %RDI_LP, %RAX_LP
|
||||
- mov %RSI_LP, %RDX_LP
|
||||
-#ifndef USE_LESS_VEC_MASK_STORE
|
||||
- xorl %esi, %esi
|
||||
-#endif
|
||||
- cmp $VEC_SIZE, %RDX_LP
|
||||
- jb L(less_vec_no_vdup)
|
||||
-#ifdef USE_LESS_VEC_MASK_STORE
|
||||
- xorl %esi, %esi
|
||||
-#endif
|
||||
-#if VEC_SIZE <= 16
|
||||
- BZERO_ZERO_VEC0 ()
|
||||
-#endif
|
||||
- cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
- ja L(more_2x_vec)
|
||||
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
- VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
-END (BZERO_SYMBOL(__bzero, unaligned))
|
||||
-
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
@@ -216,31 +187,6 @@ END (__memset_erms)
|
||||
END (MEMSET_SYMBOL (__memset, erms))
|
||||
# endif
|
||||
|
||||
-ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
|
||||
-# if VEC_SIZE > 16
|
||||
- BZERO_ZERO_VEC0 ()
|
||||
-# endif
|
||||
- mov %RDI_LP, %RAX_LP
|
||||
- mov %RSI_LP, %RDX_LP
|
||||
-# ifndef USE_LESS_VEC_MASK_STORE
|
||||
- xorl %esi, %esi
|
||||
-# endif
|
||||
- cmp $VEC_SIZE, %RDX_LP
|
||||
- jb L(less_vec_no_vdup)
|
||||
-# ifdef USE_LESS_VEC_MASK_STORE
|
||||
- xorl %esi, %esi
|
||||
-# endif
|
||||
-# if VEC_SIZE <= 16
|
||||
- BZERO_ZERO_VEC0 ()
|
||||
-# endif
|
||||
- cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
- ja L(stosb_more_2x_vec)
|
||||
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
- VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
-END (BZERO_SYMBOL(__bzero, unaligned_erms))
|
||||
-
|
||||
# if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
@@ -282,7 +228,6 @@ L(last_2x_vec):
|
||||
#ifdef USE_LESS_VEC_MASK_STORE
|
||||
.p2align 4,, 10
|
||||
L(less_vec):
|
||||
-L(less_vec_no_vdup):
|
||||
L(less_vec_from_wmemset):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
@@ -430,9 +375,6 @@ L(less_vec):
|
||||
xmm). This is only does anything for AVX2. */
|
||||
MEMSET_VDUP_TO_VEC0_LOW ()
|
||||
L(less_vec_from_wmemset):
|
||||
-#if VEC_SIZE > 16
|
||||
-L(less_vec_no_vdup):
|
||||
-#endif
|
||||
#endif
|
||||
L(cross_page):
|
||||
#if VEC_SIZE > 32
|
||||
@@ -445,9 +387,6 @@ L(cross_page):
|
||||
#endif
|
||||
#ifndef USE_XMM_LESS_VEC
|
||||
MOVQ %XMM0, %SET_REG64
|
||||
-#endif
|
||||
-#if VEC_SIZE <= 16
|
||||
-L(less_vec_no_vdup):
|
||||
#endif
|
||||
cmpl $8, %edx
|
||||
jge L(between_8_15)
|
460
glibc-upstream-2.34-278.patch
Normal file
460
glibc-upstream-2.34-278.patch
Normal file
@ -0,0 +1,460 @@
|
||||
commit 8ab861d295b90177b89288a2bc95c5de5e4e5bc6
|
||||
Author: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Sun Feb 27 16:39:47 2022 -0800
|
||||
|
||||
x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
|
||||
|
||||
This patch implements following evex512 version of string functions.
|
||||
Perf gain for evex512 version is up to 50% as compared to evex,
|
||||
depending on length and alignment.
|
||||
|
||||
Placeholder function, not used by any processor at the moment.
|
||||
|
||||
- String length function using 512 bit vectors.
|
||||
- String N length using 512 bit vectors.
|
||||
- Wide string length using 512 bit vectors.
|
||||
- Wide string N length using 512 bit vectors.
|
||||
|
||||
Reviewed-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
(cherry picked from commit 9c66efb86fe384f77435f7e326333fb2e4e10676)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 67401162d526f664..4d4ad2a3686b5bc3 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -87,6 +87,7 @@ sysdep_routines += \
|
||||
strlen-avx2 \
|
||||
strlen-avx2-rtm \
|
||||
strlen-evex \
|
||||
+ strlen-evex512 \
|
||||
strlen-sse2 \
|
||||
strncase_l-avx2 \
|
||||
strncase_l-avx2-rtm \
|
||||
@@ -115,6 +116,7 @@ sysdep_routines += \
|
||||
strnlen-avx2 \
|
||||
strnlen-avx2-rtm \
|
||||
strnlen-evex \
|
||||
+ strnlen-evex512 \
|
||||
strnlen-sse2 \
|
||||
strpbrk-c \
|
||||
strpbrk-sse2 \
|
||||
@@ -148,6 +150,7 @@ sysdep_routines += \
|
||||
wcslen-avx2 \
|
||||
wcslen-avx2-rtm \
|
||||
wcslen-evex \
|
||||
+ wcslen-evex512 \
|
||||
wcslen-sse2 \
|
||||
wcslen-sse4_1 \
|
||||
wcsncmp-avx2 \
|
||||
@@ -158,6 +161,7 @@ sysdep_routines += \
|
||||
wcsnlen-avx2-rtm \
|
||||
wcsnlen-c \
|
||||
wcsnlen-evex \
|
||||
+ wcsnlen-evex512 \
|
||||
wcsnlen-sse4_1 \
|
||||
wcsrchr-avx2 \
|
||||
wcsrchr-avx2-rtm \
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index d990a7149489efd9..6b75a7106e174bce 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__strlen_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, strlen,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __strlen_evex512)
|
||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
|
||||
@@ -335,6 +340,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__strnlen_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __strnlen_evex512)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
|
||||
@@ -714,6 +724,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcslen_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __wcslen_evex512)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
CPU_FEATURE_USABLE (SSE4_1),
|
||||
__wcslen_sse4_1)
|
||||
@@ -735,6 +750,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcsnlen_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __wcsnlen_evex512)
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
CPU_FEATURE_USABLE (SSE4_1),
|
||||
__wcsnlen_sse4_1)
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..278c899691d89ba7
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S
|
||||
@@ -0,0 +1,302 @@
|
||||
+/* Placeholder function, not used by any processor at the moment.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+
|
||||
+# include <sysdep.h>
|
||||
+
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+# define VPCMP vpcmpd
|
||||
+# define VPTESTN vptestnmd
|
||||
+# define VPMINU vpminud
|
||||
+# define CHAR_SIZE 4
|
||||
+# else
|
||||
+# define VPCMP vpcmpb
|
||||
+# define VPTESTN vptestnmb
|
||||
+# define VPMINU vpminub
|
||||
+# define CHAR_SIZE 1
|
||||
+# endif
|
||||
+
|
||||
+# define XMM0 xmm16
|
||||
+# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
+
|
||||
+# if VEC_SIZE == 64
|
||||
+# define KMOV kmovq
|
||||
+# define KORTEST kortestq
|
||||
+# define RAX rax
|
||||
+# define RCX rcx
|
||||
+# define RDX rdx
|
||||
+# define SHR shrq
|
||||
+# define TEXTSUFFIX evex512
|
||||
+# define VMM0 zmm16
|
||||
+# define VMM1 zmm17
|
||||
+# define VMM2 zmm18
|
||||
+# define VMM3 zmm19
|
||||
+# define VMM4 zmm20
|
||||
+# define VMOVA vmovdqa64
|
||||
+# elif VEC_SIZE == 32
|
||||
+/* Currently Unused. */
|
||||
+# define KMOV kmovd
|
||||
+# define KORTEST kortestd
|
||||
+# define RAX eax
|
||||
+# define RCX ecx
|
||||
+# define RDX edx
|
||||
+# define SHR shrl
|
||||
+# define TEXTSUFFIX evex256
|
||||
+# define VMM0 ymm16
|
||||
+# define VMM1 ymm17
|
||||
+# define VMM2 ymm18
|
||||
+# define VMM3 ymm19
|
||||
+# define VMM4 ymm20
|
||||
+# define VMOVA vmovdqa32
|
||||
+# endif
|
||||
+
|
||||
+ .section .text.TEXTSUFFIX, "ax", @progbits
|
||||
+/* Aligning entry point to 64 byte, provides better performance for
|
||||
+ one vector length string. */
|
||||
+ENTRY_P2ALIGN (STRLEN, 6)
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Check zero length. */
|
||||
+ test %RSI_LP, %RSI_LP
|
||||
+ jz L(ret_max)
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %esi, %esi
|
||||
+# endif
|
||||
+# endif
|
||||
+
|
||||
+ movl %edi, %eax
|
||||
+ vpxorq %XMM0, %XMM0, %XMM0
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(page_cross)
|
||||
+
|
||||
+ /* Compare [w]char for null, mask bit will be set for match. */
|
||||
+ VPCMP $0, (%rdi), %VMM0, %k0
|
||||
+ KMOV %k0, %RAX
|
||||
+ test %RAX, %RAX
|
||||
+ jz L(align_more)
|
||||
+
|
||||
+ bsf %RAX, %RAX
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ cmpq %rsi, %rax
|
||||
+ cmovnb %rsi, %rax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+ /* At this point vector max length reached. */
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ .p2align 4,,3
|
||||
+L(ret_max):
|
||||
+ movq %rsi, %rax
|
||||
+ ret
|
||||
+# endif
|
||||
+
|
||||
+L(align_more):
|
||||
+ leaq VEC_SIZE(%rdi), %rax
|
||||
+ /* Align rax to VEC_SIZE. */
|
||||
+ andq $-VEC_SIZE, %rax
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ movq %rax, %rdx
|
||||
+ subq %rdi, %rdx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ SHR $2, %RDX
|
||||
+# endif
|
||||
+ /* At this point rdx contains [w]chars already compared. */
|
||||
+ subq %rsi, %rdx
|
||||
+ jae L(ret_max)
|
||||
+ negq %rdx
|
||||
+ /* At this point rdx contains number of w[char] needs to go.
|
||||
+ Now onwards rdx will keep decrementing with each compare. */
|
||||
+# endif
|
||||
+
|
||||
+ /* Loop unroll 4 times for 4 vector loop. */
|
||||
+ VPCMP $0, (%rax), %VMM0, %k0
|
||||
+ KMOV %k0, %RCX
|
||||
+ test %RCX, %RCX
|
||||
+ jnz L(ret_vec_x1)
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ subq $CHAR_PER_VEC, %rdx
|
||||
+ jbe L(ret_max)
|
||||
+# endif
|
||||
+
|
||||
+ VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0
|
||||
+ KMOV %k0, %RCX
|
||||
+ test %RCX, %RCX
|
||||
+ jnz L(ret_vec_x2)
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ subq $CHAR_PER_VEC, %rdx
|
||||
+ jbe L(ret_max)
|
||||
+# endif
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0
|
||||
+ KMOV %k0, %RCX
|
||||
+ test %RCX, %RCX
|
||||
+ jnz L(ret_vec_x3)
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ subq $CHAR_PER_VEC, %rdx
|
||||
+ jbe L(ret_max)
|
||||
+# endif
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0
|
||||
+ KMOV %k0, %RCX
|
||||
+ test %RCX, %RCX
|
||||
+ jnz L(ret_vec_x4)
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ subq $CHAR_PER_VEC, %rdx
|
||||
+ jbe L(ret_max)
|
||||
+ /* Save pointer before 4 x VEC_SIZE alignment. */
|
||||
+ movq %rax, %rcx
|
||||
+# endif
|
||||
+
|
||||
+ /* Align address to VEC_SIZE * 4 for loop. */
|
||||
+ andq $-(VEC_SIZE * 4), %rax
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ subq %rax, %rcx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ SHR $2, %RCX
|
||||
+# endif
|
||||
+ /* rcx contains number of [w]char will be recompared due to
|
||||
+ alignment fixes. rdx must be incremented by rcx to offset
|
||||
+ alignment adjustment. */
|
||||
+ addq %rcx, %rdx
|
||||
+ /* Need jump as we don't want to add/subtract rdx for first
|
||||
+ iteration of 4 x VEC_SIZE aligned loop. */
|
||||
+ jmp L(loop_entry)
|
||||
+# endif
|
||||
+
|
||||
+ .p2align 4,,11
|
||||
+L(loop):
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
+ jbe L(ret_max)
|
||||
+L(loop_entry):
|
||||
+# endif
|
||||
+ /* VPMINU and VPCMP combination provide better performance as
|
||||
+ compared to alternative combinations. */
|
||||
+ VMOVA (VEC_SIZE * 4)(%rax), %VMM1
|
||||
+ VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2
|
||||
+ VMOVA (VEC_SIZE * 6)(%rax), %VMM3
|
||||
+ VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4
|
||||
+
|
||||
+ VPTESTN %VMM2, %VMM2, %k0
|
||||
+ VPTESTN %VMM4, %VMM4, %k1
|
||||
+
|
||||
+ subq $-(VEC_SIZE * 4), %rax
|
||||
+ KORTEST %k0, %k1
|
||||
+ jz L(loop)
|
||||
+
|
||||
+ VPTESTN %VMM1, %VMM1, %k2
|
||||
+ KMOV %k2, %RCX
|
||||
+ test %RCX, %RCX
|
||||
+ jnz L(ret_vec_x1)
|
||||
+
|
||||
+ KMOV %k0, %RCX
|
||||
+ /* At this point, if k0 is non zero, null char must be in the
|
||||
+ second vector. */
|
||||
+ test %RCX, %RCX
|
||||
+ jnz L(ret_vec_x2)
|
||||
+
|
||||
+ VPTESTN %VMM3, %VMM3, %k3
|
||||
+ KMOV %k3, %RCX
|
||||
+ test %RCX, %RCX
|
||||
+ jnz L(ret_vec_x3)
|
||||
+ /* At this point null [w]char must be in the fourth vector so no
|
||||
+ need to check. */
|
||||
+ KMOV %k1, %RCX
|
||||
+
|
||||
+ /* Fourth, third, second vector terminating are pretty much
|
||||
+ same, implemented this way to avoid branching and reuse code
|
||||
+ from pre loop exit condition. */
|
||||
+L(ret_vec_x4):
|
||||
+ bsf %RCX, %RCX
|
||||
+ subq %rdi, %rax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ subq $-(VEC_SIZE * 3), %rax
|
||||
+ shrq $2, %rax
|
||||
+ addq %rcx, %rax
|
||||
+# else
|
||||
+ leaq (VEC_SIZE * 3)(%rcx, %rax), %rax
|
||||
+# endif
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ cmpq %rsi, %rax
|
||||
+ cmovnb %rsi, %rax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+L(ret_vec_x3):
|
||||
+ bsf %RCX, %RCX
|
||||
+ subq %rdi, %rax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ subq $-(VEC_SIZE * 2), %rax
|
||||
+ shrq $2, %rax
|
||||
+ addq %rcx, %rax
|
||||
+# else
|
||||
+ leaq (VEC_SIZE * 2)(%rcx, %rax), %rax
|
||||
+# endif
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ cmpq %rsi, %rax
|
||||
+ cmovnb %rsi, %rax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+L(ret_vec_x2):
|
||||
+ subq $-VEC_SIZE, %rax
|
||||
+L(ret_vec_x1):
|
||||
+ bsf %RCX, %RCX
|
||||
+ subq %rdi, %rax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrq $2, %rax
|
||||
+# endif
|
||||
+ addq %rcx, %rax
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ cmpq %rsi, %rax
|
||||
+ cmovnb %rsi, %rax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+L(page_cross):
|
||||
+ movl %eax, %ecx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ andl $(VEC_SIZE - 1), %ecx
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
+ /* ecx contains number of w[char] to be skipped as a result
|
||||
+ of address alignment. */
|
||||
+ xorq %rdi, %rax
|
||||
+ VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0
|
||||
+ KMOV %k0, %RAX
|
||||
+ /* Ignore number of character for alignment adjustment. */
|
||||
+ SHR %cl, %RAX
|
||||
+ jz L(align_more)
|
||||
+
|
||||
+ bsf %RAX, %RAX
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ cmpq %rsi, %rax
|
||||
+ cmovnb %rsi, %rax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+END (STRLEN)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..116f8981c8954e2e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex512.S
|
||||
@@ -0,0 +1,7 @@
|
||||
+#ifndef STRLEN
|
||||
+# define STRLEN __strlen_evex512
|
||||
+#endif
|
||||
+
|
||||
+#define VEC_SIZE 64
|
||||
+
|
||||
+#include "strlen-evex-base.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..0b7f220214a7c33c
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S
|
||||
@@ -0,0 +1,4 @@
|
||||
+#define STRLEN __strnlen_evex512
|
||||
+#define USE_AS_STRNLEN 1
|
||||
+
|
||||
+#include "strlen-evex512.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..f59c372b78b4fb8c
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S
|
||||
@@ -0,0 +1,4 @@
|
||||
+#define STRLEN __wcslen_evex512
|
||||
+#define USE_AS_WCSLEN 1
|
||||
+
|
||||
+#include "strlen-evex512.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..73dcf2f210a85aac
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S
|
||||
@@ -0,0 +1,5 @@
|
||||
+#define STRLEN __wcsnlen_evex512
|
||||
+#define USE_AS_WCSLEN 1
|
||||
+#define USE_AS_STRNLEN 1
|
||||
+
|
||||
+#include "strlen-evex512.S"
|
33
glibc-upstream-2.34-279.patch
Normal file
33
glibc-upstream-2.34-279.patch
Normal file
@ -0,0 +1,33 @@
|
||||
commit f6bc52f080e4a0195c707c01f54e2eae0ff89010
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Fri May 20 19:21:48 2022 -0700
|
||||
|
||||
x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT
|
||||
|
||||
According to x86-64 psABI, r_addend should be ignored for R_X86_64_GLOB_DAT
|
||||
and R_X86_64_JUMP_SLOT. Since linkers always set their r_addends to 0, we
|
||||
can ignore their r_addends.
|
||||
|
||||
Reviewed-by: Fangrui Song <maskray@google.com>
|
||||
(cherry picked from commit f8587a61892cbafd98ce599131bf4f103466f084)
|
||||
|
||||
diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h
|
||||
index 94296719d4d9fb82..742682517179fab5 100644
|
||||
--- a/sysdeps/x86_64/dl-machine.h
|
||||
+++ b/sysdeps/x86_64/dl-machine.h
|
||||
@@ -347,11 +347,13 @@ and creates an unsatisfiable circular dependency.\n",
|
||||
# endif
|
||||
/* Set to symbol size plus addend. */
|
||||
value = sym->st_size;
|
||||
+ *reloc_addr = value + reloc->r_addend;
|
||||
+ break;
|
||||
# endif
|
||||
- /* Fall through. */
|
||||
+
|
||||
case R_X86_64_GLOB_DAT:
|
||||
case R_X86_64_JUMP_SLOT:
|
||||
- *reloc_addr = value + reloc->r_addend;
|
||||
+ *reloc_addr = value;
|
||||
break;
|
||||
|
||||
# ifndef RESOLVE_CONFLICT_FIND_MAP
|
356
glibc-upstream-2.34-280.patch
Normal file
356
glibc-upstream-2.34-280.patch
Normal file
@ -0,0 +1,356 @@
|
||||
commit 82a707aeb74f23bb1783af0f9e93790a2038ff7e
|
||||
Author: Raghuveer Devulapalli <raghuveer.devulapalli@intel.com>
|
||||
Date: Mon Jun 6 12:17:43 2022 -0700
|
||||
|
||||
x86_64: Add strstr function with 512-bit EVEX
|
||||
|
||||
Adding a 512-bit EVEX version of strstr. The algorithm works as follows:
|
||||
|
||||
(1) We spend a few cycles at the begining to peek into the needle. We
|
||||
locate an edge in the needle (first occurance of 2 consequent distinct
|
||||
characters) and also store the first 64-bytes into a zmm register.
|
||||
|
||||
(2) We search for the edge in the haystack by looking into one cache
|
||||
line of the haystack at a time. This avoids having to read past a page
|
||||
boundary which can cause a seg fault.
|
||||
|
||||
(3) If an edge is found in the haystack we first compare the first
|
||||
64-bytes of the needle (already stored in a zmm register) before we
|
||||
proceed with a full string compare performed byte by byte.
|
||||
|
||||
Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512)
|
||||
|
||||
Geometric mean of all benchmarks: new / old = 0.66
|
||||
|
||||
Difficult skiptable(0) : new / old = 0.02
|
||||
Difficult skiptable(1) : new / old = 0.01
|
||||
Difficult 2-way : new / old = 0.25
|
||||
Difficult testing first 2 : new / old = 1.26
|
||||
Difficult skiptable(0) : new / old = 0.05
|
||||
Difficult skiptable(1) : new / old = 0.06
|
||||
Difficult 2-way : new / old = 0.26
|
||||
Difficult testing first 2 : new / old = 1.05
|
||||
Difficult skiptable(0) : new / old = 0.42
|
||||
Difficult skiptable(1) : new / old = 0.24
|
||||
Difficult 2-way : new / old = 0.21
|
||||
Difficult testing first 2 : new / old = 1.04
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c)
|
||||
|
||||
x86: Remove __mmask intrinsics in strstr-avx512.c
|
||||
|
||||
The intrinsics are not available before GCC7 and using standard
|
||||
operators generates code of equivalent or better quality.
|
||||
|
||||
Removed:
|
||||
_cvtmask64_u64
|
||||
_kshiftri_mask64
|
||||
_kand_mask64
|
||||
|
||||
Geometric Mean of 5 Runs of Full Benchmark Suite New / Old: 0.958
|
||||
|
||||
(cherry picked from commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 4d4ad2a3686b5bc3..0e39e63ef6be6a86 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -126,6 +126,7 @@ sysdep_routines += \
|
||||
strrchr-sse2 \
|
||||
strspn-c \
|
||||
strspn-sse2 \
|
||||
+ strstr-avx512 \
|
||||
strstr-sse2-unaligned \
|
||||
varshift \
|
||||
# sysdep_routines
|
||||
@@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
CFLAGS-strspn-c.c += -msse4
|
||||
+CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 6b75a7106e174bce..043821278fdb6d8f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -633,6 +633,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/strstr.c. */
|
||||
IFUNC_IMPL (i, name, strstr,
|
||||
+ IFUNC_IMPL_ADD (array, i, strstr,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (AVX512DQ)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __strstr_avx512)
|
||||
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2))
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c
|
||||
new file mode 100644
|
||||
index 0000000000000000..e44c1a05dc0007e5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/strstr-avx512.c
|
||||
@@ -0,0 +1,218 @@
|
||||
+/* strstr optimized with 512-bit AVX-512 instructions
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <immintrin.h>
|
||||
+#include <inttypes.h>
|
||||
+#include <stdbool.h>
|
||||
+#include <string.h>
|
||||
+
|
||||
+#define FULL_MMASK64 0xffffffffffffffff
|
||||
+#define ONE_64BIT 0x1ull
|
||||
+#define ZMM_SIZE_IN_BYTES 64
|
||||
+#define PAGESIZE 4096
|
||||
+
|
||||
+#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__)
|
||||
+#define kshiftri_mask64(x, y) ((x) >> (y))
|
||||
+#define kand_mask64(x, y) ((x) & (y))
|
||||
+
|
||||
+/*
|
||||
+ Returns the index of the first edge within the needle, returns 0 if no edge
|
||||
+ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg'
|
||||
+ */
|
||||
+static inline size_t
|
||||
+find_edge_in_needle (const char *ned)
|
||||
+{
|
||||
+ size_t ind = 0;
|
||||
+ while (ned[ind + 1] != '\0')
|
||||
+ {
|
||||
+ if (ned[ind] != ned[ind + 1])
|
||||
+ return ind;
|
||||
+ else
|
||||
+ ind = ind + 1;
|
||||
+ }
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ Compare needle with haystack byte by byte at specified location
|
||||
+ */
|
||||
+static inline bool
|
||||
+verify_string_match (const char *hay, const size_t hay_index, const char *ned,
|
||||
+ size_t ind)
|
||||
+{
|
||||
+ while (ned[ind] != '\0')
|
||||
+ {
|
||||
+ if (ned[ind] != hay[hay_index + ind])
|
||||
+ return false;
|
||||
+ ind = ind + 1;
|
||||
+ }
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+/*
|
||||
+ Compare needle with haystack at specified location. The first 64 bytes are
|
||||
+ compared using a ZMM register.
|
||||
+ */
|
||||
+static inline bool
|
||||
+verify_string_match_avx512 (const char *hay, const size_t hay_index,
|
||||
+ const char *ned, const __mmask64 ned_mask,
|
||||
+ const __m512i ned_zmm)
|
||||
+{
|
||||
+ /* check first 64 bytes using zmm and then scalar */
|
||||
+ __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so
|
||||
+ __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm);
|
||||
+ if (match != 0x0) // failed the first few chars
|
||||
+ return false;
|
||||
+ else if (ned_mask == FULL_MMASK64)
|
||||
+ return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES);
|
||||
+ return true;
|
||||
+}
|
||||
+
|
||||
+char *
|
||||
+__strstr_avx512 (const char *haystack, const char *ned)
|
||||
+{
|
||||
+ char first = ned[0];
|
||||
+ if (first == '\0')
|
||||
+ return (char *)haystack;
|
||||
+ if (ned[1] == '\0')
|
||||
+ return (char *)strchr (haystack, ned[0]);
|
||||
+
|
||||
+ size_t edge = find_edge_in_needle (ned);
|
||||
+
|
||||
+ /* ensure haystack is as long as the pos of edge in needle */
|
||||
+ for (int ii = 0; ii < edge; ++ii)
|
||||
+ {
|
||||
+ if (haystack[ii] == '\0')
|
||||
+ return NULL;
|
||||
+ }
|
||||
+
|
||||
+ /*
|
||||
+ Load 64 bytes of the needle and save it to a zmm register
|
||||
+ Read one cache line at a time to avoid loading across a page boundary
|
||||
+ */
|
||||
+ __mmask64 ned_load_mask = _bzhi_u64 (
|
||||
+ FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63));
|
||||
+ __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned);
|
||||
+ __mmask64 ned_nullmask
|
||||
+ = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm);
|
||||
+
|
||||
+ if (__glibc_unlikely (ned_nullmask == 0x0))
|
||||
+ {
|
||||
+ ned_zmm = _mm512_loadu_si512 (ned);
|
||||
+ ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm);
|
||||
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
|
||||
+ if (ned_nullmask != 0x0)
|
||||
+ ned_load_mask = ned_load_mask >> 1;
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT);
|
||||
+ ned_load_mask = ned_load_mask >> 1;
|
||||
+ }
|
||||
+ const __m512i ned0 = _mm512_set1_epi8 (ned[edge]);
|
||||
+ const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]);
|
||||
+
|
||||
+ /*
|
||||
+ Read the bytes of haystack in the current cache line
|
||||
+ */
|
||||
+ size_t hay_index = edge;
|
||||
+ __mmask64 loadmask = _bzhi_u64 (
|
||||
+ FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63));
|
||||
+ /* First load is a partial cache line */
|
||||
+ __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index);
|
||||
+ /* Search for NULL and compare only till null char */
|
||||
+ uint64_t nullmask
|
||||
+ = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0));
|
||||
+ uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT);
|
||||
+ cmpmask = cmpmask & cvtmask64_u64 (loadmask);
|
||||
+ /* Search for the 2 charaters of needle */
|
||||
+ __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
|
||||
+ __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1);
|
||||
+ k1 = kshiftri_mask64 (k1, 1);
|
||||
+ /* k2 masks tell us if both chars from needle match */
|
||||
+ uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
|
||||
+ /* For every match, search for the entire needle for a full match */
|
||||
+ while (k2)
|
||||
+ {
|
||||
+ uint64_t bitcount = _tzcnt_u64 (k2);
|
||||
+ k2 = _blsr_u64 (k2);
|
||||
+ size_t match_pos = hay_index + bitcount - edge;
|
||||
+ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
|
||||
+ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
|
||||
+ {
|
||||
+ /*
|
||||
+ * Use vector compare as long as you are not crossing a page
|
||||
+ */
|
||||
+ if (verify_string_match_avx512 (haystack, match_pos, ned,
|
||||
+ ned_load_mask, ned_zmm))
|
||||
+ return (char *)haystack + match_pos;
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ if (verify_string_match (haystack, match_pos, ned, 0))
|
||||
+ return (char *)haystack + match_pos;
|
||||
+ }
|
||||
+ }
|
||||
+ /* We haven't checked for potential match at the last char yet */
|
||||
+ haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63));
|
||||
+ hay_index = 0;
|
||||
+
|
||||
+ /*
|
||||
+ Loop over one cache line at a time to prevent reading over page
|
||||
+ boundary
|
||||
+ */
|
||||
+ __m512i hay1;
|
||||
+ while (nullmask == 0)
|
||||
+ {
|
||||
+ hay0 = _mm512_loadu_si512 (haystack + hay_index);
|
||||
+ hay1 = _mm512_load_si512 (haystack + hay_index
|
||||
+ + 1); // Always 64 byte aligned
|
||||
+ nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1));
|
||||
+ /* Compare only till null char */
|
||||
+ cmpmask = nullmask ^ (nullmask - ONE_64BIT);
|
||||
+ k0 = _mm512_cmpeq_epi8_mask (hay0, ned0);
|
||||
+ k1 = _mm512_cmpeq_epi8_mask (hay1, ned1);
|
||||
+ /* k2 masks tell us if both chars from needle match */
|
||||
+ k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask;
|
||||
+ /* For every match, compare full strings for potential match */
|
||||
+ while (k2)
|
||||
+ {
|
||||
+ uint64_t bitcount = _tzcnt_u64 (k2);
|
||||
+ k2 = _blsr_u64 (k2);
|
||||
+ size_t match_pos = hay_index + bitcount - edge;
|
||||
+ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1))
|
||||
+ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES)
|
||||
+ {
|
||||
+ /*
|
||||
+ * Use vector compare as long as you are not crossing a page
|
||||
+ */
|
||||
+ if (verify_string_match_avx512 (haystack, match_pos, ned,
|
||||
+ ned_load_mask, ned_zmm))
|
||||
+ return (char *)haystack + match_pos;
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ /* Compare byte by byte */
|
||||
+ if (verify_string_match (haystack, match_pos, ned, 0))
|
||||
+ return (char *)haystack + match_pos;
|
||||
+ }
|
||||
+ }
|
||||
+ hay_index += ZMM_SIZE_IN_BYTES;
|
||||
+ }
|
||||
+ return NULL;
|
||||
+}
|
||||
diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c
|
||||
index 848601bde7583ca3..9474d6234e9b62d3 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strstr.c
|
||||
+++ b/sysdeps/x86_64/multiarch/strstr.c
|
||||
@@ -35,16 +35,32 @@
|
||||
|
||||
extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden;
|
||||
extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden;
|
||||
+extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden;
|
||||
|
||||
#include "init-arch.h"
|
||||
|
||||
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
|
||||
ifunc symbol properly. */
|
||||
extern __typeof (__redirect_strstr) __libc_strstr;
|
||||
-libc_ifunc (__libc_strstr,
|
||||
- HAS_ARCH_FEATURE (Fast_Unaligned_Load)
|
||||
- ? __strstr_sse2_unaligned
|
||||
- : __strstr_sse2)
|
||||
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+ const struct cpu_features *cpu_features = __get_cpu_features ();
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
+ return __strstr_avx512;
|
||||
+
|
||||
+ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
|
||||
+ return __strstr_sse2_unaligned;
|
||||
+
|
||||
+ return __strstr_sse2;
|
||||
+}
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ());
|
||||
#undef strstr
|
||||
strong_alias (__libc_strstr, strstr)
|
385
glibc-upstream-2.34-281.patch
Normal file
385
glibc-upstream-2.34-281.patch
Normal file
@ -0,0 +1,385 @@
|
||||
commit 70be93d1c58916d289a5e6e7c7d9b989707a9e41
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:27 2022 -0700
|
||||
|
||||
x86: Create header for VEC classes in x86 strings library
|
||||
|
||||
This patch does not touch any existing code and is only meant to be a
|
||||
tool for future patches so that simple source files can more easily be
|
||||
maintained to target multiple VEC classes.
|
||||
|
||||
There is no difference in the objdump of libc.so before and after this
|
||||
patch.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 8a780a6b910023e71f3173f37f0793834c047554)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..3f531dd47fceefe9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
|
||||
@@ -0,0 +1,34 @@
|
||||
+/* Common config for AVX-RTM VECs
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _AVX_RTM_VECS_H
|
||||
+#define _AVX_RTM_VECS_H 1
|
||||
+
|
||||
+#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
+
|
||||
+#define VZEROUPPER_RETURN jmp L(return_vzeroupper)
|
||||
+
|
||||
+#define USE_WITH_RTM 1
|
||||
+#include "avx-vecs.h"
|
||||
+
|
||||
+#undef SECTION
|
||||
+#define SECTION(p) p##.avx.rtm
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..89680f5db827c332
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/avx-vecs.h
|
||||
@@ -0,0 +1,47 @@
|
||||
+/* Common config for AVX VECs
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _AVX_VECS_H
|
||||
+#define _AVX_VECS_H 1
|
||||
+
|
||||
+#ifdef VEC_SIZE
|
||||
+# error "Multiple VEC configs included!"
|
||||
+#endif
|
||||
+
|
||||
+#define VEC_SIZE 32
|
||||
+#include "vec-macros.h"
|
||||
+
|
||||
+#define USE_WITH_AVX 1
|
||||
+#define SECTION(p) p##.avx
|
||||
+
|
||||
+/* 4-byte mov instructions with AVX2. */
|
||||
+#define MOV_SIZE 4
|
||||
+/* 1 (ret) + 3 (vzeroupper). */
|
||||
+#define RET_SIZE 4
|
||||
+#define VZEROUPPER vzeroupper
|
||||
+
|
||||
+#define VMOVU vmovdqu
|
||||
+#define VMOVA vmovdqa
|
||||
+#define VMOVNT vmovntdq
|
||||
+
|
||||
+/* Often need to access xmm portion. */
|
||||
+#define VEC_xmm VEC_any_xmm
|
||||
+#define VEC VEC_any_ymm
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..99806ebcd7bde53d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h
|
||||
@@ -0,0 +1,39 @@
|
||||
+/* Common config for EVEX256 and EVEX512 VECs
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _EVEX_VECS_COMMON_H
|
||||
+#define _EVEX_VECS_COMMON_H 1
|
||||
+
|
||||
+#include "vec-macros.h"
|
||||
+
|
||||
+/* 6-byte mov instructions with EVEX. */
|
||||
+#define MOV_SIZE 6
|
||||
+/* No vzeroupper needed. */
|
||||
+#define RET_SIZE 1
|
||||
+#define VZEROUPPER
|
||||
+
|
||||
+#define VMOVU vmovdqu64
|
||||
+#define VMOVA vmovdqa64
|
||||
+#define VMOVNT vmovntdq
|
||||
+
|
||||
+#define VEC_xmm VEC_hi_xmm
|
||||
+#define VEC_ymm VEC_hi_ymm
|
||||
+#define VEC_zmm VEC_hi_zmm
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..222ba46dc74cfcbd
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/evex256-vecs.h
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Common config for EVEX256 VECs
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _EVEX256_VECS_H
|
||||
+#define _EVEX256_VECS_H 1
|
||||
+
|
||||
+#ifdef VEC_SIZE
|
||||
+# error "Multiple VEC configs included!"
|
||||
+#endif
|
||||
+
|
||||
+#define VEC_SIZE 32
|
||||
+#include "evex-vecs-common.h"
|
||||
+
|
||||
+#define USE_WITH_EVEX256 1
|
||||
+#define SECTION(p) p##.evex
|
||||
+
|
||||
+#define VEC VEC_ymm
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..d1784d5368d8cebe
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/evex512-vecs.h
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Common config for EVEX512 VECs
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _EVEX512_VECS_H
|
||||
+#define _EVEX512_VECS_H 1
|
||||
+
|
||||
+#ifdef VEC_SIZE
|
||||
+# error "Multiple VEC configs included!"
|
||||
+#endif
|
||||
+
|
||||
+#define VEC_SIZE 64
|
||||
+#include "evex-vecs-common.h"
|
||||
+
|
||||
+#define USE_WITH_EVEX512 1
|
||||
+#define SECTION(p) p##.evex512
|
||||
+
|
||||
+#define VEC VEC_zmm
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..2b77a59d56ff2660
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/sse2-vecs.h
|
||||
@@ -0,0 +1,47 @@
|
||||
+/* Common config for SSE2 VECs
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _SSE2_VECS_H
|
||||
+#define _SSE2_VECS_H 1
|
||||
+
|
||||
+#ifdef VEC_SIZE
|
||||
+# error "Multiple VEC configs included!"
|
||||
+#endif
|
||||
+
|
||||
+#define VEC_SIZE 16
|
||||
+#include "vec-macros.h"
|
||||
+
|
||||
+#define USE_WITH_SSE2 1
|
||||
+#define SECTION(p) p
|
||||
+
|
||||
+/* 3-byte mov instructions with SSE2. */
|
||||
+#define MOV_SIZE 3
|
||||
+/* No vzeroupper needed. */
|
||||
+#define RET_SIZE 1
|
||||
+#define VZEROUPPER
|
||||
+
|
||||
+#define VMOVU movups
|
||||
+#define VMOVA movaps
|
||||
+#define VMOVNT movntdq
|
||||
+
|
||||
+#define VEC_xmm VEC_any_xmm
|
||||
+#define VEC VEC_any_xmm
|
||||
+
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h
|
||||
new file mode 100644
|
||||
index 0000000000000000..9f3ffecede9feb26
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/vec-macros.h
|
||||
@@ -0,0 +1,90 @@
|
||||
+/* Macro helpers for VEC_{type}({vec_num})
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _VEC_MACROS_H
|
||||
+#define _VEC_MACROS_H 1
|
||||
+
|
||||
+#ifndef VEC_SIZE
|
||||
+# error "Never include this file directly. Always include a vector config."
|
||||
+#endif
|
||||
+
|
||||
+/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same
|
||||
+ VEC(N) values. */
|
||||
+#define VEC_hi_xmm0 xmm16
|
||||
+#define VEC_hi_xmm1 xmm17
|
||||
+#define VEC_hi_xmm2 xmm18
|
||||
+#define VEC_hi_xmm3 xmm19
|
||||
+#define VEC_hi_xmm4 xmm20
|
||||
+#define VEC_hi_xmm5 xmm21
|
||||
+#define VEC_hi_xmm6 xmm22
|
||||
+#define VEC_hi_xmm7 xmm23
|
||||
+#define VEC_hi_xmm8 xmm24
|
||||
+#define VEC_hi_xmm9 xmm25
|
||||
+#define VEC_hi_xmm10 xmm26
|
||||
+#define VEC_hi_xmm11 xmm27
|
||||
+#define VEC_hi_xmm12 xmm28
|
||||
+#define VEC_hi_xmm13 xmm29
|
||||
+#define VEC_hi_xmm14 xmm30
|
||||
+#define VEC_hi_xmm15 xmm31
|
||||
+
|
||||
+#define VEC_hi_ymm0 ymm16
|
||||
+#define VEC_hi_ymm1 ymm17
|
||||
+#define VEC_hi_ymm2 ymm18
|
||||
+#define VEC_hi_ymm3 ymm19
|
||||
+#define VEC_hi_ymm4 ymm20
|
||||
+#define VEC_hi_ymm5 ymm21
|
||||
+#define VEC_hi_ymm6 ymm22
|
||||
+#define VEC_hi_ymm7 ymm23
|
||||
+#define VEC_hi_ymm8 ymm24
|
||||
+#define VEC_hi_ymm9 ymm25
|
||||
+#define VEC_hi_ymm10 ymm26
|
||||
+#define VEC_hi_ymm11 ymm27
|
||||
+#define VEC_hi_ymm12 ymm28
|
||||
+#define VEC_hi_ymm13 ymm29
|
||||
+#define VEC_hi_ymm14 ymm30
|
||||
+#define VEC_hi_ymm15 ymm31
|
||||
+
|
||||
+#define VEC_hi_zmm0 zmm16
|
||||
+#define VEC_hi_zmm1 zmm17
|
||||
+#define VEC_hi_zmm2 zmm18
|
||||
+#define VEC_hi_zmm3 zmm19
|
||||
+#define VEC_hi_zmm4 zmm20
|
||||
+#define VEC_hi_zmm5 zmm21
|
||||
+#define VEC_hi_zmm6 zmm22
|
||||
+#define VEC_hi_zmm7 zmm23
|
||||
+#define VEC_hi_zmm8 zmm24
|
||||
+#define VEC_hi_zmm9 zmm25
|
||||
+#define VEC_hi_zmm10 zmm26
|
||||
+#define VEC_hi_zmm11 zmm27
|
||||
+#define VEC_hi_zmm12 zmm28
|
||||
+#define VEC_hi_zmm13 zmm29
|
||||
+#define VEC_hi_zmm14 zmm30
|
||||
+#define VEC_hi_zmm15 zmm31
|
||||
+
|
||||
+#define PRIMITIVE_VEC(vec, num) vec##num
|
||||
+
|
||||
+#define VEC_any_xmm(i) PRIMITIVE_VEC(xmm, i)
|
||||
+#define VEC_any_ymm(i) PRIMITIVE_VEC(ymm, i)
|
||||
+#define VEC_any_zmm(i) PRIMITIVE_VEC(zmm, i)
|
||||
+
|
||||
+#define VEC_hi_xmm(i) PRIMITIVE_VEC(VEC_hi_xmm, i)
|
||||
+#define VEC_hi_ymm(i) PRIMITIVE_VEC(VEC_hi_ymm, i)
|
||||
+#define VEC_hi_zmm(i) PRIMITIVE_VEC(VEC_hi_zmm, i)
|
||||
+
|
||||
+#endif
|
90
glibc-upstream-2.34-282.patch
Normal file
90
glibc-upstream-2.34-282.patch
Normal file
@ -0,0 +1,90 @@
|
||||
commit e805606193e1a39956ca5ef73cb44a8796730686
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:28 2022 -0700
|
||||
|
||||
x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
|
||||
|
||||
The RTM vzeroupper mitigation has no way of replacing inline
|
||||
vzeroupper not before a return.
|
||||
|
||||
This can be useful when hoisting a vzeroupper to save code size
|
||||
for example:
|
||||
|
||||
```
|
||||
L(foo):
|
||||
cmpl %eax, %edx
|
||||
jz L(bar)
|
||||
tzcntl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
L(bar):
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER_RETURN
|
||||
```
|
||||
|
||||
Can become:
|
||||
|
||||
```
|
||||
L(foo):
|
||||
COND_VZEROUPPER
|
||||
cmpl %eax, %edx
|
||||
jz L(bar)
|
||||
tzcntl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
ret
|
||||
|
||||
L(bar):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
```
|
||||
|
||||
This code does not change any existing functionality.
|
||||
|
||||
There is no difference in the objdump of libc.so before and after this
|
||||
patch.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit dd5c483b2598f411428df4d8864c15c4b8a3cd68)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
|
||||
index 3f531dd47fceefe9..6ca9f5e6bae7ba72 100644
|
||||
--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
|
||||
+++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h
|
||||
@@ -20,6 +20,7 @@
|
||||
#ifndef _AVX_RTM_VECS_H
|
||||
#define _AVX_RTM_VECS_H 1
|
||||
|
||||
+#define COND_VZEROUPPER COND_VZEROUPPER_XTEST
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
|
||||
index 7bebdeb21095eda0..93e44be22e2275f1 100644
|
||||
--- a/sysdeps/x86_64/sysdep.h
|
||||
+++ b/sysdeps/x86_64/sysdep.h
|
||||
@@ -106,6 +106,24 @@ lose: \
|
||||
vzeroupper; \
|
||||
ret
|
||||
|
||||
+/* Can be used to replace vzeroupper that is not directly before a
|
||||
+ return. This is useful when hoisting a vzeroupper from multiple
|
||||
+ return paths to decrease the total number of vzerouppers and code
|
||||
+ size. */
|
||||
+#define COND_VZEROUPPER_XTEST \
|
||||
+ xtest; \
|
||||
+ jz 1f; \
|
||||
+ vzeroall; \
|
||||
+ jmp 2f; \
|
||||
+1: \
|
||||
+ vzeroupper; \
|
||||
+2:
|
||||
+
|
||||
+/* In RTM define this as COND_VZEROUPPER_XTEST. */
|
||||
+#ifndef COND_VZEROUPPER
|
||||
+# define COND_VZEROUPPER vzeroupper
|
||||
+#endif
|
||||
+
|
||||
/* Zero upper vector registers and return. */
|
||||
#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
# define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
696
glibc-upstream-2.34-283.patch
Normal file
696
glibc-upstream-2.34-283.patch
Normal file
@ -0,0 +1,696 @@
|
||||
commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:30 2022 -0700
|
||||
|
||||
x86: Optimize memrchr-sse2.S
|
||||
|
||||
The new code:
|
||||
1. prioritizes smaller lengths more.
|
||||
2. optimizes target placement more carefully.
|
||||
3. reuses logic more.
|
||||
4. fixes up various inefficiencies in the logic.
|
||||
|
||||
The total code size saving is: 394 bytes
|
||||
Geometric Mean of all benchmarks New / Old: 0.874
|
||||
|
||||
Regressions:
|
||||
1. The page cross case is now colder, especially re-entry from the
|
||||
page cross case if a match is not found in the first VEC
|
||||
(roughly 50%). My general opinion with this patch is this is
|
||||
acceptable given the "coldness" of this case (less than 4%) and
|
||||
generally performance improvement in the other far more common
|
||||
cases.
|
||||
|
||||
2. There are some regressions 5-15% for medium/large user-arg
|
||||
lengths that have a match in the first VEC. This is because the
|
||||
logic was rewritten to optimize finds in the first VEC if the
|
||||
user-arg length is shorter (where we see roughly 20-50%
|
||||
performance improvements). It is not always the case this is a
|
||||
regression. My intuition is some frontend quirk is partially
|
||||
explaining the data although I haven't been able to find the
|
||||
root cause.
|
||||
|
||||
Full xcheck passes on x86_64.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 731feee3869550e93177e604604c1765d81de571)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
|
||||
index cc2001167d77c83c..c2a5902bf9385c67 100644
|
||||
--- a/sysdeps/x86_64/memrchr.S
|
||||
+++ b/sysdeps/x86_64/memrchr.S
|
||||
@@ -19,362 +19,333 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
+#define VEC_SIZE 16
|
||||
+#define PAGE_SIZE 4096
|
||||
|
||||
.text
|
||||
-ENTRY (__memrchr)
|
||||
- movd %esi, %xmm1
|
||||
-
|
||||
- sub $16, %RDX_LP
|
||||
- jbe L(length_less16)
|
||||
-
|
||||
- punpcklbw %xmm1, %xmm1
|
||||
- punpcklbw %xmm1, %xmm1
|
||||
-
|
||||
- add %RDX_LP, %RDI_LP
|
||||
- pshufd $0, %xmm1, %xmm1
|
||||
-
|
||||
- movdqu (%rdi), %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
-
|
||||
-/* Check if there is a match. */
|
||||
- pmovmskb %xmm0, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches0)
|
||||
-
|
||||
- sub $64, %rdi
|
||||
- mov %edi, %ecx
|
||||
- and $15, %ecx
|
||||
- jz L(loop_prolog)
|
||||
-
|
||||
- add $16, %rdi
|
||||
- add $16, %rdx
|
||||
- and $-16, %rdi
|
||||
- sub %rcx, %rdx
|
||||
-
|
||||
- .p2align 4
|
||||
-L(loop_prolog):
|
||||
- sub $64, %rdx
|
||||
- jbe L(exit_loop)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches48)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm2
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- pmovmskb %xmm2, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches32)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm3
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pmovmskb %xmm3, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches16)
|
||||
-
|
||||
- movdqa (%rdi), %xmm4
|
||||
- pcmpeqb %xmm1, %xmm4
|
||||
- pmovmskb %xmm4, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches0)
|
||||
-
|
||||
- sub $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- jbe L(exit_loop)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches48)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm2
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- pmovmskb %xmm2, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches32)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm3
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pmovmskb %xmm3, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches16)
|
||||
-
|
||||
- movdqa (%rdi), %xmm3
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pmovmskb %xmm3, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches0)
|
||||
-
|
||||
- mov %edi, %ecx
|
||||
- and $63, %ecx
|
||||
- jz L(align64_loop)
|
||||
-
|
||||
- add $64, %rdi
|
||||
- add $64, %rdx
|
||||
- and $-64, %rdi
|
||||
- sub %rcx, %rdx
|
||||
-
|
||||
- .p2align 4
|
||||
-L(align64_loop):
|
||||
- sub $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- jbe L(exit_loop)
|
||||
-
|
||||
- movdqa (%rdi), %xmm0
|
||||
- movdqa 16(%rdi), %xmm2
|
||||
- movdqa 32(%rdi), %xmm3
|
||||
- movdqa 48(%rdi), %xmm4
|
||||
-
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pcmpeqb %xmm1, %xmm4
|
||||
-
|
||||
- pmaxub %xmm3, %xmm0
|
||||
- pmaxub %xmm4, %xmm2
|
||||
- pmaxub %xmm0, %xmm2
|
||||
- pmovmskb %xmm2, %eax
|
||||
-
|
||||
- test %eax, %eax
|
||||
- jz L(align64_loop)
|
||||
-
|
||||
- pmovmskb %xmm4, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches48)
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches32)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm2
|
||||
-
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- pcmpeqb (%rdi), %xmm1
|
||||
-
|
||||
- pmovmskb %xmm2, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches16)
|
||||
-
|
||||
- pmovmskb %xmm1, %eax
|
||||
- bsr %eax, %eax
|
||||
-
|
||||
- add %rdi, %rax
|
||||
+ENTRY_P2ALIGN(__memrchr, 6)
|
||||
+#ifdef __ILP32__
|
||||
+ /* Clear upper bits. */
|
||||
+ mov %RDX_LP, %RDX_LP
|
||||
+#endif
|
||||
+ movd %esi, %xmm0
|
||||
+
|
||||
+ /* Get end pointer. */
|
||||
+ leaq (%rdx, %rdi), %rcx
|
||||
+
|
||||
+ punpcklbw %xmm0, %xmm0
|
||||
+ punpcklwd %xmm0, %xmm0
|
||||
+ pshufd $0, %xmm0, %xmm0
|
||||
+
|
||||
+ /* Check if we can load 1x VEC without cross a page. */
|
||||
+ testl $(PAGE_SIZE - VEC_SIZE), %ecx
|
||||
+ jz L(page_cross)
|
||||
+
|
||||
+ /* NB: This load happens regardless of whether rdx (len) is zero. Since
|
||||
+ it doesn't cross a page and the standard gurantees any pointer have
|
||||
+ at least one-valid byte this load must be safe. For the entire
|
||||
+ history of the x86 memrchr implementation this has been possible so
|
||||
+ no code "should" be relying on a zero-length check before this load.
|
||||
+ The zero-length check is moved to the page cross case because it is
|
||||
+ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE
|
||||
+ into 2-cache lines. */
|
||||
+ movups -(VEC_SIZE)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+
|
||||
+ subq $VEC_SIZE, %rdx
|
||||
+ ja L(more_1x_vec)
|
||||
+L(ret_vec_x0_test):
|
||||
+ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
|
||||
+ zero. */
|
||||
+ bsrl %eax, %eax
|
||||
+ jz L(ret_0)
|
||||
+ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
|
||||
+ if out of bounds. */
|
||||
+ addl %edx, %eax
|
||||
+ jl L(zero_0)
|
||||
+ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
|
||||
+ ptr. */
|
||||
+ addq %rdi, %rax
|
||||
+L(ret_0):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(exit_loop):
|
||||
- add $64, %edx
|
||||
- cmp $32, %edx
|
||||
- jbe L(exit_loop_32)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches48)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm2
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- pmovmskb %xmm2, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches32)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm3
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pmovmskb %xmm3, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches16_1)
|
||||
- cmp $48, %edx
|
||||
- jbe L(return_null)
|
||||
-
|
||||
- pcmpeqb (%rdi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches0_1)
|
||||
- xor %eax, %eax
|
||||
+ .p2align 4,, 5
|
||||
+L(ret_vec_x0):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq -(VEC_SIZE)(%rcx, %rax), %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(exit_loop_32):
|
||||
- movdqa 48(%rdi), %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches48_1)
|
||||
- cmp $16, %edx
|
||||
- jbe L(return_null)
|
||||
-
|
||||
- pcmpeqb 32(%rdi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- test %eax, %eax
|
||||
- jnz L(matches32_1)
|
||||
- xor %eax, %eax
|
||||
+ .p2align 4,, 2
|
||||
+L(zero_0):
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(matches0):
|
||||
- bsr %eax, %eax
|
||||
- add %rdi, %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(matches16):
|
||||
- bsr %eax, %eax
|
||||
- lea 16(%rax, %rdi), %rax
|
||||
- ret
|
||||
|
||||
- .p2align 4
|
||||
-L(matches32):
|
||||
- bsr %eax, %eax
|
||||
- lea 32(%rax, %rdi), %rax
|
||||
+ .p2align 4,, 8
|
||||
+L(more_1x_vec):
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x0)
|
||||
+
|
||||
+ /* Align rcx (pointer to string). */
|
||||
+ decq %rcx
|
||||
+ andq $-VEC_SIZE, %rcx
|
||||
+
|
||||
+ movq %rcx, %rdx
|
||||
+ /* NB: We could consistenyl save 1-byte in this pattern with `movaps
|
||||
+ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
|
||||
+ it adds more frontend uops (even if the moves can be eliminated) and
|
||||
+ some percentage of the time actual backend uops. */
|
||||
+ movaps -(VEC_SIZE)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ subq %rdi, %rdx
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+
|
||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ ja L(more_2x_vec)
|
||||
+L(last_2x_vec):
|
||||
+ subl $VEC_SIZE, %edx
|
||||
+ jbe L(ret_vec_x0_test)
|
||||
+
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x0)
|
||||
+
|
||||
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+
|
||||
+ subl $VEC_SIZE, %edx
|
||||
+ bsrl %eax, %eax
|
||||
+ jz L(ret_1)
|
||||
+ addl %edx, %eax
|
||||
+ jl L(zero_0)
|
||||
+ addq %rdi, %rax
|
||||
+L(ret_1):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(matches48):
|
||||
- bsr %eax, %eax
|
||||
- lea 48(%rax, %rdi), %rax
|
||||
+ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
|
||||
+ causes the hot pause (length <= VEC_SIZE) to span multiple cache
|
||||
+ lines. Naturally aligned % 16 to 8-bytes. */
|
||||
+L(page_cross):
|
||||
+ /* Zero length check. */
|
||||
+ testq %rdx, %rdx
|
||||
+ jz L(zero_0)
|
||||
+
|
||||
+ leaq -1(%rcx), %r8
|
||||
+ andq $-(VEC_SIZE), %r8
|
||||
+
|
||||
+ movaps (%r8), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %esi
|
||||
+ /* Shift out negative alignment (because we are starting from endptr and
|
||||
+ working backwards). */
|
||||
+ negl %ecx
|
||||
+ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
|
||||
+ explicitly. */
|
||||
+ andl $(VEC_SIZE - 1), %ecx
|
||||
+ shl %cl, %esi
|
||||
+ movzwl %si, %eax
|
||||
+ leaq (%rdi, %rdx), %rcx
|
||||
+ cmpq %rdi, %r8
|
||||
+ ja L(more_1x_vec)
|
||||
+ subl $VEC_SIZE, %edx
|
||||
+ bsrl %eax, %eax
|
||||
+ jz L(ret_2)
|
||||
+ addl %edx, %eax
|
||||
+ jl L(zero_1)
|
||||
+ addq %rdi, %rax
|
||||
+L(ret_2):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(matches0_1):
|
||||
- bsr %eax, %eax
|
||||
- sub $64, %rdx
|
||||
- add %rax, %rdx
|
||||
- jl L(return_null)
|
||||
- add %rdi, %rax
|
||||
+ /* Fits in aliging bytes. */
|
||||
+L(zero_1):
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(matches16_1):
|
||||
- bsr %eax, %eax
|
||||
- sub $48, %rdx
|
||||
- add %rax, %rdx
|
||||
- jl L(return_null)
|
||||
- lea 16(%rdi, %rax), %rax
|
||||
+ .p2align 4,, 5
|
||||
+L(ret_vec_x1):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(matches32_1):
|
||||
- bsr %eax, %eax
|
||||
- sub $32, %rdx
|
||||
- add %rax, %rdx
|
||||
- jl L(return_null)
|
||||
- lea 32(%rdi, %rax), %rax
|
||||
- ret
|
||||
+ .p2align 4,, 8
|
||||
+L(more_2x_vec):
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x0)
|
||||
|
||||
- .p2align 4
|
||||
-L(matches48_1):
|
||||
- bsr %eax, %eax
|
||||
- sub $16, %rdx
|
||||
- add %rax, %rdx
|
||||
- jl L(return_null)
|
||||
- lea 48(%rdi, %rax), %rax
|
||||
- ret
|
||||
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x1)
|
||||
|
||||
- .p2align 4
|
||||
-L(return_null):
|
||||
- xor %eax, %eax
|
||||
- ret
|
||||
|
||||
- .p2align 4
|
||||
-L(length_less16_offset0):
|
||||
- test %edx, %edx
|
||||
- jz L(return_null)
|
||||
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
|
||||
- mov %dl, %cl
|
||||
- pcmpeqb (%rdi), %xmm1
|
||||
+ subq $(VEC_SIZE * 4), %rdx
|
||||
+ ja L(more_4x_vec)
|
||||
|
||||
- mov $1, %edx
|
||||
- sal %cl, %edx
|
||||
- sub $1, %edx
|
||||
+ addl $(VEC_SIZE), %edx
|
||||
+ jle L(ret_vec_x2_test)
|
||||
|
||||
- pmovmskb %xmm1, %eax
|
||||
+L(last_vec):
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x2)
|
||||
|
||||
- and %edx, %eax
|
||||
- test %eax, %eax
|
||||
- jz L(return_null)
|
||||
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
|
||||
- bsr %eax, %eax
|
||||
- add %rdi, %rax
|
||||
+ subl $(VEC_SIZE), %edx
|
||||
+ bsrl %eax, %eax
|
||||
+ jz L(ret_3)
|
||||
+ addl %edx, %eax
|
||||
+ jl L(zero_2)
|
||||
+ addq %rdi, %rax
|
||||
+L(ret_3):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(length_less16):
|
||||
- punpcklbw %xmm1, %xmm1
|
||||
- punpcklbw %xmm1, %xmm1
|
||||
-
|
||||
- add $16, %edx
|
||||
-
|
||||
- pshufd $0, %xmm1, %xmm1
|
||||
-
|
||||
- mov %edi, %ecx
|
||||
- and $15, %ecx
|
||||
- jz L(length_less16_offset0)
|
||||
-
|
||||
- mov %cl, %dh
|
||||
- mov %ecx, %esi
|
||||
- add %dl, %dh
|
||||
- and $-16, %rdi
|
||||
-
|
||||
- sub $16, %dh
|
||||
- ja L(length_less16_part2)
|
||||
-
|
||||
- pcmpeqb (%rdi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
-
|
||||
- sar %cl, %eax
|
||||
- mov %dl, %cl
|
||||
-
|
||||
- mov $1, %edx
|
||||
- sal %cl, %edx
|
||||
- sub $1, %edx
|
||||
-
|
||||
- and %edx, %eax
|
||||
- test %eax, %eax
|
||||
- jz L(return_null)
|
||||
-
|
||||
- bsr %eax, %eax
|
||||
- add %rdi, %rax
|
||||
- add %rsi, %rax
|
||||
+ .p2align 4,, 6
|
||||
+L(ret_vec_x2_test):
|
||||
+ bsrl %eax, %eax
|
||||
+ jz L(zero_2)
|
||||
+ addl %edx, %eax
|
||||
+ jl L(zero_2)
|
||||
+ addq %rdi, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(length_less16_part2):
|
||||
- movdqa 16(%rdi), %xmm2
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- pmovmskb %xmm2, %eax
|
||||
-
|
||||
- mov %dh, %cl
|
||||
- mov $1, %edx
|
||||
- sal %cl, %edx
|
||||
- sub $1, %edx
|
||||
-
|
||||
- and %edx, %eax
|
||||
+L(zero_2):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
- test %eax, %eax
|
||||
- jnz L(length_less16_part2_return)
|
||||
|
||||
- pcmpeqb (%rdi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
+ .p2align 4,, 5
|
||||
+L(ret_vec_x2):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
|
||||
+ ret
|
||||
|
||||
- mov %esi, %ecx
|
||||
- sar %cl, %eax
|
||||
- test %eax, %eax
|
||||
- jz L(return_null)
|
||||
+ .p2align 4,, 5
|
||||
+L(ret_vec_x3):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
|
||||
+ ret
|
||||
|
||||
- bsr %eax, %eax
|
||||
- add %rdi, %rax
|
||||
- add %rsi, %rax
|
||||
+ .p2align 4,, 8
|
||||
+L(more_4x_vec):
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x2)
|
||||
+
|
||||
+ movaps -(VEC_SIZE * 4)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x3)
|
||||
+
|
||||
+ addq $-(VEC_SIZE * 4), %rcx
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jbe L(last_4x_vec)
|
||||
+
|
||||
+ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
|
||||
+ keeping the code from spilling to the next cache line. */
|
||||
+ addq $(VEC_SIZE * 4 - 1), %rcx
|
||||
+ andq $-(VEC_SIZE * 4), %rcx
|
||||
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
|
||||
+ andq $-(VEC_SIZE * 4), %rdx
|
||||
+
|
||||
+ .p2align 4,, 11
|
||||
+L(loop_4x_vec):
|
||||
+ movaps (VEC_SIZE * -1)(%rcx), %xmm1
|
||||
+ movaps (VEC_SIZE * -2)(%rcx), %xmm2
|
||||
+ movaps (VEC_SIZE * -3)(%rcx), %xmm3
|
||||
+ movaps (VEC_SIZE * -4)(%rcx), %xmm4
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm2
|
||||
+ pcmpeqb %xmm0, %xmm3
|
||||
+ pcmpeqb %xmm0, %xmm4
|
||||
+
|
||||
+ por %xmm1, %xmm2
|
||||
+ por %xmm3, %xmm4
|
||||
+ por %xmm2, %xmm4
|
||||
+
|
||||
+ pmovmskb %xmm4, %esi
|
||||
+ testl %esi, %esi
|
||||
+ jnz L(loop_end)
|
||||
+
|
||||
+ addq $-(VEC_SIZE * 4), %rcx
|
||||
+ cmpq %rdx, %rcx
|
||||
+ jne L(loop_4x_vec)
|
||||
+
|
||||
+ subl %edi, %edx
|
||||
+
|
||||
+ /* Ends up being 1-byte nop. */
|
||||
+ .p2align 4,, 2
|
||||
+L(last_4x_vec):
|
||||
+ movaps -(VEC_SIZE)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+
|
||||
+ cmpl $(VEC_SIZE * 2), %edx
|
||||
+ jbe L(last_2x_vec)
|
||||
+
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_x0)
|
||||
+
|
||||
+
|
||||
+ movaps -(VEC_SIZE * 2)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_end)
|
||||
+
|
||||
+ movaps -(VEC_SIZE * 3)(%rcx), %xmm1
|
||||
+ pcmpeqb %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+
|
||||
+ subl $(VEC_SIZE * 3), %edx
|
||||
+ ja L(last_vec)
|
||||
+ bsrl %eax, %eax
|
||||
+ jz L(ret_4)
|
||||
+ addl %edx, %eax
|
||||
+ jl L(zero_3)
|
||||
+ addq %rdi, %rax
|
||||
+L(ret_4):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(length_less16_part2_return):
|
||||
- bsr %eax, %eax
|
||||
- lea 16(%rax, %rdi), %rax
|
||||
+ /* Ends up being 1-byte nop. */
|
||||
+ .p2align 4,, 3
|
||||
+L(loop_end):
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ sall $16, %eax
|
||||
+ jnz L(ret_vec_end)
|
||||
+
|
||||
+ pmovmskb %xmm2, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(ret_vec_end)
|
||||
+
|
||||
+ pmovmskb %xmm3, %eax
|
||||
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
|
||||
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
|
||||
+ then CHAR in VEC3 and bsrq will use that position. */
|
||||
+ sall $16, %eax
|
||||
+ orl %esi, %eax
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
|
||||
ret
|
||||
|
||||
-END (__memrchr)
|
||||
+L(ret_vec_end):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax
|
||||
+ ret
|
||||
+ /* Use in L(last_4x_vec). In the same cache line. This is just a spare
|
||||
+ aligning bytes. */
|
||||
+L(zero_3):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+ /* 2-bytes from next cache line. */
|
||||
+END(__memrchr)
|
||||
weak_alias (__memrchr, memrchr)
|
624
glibc-upstream-2.34-284.patch
Normal file
624
glibc-upstream-2.34-284.patch
Normal file
@ -0,0 +1,624 @@
|
||||
commit 83a986e9fbc301e6056dbc9d9ec6888621b60f67
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:31 2022 -0700
|
||||
|
||||
x86: Optimize memrchr-evex.S
|
||||
|
||||
The new code:
|
||||
1. prioritizes smaller user-arg lengths more.
|
||||
2. optimizes target placement more carefully
|
||||
3. reuses logic more
|
||||
4. fixes up various inefficiencies in the logic. The biggest
|
||||
case here is the `lzcnt` logic for checking returns which
|
||||
saves either a branch or multiple instructions.
|
||||
|
||||
The total code size saving is: 263 bytes
|
||||
Geometric Mean of all benchmarks New / Old: 0.755
|
||||
|
||||
Regressions:
|
||||
There are some regressions. Particularly where the length (user arg
|
||||
length) is large but the position of the match char is near the
|
||||
beginning of the string (in first VEC). This case has roughly a
|
||||
20% regression.
|
||||
|
||||
This is because the new logic gives the hot path for immediate matches
|
||||
to shorter lengths (the more common input). This case has roughly
|
||||
a 35% speedup.
|
||||
|
||||
Full xcheck passes on x86_64.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
|
||||
index 16bf8e02b1e80c84..bddc89c3754894ed 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memrchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
|
||||
@@ -19,319 +19,316 @@
|
||||
#if IS_IN (libc)
|
||||
|
||||
# include <sysdep.h>
|
||||
+# include "evex256-vecs.h"
|
||||
+# if VEC_SIZE != 32
|
||||
+# error "VEC_SIZE != 32 unimplemented"
|
||||
+# endif
|
||||
+
|
||||
+# ifndef MEMRCHR
|
||||
+# define MEMRCHR __memrchr_evex
|
||||
+# endif
|
||||
+
|
||||
+# define PAGE_SIZE 4096
|
||||
+# define VECMATCH VEC(0)
|
||||
+
|
||||
+ .section SECTION(.text), "ax", @progbits
|
||||
+ENTRY_P2ALIGN(MEMRCHR, 6)
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear upper bits. */
|
||||
+ and %RDX_LP, %RDX_LP
|
||||
+# else
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
+# endif
|
||||
+ jz L(zero_0)
|
||||
+
|
||||
+ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
|
||||
+ correct page cross check and 2) it correctly sets up end ptr to be
|
||||
+ subtract by lzcnt aligned. */
|
||||
+ leaq -1(%rdi, %rdx), %rax
|
||||
+ vpbroadcastb %esi, %VECMATCH
|
||||
+
|
||||
+ /* Check if we can load 1x VEC without cross a page. */
|
||||
+ testl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jz L(page_cross)
|
||||
+
|
||||
+ /* Don't use rax for pointer here because EVEX has better encoding with
|
||||
+ offset % VEC_SIZE == 0. */
|
||||
+ vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+
|
||||
+ /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ ja L(more_1x_vec)
|
||||
+L(ret_vec_x0_test):
|
||||
+
|
||||
+ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
|
||||
+ will guarantee edx (len) is less than it. */
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ cmpl %ecx, %edx
|
||||
+ jle L(zero_0)
|
||||
+ subq %rcx, %rax
|
||||
+ ret
|
||||
|
||||
-# define VMOVA vmovdqa64
|
||||
-
|
||||
-# define YMMMATCH ymm16
|
||||
-
|
||||
-# define VEC_SIZE 32
|
||||
-
|
||||
- .section .text.evex,"ax",@progbits
|
||||
-ENTRY (__memrchr_evex)
|
||||
- /* Broadcast CHAR to YMMMATCH. */
|
||||
- vpbroadcastb %esi, %YMMMATCH
|
||||
-
|
||||
- sub $VEC_SIZE, %RDX_LP
|
||||
- jbe L(last_vec_or_less)
|
||||
-
|
||||
- add %RDX_LP, %RDI_LP
|
||||
-
|
||||
- /* Check the last VEC_SIZE bytes. */
|
||||
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x0)
|
||||
-
|
||||
- subq $(VEC_SIZE * 4), %rdi
|
||||
- movl %edi, %ecx
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- jz L(aligned_more)
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- addq $VEC_SIZE, %rdx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- subq %rcx, %rdx
|
||||
-
|
||||
- .p2align 4
|
||||
-L(aligned_more):
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-
|
||||
- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
- since data is only aligned to VEC_SIZE. */
|
||||
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3)
|
||||
-
|
||||
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
|
||||
- kmovd %k2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x2)
|
||||
-
|
||||
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
|
||||
- kmovd %k3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1)
|
||||
-
|
||||
- vpcmpb $0, (%rdi), %YMMMATCH, %k4
|
||||
- kmovd %k4, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x0)
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE for loop with fewer branches.
|
||||
- There are some overlaps with above if data isn't aligned
|
||||
- to 4 * VEC_SIZE. */
|
||||
- movl %edi, %ecx
|
||||
- andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
- jz L(loop_4x_vec)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
- addq $(VEC_SIZE * 4), %rdx
|
||||
- andq $-(VEC_SIZE * 4), %rdi
|
||||
- subq %rcx, %rdx
|
||||
+ /* Fits in aligning bytes of first cache line. */
|
||||
+L(zero_0):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
- .p2align 4
|
||||
-L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- subq $(VEC_SIZE * 4), %rdi
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-
|
||||
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
||||
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
|
||||
- kord %k1, %k2, %k5
|
||||
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
|
||||
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
|
||||
-
|
||||
- kord %k3, %k4, %k6
|
||||
- kortestd %k5, %k6
|
||||
- jz L(loop_4x_vec)
|
||||
-
|
||||
- /* There is a match. */
|
||||
- kmovd %k4, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3)
|
||||
-
|
||||
- kmovd %k3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x2)
|
||||
-
|
||||
- kmovd %k2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1)
|
||||
-
|
||||
- kmovd %k1, %eax
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
+ .p2align 4,, 9
|
||||
+L(ret_vec_x0_dec):
|
||||
+ decq %rax
|
||||
+L(ret_vec_x0):
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq %rcx, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_4x_vec_or_less):
|
||||
- addl $(VEC_SIZE * 4), %edx
|
||||
- cmpl $(VEC_SIZE * 2), %edx
|
||||
- jbe L(last_2x_vec)
|
||||
+ .p2align 4,, 10
|
||||
+L(more_1x_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0)
|
||||
|
||||
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3)
|
||||
+ /* Align rax (pointer to string). */
|
||||
+ andq $-VEC_SIZE, %rax
|
||||
|
||||
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
|
||||
- kmovd %k2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x2)
|
||||
+ /* Recompute length after aligning. */
|
||||
+ movq %rax, %rdx
|
||||
|
||||
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
|
||||
- kmovd %k3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1_check)
|
||||
- cmpl $(VEC_SIZE * 3), %edx
|
||||
- jbe L(zero)
|
||||
+ /* Need no matter what. */
|
||||
+ vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
|
||||
- vpcmpb $0, (%rdi), %YMMMATCH, %k4
|
||||
- kmovd %k4, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
- bsrl %eax, %eax
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addq %rdi, %rax
|
||||
- ret
|
||||
+ subq %rdi, %rdx
|
||||
|
||||
- .p2align 4
|
||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ ja L(more_2x_vec)
|
||||
L(last_2x_vec):
|
||||
- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3_check)
|
||||
+
|
||||
+ /* Must dec rax because L(ret_vec_x0_test) expects it. */
|
||||
+ decq %rax
|
||||
cmpl $VEC_SIZE, %edx
|
||||
- jbe L(zero)
|
||||
-
|
||||
- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
- bsrl %eax, %eax
|
||||
- subq $(VEC_SIZE * 2), %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addl $(VEC_SIZE * 2), %eax
|
||||
- addq %rdi, %rax
|
||||
+ jbe L(ret_vec_x0_test)
|
||||
+
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0)
|
||||
+
|
||||
+ /* Don't use rax for pointer here because EVEX has better encoding with
|
||||
+ offset % VEC_SIZE == 0. */
|
||||
+ vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+ /* NB: 64-bit lzcnt. This will naturally add 32 to position. */
|
||||
+ lzcntq %rcx, %rcx
|
||||
+ cmpl %ecx, %edx
|
||||
+ jle L(zero_0)
|
||||
+ subq %rcx, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x0):
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
+ /* Inexpensive place to put this regarding code size / target alignments
|
||||
+ / ICache NLP. Necessary for 2-byte encoding of jump to page cross
|
||||
+ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
|
||||
+ in first cache line. */
|
||||
+L(page_cross):
|
||||
+ movq %rax, %rsi
|
||||
+ andq $-VEC_SIZE, %rsi
|
||||
+ vpcmpb $0, (%rsi), %VECMATCH, %k0
|
||||
+ kmovd %k0, %r8d
|
||||
+ /* Shift out negative alignment (because we are starting from endptr and
|
||||
+ working backwards). */
|
||||
+ movl %eax, %ecx
|
||||
+ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */
|
||||
+ notl %ecx
|
||||
+ shlxl %ecx, %r8d, %ecx
|
||||
+ cmpq %rdi, %rsi
|
||||
+ ja L(more_1x_vec)
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ cmpl %ecx, %edx
|
||||
+ jle L(zero_1)
|
||||
+ subq %rcx, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x1):
|
||||
- bsrl %eax, %eax
|
||||
- addl $VEC_SIZE, %eax
|
||||
- addq %rdi, %rax
|
||||
+ /* Continue creating zero labels that fit in aligning bytes and get
|
||||
+ 2-byte encoding / are in the same cache line as condition. */
|
||||
+L(zero_1):
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x2):
|
||||
- bsrl %eax, %eax
|
||||
- addl $(VEC_SIZE * 2), %eax
|
||||
- addq %rdi, %rax
|
||||
+ .p2align 4,, 8
|
||||
+L(ret_vec_x1):
|
||||
+ /* This will naturally add 32 to position. */
|
||||
+ bsrl %ecx, %ecx
|
||||
+ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x3):
|
||||
- bsrl %eax, %eax
|
||||
- addl $(VEC_SIZE * 3), %eax
|
||||
- addq %rdi, %rax
|
||||
- ret
|
||||
+ .p2align 4,, 8
|
||||
+L(more_2x_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0_dec)
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x1_check):
|
||||
- bsrl %eax, %eax
|
||||
- subq $(VEC_SIZE * 3), %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addl $VEC_SIZE, %eax
|
||||
- addq %rdi, %rax
|
||||
- ret
|
||||
+ vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x1)
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x3_check):
|
||||
- bsrl %eax, %eax
|
||||
- subq $VEC_SIZE, %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addl $(VEC_SIZE * 3), %eax
|
||||
- addq %rdi, %rax
|
||||
- ret
|
||||
+ /* Need no matter what. */
|
||||
+ vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
+ subq $(VEC_SIZE * 4), %rdx
|
||||
+ ja L(more_4x_vec)
|
||||
+
|
||||
+ cmpl $(VEC_SIZE * -1), %edx
|
||||
+ jle L(ret_vec_x2_test)
|
||||
+L(last_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x2)
|
||||
+
|
||||
+
|
||||
+ /* Need no matter what. */
|
||||
+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq $(VEC_SIZE * 3 + 1), %rax
|
||||
+ subq %rcx, %rax
|
||||
+ cmpq %rax, %rdi
|
||||
+ ja L(zero_1)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_or_less_aligned):
|
||||
- movl %edx, %ecx
|
||||
-
|
||||
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
||||
-
|
||||
- movl $1, %edx
|
||||
- /* Support rdx << 32. */
|
||||
- salq %cl, %rdx
|
||||
- subq $1, %rdx
|
||||
-
|
||||
- kmovd %k1, %eax
|
||||
-
|
||||
- /* Remove the trailing bytes. */
|
||||
- andl %edx, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
-
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
+ .p2align 4,, 8
|
||||
+L(ret_vec_x2_test):
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq $(VEC_SIZE * 2 + 1), %rax
|
||||
+ subq %rcx, %rax
|
||||
+ cmpq %rax, %rdi
|
||||
+ ja L(zero_1)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_or_less):
|
||||
- addl $VEC_SIZE, %edx
|
||||
-
|
||||
- /* Check for zero length. */
|
||||
- testl %edx, %edx
|
||||
- jz L(zero)
|
||||
-
|
||||
- movl %edi, %ecx
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- jz L(last_vec_or_less_aligned)
|
||||
-
|
||||
- movl %ecx, %esi
|
||||
- movl %ecx, %r8d
|
||||
- addl %edx, %esi
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ .p2align 4,, 8
|
||||
+L(ret_vec_x2):
|
||||
+ bsrl %ecx, %ecx
|
||||
+ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax
|
||||
+ ret
|
||||
|
||||
- subl $VEC_SIZE, %esi
|
||||
- ja L(last_vec_2x_aligned)
|
||||
+ .p2align 4,, 8
|
||||
+L(ret_vec_x3):
|
||||
+ bsrl %ecx, %ecx
|
||||
+ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax
|
||||
+ ret
|
||||
|
||||
- /* Check the last VEC. */
|
||||
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ .p2align 4,, 8
|
||||
+L(more_4x_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x2)
|
||||
|
||||
- /* Remove the leading and trailing bytes. */
|
||||
- sarl %cl, %eax
|
||||
- movl %edx, %ecx
|
||||
+ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
|
||||
- movl $1, %edx
|
||||
- sall %cl, %edx
|
||||
- subl $1, %edx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x3)
|
||||
|
||||
- andl %edx, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
+ /* Check if near end before re-aligning (otherwise might do an
|
||||
+ unnecessary loop iteration). */
|
||||
+ addq $-(VEC_SIZE * 4), %rax
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jbe L(last_4x_vec)
|
||||
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- addq %r8, %rax
|
||||
- ret
|
||||
+ decq %rax
|
||||
+ andq $-(VEC_SIZE * 4), %rax
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
|
||||
+ lengths that overflow can be valid and break the comparison. */
|
||||
+ andq $-(VEC_SIZE * 4), %rdx
|
||||
|
||||
.p2align 4
|
||||
-L(last_vec_2x_aligned):
|
||||
- movl %esi, %ecx
|
||||
-
|
||||
- /* Check the last VEC. */
|
||||
- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
||||
+L(loop_4x_vec):
|
||||
+ /* Store 1 were not-equals and 0 where equals in k1 (used to mask later
|
||||
+ on). */
|
||||
+ vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1
|
||||
+
|
||||
+ /* VEC(2/3) will have zero-byte where we found a CHAR. */
|
||||
+ vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2)
|
||||
+ vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3)
|
||||
+ vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4
|
||||
+
|
||||
+ /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where
|
||||
+ CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */
|
||||
+ vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z}
|
||||
+ vptestnmb %VEC(3), %VEC(3), %k2
|
||||
+
|
||||
+ /* Any 1s and we found CHAR. */
|
||||
+ kortestd %k2, %k4
|
||||
+ jnz L(loop_end)
|
||||
+
|
||||
+ addq $-(VEC_SIZE * 4), %rax
|
||||
+ cmpq %rdx, %rax
|
||||
+ jne L(loop_4x_vec)
|
||||
+
|
||||
+ /* Need to re-adjust rdx / rax for L(last_4x_vec). */
|
||||
+ subq $-(VEC_SIZE * 4), %rdx
|
||||
+ movq %rdx, %rax
|
||||
+ subl %edi, %edx
|
||||
+L(last_4x_vec):
|
||||
+
|
||||
+ /* Used no matter what. */
|
||||
+ vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
|
||||
- movl $1, %edx
|
||||
- sall %cl, %edx
|
||||
- subl $1, %edx
|
||||
+ cmpl $(VEC_SIZE * 2), %edx
|
||||
+ jbe L(last_2x_vec)
|
||||
|
||||
- kmovd %k1, %eax
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0_dec)
|
||||
|
||||
- /* Remove the trailing bytes. */
|
||||
- andl %edx, %eax
|
||||
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1)
|
||||
+ vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
|
||||
- /* Check the second last VEC. */
|
||||
- vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x1)
|
||||
|
||||
- movl %r8d, %ecx
|
||||
+ /* Used no matter what. */
|
||||
+ vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
|
||||
- kmovd %k1, %eax
|
||||
+ cmpl $(VEC_SIZE * 3), %edx
|
||||
+ ja L(last_vec)
|
||||
|
||||
- /* Remove the leading bytes. Must use unsigned right shift for
|
||||
- bsrl below. */
|
||||
- shrl %cl, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq $(VEC_SIZE * 2 + 1), %rax
|
||||
+ subq %rcx, %rax
|
||||
+ cmpq %rax, %rdi
|
||||
+ jbe L(ret_1)
|
||||
+ xorl %eax, %eax
|
||||
+L(ret_1):
|
||||
+ ret
|
||||
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- addq %r8, %rax
|
||||
+ .p2align 4,, 6
|
||||
+L(loop_end):
|
||||
+ kmovd %k1, %ecx
|
||||
+ notl %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0_end)
|
||||
+
|
||||
+ vptestnmb %VEC(2), %VEC(2), %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x1_end)
|
||||
+
|
||||
+ kmovd %k2, %ecx
|
||||
+ kmovd %k4, %esi
|
||||
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
|
||||
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
|
||||
+ then CHAR in VEC3 and bsrq will use that position. */
|
||||
+ salq $32, %rcx
|
||||
+ orq %rsi, %rcx
|
||||
+ bsrq %rcx, %rcx
|
||||
+ addq %rcx, %rax
|
||||
+ ret
|
||||
+ .p2align 4,, 4
|
||||
+L(ret_vec_x0_end):
|
||||
+ addq $(VEC_SIZE), %rax
|
||||
+L(ret_vec_x1_end):
|
||||
+ bsrl %ecx, %ecx
|
||||
+ leaq (VEC_SIZE * 2)(%rax, %rcx), %rax
|
||||
ret
|
||||
-END (__memrchr_evex)
|
||||
+
|
||||
+END(MEMRCHR)
|
||||
#endif
|
645
glibc-upstream-2.34-285.patch
Normal file
645
glibc-upstream-2.34-285.patch
Normal file
@ -0,0 +1,645 @@
|
||||
commit b05bd59823bcedee281d3fd5bd4928698ea9d69d
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:32 2022 -0700
|
||||
|
||||
x86: Optimize memrchr-avx2.S
|
||||
|
||||
The new code:
|
||||
1. prioritizes smaller user-arg lengths more.
|
||||
2. optimizes target placement more carefully
|
||||
3. reuses logic more
|
||||
4. fixes up various inefficiencies in the logic. The biggest
|
||||
case here is the `lzcnt` logic for checking returns which
|
||||
saves either a branch or multiple instructions.
|
||||
|
||||
The total code size saving is: 306 bytes
|
||||
Geometric Mean of all benchmarks New / Old: 0.760
|
||||
|
||||
Regressions:
|
||||
There are some regressions. Particularly where the length (user arg
|
||||
length) is large but the position of the match char is near the
|
||||
beginning of the string (in first VEC). This case has roughly a
|
||||
10-20% regression.
|
||||
|
||||
This is because the new logic gives the hot path for immediate matches
|
||||
to shorter lengths (the more common input). This case has roughly
|
||||
a 15-45% speedup.
|
||||
|
||||
Full xcheck passes on x86_64.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit af5306a735eb0966fdc2f8ccdafa8888e2df0c87)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
|
||||
index cea2d2a72db7406a..5e9beeeef2677c9f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S
|
||||
@@ -2,6 +2,7 @@
|
||||
# define MEMRCHR __memrchr_avx2_rtm
|
||||
#endif
|
||||
|
||||
+#define COND_VZEROUPPER COND_VZEROUPPER_XTEST
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
index ac7370cb06e9a0fd..5f8e0be18cfe4fad 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
@@ -21,340 +21,318 @@
|
||||
# include <sysdep.h>
|
||||
|
||||
# ifndef MEMRCHR
|
||||
-# define MEMRCHR __memrchr_avx2
|
||||
+# define MEMRCHR __memrchr_avx2
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
-# define VZEROUPPER vzeroupper
|
||||
+# define VZEROUPPER vzeroupper
|
||||
# endif
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
# endif
|
||||
|
||||
-# define VEC_SIZE 32
|
||||
+# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
+ .section SECTION(.text), "ax", @progbits
|
||||
+ENTRY(MEMRCHR)
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear upper bits. */
|
||||
+ and %RDX_LP, %RDX_LP
|
||||
+# else
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
+# endif
|
||||
+ jz L(zero_0)
|
||||
|
||||
- .section SECTION(.text),"ax",@progbits
|
||||
-ENTRY (MEMRCHR)
|
||||
- /* Broadcast CHAR to YMM0. */
|
||||
vmovd %esi, %xmm0
|
||||
- vpbroadcastb %xmm0, %ymm0
|
||||
-
|
||||
- sub $VEC_SIZE, %RDX_LP
|
||||
- jbe L(last_vec_or_less)
|
||||
-
|
||||
- add %RDX_LP, %RDI_LP
|
||||
-
|
||||
- /* Check the last VEC_SIZE bytes. */
|
||||
- vpcmpeqb (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x0)
|
||||
+ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a
|
||||
+ correct page cross check and 2) it correctly sets up end ptr to be
|
||||
+ subtract by lzcnt aligned. */
|
||||
+ leaq -1(%rdx, %rdi), %rax
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rdi
|
||||
- movl %edi, %ecx
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- jz L(aligned_more)
|
||||
+ vpbroadcastb %xmm0, %ymm0
|
||||
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- addq $VEC_SIZE, %rdx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- subq %rcx, %rdx
|
||||
+ /* Check if we can load 1x VEC without cross a page. */
|
||||
+ testl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jz L(page_cross)
|
||||
+
|
||||
+ vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ ja L(more_1x_vec)
|
||||
+
|
||||
+L(ret_vec_x0_test):
|
||||
+ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which
|
||||
+ will gurantee edx (len) is less than it. */
|
||||
+ lzcntl %ecx, %ecx
|
||||
+
|
||||
+ /* Hoist vzeroupper (not great for RTM) to save code size. This allows
|
||||
+ all logic for edx (len) <= VEC_SIZE to fit in first cache line. */
|
||||
+ COND_VZEROUPPER
|
||||
+ cmpl %ecx, %edx
|
||||
+ jle L(zero_0)
|
||||
+ subq %rcx, %rax
|
||||
+ ret
|
||||
|
||||
- .p2align 4
|
||||
-L(aligned_more):
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-
|
||||
- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
- since data is only aligned to VEC_SIZE. */
|
||||
- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3)
|
||||
-
|
||||
- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x2)
|
||||
-
|
||||
- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1)
|
||||
-
|
||||
- vpcmpeqb (%rdi), %ymm0, %ymm4
|
||||
- vpmovmskb %ymm4, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x0)
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE for loop with fewer branches.
|
||||
- There are some overlaps with above if data isn't aligned
|
||||
- to 4 * VEC_SIZE. */
|
||||
- movl %edi, %ecx
|
||||
- andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
- jz L(loop_4x_vec)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
- addq $(VEC_SIZE * 4), %rdx
|
||||
- andq $-(VEC_SIZE * 4), %rdi
|
||||
- subq %rcx, %rdx
|
||||
+ /* Fits in aligning bytes of first cache line. */
|
||||
+L(zero_0):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
- .p2align 4
|
||||
-L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- subq $(VEC_SIZE * 4), %rdi
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- vmovdqa VEC_SIZE(%rdi), %ymm2
|
||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
|
||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
|
||||
-
|
||||
- vpcmpeqb %ymm1, %ymm0, %ymm1
|
||||
- vpcmpeqb %ymm2, %ymm0, %ymm2
|
||||
- vpcmpeqb %ymm3, %ymm0, %ymm3
|
||||
- vpcmpeqb %ymm4, %ymm0, %ymm4
|
||||
-
|
||||
- vpor %ymm1, %ymm2, %ymm5
|
||||
- vpor %ymm3, %ymm4, %ymm6
|
||||
- vpor %ymm5, %ymm6, %ymm5
|
||||
-
|
||||
- vpmovmskb %ymm5, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(loop_4x_vec)
|
||||
-
|
||||
- /* There is a match. */
|
||||
- vpmovmskb %ymm4, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3)
|
||||
-
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x2)
|
||||
-
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1)
|
||||
-
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
+ .p2align 4,, 9
|
||||
+L(ret_vec_x0):
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq %rcx, %rax
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
- .p2align 4
|
||||
-L(last_4x_vec_or_less):
|
||||
- addl $(VEC_SIZE * 4), %edx
|
||||
- cmpl $(VEC_SIZE * 2), %edx
|
||||
- jbe L(last_2x_vec)
|
||||
-
|
||||
- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3)
|
||||
-
|
||||
- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x2)
|
||||
-
|
||||
- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1_check)
|
||||
- cmpl $(VEC_SIZE * 3), %edx
|
||||
- jbe L(zero)
|
||||
-
|
||||
- vpcmpeqb (%rdi), %ymm0, %ymm4
|
||||
- vpmovmskb %ymm4, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
- bsrl %eax, %eax
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 10
|
||||
+L(more_1x_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0)
|
||||
+
|
||||
+ /* Align rax (string pointer). */
|
||||
+ andq $-VEC_SIZE, %rax
|
||||
+
|
||||
+ /* Recompute remaining length after aligning. */
|
||||
+ movq %rax, %rdx
|
||||
+ /* Need this comparison next no matter what. */
|
||||
+ vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1
|
||||
+ subq %rdi, %rdx
|
||||
+ decq %rax
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ /* Fall through for short (hotter than length). */
|
||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ ja L(more_2x_vec)
|
||||
L(last_2x_vec):
|
||||
- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x3_check)
|
||||
cmpl $VEC_SIZE, %edx
|
||||
- jbe L(zero)
|
||||
-
|
||||
- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
- bsrl %eax, %eax
|
||||
- subq $(VEC_SIZE * 2), %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addl $(VEC_SIZE * 2), %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
-L(last_vec_x0):
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+ jbe L(ret_vec_x0_test)
|
||||
+
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0)
|
||||
+
|
||||
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ /* 64-bit lzcnt. This will naturally add 32 to position. */
|
||||
+ lzcntq %rcx, %rcx
|
||||
+ COND_VZEROUPPER
|
||||
+ cmpl %ecx, %edx
|
||||
+ jle L(zero_0)
|
||||
+ subq %rcx, %rax
|
||||
+ ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x1):
|
||||
- bsrl %eax, %eax
|
||||
- addl $VEC_SIZE, %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x2):
|
||||
- bsrl %eax, %eax
|
||||
- addl $(VEC_SIZE * 2), %eax
|
||||
- addq %rdi, %rax
|
||||
+ /* Inexpensive place to put this regarding code size / target alignments
|
||||
+ / ICache NLP. Necessary for 2-byte encoding of jump to page cross
|
||||
+ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit
|
||||
+ in first cache line. */
|
||||
+L(page_cross):
|
||||
+ movq %rax, %rsi
|
||||
+ andq $-VEC_SIZE, %rsi
|
||||
+ vpcmpeqb (%rsi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ /* Shift out negative alignment (because we are starting from endptr and
|
||||
+ working backwards). */
|
||||
+ movl %eax, %r8d
|
||||
+ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */
|
||||
+ notl %r8d
|
||||
+ shlxl %r8d, %ecx, %ecx
|
||||
+ cmpq %rdi, %rsi
|
||||
+ ja L(more_1x_vec)
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ COND_VZEROUPPER
|
||||
+ cmpl %ecx, %edx
|
||||
+ jle L(zero_0)
|
||||
+ subq %rcx, %rax
|
||||
+ ret
|
||||
+ .p2align 4,, 11
|
||||
+L(ret_vec_x1):
|
||||
+ /* This will naturally add 32 to position. */
|
||||
+ lzcntq %rcx, %rcx
|
||||
+ subq %rcx, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+ .p2align 4,, 10
|
||||
+L(more_2x_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0)
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x3):
|
||||
- bsrl %eax, %eax
|
||||
- addl $(VEC_SIZE * 3), %eax
|
||||
- addq %rdi, %rax
|
||||
- ret
|
||||
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x1)
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x1_check):
|
||||
- bsrl %eax, %eax
|
||||
- subq $(VEC_SIZE * 3), %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addl $VEC_SIZE, %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_x3_check):
|
||||
- bsrl %eax, %eax
|
||||
- subq $VEC_SIZE, %rdx
|
||||
- addq %rax, %rdx
|
||||
- jl L(zero)
|
||||
- addl $(VEC_SIZE * 3), %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+ /* Needed no matter what. */
|
||||
+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- VZEROUPPER_RETURN
|
||||
+ subq $(VEC_SIZE * 4), %rdx
|
||||
+ ja L(more_4x_vec)
|
||||
+
|
||||
+ cmpl $(VEC_SIZE * -1), %edx
|
||||
+ jle L(ret_vec_x2_test)
|
||||
+
|
||||
+L(last_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x2)
|
||||
+
|
||||
+ /* Needed no matter what. */
|
||||
+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq $(VEC_SIZE * 3), %rax
|
||||
+ COND_VZEROUPPER
|
||||
+ subq %rcx, %rax
|
||||
+ cmpq %rax, %rdi
|
||||
+ ja L(zero_2)
|
||||
+ ret
|
||||
|
||||
- .p2align 4
|
||||
-L(null):
|
||||
+ /* First in aligning bytes. */
|
||||
+L(zero_2):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_or_less_aligned):
|
||||
- movl %edx, %ecx
|
||||
+ .p2align 4,, 4
|
||||
+L(ret_vec_x2_test):
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq $(VEC_SIZE * 2), %rax
|
||||
+ COND_VZEROUPPER
|
||||
+ subq %rcx, %rax
|
||||
+ cmpq %rax, %rdi
|
||||
+ ja L(zero_2)
|
||||
+ ret
|
||||
|
||||
- vpcmpeqb (%rdi), %ymm0, %ymm1
|
||||
|
||||
- movl $1, %edx
|
||||
- /* Support rdx << 32. */
|
||||
- salq %cl, %rdx
|
||||
- subq $1, %rdx
|
||||
+ .p2align 4,, 11
|
||||
+L(ret_vec_x2):
|
||||
+ /* ecx must be non-zero. */
|
||||
+ bsrl %ecx, %ecx
|
||||
+ leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ .p2align 4,, 14
|
||||
+L(ret_vec_x3):
|
||||
+ /* ecx must be non-zero. */
|
||||
+ bsrl %ecx, %ecx
|
||||
+ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- /* Remove the trailing bytes. */
|
||||
- andl %edx, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(last_vec_or_less):
|
||||
- addl $VEC_SIZE, %edx
|
||||
+L(more_4x_vec):
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x2)
|
||||
|
||||
- /* Check for zero length. */
|
||||
- testl %edx, %edx
|
||||
- jz L(null)
|
||||
+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
|
||||
- movl %edi, %ecx
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- jz L(last_vec_or_less_aligned)
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x3)
|
||||
|
||||
- movl %ecx, %esi
|
||||
- movl %ecx, %r8d
|
||||
- addl %edx, %esi
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ /* Check if near end before re-aligning (otherwise might do an
|
||||
+ unnecissary loop iteration). */
|
||||
+ addq $-(VEC_SIZE * 4), %rax
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jbe L(last_4x_vec)
|
||||
|
||||
- subl $VEC_SIZE, %esi
|
||||
- ja L(last_vec_2x_aligned)
|
||||
+ /* Align rax to (VEC_SIZE - 1). */
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rax
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because
|
||||
+ lengths that overflow can be valid and break the comparison. */
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdx
|
||||
|
||||
- /* Check the last VEC. */
|
||||
- vpcmpeqb (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
-
|
||||
- /* Remove the leading and trailing bytes. */
|
||||
- sarl %cl, %eax
|
||||
- movl %edx, %ecx
|
||||
+ .p2align 4
|
||||
+L(loop_4x_vec):
|
||||
+ /* Need this comparison next no matter what. */
|
||||
+ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2
|
||||
+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3
|
||||
+ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4
|
||||
|
||||
- movl $1, %edx
|
||||
- sall %cl, %edx
|
||||
- subl $1, %edx
|
||||
+ vpor %ymm1, %ymm2, %ymm2
|
||||
+ vpor %ymm3, %ymm4, %ymm4
|
||||
+ vpor %ymm2, %ymm4, %ymm4
|
||||
+ vpmovmskb %ymm4, %esi
|
||||
|
||||
- andl %edx, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
+ testl %esi, %esi
|
||||
+ jnz L(loop_end)
|
||||
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- addq %r8, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+ addq $(VEC_SIZE * -4), %rax
|
||||
+ cmpq %rdx, %rax
|
||||
+ jne L(loop_4x_vec)
|
||||
|
||||
- .p2align 4
|
||||
-L(last_vec_2x_aligned):
|
||||
- movl %esi, %ecx
|
||||
+ subl %edi, %edx
|
||||
+ incl %edx
|
||||
|
||||
- /* Check the last VEC. */
|
||||
- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
+L(last_4x_vec):
|
||||
+ /* Used no matter what. */
|
||||
+ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
|
||||
- movl $1, %edx
|
||||
- sall %cl, %edx
|
||||
- subl $1, %edx
|
||||
+ cmpl $(VEC_SIZE * 2), %edx
|
||||
+ jbe L(last_2x_vec)
|
||||
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0_end)
|
||||
|
||||
- /* Remove the trailing bytes. */
|
||||
- andl %edx, %eax
|
||||
+ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x1_end)
|
||||
|
||||
- testl %eax, %eax
|
||||
- jnz L(last_vec_x1)
|
||||
+ /* Used no matter what. */
|
||||
+ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
|
||||
- /* Check the second last VEC. */
|
||||
- vpcmpeqb (%rdi), %ymm0, %ymm1
|
||||
+ cmpl $(VEC_SIZE * 3), %edx
|
||||
+ ja L(last_vec)
|
||||
+
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq $(VEC_SIZE * 2), %rax
|
||||
+ COND_VZEROUPPER
|
||||
+ subq %rcx, %rax
|
||||
+ cmpq %rax, %rdi
|
||||
+ jbe L(ret0)
|
||||
+ xorl %eax, %eax
|
||||
+L(ret0):
|
||||
+ ret
|
||||
|
||||
- movl %r8d, %ecx
|
||||
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ .p2align 4
|
||||
+L(loop_end):
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x0_end)
|
||||
+
|
||||
+ vpmovmskb %ymm2, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(ret_vec_x1_end)
|
||||
+
|
||||
+ vpmovmskb %ymm3, %ecx
|
||||
+ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
|
||||
+ then it won't affect the result in esi (VEC4). If ecx is non-zero
|
||||
+ then CHAR in VEC3 and bsrq will use that position. */
|
||||
+ salq $32, %rcx
|
||||
+ orq %rsi, %rcx
|
||||
+ bsrq %rcx, %rcx
|
||||
+ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- /* Remove the leading bytes. Must use unsigned right shift for
|
||||
- bsrl below. */
|
||||
- shrl %cl, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(zero)
|
||||
+ .p2align 4,, 4
|
||||
+L(ret_vec_x1_end):
|
||||
+ /* 64-bit version will automatically add 32 (VEC_SIZE). */
|
||||
+ lzcntq %rcx, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- bsrl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- addq %r8, %rax
|
||||
+ .p2align 4,, 4
|
||||
+L(ret_vec_x0_end):
|
||||
+ lzcntl %ecx, %ecx
|
||||
+ subq %rcx, %rax
|
||||
VZEROUPPER_RETURN
|
||||
-END (MEMRCHR)
|
||||
+
|
||||
+ /* 2 bytes until next cache line. */
|
||||
+END(MEMRCHR)
|
||||
#endif
|
323
glibc-upstream-2.34-286.patch
Normal file
323
glibc-upstream-2.34-286.patch
Normal file
@ -0,0 +1,323 @@
|
||||
commit a910d7e164f1d9b8e77bbea35a2d2ab89a5e26cc
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:33 2022 -0700
|
||||
|
||||
x86: Shrink code size of memchr-avx2.S
|
||||
|
||||
This is not meant as a performance optimization. The previous code was
|
||||
far to liberal in aligning targets and wasted code size unnecissarily.
|
||||
|
||||
The total code size saving is: 59 bytes
|
||||
|
||||
There are no major changes in the benchmarks.
|
||||
Geometric Mean of all benchmarks New / Old: 0.967
|
||||
|
||||
Full xcheck passes on x86_64.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35)
|
||||
|
||||
x86: Fix page cross case in rawmemchr-avx2 [BZ #29234]
|
||||
|
||||
commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:33 2022 -0700
|
||||
|
||||
x86: Shrink code size of memchr-avx2.S
|
||||
|
||||
Changed how the page cross case aligned string (rdi) in
|
||||
rawmemchr. This was incompatible with how
|
||||
`L(cross_page_continue)` expected the pointer to be aligned and
|
||||
would cause rawmemchr to read data start started before the
|
||||
beginning of the string. What it would read was in valid memory
|
||||
but could count CHAR matches resulting in an incorrect return
|
||||
value.
|
||||
|
||||
This commit fixes that issue by essentially reverting the changes to
|
||||
the L(page_cross) case as they didn't really matter.
|
||||
|
||||
Test cases added and all pass with the new code (and where confirmed
|
||||
to fail with the old code).
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 2c9af8421d2b4a7fcce163e7bc81a118d22fd346)
|
||||
|
||||
diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c
|
||||
index 085098aba8fdbc13..327c0654e69e7669 100644
|
||||
--- a/string/test-rawmemchr.c
|
||||
+++ b/string/test-rawmemchr.c
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <assert.h>
|
||||
+#include <support/xunistd.h>
|
||||
|
||||
#define TEST_MAIN
|
||||
#define TEST_NAME "rawmemchr"
|
||||
@@ -51,13 +52,45 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res)
|
||||
}
|
||||
}
|
||||
|
||||
+static void
|
||||
+do_test_bz29234 (void)
|
||||
+{
|
||||
+ size_t i, j;
|
||||
+ char *ptr_start;
|
||||
+ char *buf = xmmap (0, 8192, PROT_READ | PROT_WRITE,
|
||||
+ MAP_PRIVATE | MAP_ANONYMOUS, -1);
|
||||
+
|
||||
+ memset (buf, -1, 8192);
|
||||
+
|
||||
+ ptr_start = buf + 4096 - 8;
|
||||
+
|
||||
+ /* Out of range matches before the start of a page. */
|
||||
+ memset (ptr_start - 8, 0x1, 8);
|
||||
+
|
||||
+ for (j = 0; j < 8; ++j)
|
||||
+ {
|
||||
+ for (i = 0; i < 128; ++i)
|
||||
+ {
|
||||
+ ptr_start[i + j] = 0x1;
|
||||
+
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ do_one_test (impl, (char *) (ptr_start + j), 0x1,
|
||||
+ ptr_start + i + j);
|
||||
+
|
||||
+ ptr_start[i + j] = 0xff;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ xmunmap (buf, 8192);
|
||||
+}
|
||||
+
|
||||
static void
|
||||
do_test (size_t align, size_t pos, size_t len, int seek_char)
|
||||
{
|
||||
size_t i;
|
||||
char *result;
|
||||
|
||||
- align &= 7;
|
||||
+ align &= getpagesize () - 1;
|
||||
if (align + len >= page_size)
|
||||
return;
|
||||
|
||||
@@ -115,6 +148,13 @@ do_random_tests (void)
|
||||
}
|
||||
}
|
||||
|
||||
+ if (align)
|
||||
+ {
|
||||
+ p[align - 1] = seek_char;
|
||||
+ if (align > 4)
|
||||
+ p[align - 4] = seek_char;
|
||||
+ }
|
||||
+
|
||||
assert (pos < len);
|
||||
size_t r = random ();
|
||||
if ((r & 31) == 0)
|
||||
@@ -130,6 +170,13 @@ do_random_tests (void)
|
||||
result, p);
|
||||
ret = 1;
|
||||
}
|
||||
+
|
||||
+ if (align)
|
||||
+ {
|
||||
+ p[align - 1] = seek_char;
|
||||
+ if (align > 4)
|
||||
+ p[align - 4] = seek_char;
|
||||
+ }
|
||||
}
|
||||
}
|
||||
|
||||
@@ -151,14 +198,22 @@ test_main (void)
|
||||
do_test (i, 64, 256, 23);
|
||||
do_test (0, 16 << i, 2048, 0);
|
||||
do_test (i, 64, 256, 0);
|
||||
+
|
||||
+ do_test (getpagesize () - i, 64, 256, 23);
|
||||
+ do_test (getpagesize () - i, 64, 256, 0);
|
||||
}
|
||||
for (i = 1; i < 32; ++i)
|
||||
{
|
||||
do_test (0, i, i + 1, 23);
|
||||
do_test (0, i, i + 1, 0);
|
||||
+
|
||||
+ do_test (getpagesize () - 7, i, i + 1, 23);
|
||||
+ do_test (getpagesize () - i / 2, i, i + 1, 23);
|
||||
+ do_test (getpagesize () - i, i, i + 1, 23);
|
||||
}
|
||||
|
||||
do_random_tests ();
|
||||
+ do_test_bz29234 ();
|
||||
return ret;
|
||||
}
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
|
||||
index 87b076c7c403ba85..c4d71938c5a3ed24 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S
|
||||
@@ -2,6 +2,7 @@
|
||||
# define MEMCHR __memchr_avx2_rtm
|
||||
#endif
|
||||
|
||||
+#define COND_VZEROUPPER COND_VZEROUPPER_XTEST
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN \
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
index afdb95650232fdac..9e0b7dd1f4fe9909 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
@@ -57,7 +57,7 @@
|
||||
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
-ENTRY (MEMCHR)
|
||||
+ENTRY_P2ALIGN (MEMCHR, 5)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
# ifdef __ILP32__
|
||||
@@ -87,12 +87,14 @@ ENTRY (MEMCHR)
|
||||
# endif
|
||||
testl %eax, %eax
|
||||
jz L(aligned_more)
|
||||
- tzcntl %eax, %eax
|
||||
+ bsfl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+L(return_vzeroupper):
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- .p2align 5
|
||||
+ .p2align 4
|
||||
L(first_vec_x0):
|
||||
/* Check if first match was before length. */
|
||||
tzcntl %eax, %eax
|
||||
@@ -100,58 +102,31 @@ L(first_vec_x0):
|
||||
/* NB: Multiply length by 4 to get byte count. */
|
||||
sall $2, %edx
|
||||
# endif
|
||||
- xorl %ecx, %ecx
|
||||
+ COND_VZEROUPPER
|
||||
+ /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch
|
||||
+ block. branch here as opposed to cmovcc is not that costly. Common
|
||||
+ usage of memchr is to check if the return was NULL (if string was
|
||||
+ known to contain CHAR user would use rawmemchr). This branch will be
|
||||
+ highly correlated with the user branch and can be used by most
|
||||
+ modern branch predictors to predict the user branch. */
|
||||
cmpl %eax, %edx
|
||||
- leaq (%rdi, %rax), %rax
|
||||
- cmovle %rcx, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
-
|
||||
-L(null):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-# endif
|
||||
- .p2align 4
|
||||
-L(cross_page_boundary):
|
||||
- /* Save pointer before aligning as its original value is
|
||||
- necessary for computer return address if byte is found or
|
||||
- adjusting length if it is not and this is memchr. */
|
||||
- movq %rdi, %rcx
|
||||
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
|
||||
- and rdi for rawmemchr. */
|
||||
- orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
||||
- VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Calculate length until end of page (length checked for a
|
||||
- match). */
|
||||
- leaq 1(%ALGN_PTR_REG), %rsi
|
||||
- subq %RRAW_PTR_REG, %rsi
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
- shrl $2, %esi
|
||||
-# endif
|
||||
-# endif
|
||||
- /* Remove the leading bytes. */
|
||||
- sarxl %ERAW_PTR_REG, %eax, %eax
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rsi, %rdx
|
||||
- jbe L(first_vec_x0)
|
||||
+ jle L(null)
|
||||
+ addq %rdi, %rax
|
||||
+ ret
|
||||
# endif
|
||||
- testl %eax, %eax
|
||||
- jz L(cross_page_continue)
|
||||
- tzcntl %eax, %eax
|
||||
- addq %RRAW_PTR_REG, %rax
|
||||
-L(return_vzeroupper):
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 10
|
||||
L(first_vec_x1):
|
||||
- tzcntl %eax, %eax
|
||||
+ bsfl %eax, %eax
|
||||
incq %rdi
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
-
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* First in aligning bytes here. */
|
||||
+L(null):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+# endif
|
||||
.p2align 4
|
||||
L(first_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
@@ -340,7 +315,7 @@ L(first_vec_x1_check):
|
||||
incq %rdi
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 6
|
||||
L(set_zero_end):
|
||||
xorl %eax, %eax
|
||||
VZEROUPPER_RETURN
|
||||
@@ -428,5 +403,39 @@ L(last_vec_x3):
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
|
||||
+ .p2align 4
|
||||
+L(cross_page_boundary):
|
||||
+ /* Save pointer before aligning as its original value is necessary for
|
||||
+ computer return address if byte is found or adjusting length if it
|
||||
+ is not and this is memchr. */
|
||||
+ movq %rdi, %rcx
|
||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
|
||||
+ and rdi for rawmemchr. */
|
||||
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
||||
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Calculate length until end of page (length checked for a match). */
|
||||
+ leaq 1(%ALGN_PTR_REG), %rsi
|
||||
+ subq %RRAW_PTR_REG, %rsi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
+ shrl $2, %esi
|
||||
+# endif
|
||||
+# endif
|
||||
+ /* Remove the leading bytes. */
|
||||
+ sarxl %ERAW_PTR_REG, %eax, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Check the end of data. */
|
||||
+ cmpq %rsi, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
+# endif
|
||||
+ testl %eax, %eax
|
||||
+ jz L(cross_page_continue)
|
||||
+ bsfl %eax, %eax
|
||||
+ addq %RRAW_PTR_REG, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+
|
||||
END (MEMCHR)
|
||||
#endif
|
130
glibc-upstream-2.34-287.patch
Normal file
130
glibc-upstream-2.34-287.patch
Normal file
@ -0,0 +1,130 @@
|
||||
commit 3c87383a20daff9a230439e31b778716bfed4d8b
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jun 6 21:11:34 2022 -0700
|
||||
|
||||
x86: Shrink code size of memchr-evex.S
|
||||
|
||||
This is not meant as a performance optimization. The previous code was
|
||||
far to liberal in aligning targets and wasted code size unnecissarily.
|
||||
|
||||
The total code size saving is: 64 bytes
|
||||
|
||||
There are no non-negligible changes in the benchmarks.
|
||||
Geometric Mean of all benchmarks New / Old: 1.000
|
||||
|
||||
Full xcheck passes on x86_64.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit 56da3fe1dd075285fa8186d44b3c28e68c687e62)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
index 4d0ed6d136f099e1..68381c99a4948134 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
@@ -88,7 +88,7 @@
|
||||
# define PAGE_SIZE 4096
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
-ENTRY (MEMCHR)
|
||||
+ENTRY_P2ALIGN (MEMCHR, 6)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
test %RDX_LP, %RDX_LP
|
||||
@@ -131,22 +131,24 @@ L(zero):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
- .p2align 5
|
||||
+ .p2align 4
|
||||
L(first_vec_x0):
|
||||
- /* Check if first match was before length. */
|
||||
- tzcntl %eax, %eax
|
||||
- xorl %ecx, %ecx
|
||||
- cmpl %eax, %edx
|
||||
- leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
- cmovle %rcx, %rax
|
||||
+ /* Check if first match was before length. NB: tzcnt has false data-
|
||||
+ dependency on destination. eax already had a data-dependency on esi
|
||||
+ so this should have no affect here. */
|
||||
+ tzcntl %eax, %esi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ leaq (%rdi, %rsi, CHAR_SIZE), %rdi
|
||||
+# else
|
||||
+ addq %rsi, %rdi
|
||||
+# endif
|
||||
+ xorl %eax, %eax
|
||||
+ cmpl %esi, %edx
|
||||
+ cmovg %rdi, %rax
|
||||
ret
|
||||
-# else
|
||||
- /* NB: first_vec_x0 is 17 bytes which will leave
|
||||
- cross_page_boundary (which is relatively cold) close enough
|
||||
- to ideal alignment. So only realign L(cross_page_boundary) if
|
||||
- rawmemchr. */
|
||||
- .p2align 4
|
||||
# endif
|
||||
+
|
||||
+ .p2align 4
|
||||
L(cross_page_boundary):
|
||||
/* Save pointer before aligning as its original value is
|
||||
necessary for computer return address if byte is found or
|
||||
@@ -400,10 +402,14 @@ L(last_2x_vec):
|
||||
L(zero_end):
|
||||
ret
|
||||
|
||||
+L(set_zero_end):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1_check):
|
||||
- tzcntl %eax, %eax
|
||||
+ /* eax must be non-zero. Use bsfl to save code size. */
|
||||
+ bsfl %eax, %eax
|
||||
/* Adjust length. */
|
||||
subl $-(CHAR_PER_VEC * 4), %edx
|
||||
/* Check if match within remaining length. */
|
||||
@@ -412,9 +418,6 @@ L(first_vec_x1_check):
|
||||
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
-L(set_zero_end):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
|
||||
.p2align 4
|
||||
L(loop_4x_vec_end):
|
||||
@@ -464,7 +467,7 @@ L(loop_4x_vec_end):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 10
|
||||
L(last_vec_x1_return):
|
||||
tzcntl %eax, %eax
|
||||
# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
|
||||
@@ -496,6 +499,7 @@ L(last_vec_x3_return):
|
||||
# endif
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
+ .p2align 4,, 5
|
||||
L(last_4x_vec_or_less_cmpeq):
|
||||
VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
|
||||
kmovd %k0, %eax
|
||||
@@ -546,7 +550,7 @@ L(last_4x_vec):
|
||||
# endif
|
||||
andl %ecx, %eax
|
||||
jz L(zero_end2)
|
||||
- tzcntl %eax, %eax
|
||||
+ bsfl %eax, %eax
|
||||
leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
L(zero_end2):
|
||||
ret
|
||||
@@ -562,6 +566,6 @@ L(last_vec_x3):
|
||||
leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
# endif
|
||||
-
|
||||
+ /* 7 bytes from next cache line. */
|
||||
END (MEMCHR)
|
||||
#endif
|
33
glibc-upstream-2.34-288.patch
Normal file
33
glibc-upstream-2.34-288.patch
Normal file
@ -0,0 +1,33 @@
|
||||
commit 820504e3edd7276bf869d543ad5b57187ff9c9b6
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri Jun 3 18:52:37 2022 -0500
|
||||
|
||||
x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions
|
||||
|
||||
Give fall-through path to `vzeroupper` and taken-path to `vzeroall`.
|
||||
|
||||
Generally even on machines with RTM the expectation is the
|
||||
string-library functions will not be called in transactions.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
(cherry picked from commit c28db9cb29a7d6cf3ce08fd8445e6b7dea03f35b)
|
||||
|
||||
diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h
|
||||
index 93e44be22e2275f1..04478b097cdffe20 100644
|
||||
--- a/sysdeps/x86_64/sysdep.h
|
||||
+++ b/sysdeps/x86_64/sysdep.h
|
||||
@@ -99,11 +99,11 @@ lose: \
|
||||
to avoid RTM abort triggered by VZEROUPPER inside transactionally. */
|
||||
#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \
|
||||
xtest; \
|
||||
- jz 1f; \
|
||||
- vzeroall; \
|
||||
+ jnz 1f; \
|
||||
+ vzeroupper; \
|
||||
ret; \
|
||||
1: \
|
||||
- vzeroupper; \
|
||||
+ vzeroall; \
|
||||
ret
|
||||
|
||||
/* Can be used to replace vzeroupper that is not directly before a
|
41
glibc-upstream-2.34-289.patch
Normal file
41
glibc-upstream-2.34-289.patch
Normal file
@ -0,0 +1,41 @@
|
||||
commit fc54e1fae854e6ee6361cd4ddf900c36fce8158e
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 8 21:16:51 2022 -0700
|
||||
|
||||
x86: Align varshift table to 32-bytes
|
||||
|
||||
This ensures the load will never split a cache line.
|
||||
|
||||
(cherry picked from commit 0f91811333f23b61cf681cab2704b35a0a073b97)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c
|
||||
index 45267b0a6823459a..1f563542666bc4f1 100644
|
||||
--- a/sysdeps/x86_64/multiarch/varshift.c
|
||||
+++ b/sysdeps/x86_64/multiarch/varshift.c
|
||||
@@ -16,9 +16,10 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include "varshift.h"
|
||||
+#include <stdint.h>
|
||||
|
||||
-const int8_t ___m128i_shift_right[31] attribute_hidden =
|
||||
+const int8_t ___m128i_shift_right[31] attribute_hidden
|
||||
+ __attribute__((aligned(32))) =
|
||||
{
|
||||
0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
|
||||
-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
|
||||
diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h
|
||||
index 32f2173dd2a95b3a..745d48fa7c775136 100644
|
||||
--- a/sysdeps/x86_64/multiarch/varshift.h
|
||||
+++ b/sysdeps/x86_64/multiarch/varshift.h
|
||||
@@ -19,7 +19,8 @@
|
||||
#include <stdint.h>
|
||||
#include <tmmintrin.h>
|
||||
|
||||
-extern const int8_t ___m128i_shift_right[31] attribute_hidden;
|
||||
+extern const int8_t ___m128i_shift_right[31] attribute_hidden
|
||||
+ __attribute__ ((aligned (32)));
|
||||
|
||||
static __inline__ __m128i
|
||||
__m128i_shift_right (__m128i value, unsigned long int offset)
|
56
glibc-upstream-2.34-290.patch
Normal file
56
glibc-upstream-2.34-290.patch
Normal file
@ -0,0 +1,56 @@
|
||||
commit 6e008c884dad5a25f91085c68d044bb5e2d63761
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue Jun 14 13:50:11 2022 -0700
|
||||
|
||||
x86: Fix misordered logic for setting `rep_movsb_stop_threshold`
|
||||
|
||||
Move the setting of `rep_movsb_stop_threshold` to after the tunables
|
||||
have been collected so that the `rep_movsb_stop_threshold` (which
|
||||
is used to redirect control flow to the non_temporal case) will
|
||||
use any user value for `non_temporal_threshold` (set using
|
||||
glibc.cpu.x86_non_temporal_threshold)
|
||||
|
||||
(cherry picked from commit 035591551400cfc810b07244a015c9411e8bff7c)
|
||||
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index 2e43e67e4f4037d3..560bf260e8fbd7bf 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||
rep_movsb_threshold = 2112;
|
||||
|
||||
- unsigned long int rep_movsb_stop_threshold;
|
||||
- /* ERMS feature is implemented from AMD Zen3 architecture and it is
|
||||
- performing poorly for data above L2 cache size. Henceforth, adding
|
||||
- an upper bound threshold parameter to limit the usage of Enhanced
|
||||
- REP MOVSB operations and setting its value to L2 cache size. */
|
||||
- if (cpu_features->basic.kind == arch_kind_amd)
|
||||
- rep_movsb_stop_threshold = core;
|
||||
- /* Setting the upper bound of ERMS to the computed value of
|
||||
- non-temporal threshold for architectures other than AMD. */
|
||||
- else
|
||||
- rep_movsb_stop_threshold = non_temporal_threshold;
|
||||
-
|
||||
/* The default threshold to use Enhanced REP STOSB. */
|
||||
unsigned long int rep_stosb_threshold = 2048;
|
||||
|
||||
@@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
SIZE_MAX);
|
||||
#endif
|
||||
|
||||
+ unsigned long int rep_movsb_stop_threshold;
|
||||
+ /* ERMS feature is implemented from AMD Zen3 architecture and it is
|
||||
+ performing poorly for data above L2 cache size. Henceforth, adding
|
||||
+ an upper bound threshold parameter to limit the usage of Enhanced
|
||||
+ REP MOVSB operations and setting its value to L2 cache size. */
|
||||
+ if (cpu_features->basic.kind == arch_kind_amd)
|
||||
+ rep_movsb_stop_threshold = core;
|
||||
+ /* Setting the upper bound of ERMS to the computed value of
|
||||
+ non-temporal threshold for architectures other than AMD. */
|
||||
+ else
|
||||
+ rep_movsb_stop_threshold = non_temporal_threshold;
|
||||
+
|
||||
cpu_features->data_cache_size = data;
|
||||
cpu_features->shared_cache_size = shared;
|
||||
cpu_features->non_temporal_threshold = non_temporal_threshold;
|
38
glibc-upstream-2.34-291.patch
Normal file
38
glibc-upstream-2.34-291.patch
Normal file
@ -0,0 +1,38 @@
|
||||
commit 9d50e162eef88e1f870a941b0a973060e984e7ca
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue Jun 14 15:37:28 2022 -0700
|
||||
|
||||
x86: Add sse42 implementation to strcmp's ifunc
|
||||
|
||||
This has been missing since the the ifuncs where added.
|
||||
|
||||
The performance of SSE4.2 is preferable to to SSE2.
|
||||
|
||||
Measured on Tigerlake with N = 20 runs.
|
||||
Geometric Mean of all benchmarks SSE4.2 / SSE2: 0.906
|
||||
|
||||
(cherry picked from commit ff439c47173565fbff4f0f78d07b0f14e4a7db05)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
|
||||
index 7c2901bf44456259..b457fb4c150e4407 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp.c
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp.c
|
||||
@@ -29,6 +29,7 @@
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
@@ -53,6 +54,10 @@ IFUNC_SELECTOR (void)
|
||||
return OPTIMIZE (avx2);
|
||||
}
|
||||
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
|
||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
|
||||
+ return OPTIMIZE (sse42);
|
||||
+
|
||||
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
|
||||
return OPTIMIZE (sse2_unaligned);
|
||||
|
54
glibc-upstream-2.34-292.patch
Normal file
54
glibc-upstream-2.34-292.patch
Normal file
@ -0,0 +1,54 @@
|
||||
commit 94b0dc9419bd038cb85b364a7556569386c31741
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 15 10:41:29 2022 -0700
|
||||
|
||||
x86: Add bounds `x86_non_temporal_threshold`
|
||||
|
||||
The lower-bound (16448) and upper-bound (SIZE_MAX / 16) are assumed
|
||||
by memmove-vec-unaligned-erms.
|
||||
|
||||
The lower-bound is needed because memmove-vec-unaligned-erms unrolls
|
||||
the loop aggressively in the L(large_memset_4x) case.
|
||||
|
||||
The upper-bound is needed because memmove-vec-unaligned-erms
|
||||
right-shifts the value of `x86_non_temporal_threshold` by
|
||||
LOG_4X_MEMCPY_THRESH (4) which without a bound may overflow.
|
||||
|
||||
The lack of lower-bound can be a correctness issue. The lack of
|
||||
upper-bound cannot.
|
||||
|
||||
(cherry picked from commit b446822b6ae4e8149902a78cdd4a886634ad6321)
|
||||
|
||||
diff --git a/manual/tunables.texi b/manual/tunables.texi
|
||||
index 28ff502990c2a10f..5ab3212f34e3dc37 100644
|
||||
--- a/manual/tunables.texi
|
||||
+++ b/manual/tunables.texi
|
||||
@@ -47,7 +47,7 @@ glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0xffffffffffffffff)
|
||||
glibc.elision.skip_lock_busy: 3 (min: -2147483648, max: 2147483647)
|
||||
glibc.malloc.top_pad: 0x0 (min: 0x0, max: 0xffffffffffffffff)
|
||||
glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff)
|
||||
-glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x0, max: 0xffffffffffffffff)
|
||||
+glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0x0fffffffffffffff)
|
||||
glibc.cpu.x86_shstk:
|
||||
glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff)
|
||||
glibc.malloc.mmap_max: 0 (min: -2147483648, max: 2147483647)
|
||||
diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h
|
||||
index 560bf260e8fbd7bf..8f85f70858413ebe 100644
|
||||
--- a/sysdeps/x86/dl-cacheinfo.h
|
||||
+++ b/sysdeps/x86/dl-cacheinfo.h
|
||||
@@ -931,8 +931,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features)
|
||||
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX);
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX);
|
||||
+ /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of
|
||||
+ 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best
|
||||
+ if that operation cannot overflow. Minimum of 0x4040 (16448) because the
|
||||
+ L(large_memset_4x) loops need 64-byte to cache align and enough space for
|
||||
+ at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are
|
||||
+ reflected in the manual. */
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold,
|
||||
- 0, SIZE_MAX);
|
||||
+ 0x4040, SIZE_MAX >> 4);
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold,
|
||||
minimum_rep_movsb_threshold, SIZE_MAX);
|
||||
TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,
|
88
glibc-upstream-2.34-293.patch
Normal file
88
glibc-upstream-2.34-293.patch
Normal file
@ -0,0 +1,88 @@
|
||||
commit ba1c3f23d9ba63c38333116eec6043c471c378c4
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 15 10:41:28 2022 -0700
|
||||
|
||||
x86: Cleanup bounds checking in large memcpy case
|
||||
|
||||
1. Fix incorrect lower-bound threshold in L(large_memcpy_2x).
|
||||
Previously was using `__x86_rep_movsb_threshold` and should
|
||||
have been using `__x86_shared_non_temporal_threshold`.
|
||||
|
||||
2. Avoid reloading __x86_shared_non_temporal_threshold before
|
||||
the L(large_memcpy_4x) bounds check.
|
||||
|
||||
3. Document the second bounds check for L(large_memcpy_4x)
|
||||
more clearly.
|
||||
|
||||
(cherry picked from commit 89a25c6f64746732b87eaf433af0964b564d4a92)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index 7b27cbdda5fb99f7..618d46d8ce28828c 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -118,7 +118,13 @@
|
||||
# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
|
||||
#endif
|
||||
|
||||
-/* Amount to shift rdx by to compare for memcpy_large_4x. */
|
||||
+/* Amount to shift __x86_shared_non_temporal_threshold by for
|
||||
+ bound for memcpy_large_4x. This is essentially use to to
|
||||
+ indicate that the copy is far beyond the scope of L3
|
||||
+ (assuming no user config x86_non_temporal_threshold) and to
|
||||
+ use a more aggressively unrolled loop. NB: before
|
||||
+ increasing the value also update initialization of
|
||||
+ x86_non_temporal_threshold. */
|
||||
#ifndef LOG_4X_MEMCPY_THRESH
|
||||
# define LOG_4X_MEMCPY_THRESH 4
|
||||
#endif
|
||||
@@ -724,9 +730,14 @@ L(skip_short_movsb_check):
|
||||
.p2align 4,, 10
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
L(large_memcpy_2x_check):
|
||||
- cmp __x86_rep_movsb_threshold(%rip), %RDX_LP
|
||||
- jb L(more_8x_vec_check)
|
||||
+ /* Entry from L(large_memcpy_2x) has a redundant load of
|
||||
+ __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x)
|
||||
+ is only use for the non-erms memmove which is generally less
|
||||
+ common. */
|
||||
L(large_memcpy_2x):
|
||||
+ mov __x86_shared_non_temporal_threshold(%rip), %R11_LP
|
||||
+ cmp %R11_LP, %RDX_LP
|
||||
+ jb L(more_8x_vec_check)
|
||||
/* To reach this point it is impossible for dst > src and
|
||||
overlap. Remaining to check is src > dst and overlap. rcx
|
||||
already contains dst - src. Negate rcx to get src - dst. If
|
||||
@@ -774,18 +785,21 @@ L(large_memcpy_2x):
|
||||
/* ecx contains -(dst - src). not ecx will return dst - src - 1
|
||||
which works for testing aliasing. */
|
||||
notl %ecx
|
||||
+ movq %rdx, %r10
|
||||
testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
||||
jz L(large_memcpy_4x)
|
||||
|
||||
- movq %rdx, %r10
|
||||
- shrq $LOG_4X_MEMCPY_THRESH, %r10
|
||||
- cmp __x86_shared_non_temporal_threshold(%rip), %r10
|
||||
+ /* r11 has __x86_shared_non_temporal_threshold. Shift it left
|
||||
+ by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold.
|
||||
+ */
|
||||
+ shlq $LOG_4X_MEMCPY_THRESH, %r11
|
||||
+ cmp %r11, %rdx
|
||||
jae L(large_memcpy_4x)
|
||||
|
||||
/* edx will store remainder size for copying tail. */
|
||||
andl $(PAGE_SIZE * 2 - 1), %edx
|
||||
/* r10 stores outer loop counter. */
|
||||
- shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
|
||||
+ shrq $(LOG_PAGE_SIZE + 1), %r10
|
||||
/* Copy 4x VEC at a time from 2 pages. */
|
||||
.p2align 4
|
||||
L(loop_large_memcpy_2x_outer):
|
||||
@@ -850,7 +864,6 @@ L(large_memcpy_2x_end):
|
||||
|
||||
.p2align 4
|
||||
L(large_memcpy_4x):
|
||||
- movq %rdx, %r10
|
||||
/* edx will store remainder size for copying tail. */
|
||||
andl $(PAGE_SIZE * 4 - 1), %edx
|
||||
/* r10 stores outer loop counter. */
|
27
glibc-upstream-2.34-294.patch
Normal file
27
glibc-upstream-2.34-294.patch
Normal file
@ -0,0 +1,27 @@
|
||||
commit c51d8d383cfb92142b86d8d1822159f3bea10d16
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu Jun 16 15:01:08 2022 -0700
|
||||
|
||||
x86: Add BMI1/BMI2 checks for ISA_V3 check
|
||||
|
||||
BMI1/BMI2 are part of the ISA V3 requirements:
|
||||
https://en.wikipedia.org/wiki/X86-64
|
||||
|
||||
And defined by GCC when building with `-march=x86-64-v3`
|
||||
|
||||
(cherry picked from commit 8da9f346cb2051844348785b8a932ec44489e0b7)
|
||||
|
||||
diff --git a/sysdeps/x86/isa-level.c b/sysdeps/x86/isa-level.c
|
||||
index 49ef4aa6122072cf..07815381122c94c3 100644
|
||||
--- a/sysdeps/x86/isa-level.c
|
||||
+++ b/sysdeps/x86/isa-level.c
|
||||
@@ -47,7 +47,8 @@
|
||||
# endif
|
||||
|
||||
# if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \
|
||||
- && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE
|
||||
+ && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \
|
||||
+ && defined __BMI__ && defined __BMI2__
|
||||
/* NB: ISAs in x86-64 ISA level v3 are used. */
|
||||
# define ISA_V3 GNU_PROPERTY_X86_ISA_1_V3
|
||||
# else
|
28
glibc-upstream-2.34-295.patch
Normal file
28
glibc-upstream-2.34-295.patch
Normal file
@ -0,0 +1,28 @@
|
||||
commit d201c59177b98946d7f80145e7b4d02991d04805
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri Jun 24 09:42:12 2022 -0700
|
||||
|
||||
x86: Align entry for memrchr to 64-bytes.
|
||||
|
||||
The function was tuned around 64-byte entry alignment and performs
|
||||
better for all sizes with it.
|
||||
|
||||
As well different code boths where explicitly written to touch the
|
||||
minimum number of cache line i.e sizes <= 32 touch only the entry
|
||||
cache line.
|
||||
|
||||
(cherry picked from commit 227afaa67213efcdce6a870ef5086200f1076438)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
index 5f8e0be18cfe4fad..edd8180ba1ede9a5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
@@ -35,7 +35,7 @@
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
.section SECTION(.text), "ax", @progbits
|
||||
-ENTRY(MEMRCHR)
|
||||
+ENTRY_P2ALIGN(MEMRCHR, 6)
|
||||
# ifdef __ILP32__
|
||||
/* Clear upper bits. */
|
||||
and %RDX_LP, %RDX_LP
|
56
glibc-upstream-2.34-296.patch
Normal file
56
glibc-upstream-2.34-296.patch
Normal file
@ -0,0 +1,56 @@
|
||||
commit aadd0a1c7c89d016e1186c81c0efcafa36bf84fc
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri Jun 24 09:42:15 2022 -0700
|
||||
|
||||
x86: Put wcs{n}len-sse4.1 in the sse4.1 text section
|
||||
|
||||
Previously was missing but the two implementations shouldn't get in
|
||||
the sse2 (generic) text section.
|
||||
|
||||
(cherry picked from commit afc6e4328ff80973bde50d5401691b4c4b2e522c)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
index 031753a91763b351..762f4755020c35f9 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
@@ -28,6 +28,10 @@
|
||||
# define SHIFT_RETURN
|
||||
#endif
|
||||
|
||||
+#ifndef SECTION
|
||||
+# define SECTION(p) p
|
||||
+#endif
|
||||
+
|
||||
/* Long lived register in strlen(s), strnlen(s, n) are:
|
||||
|
||||
%xmm3 - zero
|
||||
@@ -37,7 +41,7 @@
|
||||
*/
|
||||
|
||||
|
||||
-.text
|
||||
+ .section SECTION(.text),"ax",@progbits
|
||||
ENTRY(strlen)
|
||||
|
||||
/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
index 7e62621afc729492..e306a77f51e650d1 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
@@ -1,4 +1,5 @@
|
||||
#define AS_WCSLEN
|
||||
#define strlen __wcslen_sse4_1
|
||||
+#define SECTION(p) p##.sse4.1
|
||||
|
||||
#include "strlen-vec.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
index 5fa51fe07cbbdf5c..d2f7dd6e2254736c 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
@@ -1,5 +1,6 @@
|
||||
#define AS_WCSLEN
|
||||
#define AS_STRNLEN
|
||||
#define strlen __wcsnlen_sse4_1
|
||||
+#define SECTION(p) p##.sse4.1
|
||||
|
||||
#include "strlen-vec.S"
|
25
glibc-upstream-2.34-297.patch
Normal file
25
glibc-upstream-2.34-297.patch
Normal file
@ -0,0 +1,25 @@
|
||||
commit f4598f0351559f1a4176d7ce0154423d98bcfb0d
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 29 16:07:04 2022 -0700
|
||||
|
||||
x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list
|
||||
|
||||
This was simply missing and meant we weren't testing it properly.
|
||||
|
||||
(cherry picked from commit 2a1099020cdc1e4c9c928156aa85c8cf9d540291)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 043821278fdb6d8f..8d649e263eb24b8a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -1032,6 +1032,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__wmemset_chk_avx2_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (RTM)),
|
||||
+ __wmemset_chk_avx2_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
124
glibc-upstream-2.34-298.patch
Normal file
124
glibc-upstream-2.34-298.patch
Normal file
@ -0,0 +1,124 @@
|
||||
commit 7079931c51547854323fe2ed6fdccf2a1b8b04d7
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 29 16:07:05 2022 -0700
|
||||
|
||||
x86: Move and slightly improve memset_erms
|
||||
|
||||
Implementation wise:
|
||||
1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not
|
||||
use the L(stosb) label that was previously defined.
|
||||
|
||||
2. Don't give the hotpath (fallthrough) to zero size.
|
||||
|
||||
Code positioning wise:
|
||||
|
||||
Move memset_{chk}_erms to its own file. Leaving it in between the
|
||||
memset_{impl}_unaligned both adds unnecessary complexity to the
|
||||
file and wastes space in a relatively hot cache section.
|
||||
|
||||
(cherry picked from commit 4a3f29e7e475dd4e7cce2a24c187e6fb7b5b0a05)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 0e39e63ef6be6a86..da9f16286a763556 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -29,6 +29,7 @@ sysdep_routines += \
|
||||
memset-avx2-unaligned-erms-rtm \
|
||||
memset-avx512-no-vzeroupper \
|
||||
memset-avx512-unaligned-erms \
|
||||
+ memset-erms \
|
||||
memset-evex-unaligned-erms \
|
||||
memset-sse2-unaligned-erms \
|
||||
rawmemchr-avx2 \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..e83cccc731f0a7ea
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-erms.S
|
||||
@@ -0,0 +1,44 @@
|
||||
+/* memset implement with rep stosb
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
+ .text
|
||||
+ENTRY (__memset_chk_erms)
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
+ jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
+END (__memset_chk_erms)
|
||||
+
|
||||
+/* Only used to measure performance of REP STOSB. */
|
||||
+ENTRY (__memset_erms)
|
||||
+ /* Skip zero length. */
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
+ jz L(stosb_return_zero)
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ movzbl %sil, %eax
|
||||
+ mov %RDI_LP, %RDX_LP
|
||||
+ rep stosb
|
||||
+ mov %RDX_LP, %RAX_LP
|
||||
+ ret
|
||||
+L(stosb_return_zero):
|
||||
+ movq %rdi, %rax
|
||||
+ ret
|
||||
+END (__memset_erms)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index abc12d9cda1b3843..905d0fa4643d5768 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -156,37 +156,6 @@ L(entry_from_wmemset):
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
END (MEMSET_SYMBOL (__memset, unaligned))
|
||||
|
||||
-# if VEC_SIZE == 16
|
||||
-ENTRY (__memset_chk_erms)
|
||||
- cmp %RDX_LP, %RCX_LP
|
||||
- jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
-END (__memset_chk_erms)
|
||||
-
|
||||
-/* Only used to measure performance of REP STOSB. */
|
||||
-ENTRY (__memset_erms)
|
||||
- /* Skip zero length. */
|
||||
- test %RDX_LP, %RDX_LP
|
||||
- jnz L(stosb)
|
||||
- movq %rdi, %rax
|
||||
- ret
|
||||
-# else
|
||||
-/* Provide a hidden symbol to debugger. */
|
||||
- .hidden MEMSET_SYMBOL (__memset, erms)
|
||||
-ENTRY (MEMSET_SYMBOL (__memset, erms))
|
||||
-# endif
|
||||
-L(stosb):
|
||||
- mov %RDX_LP, %RCX_LP
|
||||
- movzbl %sil, %eax
|
||||
- mov %RDI_LP, %RDX_LP
|
||||
- rep stosb
|
||||
- mov %RDX_LP, %RAX_LP
|
||||
- VZEROUPPER_RETURN
|
||||
-# if VEC_SIZE == 16
|
||||
-END (__memset_erms)
|
||||
-# else
|
||||
-END (MEMSET_SYMBOL (__memset, erms))
|
||||
-# endif
|
||||
-
|
||||
# if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
cmp %RDX_LP, %RCX_LP
|
163
glibc-upstream-2.34-299.patch
Normal file
163
glibc-upstream-2.34-299.patch
Normal file
@ -0,0 +1,163 @@
|
||||
commit 35f9c72c8bd7bc30deb412e966e2f548241b15d2
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 29 16:07:15 2022 -0700
|
||||
|
||||
x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file
|
||||
|
||||
The primary memmove_{impl}_unaligned_erms implementations don't
|
||||
interact with this function. Putting them in same file both
|
||||
wastes space and unnecessarily bloats a hot code section.
|
||||
|
||||
(cherry picked from commit 21925f64730d52eb7d8b2fb62b412f8ab92b0caf)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index da9f16286a763556..b9ea5b60c2be1b0a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -17,6 +17,7 @@ sysdep_routines += \
|
||||
memmove-avx-unaligned-erms-rtm \
|
||||
memmove-avx512-no-vzeroupper \
|
||||
memmove-avx512-unaligned-erms \
|
||||
+ memmove-erms \
|
||||
memmove-evex-unaligned-erms \
|
||||
memmove-sse2-unaligned-erms \
|
||||
memmove-ssse3 \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S
|
||||
new file mode 100644
|
||||
index 0000000000000000..2d3a6ccb76d77052
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-erms.S
|
||||
@@ -0,0 +1,72 @@
|
||||
+/* memcpy/mempcpy/memmove implement with rep movsb
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
+ .text
|
||||
+ENTRY (__mempcpy_chk_erms)
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
+ jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
+END (__mempcpy_chk_erms)
|
||||
+
|
||||
+/* Only used to measure performance of REP MOVSB. */
|
||||
+ENTRY (__mempcpy_erms)
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ /* Skip zero length. */
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
+ jz 2f
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
+ jmp L(start_movsb)
|
||||
+END (__mempcpy_erms)
|
||||
+
|
||||
+ENTRY (__memmove_chk_erms)
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
+ jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
+END (__memmove_chk_erms)
|
||||
+
|
||||
+ENTRY (__memmove_erms)
|
||||
+ movq %rdi, %rax
|
||||
+ /* Skip zero length. */
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
+ jz 2f
|
||||
+L(start_movsb):
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ cmp %RSI_LP, %RDI_LP
|
||||
+ jb 1f
|
||||
+ /* Source == destination is less common. */
|
||||
+ je 2f
|
||||
+ lea (%rsi,%rcx), %RDX_LP
|
||||
+ cmp %RDX_LP, %RDI_LP
|
||||
+ jb L(movsb_backward)
|
||||
+1:
|
||||
+ rep movsb
|
||||
+2:
|
||||
+ ret
|
||||
+L(movsb_backward):
|
||||
+ leaq -1(%rdi,%rcx), %rdi
|
||||
+ leaq -1(%rsi,%rcx), %rsi
|
||||
+ std
|
||||
+ rep movsb
|
||||
+ cld
|
||||
+ ret
|
||||
+END (__memmove_erms)
|
||||
+strong_alias (__memmove_erms, __memcpy_erms)
|
||||
+strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index 618d46d8ce28828c..93c7e6883a254434 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -239,56 +239,6 @@ L(start):
|
||||
#endif
|
||||
#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
-# if VEC_SIZE == 16
|
||||
-ENTRY (__mempcpy_chk_erms)
|
||||
- cmp %RDX_LP, %RCX_LP
|
||||
- jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
-END (__mempcpy_chk_erms)
|
||||
-
|
||||
-/* Only used to measure performance of REP MOVSB. */
|
||||
-ENTRY (__mempcpy_erms)
|
||||
- mov %RDI_LP, %RAX_LP
|
||||
- /* Skip zero length. */
|
||||
- test %RDX_LP, %RDX_LP
|
||||
- jz 2f
|
||||
- add %RDX_LP, %RAX_LP
|
||||
- jmp L(start_movsb)
|
||||
-END (__mempcpy_erms)
|
||||
-
|
||||
-ENTRY (__memmove_chk_erms)
|
||||
- cmp %RDX_LP, %RCX_LP
|
||||
- jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
-END (__memmove_chk_erms)
|
||||
-
|
||||
-ENTRY (__memmove_erms)
|
||||
- movq %rdi, %rax
|
||||
- /* Skip zero length. */
|
||||
- test %RDX_LP, %RDX_LP
|
||||
- jz 2f
|
||||
-L(start_movsb):
|
||||
- mov %RDX_LP, %RCX_LP
|
||||
- cmp %RSI_LP, %RDI_LP
|
||||
- jb 1f
|
||||
- /* Source == destination is less common. */
|
||||
- je 2f
|
||||
- lea (%rsi,%rcx), %RDX_LP
|
||||
- cmp %RDX_LP, %RDI_LP
|
||||
- jb L(movsb_backward)
|
||||
-1:
|
||||
- rep movsb
|
||||
-2:
|
||||
- ret
|
||||
-L(movsb_backward):
|
||||
- leaq -1(%rdi,%rcx), %rdi
|
||||
- leaq -1(%rsi,%rcx), %rsi
|
||||
- std
|
||||
- rep movsb
|
||||
- cld
|
||||
- ret
|
||||
-END (__memmove_erms)
|
||||
-strong_alias (__memmove_erms, __memcpy_erms)
|
||||
-strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
|
||||
-# endif
|
||||
|
||||
# ifdef SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
38
glibc-upstream-2.34-300.patch
Normal file
38
glibc-upstream-2.34-300.patch
Normal file
@ -0,0 +1,38 @@
|
||||
commit ccc54bd61c768b6a27f9305a0831b76a7b6d706f
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 29 18:56:18 2022 -0700
|
||||
|
||||
x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S
|
||||
|
||||
Was missing to for the multiarch build rtld-strncmp-sse4_2.os was
|
||||
being built and exporting symbols:
|
||||
|
||||
build/glibc/string/rtld-strncmp-sse4_2.os:
|
||||
0000000000000000 T __strncmp_sse42
|
||||
|
||||
Introduced in:
|
||||
|
||||
commit 11ffcacb64a939c10cfc713746b8ec88837f5c4a
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Wed Jun 21 12:10:50 2017 -0700
|
||||
|
||||
x86-64: Implement strcmp family IFUNC selectors in C
|
||||
|
||||
(cherry picked from commit 96ac447d915ea5ecef3f9168cc13f4e731349a3b)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
|
||||
index 22f51a0dfd2770c9..85dc363bf9d6273d 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S
|
||||
@@ -16,6 +16,8 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
-#define STRCMP_SSE42 __strncmp_sse42
|
||||
-#define USE_AS_STRNCMP
|
||||
-#include "strcmp-sse42.S"
|
||||
+#if IS_IN (libc)
|
||||
+# define STRCMP_SSE42 __strncmp_sse42
|
||||
+# define USE_AS_STRNCMP
|
||||
+# include "strcmp-sse42.S"
|
||||
+#endif
|
28
glibc-upstream-2.34-301.patch
Normal file
28
glibc-upstream-2.34-301.patch
Normal file
@ -0,0 +1,28 @@
|
||||
commit b991af50632a2da262b00b2375d9120f23b25525
|
||||
Author: Joseph Myers <joseph@codesourcery.com>
|
||||
Date: Wed May 25 14:37:28 2022 +0000
|
||||
|
||||
Update syscall-names.list for Linux 5.18
|
||||
|
||||
Linux 5.18 has no new syscalls. Update the version number in
|
||||
syscall-names.list to reflect that it is still current for 5.18.
|
||||
|
||||
Tested with build-many-glibcs.py.
|
||||
|
||||
(cherry picked from commit 3d9926663cba19f40d26d8a8ab3b2a7cc09ffb13)
|
||||
|
||||
diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list
|
||||
index e2743c649586d97a..95370e2ec5dbc4a7 100644
|
||||
--- a/sysdeps/unix/sysv/linux/syscall-names.list
|
||||
+++ b/sysdeps/unix/sysv/linux/syscall-names.list
|
||||
@@ -21,8 +21,8 @@
|
||||
# This file can list all potential system calls. The names are only
|
||||
# used if the installed kernel headers also provide them.
|
||||
|
||||
-# The list of system calls is current as of Linux 5.17.
|
||||
-kernel 5.17
|
||||
+# The list of system calls is current as of Linux 5.18.
|
||||
+kernel 5.18
|
||||
|
||||
FAST_atomic_update
|
||||
FAST_cmpxchg
|
44
glibc-upstream-2.34-302.patch
Normal file
44
glibc-upstream-2.34-302.patch
Normal file
@ -0,0 +1,44 @@
|
||||
commit b2f32e746492615a6eb3e66fac1e766e32e8deb1
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Thu Jul 21 12:12:08 2022 +0200
|
||||
|
||||
malloc: Simplify implementation of __malloc_assert
|
||||
|
||||
It is prudent not to run too much code after detecting heap
|
||||
corruption, and __fxprintf is really complex. The line number
|
||||
and file name do not carry much information, so it is not included
|
||||
in the error message. (__libc_message only supports %s formatting.)
|
||||
The function name and assertion should provide some context.
|
||||
|
||||
Reviewed-by: Siddhesh Poyarekar <siddhesh@sourceware.org>
|
||||
(cherry picked from commit ac8047cdf326504f652f7db97ec96c0e0cee052f)
|
||||
|
||||
diff --git a/malloc/malloc.c b/malloc/malloc.c
|
||||
index 7882c70f0a0312d1..d31e985ecce968fe 100644
|
||||
--- a/malloc/malloc.c
|
||||
+++ b/malloc/malloc.c
|
||||
@@ -294,19 +294,14 @@
|
||||
# define __assert_fail(assertion, file, line, function) \
|
||||
__malloc_assert(assertion, file, line, function)
|
||||
|
||||
-extern const char *__progname;
|
||||
-
|
||||
-static void
|
||||
+_Noreturn static void
|
||||
__malloc_assert (const char *assertion, const char *file, unsigned int line,
|
||||
const char *function)
|
||||
{
|
||||
- (void) __fxprintf (NULL, "%s%s%s:%u: %s%sAssertion `%s' failed.\n",
|
||||
- __progname, __progname[0] ? ": " : "",
|
||||
- file, line,
|
||||
- function ? function : "", function ? ": " : "",
|
||||
- assertion);
|
||||
- fflush (stderr);
|
||||
- abort ();
|
||||
+ __libc_message (do_abort, "\
|
||||
+Fatal glibc error: malloc assertion failure in %s: %s\n",
|
||||
+ function, assertion);
|
||||
+ __builtin_unreachable ();
|
||||
}
|
||||
#endif
|
||||
#endif
|
64
glibc.spec
64
glibc.spec
@ -148,7 +148,7 @@ end \
|
||||
Summary: The GNU libc libraries
|
||||
Name: glibc
|
||||
Version: %{glibcversion}
|
||||
Release: 39%{?dist}
|
||||
Release: 40%{?dist}
|
||||
|
||||
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
|
||||
# libraries.
|
||||
@ -550,6 +550,35 @@ Patch342: glibc-upstream-2.34-272.patch
|
||||
Patch343: glibc-upstream-2.34-273.patch
|
||||
Patch344: glibc-rh2096191-1.patch
|
||||
Patch345: glibc-rh2096191-2.patch
|
||||
Patch346: glibc-upstream-2.34-274.patch
|
||||
Patch347: glibc-upstream-2.34-275.patch
|
||||
Patch348: glibc-upstream-2.34-276.patch
|
||||
Patch349: glibc-upstream-2.34-277.patch
|
||||
Patch350: glibc-upstream-2.34-278.patch
|
||||
Patch351: glibc-upstream-2.34-279.patch
|
||||
Patch352: glibc-upstream-2.34-280.patch
|
||||
Patch353: glibc-upstream-2.34-281.patch
|
||||
Patch354: glibc-upstream-2.34-282.patch
|
||||
Patch355: glibc-upstream-2.34-283.patch
|
||||
Patch356: glibc-upstream-2.34-284.patch
|
||||
Patch357: glibc-upstream-2.34-285.patch
|
||||
Patch358: glibc-upstream-2.34-286.patch
|
||||
Patch359: glibc-upstream-2.34-287.patch
|
||||
Patch360: glibc-upstream-2.34-288.patch
|
||||
Patch361: glibc-upstream-2.34-289.patch
|
||||
Patch362: glibc-upstream-2.34-290.patch
|
||||
Patch363: glibc-upstream-2.34-291.patch
|
||||
Patch364: glibc-upstream-2.34-292.patch
|
||||
Patch365: glibc-upstream-2.34-293.patch
|
||||
Patch366: glibc-upstream-2.34-294.patch
|
||||
Patch367: glibc-upstream-2.34-295.patch
|
||||
Patch368: glibc-upstream-2.34-296.patch
|
||||
Patch369: glibc-upstream-2.34-297.patch
|
||||
Patch370: glibc-upstream-2.34-298.patch
|
||||
Patch371: glibc-upstream-2.34-299.patch
|
||||
Patch372: glibc-upstream-2.34-300.patch
|
||||
Patch373: glibc-upstream-2.34-301.patch
|
||||
Patch374: glibc-upstream-2.34-302.patch
|
||||
|
||||
##############################################################################
|
||||
# Continued list of core "glibc" package information:
|
||||
@ -2606,6 +2635,39 @@ fi
|
||||
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
||||
|
||||
%changelog
|
||||
* Fri Jul 22 2022 Arjun Shankar <arjun@redhat.com> - 2.34-40
|
||||
- Sync with upstream branch release/2.34/master,
|
||||
commit b2f32e746492615a6eb3e66fac1e766e32e8deb1:
|
||||
- malloc: Simplify implementation of __malloc_assert
|
||||
- Update syscall-names.list for Linux 5.18
|
||||
- x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S
|
||||
- x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file
|
||||
- x86: Move and slightly improve memset_erms
|
||||
- x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list
|
||||
- x86: Put wcs{n}len-sse4.1 in the sse4.1 text section
|
||||
- x86: Align entry for memrchr to 64-bytes.
|
||||
- x86: Add BMI1/BMI2 checks for ISA_V3 check
|
||||
- x86: Cleanup bounds checking in large memcpy case
|
||||
- x86: Add bounds `x86_non_temporal_threshold`
|
||||
- x86: Add sse42 implementation to strcmp's ifunc
|
||||
- x86: Fix misordered logic for setting `rep_movsb_stop_threshold`
|
||||
- x86: Align varshift table to 32-bytes
|
||||
- x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions
|
||||
- x86: Shrink code size of memchr-evex.S
|
||||
- x86: Shrink code size of memchr-avx2.S
|
||||
- x86: Optimize memrchr-avx2.S
|
||||
- x86: Optimize memrchr-evex.S
|
||||
- x86: Optimize memrchr-sse2.S
|
||||
- x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret`
|
||||
- x86: Create header for VEC classes in x86 strings library
|
||||
- x86_64: Add strstr function with 512-bit EVEX
|
||||
- x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT
|
||||
- x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen
|
||||
- x86_64: Remove bzero optimization
|
||||
- x86_64: Remove end of line trailing spaces
|
||||
- nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore
|
||||
- linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304)
|
||||
|
||||
* Fri Jun 24 2022 Florian Weimer <fweimer@redhat.com> - 2.34-39
|
||||
- Add the no-aaaa DNS stub resolver option (#2096191)
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user