diff --git a/glibc-upstream-2.34-274.patch b/glibc-upstream-2.34-274.patch new file mode 100644 index 0000000..c56ed93 --- /dev/null +++ b/glibc-upstream-2.34-274.patch @@ -0,0 +1,27 @@ +commit 4b246b2bbd1d5a77035bb990d6097b7337c34bbb +Author: Adhemerval Zanella +Date: Thu Jun 30 09:08:31 2022 -0300 + + linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304) + + On success, mq_receive() and mq_timedreceive() return the number of + bytes in the received message, so it requires to check if the value + is larger than 0. + + Checked on i686-linux-gnu. + + (cherry picked from commit 71d87d85bf54f6522813aec97c19bdd24997341e) + +diff --git a/sysdeps/unix/sysv/linux/mq_timedreceive.c b/sysdeps/unix/sysv/linux/mq_timedreceive.c +index 7f3a112d7f2cbbe7..1fc98752e7d6d506 100644 +--- a/sysdeps/unix/sysv/linux/mq_timedreceive.c ++++ b/sysdeps/unix/sysv/linux/mq_timedreceive.c +@@ -41,7 +41,7 @@ ___mq_timedreceive_time64 (mqd_t mqdes, char *__restrict msg_ptr, size_t msg_len + { + int r = SYSCALL_CANCEL (mq_timedreceive_time64, mqdes, msg_ptr, msg_len, + msg_prio, abs_timeout); +- if (r == 0 || errno != ENOSYS) ++ if (r >= 0 || errno != ENOSYS) + return r; + __set_errno (EOVERFLOW); + return -1; diff --git a/glibc-upstream-2.34-275.patch b/glibc-upstream-2.34-275.patch new file mode 100644 index 0000000..155d9e1 --- /dev/null +++ b/glibc-upstream-2.34-275.patch @@ -0,0 +1,25 @@ +commit 7789a849234f8b303a571134abe72691ce8c2540 +Author: Adhemerval Zanella +Date: Wed Jul 13 10:37:32 2022 -0300 + + nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore + + This was due a wrong revert done on 404656009b459658. + + Checked on x86_64-linux-gnu and i686-linux-gnu. + + (cherry picked from commit f27e5e21787abc9f719879af47687221aa1027b3) + +diff --git a/nptl/cleanup_defer.c b/nptl/cleanup_defer.c +index 35ba40fb0247c7cc..59571229d8ccf481 100644 +--- a/nptl/cleanup_defer.c ++++ b/nptl/cleanup_defer.c +@@ -72,7 +72,7 @@ ___pthread_unregister_cancel_restore (__pthread_unwind_buf_t *buf) + return; + + int cancelhandling = atomic_load_relaxed (&self->cancelhandling); +- if (cancelhandling & CANCELTYPE_BITMASK) ++ if ((cancelhandling & CANCELTYPE_BITMASK) == 0) + { + int newval; + do diff --git a/glibc-upstream-2.34-276.patch b/glibc-upstream-2.34-276.patch new file mode 100644 index 0000000..5e9bb63 --- /dev/null +++ b/glibc-upstream-2.34-276.patch @@ -0,0 +1,29 @@ +commit 8d324019e69203f5998f223d0e905de1395330ea +Author: Sunil K Pandey +Date: Mon Jul 18 18:38:48 2022 -0700 + + x86_64: Remove end of line trailing spaces + + This commit remove trailing space introduced by following commit. + + commit a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb + Author: Noah Goldstein + Date: Wed Jun 23 01:56:29 2021 -0400 + + x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ #27974] + +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +index b7657282bd2e6fa5..031753a91763b351 100644 +--- a/sysdeps/x86_64/multiarch/strlen-vec.S ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -66,8 +66,8 @@ ENTRY(strlen) + L(n_nonzero): + # ifdef AS_WCSLEN + /* Check for overflow from maxlen * sizeof(wchar_t). If it would +- overflow the only way this program doesn't have undefined behavior +- is if there is a null terminator in valid memory so wcslen will ++ overflow the only way this program doesn't have undefined behavior ++ is if there is a null terminator in valid memory so wcslen will + suffice. */ + mov %RSI_LP, %R10_LP + sar $62, %R10_LP diff --git a/glibc-upstream-2.34-277.patch b/glibc-upstream-2.34-277.patch new file mode 100644 index 0000000..3f15409 --- /dev/null +++ b/glibc-upstream-2.34-277.patch @@ -0,0 +1,457 @@ +commit eb9aa96facc5231e208de0946870f10c21aee9f3 +Author: Adhemerval Zanella +Date: Fri May 13 09:33:30 2022 -0300 + + x86_64: Remove bzero optimization + + Both symbols are marked as legacy in POSIX.1-2001 and removed on + POSIX.1-2008, although the prototypes are defined for _GNU_SOURCE + or _DEFAULT_SOURCE. + + GCC also replaces bcopy with a memmove and bzero with memset on default + configuration (to actually get a bzero libc call the code requires + to omit string.h inclusion and built with -fno-builtin), so it is + highly unlikely programs are actually calling libc bzero symbol. + + On a recent Linux distro (Ubuntu 22.04), there is no bzero calls + by the installed binaries. + + $ cat count_bstring.sh + #!/bin/bash + + files=`IFS=':';for i in $PATH; do test -d "$i" && find "$i" -maxdepth 1 -executable -type f; done` + total=0 + for file in $files; do + symbols=`objdump -R $file 2>&1` + if [ $? -eq 0 ]; then + ncalls=`echo $symbols | grep -w $1 | wc -l` + ((total=total+ncalls)) + if [ $ncalls -gt 0 ]; then + echo "$file: $ncalls" + fi + fi + done + echo "TOTAL=$total" + $ ./count_bstring.sh bzero + TOTAL=0 + + Checked on x86_64-linux-gnu. + + (cherry picked from commit 9403b71ae97e3f1a91c796ddcbb4e6f044434734) + +diff --git a/sysdeps/x86_64/bzero.S b/sysdeps/x86_64/bzero.S +deleted file mode 100644 +index f96d567fd87696af..0000000000000000 +--- a/sysdeps/x86_64/bzero.S ++++ /dev/null +@@ -1 +0,0 @@ +-/* Implemented in memset.S. */ +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 0358210c7ff3a976..2b64741fd10a8ec2 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -1,4 +1,4 @@ +-/* memset/bzero -- set memory area to CH/0 ++/* memset -- set memory area to CH/0 + Optimized version for x86-64. + Copyright (C) 2002-2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. +@@ -35,9 +35,6 @@ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + +-# define BZERO_ZERO_VEC0() \ +- pxor %xmm0, %xmm0 +- + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + pshufd $0, %xmm0, %xmm0; \ +@@ -56,10 +53,6 @@ + # define MEMSET_SYMBOL(p,s) memset + #endif + +-#ifndef BZERO_SYMBOL +-# define BZERO_SYMBOL(p,s) __bzero +-#endif +- + #ifndef WMEMSET_SYMBOL + # define WMEMSET_CHK_SYMBOL(p,s) p + # define WMEMSET_SYMBOL(p,s) __wmemset +@@ -70,7 +63,6 @@ + libc_hidden_builtin_def (memset) + + #if IS_IN (libc) +-weak_alias (__bzero, bzero) + libc_hidden_def (__wmemset) + weak_alias (__wmemset, wmemset) + libc_hidden_weak (wmemset) +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index b503e4b81e92a11c..67401162d526f664 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -1,7 +1,6 @@ + ifeq ($(subdir),string) + + sysdep_routines += \ +- bzero \ + memchr-avx2 \ + memchr-avx2-rtm \ + memchr-evex \ +diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c +deleted file mode 100644 +index 13e399a9a1fbdeb2..0000000000000000 +--- a/sysdeps/x86_64/multiarch/bzero.c ++++ /dev/null +@@ -1,108 +0,0 @@ +-/* Multiple versions of bzero. +- All versions must be listed in ifunc-impl-list.c. +- Copyright (C) 2022 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-/* Define multiple versions only for the definition in libc. */ +-#if IS_IN (libc) +-# define __bzero __redirect___bzero +-# include +-# undef __bzero +- +-/* OPTIMIZE1 definition required for bzero patch. */ +-# define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name) +-# define SYMBOL_NAME __bzero +-# include +- +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) +- attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) +- attribute_hidden; +- +-static inline void * +-IFUNC_SELECTOR (void) +-{ +- const struct cpu_features* cpu_features = __get_cpu_features (); +- +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE1 (avx512_unaligned_erms); +- +- return OPTIMIZE1 (avx512_unaligned); +- } +- } +- +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE1 (evex_unaligned_erms); +- +- return OPTIMIZE1 (evex_unaligned); +- } +- +- if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE1 (avx2_unaligned_erms_rtm); +- +- return OPTIMIZE1 (avx2_unaligned_rtm); +- } +- +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE1 (avx2_unaligned_erms); +- +- return OPTIMIZE1 (avx2_unaligned); +- } +- } +- +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE1 (sse2_unaligned_erms); +- +- return OPTIMIZE1 (sse2_unaligned); +-} +- +-libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); +- +-weak_alias (__bzero, bzero) +-#endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index e5e48b36c3175e68..d990a7149489efd9 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -280,48 +280,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx512_no_vzeroupper) + ) + +- /* Support sysdeps/x86_64/multiarch/bzero.c. */ +- IFUNC_IMPL (i, name, bzero, +- IFUNC_IMPL_ADD (array, i, bzero, 1, +- __bzero_sse2_unaligned) +- IFUNC_IMPL_ADD (array, i, bzero, 1, +- __bzero_sse2_unaligned_erms) +- IFUNC_IMPL_ADD (array, i, bzero, +- CPU_FEATURE_USABLE (AVX2), +- __bzero_avx2_unaligned) +- IFUNC_IMPL_ADD (array, i, bzero, +- CPU_FEATURE_USABLE (AVX2), +- __bzero_avx2_unaligned_erms) +- IFUNC_IMPL_ADD (array, i, bzero, +- (CPU_FEATURE_USABLE (AVX2) +- && CPU_FEATURE_USABLE (RTM)), +- __bzero_avx2_unaligned_rtm) +- IFUNC_IMPL_ADD (array, i, bzero, +- (CPU_FEATURE_USABLE (AVX2) +- && CPU_FEATURE_USABLE (RTM)), +- __bzero_avx2_unaligned_erms_rtm) +- IFUNC_IMPL_ADD (array, i, bzero, +- (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW) +- && CPU_FEATURE_USABLE (BMI2)), +- __bzero_evex_unaligned) +- IFUNC_IMPL_ADD (array, i, bzero, +- (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW) +- && CPU_FEATURE_USABLE (BMI2)), +- __bzero_evex_unaligned_erms) +- IFUNC_IMPL_ADD (array, i, bzero, +- (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW) +- && CPU_FEATURE_USABLE (BMI2)), +- __bzero_avx512_unaligned_erms) +- IFUNC_IMPL_ADD (array, i, bzero, +- (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW) +- && CPU_FEATURE_USABLE (BMI2)), +- __bzero_avx512_unaligned) +- ) +- + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +index 5a5ee6f67299400b..8ac3e479bba488be 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -5,7 +5,6 @@ + + #define SECTION(p) p##.avx.rtm + #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm +-#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm + #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + + #include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index a093a2831f3dfa0d..c0bf2875d03d51ab 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,9 +14,6 @@ + vmovd d, %xmm0; \ + movq r, %rax; + +-# define BZERO_ZERO_VEC0() \ +- vpxor %xmm0, %xmm0, %xmm0 +- + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +@@ -32,9 +29,6 @@ + # ifndef MEMSET_SYMBOL + # define MEMSET_SYMBOL(p,s) p##_avx2_##s + # endif +-# ifndef BZERO_SYMBOL +-# define BZERO_SYMBOL(p,s) p##_avx2_##s +-# endif + # ifndef WMEMSET_SYMBOL + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 727c92133a15900f..5241216a77bf72b7 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,9 +19,6 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + +-# define BZERO_ZERO_VEC0() \ +- vpxorq %XMM0, %XMM0, %XMM0 +- + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 5d8fa78f05476b10..637002150659123c 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,9 +19,6 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + +-# define BZERO_ZERO_VEC0() \ +- vpxorq %XMM0, %XMM0, %XMM0 +- + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index 2951f7f5f70e274a..c47f3a9c955508a2 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -22,7 +22,6 @@ + + #if IS_IN (libc) + # define MEMSET_SYMBOL(p,s) p##_sse2_##s +-# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) + # define WMEMSET_SYMBOL(p,s) p##_sse2_##s + + # ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index d9c577fb5ff9700f..abc12d9cda1b3843 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -1,5 +1,5 @@ +-/* memset/bzero with unaligned store and rep stosb +- Copyright (C) 2016-2021 Free Software Foundation, Inc. ++/* memset with unaligned store and rep stosb ++ Copyright (C) 2016-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -26,10 +26,6 @@ + + #include + +-#ifndef BZERO_SYMBOL +-# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) +-#endif +- + #ifndef MEMSET_CHK_SYMBOL + # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) + #endif +@@ -134,31 +130,6 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + +-ENTRY (BZERO_SYMBOL(__bzero, unaligned)) +-#if VEC_SIZE > 16 +- BZERO_ZERO_VEC0 () +-#endif +- mov %RDI_LP, %RAX_LP +- mov %RSI_LP, %RDX_LP +-#ifndef USE_LESS_VEC_MASK_STORE +- xorl %esi, %esi +-#endif +- cmp $VEC_SIZE, %RDX_LP +- jb L(less_vec_no_vdup) +-#ifdef USE_LESS_VEC_MASK_STORE +- xorl %esi, %esi +-#endif +-#if VEC_SIZE <= 16 +- BZERO_ZERO_VEC0 () +-#endif +- cmp $(VEC_SIZE * 2), %RDX_LP +- ja L(more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU %VEC(0), (%rdi) +- VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) +- VZEROUPPER_RETURN +-END (BZERO_SYMBOL(__bzero, unaligned)) +- + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmp %RDX_LP, %RCX_LP +@@ -216,31 +187,6 @@ END (__memset_erms) + END (MEMSET_SYMBOL (__memset, erms)) + # endif + +-ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) +-# if VEC_SIZE > 16 +- BZERO_ZERO_VEC0 () +-# endif +- mov %RDI_LP, %RAX_LP +- mov %RSI_LP, %RDX_LP +-# ifndef USE_LESS_VEC_MASK_STORE +- xorl %esi, %esi +-# endif +- cmp $VEC_SIZE, %RDX_LP +- jb L(less_vec_no_vdup) +-# ifdef USE_LESS_VEC_MASK_STORE +- xorl %esi, %esi +-# endif +-# if VEC_SIZE <= 16 +- BZERO_ZERO_VEC0 () +-# endif +- cmp $(VEC_SIZE * 2), %RDX_LP +- ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU %VEC(0), (%rdi) +- VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) +- VZEROUPPER_RETURN +-END (BZERO_SYMBOL(__bzero, unaligned_erms)) +- + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP +@@ -282,7 +228,6 @@ L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE + .p2align 4,, 10 + L(less_vec): +-L(less_vec_no_vdup): + L(less_vec_from_wmemset): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +@@ -430,9 +375,6 @@ L(less_vec): + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () + L(less_vec_from_wmemset): +-#if VEC_SIZE > 16 +-L(less_vec_no_vdup): +-#endif + #endif + L(cross_page): + #if VEC_SIZE > 32 +@@ -445,9 +387,6 @@ L(cross_page): + #endif + #ifndef USE_XMM_LESS_VEC + MOVQ %XMM0, %SET_REG64 +-#endif +-#if VEC_SIZE <= 16 +-L(less_vec_no_vdup): + #endif + cmpl $8, %edx + jge L(between_8_15) diff --git a/glibc-upstream-2.34-278.patch b/glibc-upstream-2.34-278.patch new file mode 100644 index 0000000..26b047f --- /dev/null +++ b/glibc-upstream-2.34-278.patch @@ -0,0 +1,460 @@ +commit 8ab861d295b90177b89288a2bc95c5de5e4e5bc6 +Author: Sunil K Pandey +Date: Sun Feb 27 16:39:47 2022 -0800 + + x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen + + This patch implements following evex512 version of string functions. + Perf gain for evex512 version is up to 50% as compared to evex, + depending on length and alignment. + + Placeholder function, not used by any processor at the moment. + + - String length function using 512 bit vectors. + - String N length using 512 bit vectors. + - Wide string length using 512 bit vectors. + - Wide string N length using 512 bit vectors. + + Reviewed-by: Noah Goldstein + (cherry picked from commit 9c66efb86fe384f77435f7e326333fb2e4e10676) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 67401162d526f664..4d4ad2a3686b5bc3 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -87,6 +87,7 @@ sysdep_routines += \ + strlen-avx2 \ + strlen-avx2-rtm \ + strlen-evex \ ++ strlen-evex512 \ + strlen-sse2 \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ +@@ -115,6 +116,7 @@ sysdep_routines += \ + strnlen-avx2 \ + strnlen-avx2-rtm \ + strnlen-evex \ ++ strnlen-evex512 \ + strnlen-sse2 \ + strpbrk-c \ + strpbrk-sse2 \ +@@ -148,6 +150,7 @@ sysdep_routines += \ + wcslen-avx2 \ + wcslen-avx2-rtm \ + wcslen-evex \ ++ wcslen-evex512 \ + wcslen-sse2 \ + wcslen-sse4_1 \ + wcsncmp-avx2 \ +@@ -158,6 +161,7 @@ sysdep_routines += \ + wcsnlen-avx2-rtm \ + wcsnlen-c \ + wcsnlen-evex \ ++ wcsnlen-evex512 \ + wcsnlen-sse4_1 \ + wcsrchr-avx2 \ + wcsrchr-avx2-rtm \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d990a7149489efd9..6b75a7106e174bce 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -317,6 +317,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strlen_evex) ++ IFUNC_IMPL_ADD (array, i, strlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strlen_evex512) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/strnlen.c. */ +@@ -335,6 +340,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __strnlen_evex) ++ IFUNC_IMPL_ADD (array, i, strnlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strnlen_evex512) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ +@@ -714,6 +724,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcslen_evex) ++ IFUNC_IMPL_ADD (array, i, wcslen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcslen_evex512) + IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (SSE4_1), + __wcslen_sse4_1) +@@ -735,6 +750,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcsnlen_evex) ++ IFUNC_IMPL_ADD (array, i, wcsnlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcsnlen_evex512) + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (SSE4_1), + __wcsnlen_sse4_1) +diff --git a/sysdeps/x86_64/multiarch/strlen-evex-base.S b/sysdeps/x86_64/multiarch/strlen-evex-base.S +new file mode 100644 +index 0000000000000000..278c899691d89ba7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-evex-base.S +@@ -0,0 +1,302 @@ ++/* Placeholder function, not used by any processor at the moment. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifdef USE_AS_WCSLEN ++# define VPCMP vpcmpd ++# define VPTESTN vptestnmd ++# define VPMINU vpminud ++# define CHAR_SIZE 4 ++# else ++# define VPCMP vpcmpb ++# define VPTESTN vptestnmb ++# define VPMINU vpminub ++# define CHAR_SIZE 1 ++# endif ++ ++# define XMM0 xmm16 ++# define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) ++ ++# if VEC_SIZE == 64 ++# define KMOV kmovq ++# define KORTEST kortestq ++# define RAX rax ++# define RCX rcx ++# define RDX rdx ++# define SHR shrq ++# define TEXTSUFFIX evex512 ++# define VMM0 zmm16 ++# define VMM1 zmm17 ++# define VMM2 zmm18 ++# define VMM3 zmm19 ++# define VMM4 zmm20 ++# define VMOVA vmovdqa64 ++# elif VEC_SIZE == 32 ++/* Currently Unused. */ ++# define KMOV kmovd ++# define KORTEST kortestd ++# define RAX eax ++# define RCX ecx ++# define RDX edx ++# define SHR shrl ++# define TEXTSUFFIX evex256 ++# define VMM0 ymm16 ++# define VMM1 ymm17 ++# define VMM2 ymm18 ++# define VMM3 ymm19 ++# define VMM4 ymm20 ++# define VMOVA vmovdqa32 ++# endif ++ ++ .section .text.TEXTSUFFIX, "ax", @progbits ++/* Aligning entry point to 64 byte, provides better performance for ++ one vector length string. */ ++ENTRY_P2ALIGN (STRLEN, 6) ++# ifdef USE_AS_STRNLEN ++ /* Check zero length. */ ++ test %RSI_LP, %RSI_LP ++ jz L(ret_max) ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %esi, %esi ++# endif ++# endif ++ ++ movl %edi, %eax ++ vpxorq %XMM0, %XMM0, %XMM0 ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(page_cross) ++ ++ /* Compare [w]char for null, mask bit will be set for match. */ ++ VPCMP $0, (%rdi), %VMM0, %k0 ++ KMOV %k0, %RAX ++ test %RAX, %RAX ++ jz L(align_more) ++ ++ bsf %RAX, %RAX ++# ifdef USE_AS_STRNLEN ++ cmpq %rsi, %rax ++ cmovnb %rsi, %rax ++# endif ++ ret ++ ++ /* At this point vector max length reached. */ ++# ifdef USE_AS_STRNLEN ++ .p2align 4,,3 ++L(ret_max): ++ movq %rsi, %rax ++ ret ++# endif ++ ++L(align_more): ++ leaq VEC_SIZE(%rdi), %rax ++ /* Align rax to VEC_SIZE. */ ++ andq $-VEC_SIZE, %rax ++# ifdef USE_AS_STRNLEN ++ movq %rax, %rdx ++ subq %rdi, %rdx ++# ifdef USE_AS_WCSLEN ++ SHR $2, %RDX ++# endif ++ /* At this point rdx contains [w]chars already compared. */ ++ subq %rsi, %rdx ++ jae L(ret_max) ++ negq %rdx ++ /* At this point rdx contains number of w[char] needs to go. ++ Now onwards rdx will keep decrementing with each compare. */ ++# endif ++ ++ /* Loop unroll 4 times for 4 vector loop. */ ++ VPCMP $0, (%rax), %VMM0, %k0 ++ KMOV %k0, %RCX ++ test %RCX, %RCX ++ jnz L(ret_vec_x1) ++ ++# ifdef USE_AS_STRNLEN ++ subq $CHAR_PER_VEC, %rdx ++ jbe L(ret_max) ++# endif ++ ++ VPCMP $0, VEC_SIZE(%rax), %VMM0, %k0 ++ KMOV %k0, %RCX ++ test %RCX, %RCX ++ jnz L(ret_vec_x2) ++ ++# ifdef USE_AS_STRNLEN ++ subq $CHAR_PER_VEC, %rdx ++ jbe L(ret_max) ++# endif ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rax), %VMM0, %k0 ++ KMOV %k0, %RCX ++ test %RCX, %RCX ++ jnz L(ret_vec_x3) ++ ++# ifdef USE_AS_STRNLEN ++ subq $CHAR_PER_VEC, %rdx ++ jbe L(ret_max) ++# endif ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rax), %VMM0, %k0 ++ KMOV %k0, %RCX ++ test %RCX, %RCX ++ jnz L(ret_vec_x4) ++ ++# ifdef USE_AS_STRNLEN ++ subq $CHAR_PER_VEC, %rdx ++ jbe L(ret_max) ++ /* Save pointer before 4 x VEC_SIZE alignment. */ ++ movq %rax, %rcx ++# endif ++ ++ /* Align address to VEC_SIZE * 4 for loop. */ ++ andq $-(VEC_SIZE * 4), %rax ++ ++# ifdef USE_AS_STRNLEN ++ subq %rax, %rcx ++# ifdef USE_AS_WCSLEN ++ SHR $2, %RCX ++# endif ++ /* rcx contains number of [w]char will be recompared due to ++ alignment fixes. rdx must be incremented by rcx to offset ++ alignment adjustment. */ ++ addq %rcx, %rdx ++ /* Need jump as we don't want to add/subtract rdx for first ++ iteration of 4 x VEC_SIZE aligned loop. */ ++ jmp L(loop_entry) ++# endif ++ ++ .p2align 4,,11 ++L(loop): ++# ifdef USE_AS_STRNLEN ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(ret_max) ++L(loop_entry): ++# endif ++ /* VPMINU and VPCMP combination provide better performance as ++ compared to alternative combinations. */ ++ VMOVA (VEC_SIZE * 4)(%rax), %VMM1 ++ VPMINU (VEC_SIZE * 5)(%rax), %VMM1, %VMM2 ++ VMOVA (VEC_SIZE * 6)(%rax), %VMM3 ++ VPMINU (VEC_SIZE * 7)(%rax), %VMM3, %VMM4 ++ ++ VPTESTN %VMM2, %VMM2, %k0 ++ VPTESTN %VMM4, %VMM4, %k1 ++ ++ subq $-(VEC_SIZE * 4), %rax ++ KORTEST %k0, %k1 ++ jz L(loop) ++ ++ VPTESTN %VMM1, %VMM1, %k2 ++ KMOV %k2, %RCX ++ test %RCX, %RCX ++ jnz L(ret_vec_x1) ++ ++ KMOV %k0, %RCX ++ /* At this point, if k0 is non zero, null char must be in the ++ second vector. */ ++ test %RCX, %RCX ++ jnz L(ret_vec_x2) ++ ++ VPTESTN %VMM3, %VMM3, %k3 ++ KMOV %k3, %RCX ++ test %RCX, %RCX ++ jnz L(ret_vec_x3) ++ /* At this point null [w]char must be in the fourth vector so no ++ need to check. */ ++ KMOV %k1, %RCX ++ ++ /* Fourth, third, second vector terminating are pretty much ++ same, implemented this way to avoid branching and reuse code ++ from pre loop exit condition. */ ++L(ret_vec_x4): ++ bsf %RCX, %RCX ++ subq %rdi, %rax ++# ifdef USE_AS_WCSLEN ++ subq $-(VEC_SIZE * 3), %rax ++ shrq $2, %rax ++ addq %rcx, %rax ++# else ++ leaq (VEC_SIZE * 3)(%rcx, %rax), %rax ++# endif ++# ifdef USE_AS_STRNLEN ++ cmpq %rsi, %rax ++ cmovnb %rsi, %rax ++# endif ++ ret ++ ++L(ret_vec_x3): ++ bsf %RCX, %RCX ++ subq %rdi, %rax ++# ifdef USE_AS_WCSLEN ++ subq $-(VEC_SIZE * 2), %rax ++ shrq $2, %rax ++ addq %rcx, %rax ++# else ++ leaq (VEC_SIZE * 2)(%rcx, %rax), %rax ++# endif ++# ifdef USE_AS_STRNLEN ++ cmpq %rsi, %rax ++ cmovnb %rsi, %rax ++# endif ++ ret ++ ++L(ret_vec_x2): ++ subq $-VEC_SIZE, %rax ++L(ret_vec_x1): ++ bsf %RCX, %RCX ++ subq %rdi, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ addq %rcx, %rax ++# ifdef USE_AS_STRNLEN ++ cmpq %rsi, %rax ++ cmovnb %rsi, %rax ++# endif ++ ret ++ ++L(page_cross): ++ movl %eax, %ecx ++# ifdef USE_AS_WCSLEN ++ andl $(VEC_SIZE - 1), %ecx ++ sarl $2, %ecx ++# endif ++ /* ecx contains number of w[char] to be skipped as a result ++ of address alignment. */ ++ xorq %rdi, %rax ++ VPCMP $0, (PAGE_SIZE - VEC_SIZE)(%rax), %VMM0, %k0 ++ KMOV %k0, %RAX ++ /* Ignore number of character for alignment adjustment. */ ++ SHR %cl, %RAX ++ jz L(align_more) ++ ++ bsf %RAX, %RAX ++# ifdef USE_AS_STRNLEN ++ cmpq %rsi, %rax ++ cmovnb %rsi, %rax ++# endif ++ ret ++ ++END (STRLEN) ++#endif +diff --git a/sysdeps/x86_64/multiarch/strlen-evex512.S b/sysdeps/x86_64/multiarch/strlen-evex512.S +new file mode 100644 +index 0000000000000000..116f8981c8954e2e +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-evex512.S +@@ -0,0 +1,7 @@ ++#ifndef STRLEN ++# define STRLEN __strlen_evex512 ++#endif ++ ++#define VEC_SIZE 64 ++ ++#include "strlen-evex-base.S" +diff --git a/sysdeps/x86_64/multiarch/strnlen-evex512.S b/sysdeps/x86_64/multiarch/strnlen-evex512.S +new file mode 100644 +index 0000000000000000..0b7f220214a7c33c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strnlen-evex512.S +@@ -0,0 +1,4 @@ ++#define STRLEN __strnlen_evex512 ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-evex512.S" +diff --git a/sysdeps/x86_64/multiarch/wcslen-evex512.S b/sysdeps/x86_64/multiarch/wcslen-evex512.S +new file mode 100644 +index 0000000000000000..f59c372b78b4fb8c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcslen-evex512.S +@@ -0,0 +1,4 @@ ++#define STRLEN __wcslen_evex512 ++#define USE_AS_WCSLEN 1 ++ ++#include "strlen-evex512.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex512.S b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S +new file mode 100644 +index 0000000000000000..73dcf2f210a85aac +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsnlen-evex512.S +@@ -0,0 +1,5 @@ ++#define STRLEN __wcsnlen_evex512 ++#define USE_AS_WCSLEN 1 ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-evex512.S" diff --git a/glibc-upstream-2.34-279.patch b/glibc-upstream-2.34-279.patch new file mode 100644 index 0000000..21c99ad --- /dev/null +++ b/glibc-upstream-2.34-279.patch @@ -0,0 +1,33 @@ +commit f6bc52f080e4a0195c707c01f54e2eae0ff89010 +Author: H.J. Lu +Date: Fri May 20 19:21:48 2022 -0700 + + x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT + + According to x86-64 psABI, r_addend should be ignored for R_X86_64_GLOB_DAT + and R_X86_64_JUMP_SLOT. Since linkers always set their r_addends to 0, we + can ignore their r_addends. + + Reviewed-by: Fangrui Song + (cherry picked from commit f8587a61892cbafd98ce599131bf4f103466f084) + +diff --git a/sysdeps/x86_64/dl-machine.h b/sysdeps/x86_64/dl-machine.h +index 94296719d4d9fb82..742682517179fab5 100644 +--- a/sysdeps/x86_64/dl-machine.h ++++ b/sysdeps/x86_64/dl-machine.h +@@ -347,11 +347,13 @@ and creates an unsatisfiable circular dependency.\n", + # endif + /* Set to symbol size plus addend. */ + value = sym->st_size; ++ *reloc_addr = value + reloc->r_addend; ++ break; + # endif +- /* Fall through. */ ++ + case R_X86_64_GLOB_DAT: + case R_X86_64_JUMP_SLOT: +- *reloc_addr = value + reloc->r_addend; ++ *reloc_addr = value; + break; + + # ifndef RESOLVE_CONFLICT_FIND_MAP diff --git a/glibc-upstream-2.34-280.patch b/glibc-upstream-2.34-280.patch new file mode 100644 index 0000000..95a58fe --- /dev/null +++ b/glibc-upstream-2.34-280.patch @@ -0,0 +1,356 @@ +commit 82a707aeb74f23bb1783af0f9e93790a2038ff7e +Author: Raghuveer Devulapalli +Date: Mon Jun 6 12:17:43 2022 -0700 + + x86_64: Add strstr function with 512-bit EVEX + + Adding a 512-bit EVEX version of strstr. The algorithm works as follows: + + (1) We spend a few cycles at the begining to peek into the needle. We + locate an edge in the needle (first occurance of 2 consequent distinct + characters) and also store the first 64-bytes into a zmm register. + + (2) We search for the edge in the haystack by looking into one cache + line of the haystack at a time. This avoids having to read past a page + boundary which can cause a seg fault. + + (3) If an edge is found in the haystack we first compare the first + 64-bytes of the needle (already stored in a zmm register) before we + proceed with a full string compare performed byte by byte. + + Benchmarking results: (old = strstr_sse2_unaligned, new = strstr_avx512) + + Geometric mean of all benchmarks: new / old = 0.66 + + Difficult skiptable(0) : new / old = 0.02 + Difficult skiptable(1) : new / old = 0.01 + Difficult 2-way : new / old = 0.25 + Difficult testing first 2 : new / old = 1.26 + Difficult skiptable(0) : new / old = 0.05 + Difficult skiptable(1) : new / old = 0.06 + Difficult 2-way : new / old = 0.26 + Difficult testing first 2 : new / old = 1.05 + Difficult skiptable(0) : new / old = 0.42 + Difficult skiptable(1) : new / old = 0.24 + Difficult 2-way : new / old = 0.21 + Difficult testing first 2 : new / old = 1.04 + Reviewed-by: H.J. Lu + + (cherry picked from commit 5082a287d5e9a1f9cb98b7c982a708a3684f1d5c) + + x86: Remove __mmask intrinsics in strstr-avx512.c + + The intrinsics are not available before GCC7 and using standard + operators generates code of equivalent or better quality. + + Removed: + _cvtmask64_u64 + _kshiftri_mask64 + _kand_mask64 + + Geometric Mean of 5 Runs of Full Benchmark Suite New / Old: 0.958 + + (cherry picked from commit f2698954ff9c2f9626d4bcb5a30eb5729714e0b0) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 4d4ad2a3686b5bc3..0e39e63ef6be6a86 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -126,6 +126,7 @@ sysdep_routines += \ + strrchr-sse2 \ + strspn-c \ + strspn-sse2 \ ++ strstr-avx512 \ + strstr-sse2-unaligned \ + varshift \ + # sysdep_routines +@@ -133,6 +134,7 @@ CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 + CFLAGS-strspn-c.c += -msse4 ++CFLAGS-strstr-avx512.c += -mavx512f -mavx512vl -mavx512dq -mavx512bw -mbmi -mbmi2 -O3 + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 6b75a7106e174bce..043821278fdb6d8f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -633,6 +633,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strstr.c. */ + IFUNC_IMPL (i, name, strstr, ++ IFUNC_IMPL_ADD (array, i, strstr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (AVX512DQ) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strstr_avx512) + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, strstr, 1, __strstr_sse2)) + +diff --git a/sysdeps/x86_64/multiarch/strstr-avx512.c b/sysdeps/x86_64/multiarch/strstr-avx512.c +new file mode 100644 +index 0000000000000000..e44c1a05dc0007e5 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strstr-avx512.c +@@ -0,0 +1,218 @@ ++/* strstr optimized with 512-bit AVX-512 instructions ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++ ++#define FULL_MMASK64 0xffffffffffffffff ++#define ONE_64BIT 0x1ull ++#define ZMM_SIZE_IN_BYTES 64 ++#define PAGESIZE 4096 ++ ++#define cvtmask64_u64(...) (uint64_t) (__VA_ARGS__) ++#define kshiftri_mask64(x, y) ((x) >> (y)) ++#define kand_mask64(x, y) ((x) & (y)) ++ ++/* ++ Returns the index of the first edge within the needle, returns 0 if no edge ++ is found. Example: 'ab' is the first edge in 'aaaaaaaaaabaarddg' ++ */ ++static inline size_t ++find_edge_in_needle (const char *ned) ++{ ++ size_t ind = 0; ++ while (ned[ind + 1] != '\0') ++ { ++ if (ned[ind] != ned[ind + 1]) ++ return ind; ++ else ++ ind = ind + 1; ++ } ++ return 0; ++} ++ ++/* ++ Compare needle with haystack byte by byte at specified location ++ */ ++static inline bool ++verify_string_match (const char *hay, const size_t hay_index, const char *ned, ++ size_t ind) ++{ ++ while (ned[ind] != '\0') ++ { ++ if (ned[ind] != hay[hay_index + ind]) ++ return false; ++ ind = ind + 1; ++ } ++ return true; ++} ++ ++/* ++ Compare needle with haystack at specified location. The first 64 bytes are ++ compared using a ZMM register. ++ */ ++static inline bool ++verify_string_match_avx512 (const char *hay, const size_t hay_index, ++ const char *ned, const __mmask64 ned_mask, ++ const __m512i ned_zmm) ++{ ++ /* check first 64 bytes using zmm and then scalar */ ++ __m512i hay_zmm = _mm512_loadu_si512 (hay + hay_index); // safe to do so ++ __mmask64 match = _mm512_mask_cmpneq_epi8_mask (ned_mask, hay_zmm, ned_zmm); ++ if (match != 0x0) // failed the first few chars ++ return false; ++ else if (ned_mask == FULL_MMASK64) ++ return verify_string_match (hay, hay_index, ned, ZMM_SIZE_IN_BYTES); ++ return true; ++} ++ ++char * ++__strstr_avx512 (const char *haystack, const char *ned) ++{ ++ char first = ned[0]; ++ if (first == '\0') ++ return (char *)haystack; ++ if (ned[1] == '\0') ++ return (char *)strchr (haystack, ned[0]); ++ ++ size_t edge = find_edge_in_needle (ned); ++ ++ /* ensure haystack is as long as the pos of edge in needle */ ++ for (int ii = 0; ii < edge; ++ii) ++ { ++ if (haystack[ii] == '\0') ++ return NULL; ++ } ++ ++ /* ++ Load 64 bytes of the needle and save it to a zmm register ++ Read one cache line at a time to avoid loading across a page boundary ++ */ ++ __mmask64 ned_load_mask = _bzhi_u64 ( ++ FULL_MMASK64, 64 - ((uintptr_t) (ned) & 63)); ++ __m512i ned_zmm = _mm512_maskz_loadu_epi8 (ned_load_mask, ned); ++ __mmask64 ned_nullmask ++ = _mm512_mask_testn_epi8_mask (ned_load_mask, ned_zmm, ned_zmm); ++ ++ if (__glibc_unlikely (ned_nullmask == 0x0)) ++ { ++ ned_zmm = _mm512_loadu_si512 (ned); ++ ned_nullmask = _mm512_testn_epi8_mask (ned_zmm, ned_zmm); ++ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); ++ if (ned_nullmask != 0x0) ++ ned_load_mask = ned_load_mask >> 1; ++ } ++ else ++ { ++ ned_load_mask = ned_nullmask ^ (ned_nullmask - ONE_64BIT); ++ ned_load_mask = ned_load_mask >> 1; ++ } ++ const __m512i ned0 = _mm512_set1_epi8 (ned[edge]); ++ const __m512i ned1 = _mm512_set1_epi8 (ned[edge + 1]); ++ ++ /* ++ Read the bytes of haystack in the current cache line ++ */ ++ size_t hay_index = edge; ++ __mmask64 loadmask = _bzhi_u64 ( ++ FULL_MMASK64, 64 - ((uintptr_t) (haystack + hay_index) & 63)); ++ /* First load is a partial cache line */ ++ __m512i hay0 = _mm512_maskz_loadu_epi8 (loadmask, haystack + hay_index); ++ /* Search for NULL and compare only till null char */ ++ uint64_t nullmask ++ = cvtmask64_u64 (_mm512_mask_testn_epi8_mask (loadmask, hay0, hay0)); ++ uint64_t cmpmask = nullmask ^ (nullmask - ONE_64BIT); ++ cmpmask = cmpmask & cvtmask64_u64 (loadmask); ++ /* Search for the 2 charaters of needle */ ++ __mmask64 k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); ++ __mmask64 k1 = _mm512_cmpeq_epi8_mask (hay0, ned1); ++ k1 = kshiftri_mask64 (k1, 1); ++ /* k2 masks tell us if both chars from needle match */ ++ uint64_t k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask; ++ /* For every match, search for the entire needle for a full match */ ++ while (k2) ++ { ++ uint64_t bitcount = _tzcnt_u64 (k2); ++ k2 = _blsr_u64 (k2); ++ size_t match_pos = hay_index + bitcount - edge; ++ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) ++ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) ++ { ++ /* ++ * Use vector compare as long as you are not crossing a page ++ */ ++ if (verify_string_match_avx512 (haystack, match_pos, ned, ++ ned_load_mask, ned_zmm)) ++ return (char *)haystack + match_pos; ++ } ++ else ++ { ++ if (verify_string_match (haystack, match_pos, ned, 0)) ++ return (char *)haystack + match_pos; ++ } ++ } ++ /* We haven't checked for potential match at the last char yet */ ++ haystack = (const char *)(((uintptr_t) (haystack + hay_index) | 63)); ++ hay_index = 0; ++ ++ /* ++ Loop over one cache line at a time to prevent reading over page ++ boundary ++ */ ++ __m512i hay1; ++ while (nullmask == 0) ++ { ++ hay0 = _mm512_loadu_si512 (haystack + hay_index); ++ hay1 = _mm512_load_si512 (haystack + hay_index ++ + 1); // Always 64 byte aligned ++ nullmask = cvtmask64_u64 (_mm512_testn_epi8_mask (hay1, hay1)); ++ /* Compare only till null char */ ++ cmpmask = nullmask ^ (nullmask - ONE_64BIT); ++ k0 = _mm512_cmpeq_epi8_mask (hay0, ned0); ++ k1 = _mm512_cmpeq_epi8_mask (hay1, ned1); ++ /* k2 masks tell us if both chars from needle match */ ++ k2 = cvtmask64_u64 (kand_mask64 (k0, k1)) & cmpmask; ++ /* For every match, compare full strings for potential match */ ++ while (k2) ++ { ++ uint64_t bitcount = _tzcnt_u64 (k2); ++ k2 = _blsr_u64 (k2); ++ size_t match_pos = hay_index + bitcount - edge; ++ if (((uintptr_t) (haystack + match_pos) & (PAGESIZE - 1)) ++ < PAGESIZE - 1 - ZMM_SIZE_IN_BYTES) ++ { ++ /* ++ * Use vector compare as long as you are not crossing a page ++ */ ++ if (verify_string_match_avx512 (haystack, match_pos, ned, ++ ned_load_mask, ned_zmm)) ++ return (char *)haystack + match_pos; ++ } ++ else ++ { ++ /* Compare byte by byte */ ++ if (verify_string_match (haystack, match_pos, ned, 0)) ++ return (char *)haystack + match_pos; ++ } ++ } ++ hay_index += ZMM_SIZE_IN_BYTES; ++ } ++ return NULL; ++} +diff --git a/sysdeps/x86_64/multiarch/strstr.c b/sysdeps/x86_64/multiarch/strstr.c +index 848601bde7583ca3..9474d6234e9b62d3 100644 +--- a/sysdeps/x86_64/multiarch/strstr.c ++++ b/sysdeps/x86_64/multiarch/strstr.c +@@ -35,16 +35,32 @@ + + extern __typeof (__redirect_strstr) __strstr_sse2_unaligned attribute_hidden; + extern __typeof (__redirect_strstr) __strstr_sse2 attribute_hidden; ++extern __typeof (__redirect_strstr) __strstr_avx512 attribute_hidden; + + #include "init-arch.h" + + /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ + extern __typeof (__redirect_strstr) __libc_strstr; +-libc_ifunc (__libc_strstr, +- HAS_ARCH_FEATURE (Fast_Unaligned_Load) +- ? __strstr_sse2_unaligned +- : __strstr_sse2) + ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features *cpu_features = __get_cpu_features (); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512DQ) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ return __strstr_avx512; ++ ++ if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) ++ return __strstr_sse2_unaligned; ++ ++ return __strstr_sse2; ++} ++ ++libc_ifunc_redirected (__redirect_strstr, __libc_strstr, IFUNC_SELECTOR ()); + #undef strstr + strong_alias (__libc_strstr, strstr) diff --git a/glibc-upstream-2.34-281.patch b/glibc-upstream-2.34-281.patch new file mode 100644 index 0000000..3a32378 --- /dev/null +++ b/glibc-upstream-2.34-281.patch @@ -0,0 +1,385 @@ +commit 70be93d1c58916d289a5e6e7c7d9b989707a9e41 +Author: Noah Goldstein +Date: Mon Jun 6 21:11:27 2022 -0700 + + x86: Create header for VEC classes in x86 strings library + + This patch does not touch any existing code and is only meant to be a + tool for future patches so that simple source files can more easily be + maintained to target multiple VEC classes. + + There is no difference in the objdump of libc.so before and after this + patch. + Reviewed-by: H.J. Lu + + (cherry picked from commit 8a780a6b910023e71f3173f37f0793834c047554) + +diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h +new file mode 100644 +index 0000000000000000..3f531dd47fceefe9 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h +@@ -0,0 +1,34 @@ ++/* Common config for AVX-RTM VECs ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _AVX_RTM_VECS_H ++#define _AVX_RTM_VECS_H 1 ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define USE_WITH_RTM 1 ++#include "avx-vecs.h" ++ ++#undef SECTION ++#define SECTION(p) p##.avx.rtm ++ ++#endif +diff --git a/sysdeps/x86_64/multiarch/avx-vecs.h b/sysdeps/x86_64/multiarch/avx-vecs.h +new file mode 100644 +index 0000000000000000..89680f5db827c332 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/avx-vecs.h +@@ -0,0 +1,47 @@ ++/* Common config for AVX VECs ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _AVX_VECS_H ++#define _AVX_VECS_H 1 ++ ++#ifdef VEC_SIZE ++# error "Multiple VEC configs included!" ++#endif ++ ++#define VEC_SIZE 32 ++#include "vec-macros.h" ++ ++#define USE_WITH_AVX 1 ++#define SECTION(p) p##.avx ++ ++/* 4-byte mov instructions with AVX2. */ ++#define MOV_SIZE 4 ++/* 1 (ret) + 3 (vzeroupper). */ ++#define RET_SIZE 4 ++#define VZEROUPPER vzeroupper ++ ++#define VMOVU vmovdqu ++#define VMOVA vmovdqa ++#define VMOVNT vmovntdq ++ ++/* Often need to access xmm portion. */ ++#define VEC_xmm VEC_any_xmm ++#define VEC VEC_any_ymm ++ ++#endif +diff --git a/sysdeps/x86_64/multiarch/evex-vecs-common.h b/sysdeps/x86_64/multiarch/evex-vecs-common.h +new file mode 100644 +index 0000000000000000..99806ebcd7bde53d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/evex-vecs-common.h +@@ -0,0 +1,39 @@ ++/* Common config for EVEX256 and EVEX512 VECs ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _EVEX_VECS_COMMON_H ++#define _EVEX_VECS_COMMON_H 1 ++ ++#include "vec-macros.h" ++ ++/* 6-byte mov instructions with EVEX. */ ++#define MOV_SIZE 6 ++/* No vzeroupper needed. */ ++#define RET_SIZE 1 ++#define VZEROUPPER ++ ++#define VMOVU vmovdqu64 ++#define VMOVA vmovdqa64 ++#define VMOVNT vmovntdq ++ ++#define VEC_xmm VEC_hi_xmm ++#define VEC_ymm VEC_hi_ymm ++#define VEC_zmm VEC_hi_zmm ++ ++#endif +diff --git a/sysdeps/x86_64/multiarch/evex256-vecs.h b/sysdeps/x86_64/multiarch/evex256-vecs.h +new file mode 100644 +index 0000000000000000..222ba46dc74cfcbd +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/evex256-vecs.h +@@ -0,0 +1,35 @@ ++/* Common config for EVEX256 VECs ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _EVEX256_VECS_H ++#define _EVEX256_VECS_H 1 ++ ++#ifdef VEC_SIZE ++# error "Multiple VEC configs included!" ++#endif ++ ++#define VEC_SIZE 32 ++#include "evex-vecs-common.h" ++ ++#define USE_WITH_EVEX256 1 ++#define SECTION(p) p##.evex ++ ++#define VEC VEC_ymm ++ ++#endif +diff --git a/sysdeps/x86_64/multiarch/evex512-vecs.h b/sysdeps/x86_64/multiarch/evex512-vecs.h +new file mode 100644 +index 0000000000000000..d1784d5368d8cebe +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/evex512-vecs.h +@@ -0,0 +1,35 @@ ++/* Common config for EVEX512 VECs ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _EVEX512_VECS_H ++#define _EVEX512_VECS_H 1 ++ ++#ifdef VEC_SIZE ++# error "Multiple VEC configs included!" ++#endif ++ ++#define VEC_SIZE 64 ++#include "evex-vecs-common.h" ++ ++#define USE_WITH_EVEX512 1 ++#define SECTION(p) p##.evex512 ++ ++#define VEC VEC_zmm ++ ++#endif +diff --git a/sysdeps/x86_64/multiarch/sse2-vecs.h b/sysdeps/x86_64/multiarch/sse2-vecs.h +new file mode 100644 +index 0000000000000000..2b77a59d56ff2660 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/sse2-vecs.h +@@ -0,0 +1,47 @@ ++/* Common config for SSE2 VECs ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _SSE2_VECS_H ++#define _SSE2_VECS_H 1 ++ ++#ifdef VEC_SIZE ++# error "Multiple VEC configs included!" ++#endif ++ ++#define VEC_SIZE 16 ++#include "vec-macros.h" ++ ++#define USE_WITH_SSE2 1 ++#define SECTION(p) p ++ ++/* 3-byte mov instructions with SSE2. */ ++#define MOV_SIZE 3 ++/* No vzeroupper needed. */ ++#define RET_SIZE 1 ++#define VZEROUPPER ++ ++#define VMOVU movups ++#define VMOVA movaps ++#define VMOVNT movntdq ++ ++#define VEC_xmm VEC_any_xmm ++#define VEC VEC_any_xmm ++ ++ ++#endif +diff --git a/sysdeps/x86_64/multiarch/vec-macros.h b/sysdeps/x86_64/multiarch/vec-macros.h +new file mode 100644 +index 0000000000000000..9f3ffecede9feb26 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/vec-macros.h +@@ -0,0 +1,90 @@ ++/* Macro helpers for VEC_{type}({vec_num}) ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _VEC_MACROS_H ++#define _VEC_MACROS_H 1 ++ ++#ifndef VEC_SIZE ++# error "Never include this file directly. Always include a vector config." ++#endif ++ ++/* Defines so we can use SSE2 / AVX2 / EVEX / EVEX512 encoding with same ++ VEC(N) values. */ ++#define VEC_hi_xmm0 xmm16 ++#define VEC_hi_xmm1 xmm17 ++#define VEC_hi_xmm2 xmm18 ++#define VEC_hi_xmm3 xmm19 ++#define VEC_hi_xmm4 xmm20 ++#define VEC_hi_xmm5 xmm21 ++#define VEC_hi_xmm6 xmm22 ++#define VEC_hi_xmm7 xmm23 ++#define VEC_hi_xmm8 xmm24 ++#define VEC_hi_xmm9 xmm25 ++#define VEC_hi_xmm10 xmm26 ++#define VEC_hi_xmm11 xmm27 ++#define VEC_hi_xmm12 xmm28 ++#define VEC_hi_xmm13 xmm29 ++#define VEC_hi_xmm14 xmm30 ++#define VEC_hi_xmm15 xmm31 ++ ++#define VEC_hi_ymm0 ymm16 ++#define VEC_hi_ymm1 ymm17 ++#define VEC_hi_ymm2 ymm18 ++#define VEC_hi_ymm3 ymm19 ++#define VEC_hi_ymm4 ymm20 ++#define VEC_hi_ymm5 ymm21 ++#define VEC_hi_ymm6 ymm22 ++#define VEC_hi_ymm7 ymm23 ++#define VEC_hi_ymm8 ymm24 ++#define VEC_hi_ymm9 ymm25 ++#define VEC_hi_ymm10 ymm26 ++#define VEC_hi_ymm11 ymm27 ++#define VEC_hi_ymm12 ymm28 ++#define VEC_hi_ymm13 ymm29 ++#define VEC_hi_ymm14 ymm30 ++#define VEC_hi_ymm15 ymm31 ++ ++#define VEC_hi_zmm0 zmm16 ++#define VEC_hi_zmm1 zmm17 ++#define VEC_hi_zmm2 zmm18 ++#define VEC_hi_zmm3 zmm19 ++#define VEC_hi_zmm4 zmm20 ++#define VEC_hi_zmm5 zmm21 ++#define VEC_hi_zmm6 zmm22 ++#define VEC_hi_zmm7 zmm23 ++#define VEC_hi_zmm8 zmm24 ++#define VEC_hi_zmm9 zmm25 ++#define VEC_hi_zmm10 zmm26 ++#define VEC_hi_zmm11 zmm27 ++#define VEC_hi_zmm12 zmm28 ++#define VEC_hi_zmm13 zmm29 ++#define VEC_hi_zmm14 zmm30 ++#define VEC_hi_zmm15 zmm31 ++ ++#define PRIMITIVE_VEC(vec, num) vec##num ++ ++#define VEC_any_xmm(i) PRIMITIVE_VEC(xmm, i) ++#define VEC_any_ymm(i) PRIMITIVE_VEC(ymm, i) ++#define VEC_any_zmm(i) PRIMITIVE_VEC(zmm, i) ++ ++#define VEC_hi_xmm(i) PRIMITIVE_VEC(VEC_hi_xmm, i) ++#define VEC_hi_ymm(i) PRIMITIVE_VEC(VEC_hi_ymm, i) ++#define VEC_hi_zmm(i) PRIMITIVE_VEC(VEC_hi_zmm, i) ++ ++#endif diff --git a/glibc-upstream-2.34-282.patch b/glibc-upstream-2.34-282.patch new file mode 100644 index 0000000..b1c9c0d --- /dev/null +++ b/glibc-upstream-2.34-282.patch @@ -0,0 +1,90 @@ +commit e805606193e1a39956ca5ef73cb44a8796730686 +Author: Noah Goldstein +Date: Mon Jun 6 21:11:28 2022 -0700 + + x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` + + The RTM vzeroupper mitigation has no way of replacing inline + vzeroupper not before a return. + + This can be useful when hoisting a vzeroupper to save code size + for example: + + ``` + L(foo): + cmpl %eax, %edx + jz L(bar) + tzcntl %eax, %eax + addq %rdi, %rax + VZEROUPPER_RETURN + + L(bar): + xorl %eax, %eax + VZEROUPPER_RETURN + ``` + + Can become: + + ``` + L(foo): + COND_VZEROUPPER + cmpl %eax, %edx + jz L(bar) + tzcntl %eax, %eax + addq %rdi, %rax + ret + + L(bar): + xorl %eax, %eax + ret + ``` + + This code does not change any existing functionality. + + There is no difference in the objdump of libc.so before and after this + patch. + Reviewed-by: H.J. Lu + + (cherry picked from commit dd5c483b2598f411428df4d8864c15c4b8a3cd68) + +diff --git a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h +index 3f531dd47fceefe9..6ca9f5e6bae7ba72 100644 +--- a/sysdeps/x86_64/multiarch/avx-rtm-vecs.h ++++ b/sysdeps/x86_64/multiarch/avx-rtm-vecs.h +@@ -20,6 +20,7 @@ + #ifndef _AVX_RTM_VECS_H + #define _AVX_RTM_VECS_H 1 + ++#define COND_VZEROUPPER COND_VZEROUPPER_XTEST + #define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h +index 7bebdeb21095eda0..93e44be22e2275f1 100644 +--- a/sysdeps/x86_64/sysdep.h ++++ b/sysdeps/x86_64/sysdep.h +@@ -106,6 +106,24 @@ lose: \ + vzeroupper; \ + ret + ++/* Can be used to replace vzeroupper that is not directly before a ++ return. This is useful when hoisting a vzeroupper from multiple ++ return paths to decrease the total number of vzerouppers and code ++ size. */ ++#define COND_VZEROUPPER_XTEST \ ++ xtest; \ ++ jz 1f; \ ++ vzeroall; \ ++ jmp 2f; \ ++1: \ ++ vzeroupper; \ ++2: ++ ++/* In RTM define this as COND_VZEROUPPER_XTEST. */ ++#ifndef COND_VZEROUPPER ++# define COND_VZEROUPPER vzeroupper ++#endif ++ + /* Zero upper vector registers and return. */ + #ifndef ZERO_UPPER_VEC_REGISTERS_RETURN + # define ZERO_UPPER_VEC_REGISTERS_RETURN \ diff --git a/glibc-upstream-2.34-283.patch b/glibc-upstream-2.34-283.patch new file mode 100644 index 0000000..7745fef --- /dev/null +++ b/glibc-upstream-2.34-283.patch @@ -0,0 +1,696 @@ +commit 4901009dad8b3ab141ac6e0caebe99e03a67f5eb +Author: Noah Goldstein +Date: Mon Jun 6 21:11:30 2022 -0700 + + x86: Optimize memrchr-sse2.S + + The new code: + 1. prioritizes smaller lengths more. + 2. optimizes target placement more carefully. + 3. reuses logic more. + 4. fixes up various inefficiencies in the logic. + + The total code size saving is: 394 bytes + Geometric Mean of all benchmarks New / Old: 0.874 + + Regressions: + 1. The page cross case is now colder, especially re-entry from the + page cross case if a match is not found in the first VEC + (roughly 50%). My general opinion with this patch is this is + acceptable given the "coldness" of this case (less than 4%) and + generally performance improvement in the other far more common + cases. + + 2. There are some regressions 5-15% for medium/large user-arg + lengths that have a match in the first VEC. This is because the + logic was rewritten to optimize finds in the first VEC if the + user-arg length is shorter (where we see roughly 20-50% + performance improvements). It is not always the case this is a + regression. My intuition is some frontend quirk is partially + explaining the data although I haven't been able to find the + root cause. + + Full xcheck passes on x86_64. + Reviewed-by: H.J. Lu + + (cherry picked from commit 731feee3869550e93177e604604c1765d81de571) + +diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S +index cc2001167d77c83c..c2a5902bf9385c67 100644 +--- a/sysdeps/x86_64/memrchr.S ++++ b/sysdeps/x86_64/memrchr.S +@@ -19,362 +19,333 @@ + . */ + + #include ++#define VEC_SIZE 16 ++#define PAGE_SIZE 4096 + + .text +-ENTRY (__memrchr) +- movd %esi, %xmm1 +- +- sub $16, %RDX_LP +- jbe L(length_less16) +- +- punpcklbw %xmm1, %xmm1 +- punpcklbw %xmm1, %xmm1 +- +- add %RDX_LP, %RDI_LP +- pshufd $0, %xmm1, %xmm1 +- +- movdqu (%rdi), %xmm0 +- pcmpeqb %xmm1, %xmm0 +- +-/* Check if there is a match. */ +- pmovmskb %xmm0, %eax +- test %eax, %eax +- jnz L(matches0) +- +- sub $64, %rdi +- mov %edi, %ecx +- and $15, %ecx +- jz L(loop_prolog) +- +- add $16, %rdi +- add $16, %rdx +- and $-16, %rdi +- sub %rcx, %rdx +- +- .p2align 4 +-L(loop_prolog): +- sub $64, %rdx +- jbe L(exit_loop) +- +- movdqa 48(%rdi), %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %eax +- test %eax, %eax +- jnz L(matches48) +- +- movdqa 32(%rdi), %xmm2 +- pcmpeqb %xmm1, %xmm2 +- pmovmskb %xmm2, %eax +- test %eax, %eax +- jnz L(matches32) +- +- movdqa 16(%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm3 +- pmovmskb %xmm3, %eax +- test %eax, %eax +- jnz L(matches16) +- +- movdqa (%rdi), %xmm4 +- pcmpeqb %xmm1, %xmm4 +- pmovmskb %xmm4, %eax +- test %eax, %eax +- jnz L(matches0) +- +- sub $64, %rdi +- sub $64, %rdx +- jbe L(exit_loop) +- +- movdqa 48(%rdi), %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %eax +- test %eax, %eax +- jnz L(matches48) +- +- movdqa 32(%rdi), %xmm2 +- pcmpeqb %xmm1, %xmm2 +- pmovmskb %xmm2, %eax +- test %eax, %eax +- jnz L(matches32) +- +- movdqa 16(%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm3 +- pmovmskb %xmm3, %eax +- test %eax, %eax +- jnz L(matches16) +- +- movdqa (%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm3 +- pmovmskb %xmm3, %eax +- test %eax, %eax +- jnz L(matches0) +- +- mov %edi, %ecx +- and $63, %ecx +- jz L(align64_loop) +- +- add $64, %rdi +- add $64, %rdx +- and $-64, %rdi +- sub %rcx, %rdx +- +- .p2align 4 +-L(align64_loop): +- sub $64, %rdi +- sub $64, %rdx +- jbe L(exit_loop) +- +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm2 +- movdqa 32(%rdi), %xmm3 +- movdqa 48(%rdi), %xmm4 +- +- pcmpeqb %xmm1, %xmm0 +- pcmpeqb %xmm1, %xmm2 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm1, %xmm4 +- +- pmaxub %xmm3, %xmm0 +- pmaxub %xmm4, %xmm2 +- pmaxub %xmm0, %xmm2 +- pmovmskb %xmm2, %eax +- +- test %eax, %eax +- jz L(align64_loop) +- +- pmovmskb %xmm4, %eax +- test %eax, %eax +- jnz L(matches48) +- +- pmovmskb %xmm3, %eax +- test %eax, %eax +- jnz L(matches32) +- +- movdqa 16(%rdi), %xmm2 +- +- pcmpeqb %xmm1, %xmm2 +- pcmpeqb (%rdi), %xmm1 +- +- pmovmskb %xmm2, %eax +- test %eax, %eax +- jnz L(matches16) +- +- pmovmskb %xmm1, %eax +- bsr %eax, %eax +- +- add %rdi, %rax ++ENTRY_P2ALIGN(__memrchr, 6) ++#ifdef __ILP32__ ++ /* Clear upper bits. */ ++ mov %RDX_LP, %RDX_LP ++#endif ++ movd %esi, %xmm0 ++ ++ /* Get end pointer. */ ++ leaq (%rdx, %rdi), %rcx ++ ++ punpcklbw %xmm0, %xmm0 ++ punpcklwd %xmm0, %xmm0 ++ pshufd $0, %xmm0, %xmm0 ++ ++ /* Check if we can load 1x VEC without cross a page. */ ++ testl $(PAGE_SIZE - VEC_SIZE), %ecx ++ jz L(page_cross) ++ ++ /* NB: This load happens regardless of whether rdx (len) is zero. Since ++ it doesn't cross a page and the standard gurantees any pointer have ++ at least one-valid byte this load must be safe. For the entire ++ history of the x86 memrchr implementation this has been possible so ++ no code "should" be relying on a zero-length check before this load. ++ The zero-length check is moved to the page cross case because it is ++ 1) pretty cold and including it pushes the hot case len <= VEC_SIZE ++ into 2-cache lines. */ ++ movups -(VEC_SIZE)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ ++ subq $VEC_SIZE, %rdx ++ ja L(more_1x_vec) ++L(ret_vec_x0_test): ++ /* Zero-flag set if eax (src) is zero. Destination unchanged if src is ++ zero. */ ++ bsrl %eax, %eax ++ jz L(ret_0) ++ /* Check if the CHAR match is in bounds. Need to truly zero `eax` here ++ if out of bounds. */ ++ addl %edx, %eax ++ jl L(zero_0) ++ /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base ++ ptr. */ ++ addq %rdi, %rax ++L(ret_0): + ret + +- .p2align 4 +-L(exit_loop): +- add $64, %edx +- cmp $32, %edx +- jbe L(exit_loop_32) +- +- movdqa 48(%rdi), %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %eax +- test %eax, %eax +- jnz L(matches48) +- +- movdqa 32(%rdi), %xmm2 +- pcmpeqb %xmm1, %xmm2 +- pmovmskb %xmm2, %eax +- test %eax, %eax +- jnz L(matches32) +- +- movdqa 16(%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm3 +- pmovmskb %xmm3, %eax +- test %eax, %eax +- jnz L(matches16_1) +- cmp $48, %edx +- jbe L(return_null) +- +- pcmpeqb (%rdi), %xmm1 +- pmovmskb %xmm1, %eax +- test %eax, %eax +- jnz L(matches0_1) +- xor %eax, %eax ++ .p2align 4,, 5 ++L(ret_vec_x0): ++ bsrl %eax, %eax ++ leaq -(VEC_SIZE)(%rcx, %rax), %rax + ret + +- .p2align 4 +-L(exit_loop_32): +- movdqa 48(%rdi), %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %eax +- test %eax, %eax +- jnz L(matches48_1) +- cmp $16, %edx +- jbe L(return_null) +- +- pcmpeqb 32(%rdi), %xmm1 +- pmovmskb %xmm1, %eax +- test %eax, %eax +- jnz L(matches32_1) +- xor %eax, %eax ++ .p2align 4,, 2 ++L(zero_0): ++ xorl %eax, %eax + ret + +- .p2align 4 +-L(matches0): +- bsr %eax, %eax +- add %rdi, %rax +- ret +- +- .p2align 4 +-L(matches16): +- bsr %eax, %eax +- lea 16(%rax, %rdi), %rax +- ret + +- .p2align 4 +-L(matches32): +- bsr %eax, %eax +- lea 32(%rax, %rdi), %rax ++ .p2align 4,, 8 ++L(more_1x_vec): ++ testl %eax, %eax ++ jnz L(ret_vec_x0) ++ ++ /* Align rcx (pointer to string). */ ++ decq %rcx ++ andq $-VEC_SIZE, %rcx ++ ++ movq %rcx, %rdx ++ /* NB: We could consistenyl save 1-byte in this pattern with `movaps ++ %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is ++ it adds more frontend uops (even if the moves can be eliminated) and ++ some percentage of the time actual backend uops. */ ++ movaps -(VEC_SIZE)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ subq %rdi, %rdx ++ pmovmskb %xmm1, %eax ++ ++ cmpq $(VEC_SIZE * 2), %rdx ++ ja L(more_2x_vec) ++L(last_2x_vec): ++ subl $VEC_SIZE, %edx ++ jbe L(ret_vec_x0_test) ++ ++ testl %eax, %eax ++ jnz L(ret_vec_x0) ++ ++ movaps -(VEC_SIZE * 2)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ ++ subl $VEC_SIZE, %edx ++ bsrl %eax, %eax ++ jz L(ret_1) ++ addl %edx, %eax ++ jl L(zero_0) ++ addq %rdi, %rax ++L(ret_1): + ret + +- .p2align 4 +-L(matches48): +- bsr %eax, %eax +- lea 48(%rax, %rdi), %rax ++ /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross) ++ causes the hot pause (length <= VEC_SIZE) to span multiple cache ++ lines. Naturally aligned % 16 to 8-bytes. */ ++L(page_cross): ++ /* Zero length check. */ ++ testq %rdx, %rdx ++ jz L(zero_0) ++ ++ leaq -1(%rcx), %r8 ++ andq $-(VEC_SIZE), %r8 ++ ++ movaps (%r8), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %esi ++ /* Shift out negative alignment (because we are starting from endptr and ++ working backwards). */ ++ negl %ecx ++ /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count ++ explicitly. */ ++ andl $(VEC_SIZE - 1), %ecx ++ shl %cl, %esi ++ movzwl %si, %eax ++ leaq (%rdi, %rdx), %rcx ++ cmpq %rdi, %r8 ++ ja L(more_1x_vec) ++ subl $VEC_SIZE, %edx ++ bsrl %eax, %eax ++ jz L(ret_2) ++ addl %edx, %eax ++ jl L(zero_1) ++ addq %rdi, %rax ++L(ret_2): + ret + +- .p2align 4 +-L(matches0_1): +- bsr %eax, %eax +- sub $64, %rdx +- add %rax, %rdx +- jl L(return_null) +- add %rdi, %rax ++ /* Fits in aliging bytes. */ ++L(zero_1): ++ xorl %eax, %eax + ret + +- .p2align 4 +-L(matches16_1): +- bsr %eax, %eax +- sub $48, %rdx +- add %rax, %rdx +- jl L(return_null) +- lea 16(%rdi, %rax), %rax ++ .p2align 4,, 5 ++L(ret_vec_x1): ++ bsrl %eax, %eax ++ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax + ret + +- .p2align 4 +-L(matches32_1): +- bsr %eax, %eax +- sub $32, %rdx +- add %rax, %rdx +- jl L(return_null) +- lea 32(%rdi, %rax), %rax +- ret ++ .p2align 4,, 8 ++L(more_2x_vec): ++ testl %eax, %eax ++ jnz L(ret_vec_x0) + +- .p2align 4 +-L(matches48_1): +- bsr %eax, %eax +- sub $16, %rdx +- add %rax, %rdx +- jl L(return_null) +- lea 48(%rdi, %rax), %rax +- ret ++ movaps -(VEC_SIZE * 2)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ testl %eax, %eax ++ jnz L(ret_vec_x1) + +- .p2align 4 +-L(return_null): +- xor %eax, %eax +- ret + +- .p2align 4 +-L(length_less16_offset0): +- test %edx, %edx +- jz L(return_null) ++ movaps -(VEC_SIZE * 3)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax + +- mov %dl, %cl +- pcmpeqb (%rdi), %xmm1 ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(more_4x_vec) + +- mov $1, %edx +- sal %cl, %edx +- sub $1, %edx ++ addl $(VEC_SIZE), %edx ++ jle L(ret_vec_x2_test) + +- pmovmskb %xmm1, %eax ++L(last_vec): ++ testl %eax, %eax ++ jnz L(ret_vec_x2) + +- and %edx, %eax +- test %eax, %eax +- jz L(return_null) ++ movaps -(VEC_SIZE * 4)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax + +- bsr %eax, %eax +- add %rdi, %rax ++ subl $(VEC_SIZE), %edx ++ bsrl %eax, %eax ++ jz L(ret_3) ++ addl %edx, %eax ++ jl L(zero_2) ++ addq %rdi, %rax ++L(ret_3): + ret + +- .p2align 4 +-L(length_less16): +- punpcklbw %xmm1, %xmm1 +- punpcklbw %xmm1, %xmm1 +- +- add $16, %edx +- +- pshufd $0, %xmm1, %xmm1 +- +- mov %edi, %ecx +- and $15, %ecx +- jz L(length_less16_offset0) +- +- mov %cl, %dh +- mov %ecx, %esi +- add %dl, %dh +- and $-16, %rdi +- +- sub $16, %dh +- ja L(length_less16_part2) +- +- pcmpeqb (%rdi), %xmm1 +- pmovmskb %xmm1, %eax +- +- sar %cl, %eax +- mov %dl, %cl +- +- mov $1, %edx +- sal %cl, %edx +- sub $1, %edx +- +- and %edx, %eax +- test %eax, %eax +- jz L(return_null) +- +- bsr %eax, %eax +- add %rdi, %rax +- add %rsi, %rax ++ .p2align 4,, 6 ++L(ret_vec_x2_test): ++ bsrl %eax, %eax ++ jz L(zero_2) ++ addl %edx, %eax ++ jl L(zero_2) ++ addq %rdi, %rax + ret + +- .p2align 4 +-L(length_less16_part2): +- movdqa 16(%rdi), %xmm2 +- pcmpeqb %xmm1, %xmm2 +- pmovmskb %xmm2, %eax +- +- mov %dh, %cl +- mov $1, %edx +- sal %cl, %edx +- sub $1, %edx +- +- and %edx, %eax ++L(zero_2): ++ xorl %eax, %eax ++ ret + +- test %eax, %eax +- jnz L(length_less16_part2_return) + +- pcmpeqb (%rdi), %xmm1 +- pmovmskb %xmm1, %eax ++ .p2align 4,, 5 ++L(ret_vec_x2): ++ bsrl %eax, %eax ++ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax ++ ret + +- mov %esi, %ecx +- sar %cl, %eax +- test %eax, %eax +- jz L(return_null) ++ .p2align 4,, 5 ++L(ret_vec_x3): ++ bsrl %eax, %eax ++ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax ++ ret + +- bsr %eax, %eax +- add %rdi, %rax +- add %rsi, %rax ++ .p2align 4,, 8 ++L(more_4x_vec): ++ testl %eax, %eax ++ jnz L(ret_vec_x2) ++ ++ movaps -(VEC_SIZE * 4)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ ++ testl %eax, %eax ++ jnz L(ret_vec_x3) ++ ++ addq $-(VEC_SIZE * 4), %rcx ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec) ++ ++ /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end ++ keeping the code from spilling to the next cache line. */ ++ addq $(VEC_SIZE * 4 - 1), %rcx ++ andq $-(VEC_SIZE * 4), %rcx ++ leaq (VEC_SIZE * 4)(%rdi), %rdx ++ andq $-(VEC_SIZE * 4), %rdx ++ ++ .p2align 4,, 11 ++L(loop_4x_vec): ++ movaps (VEC_SIZE * -1)(%rcx), %xmm1 ++ movaps (VEC_SIZE * -2)(%rcx), %xmm2 ++ movaps (VEC_SIZE * -3)(%rcx), %xmm3 ++ movaps (VEC_SIZE * -4)(%rcx), %xmm4 ++ pcmpeqb %xmm0, %xmm1 ++ pcmpeqb %xmm0, %xmm2 ++ pcmpeqb %xmm0, %xmm3 ++ pcmpeqb %xmm0, %xmm4 ++ ++ por %xmm1, %xmm2 ++ por %xmm3, %xmm4 ++ por %xmm2, %xmm4 ++ ++ pmovmskb %xmm4, %esi ++ testl %esi, %esi ++ jnz L(loop_end) ++ ++ addq $-(VEC_SIZE * 4), %rcx ++ cmpq %rdx, %rcx ++ jne L(loop_4x_vec) ++ ++ subl %edi, %edx ++ ++ /* Ends up being 1-byte nop. */ ++ .p2align 4,, 2 ++L(last_4x_vec): ++ movaps -(VEC_SIZE)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ ++ cmpl $(VEC_SIZE * 2), %edx ++ jbe L(last_2x_vec) ++ ++ testl %eax, %eax ++ jnz L(ret_vec_x0) ++ ++ ++ movaps -(VEC_SIZE * 2)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ ++ testl %eax, %eax ++ jnz L(ret_vec_end) ++ ++ movaps -(VEC_SIZE * 3)(%rcx), %xmm1 ++ pcmpeqb %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ ++ subl $(VEC_SIZE * 3), %edx ++ ja L(last_vec) ++ bsrl %eax, %eax ++ jz L(ret_4) ++ addl %edx, %eax ++ jl L(zero_3) ++ addq %rdi, %rax ++L(ret_4): + ret + +- .p2align 4 +-L(length_less16_part2_return): +- bsr %eax, %eax +- lea 16(%rax, %rdi), %rax ++ /* Ends up being 1-byte nop. */ ++ .p2align 4,, 3 ++L(loop_end): ++ pmovmskb %xmm1, %eax ++ sall $16, %eax ++ jnz L(ret_vec_end) ++ ++ pmovmskb %xmm2, %eax ++ testl %eax, %eax ++ jnz L(ret_vec_end) ++ ++ pmovmskb %xmm3, %eax ++ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) ++ then it won't affect the result in esi (VEC4). If ecx is non-zero ++ then CHAR in VEC3 and bsrq will use that position. */ ++ sall $16, %eax ++ orl %esi, %eax ++ bsrl %eax, %eax ++ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax + ret + +-END (__memrchr) ++L(ret_vec_end): ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * -2)(%rax, %rcx), %rax ++ ret ++ /* Use in L(last_4x_vec). In the same cache line. This is just a spare ++ aligning bytes. */ ++L(zero_3): ++ xorl %eax, %eax ++ ret ++ /* 2-bytes from next cache line. */ ++END(__memrchr) + weak_alias (__memrchr, memrchr) diff --git a/glibc-upstream-2.34-284.patch b/glibc-upstream-2.34-284.patch new file mode 100644 index 0000000..846f807 --- /dev/null +++ b/glibc-upstream-2.34-284.patch @@ -0,0 +1,624 @@ +commit 83a986e9fbc301e6056dbc9d9ec6888621b60f67 +Author: Noah Goldstein +Date: Mon Jun 6 21:11:31 2022 -0700 + + x86: Optimize memrchr-evex.S + + The new code: + 1. prioritizes smaller user-arg lengths more. + 2. optimizes target placement more carefully + 3. reuses logic more + 4. fixes up various inefficiencies in the logic. The biggest + case here is the `lzcnt` logic for checking returns which + saves either a branch or multiple instructions. + + The total code size saving is: 263 bytes + Geometric Mean of all benchmarks New / Old: 0.755 + + Regressions: + There are some regressions. Particularly where the length (user arg + length) is large but the position of the match char is near the + beginning of the string (in first VEC). This case has roughly a + 20% regression. + + This is because the new logic gives the hot path for immediate matches + to shorter lengths (the more common input). This case has roughly + a 35% speedup. + + Full xcheck passes on x86_64. + Reviewed-by: H.J. Lu + + (cherry picked from commit b4209615a06b01c974f47b4998b00e4c7b1aa5d9) + +diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S +index 16bf8e02b1e80c84..bddc89c3754894ed 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memrchr-evex.S +@@ -19,319 +19,316 @@ + #if IS_IN (libc) + + # include ++# include "evex256-vecs.h" ++# if VEC_SIZE != 32 ++# error "VEC_SIZE != 32 unimplemented" ++# endif ++ ++# ifndef MEMRCHR ++# define MEMRCHR __memrchr_evex ++# endif ++ ++# define PAGE_SIZE 4096 ++# define VECMATCH VEC(0) ++ ++ .section SECTION(.text), "ax", @progbits ++ENTRY_P2ALIGN(MEMRCHR, 6) ++# ifdef __ILP32__ ++ /* Clear upper bits. */ ++ and %RDX_LP, %RDX_LP ++# else ++ test %RDX_LP, %RDX_LP ++# endif ++ jz L(zero_0) ++ ++ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a ++ correct page cross check and 2) it correctly sets up end ptr to be ++ subtract by lzcnt aligned. */ ++ leaq -1(%rdi, %rdx), %rax ++ vpbroadcastb %esi, %VECMATCH ++ ++ /* Check if we can load 1x VEC without cross a page. */ ++ testl $(PAGE_SIZE - VEC_SIZE), %eax ++ jz L(page_cross) ++ ++ /* Don't use rax for pointer here because EVEX has better encoding with ++ offset % VEC_SIZE == 0. */ ++ vpcmpb $0, -(VEC_SIZE)(%rdi, %rdx), %VECMATCH, %k0 ++ kmovd %k0, %ecx ++ ++ /* Fall through for rdx (len) <= VEC_SIZE (expect small sizes). */ ++ cmpq $VEC_SIZE, %rdx ++ ja L(more_1x_vec) ++L(ret_vec_x0_test): ++ ++ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which ++ will guarantee edx (len) is less than it. */ ++ lzcntl %ecx, %ecx ++ cmpl %ecx, %edx ++ jle L(zero_0) ++ subq %rcx, %rax ++ ret + +-# define VMOVA vmovdqa64 +- +-# define YMMMATCH ymm16 +- +-# define VEC_SIZE 32 +- +- .section .text.evex,"ax",@progbits +-ENTRY (__memrchr_evex) +- /* Broadcast CHAR to YMMMATCH. */ +- vpbroadcastb %esi, %YMMMATCH +- +- sub $VEC_SIZE, %RDX_LP +- jbe L(last_vec_or_less) +- +- add %RDX_LP, %RDI_LP +- +- /* Check the last VEC_SIZE bytes. */ +- vpcmpb $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(last_vec_x0) +- +- subq $(VEC_SIZE * 4), %rdi +- movl %edi, %ecx +- andl $(VEC_SIZE - 1), %ecx +- jz L(aligned_more) +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rdx +- andq $-VEC_SIZE, %rdi +- subq %rcx, %rdx +- +- .p2align 4 +-L(aligned_more): +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +- +- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(last_vec_x3) +- +- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 +- kmovd %k2, %eax +- testl %eax, %eax +- jnz L(last_vec_x2) +- +- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 +- kmovd %k3, %eax +- testl %eax, %eax +- jnz L(last_vec_x1) +- +- vpcmpb $0, (%rdi), %YMMMATCH, %k4 +- kmovd %k4, %eax +- testl %eax, %eax +- jnz L(last_vec_x0) +- +- /* Align data to 4 * VEC_SIZE for loop with fewer branches. +- There are some overlaps with above if data isn't aligned +- to 4 * VEC_SIZE. */ +- movl %edi, %ecx +- andl $(VEC_SIZE * 4 - 1), %ecx +- jz L(loop_4x_vec) +- +- addq $(VEC_SIZE * 4), %rdi +- addq $(VEC_SIZE * 4), %rdx +- andq $-(VEC_SIZE * 4), %rdi +- subq %rcx, %rdx ++ /* Fits in aligning bytes of first cache line. */ ++L(zero_0): ++ xorl %eax, %eax ++ ret + +- .p2align 4 +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- subq $(VEC_SIZE * 4), %rdi +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +- +- vpcmpb $0, (%rdi), %YMMMATCH, %k1 +- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 +- kord %k1, %k2, %k5 +- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 +- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 +- +- kord %k3, %k4, %k6 +- kortestd %k5, %k6 +- jz L(loop_4x_vec) +- +- /* There is a match. */ +- kmovd %k4, %eax +- testl %eax, %eax +- jnz L(last_vec_x3) +- +- kmovd %k3, %eax +- testl %eax, %eax +- jnz L(last_vec_x2) +- +- kmovd %k2, %eax +- testl %eax, %eax +- jnz L(last_vec_x1) +- +- kmovd %k1, %eax +- bsrl %eax, %eax +- addq %rdi, %rax ++ .p2align 4,, 9 ++L(ret_vec_x0_dec): ++ decq %rax ++L(ret_vec_x0): ++ lzcntl %ecx, %ecx ++ subq %rcx, %rax + ret + +- .p2align 4 +-L(last_4x_vec_or_less): +- addl $(VEC_SIZE * 4), %edx +- cmpl $(VEC_SIZE * 2), %edx +- jbe L(last_2x_vec) ++ .p2align 4,, 10 ++L(more_1x_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0) + +- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(last_vec_x3) ++ /* Align rax (pointer to string). */ ++ andq $-VEC_SIZE, %rax + +- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 +- kmovd %k2, %eax +- testl %eax, %eax +- jnz L(last_vec_x2) ++ /* Recompute length after aligning. */ ++ movq %rax, %rdx + +- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 +- kmovd %k3, %eax +- testl %eax, %eax +- jnz L(last_vec_x1_check) +- cmpl $(VEC_SIZE * 3), %edx +- jbe L(zero) ++ /* Need no matter what. */ ++ vpcmpb $0, -(VEC_SIZE)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx + +- vpcmpb $0, (%rdi), %YMMMATCH, %k4 +- kmovd %k4, %eax +- testl %eax, %eax +- jz L(zero) +- bsrl %eax, %eax +- subq $(VEC_SIZE * 4), %rdx +- addq %rax, %rdx +- jl L(zero) +- addq %rdi, %rax +- ret ++ subq %rdi, %rdx + +- .p2align 4 ++ cmpq $(VEC_SIZE * 2), %rdx ++ ja L(more_2x_vec) + L(last_2x_vec): +- vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(last_vec_x3_check) ++ ++ /* Must dec rax because L(ret_vec_x0_test) expects it. */ ++ decq %rax + cmpl $VEC_SIZE, %edx +- jbe L(zero) +- +- vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jz L(zero) +- bsrl %eax, %eax +- subq $(VEC_SIZE * 2), %rdx +- addq %rax, %rdx +- jl L(zero) +- addl $(VEC_SIZE * 2), %eax +- addq %rdi, %rax ++ jbe L(ret_vec_x0_test) ++ ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0) ++ ++ /* Don't use rax for pointer here because EVEX has better encoding with ++ offset % VEC_SIZE == 0. */ ++ vpcmpb $0, -(VEC_SIZE * 2)(%rdi, %rdx), %VECMATCH, %k0 ++ kmovd %k0, %ecx ++ /* NB: 64-bit lzcnt. This will naturally add 32 to position. */ ++ lzcntq %rcx, %rcx ++ cmpl %ecx, %edx ++ jle L(zero_0) ++ subq %rcx, %rax + ret + +- .p2align 4 +-L(last_vec_x0): +- bsrl %eax, %eax +- addq %rdi, %rax ++ /* Inexpensive place to put this regarding code size / target alignments ++ / ICache NLP. Necessary for 2-byte encoding of jump to page cross ++ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit ++ in first cache line. */ ++L(page_cross): ++ movq %rax, %rsi ++ andq $-VEC_SIZE, %rsi ++ vpcmpb $0, (%rsi), %VECMATCH, %k0 ++ kmovd %k0, %r8d ++ /* Shift out negative alignment (because we are starting from endptr and ++ working backwards). */ ++ movl %eax, %ecx ++ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ ++ notl %ecx ++ shlxl %ecx, %r8d, %ecx ++ cmpq %rdi, %rsi ++ ja L(more_1x_vec) ++ lzcntl %ecx, %ecx ++ cmpl %ecx, %edx ++ jle L(zero_1) ++ subq %rcx, %rax + ret + +- .p2align 4 +-L(last_vec_x1): +- bsrl %eax, %eax +- addl $VEC_SIZE, %eax +- addq %rdi, %rax ++ /* Continue creating zero labels that fit in aligning bytes and get ++ 2-byte encoding / are in the same cache line as condition. */ ++L(zero_1): ++ xorl %eax, %eax + ret + +- .p2align 4 +-L(last_vec_x2): +- bsrl %eax, %eax +- addl $(VEC_SIZE * 2), %eax +- addq %rdi, %rax ++ .p2align 4,, 8 ++L(ret_vec_x1): ++ /* This will naturally add 32 to position. */ ++ bsrl %ecx, %ecx ++ leaq -(VEC_SIZE * 2)(%rcx, %rax), %rax + ret + +- .p2align 4 +-L(last_vec_x3): +- bsrl %eax, %eax +- addl $(VEC_SIZE * 3), %eax +- addq %rdi, %rax +- ret ++ .p2align 4,, 8 ++L(more_2x_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0_dec) + +- .p2align 4 +-L(last_vec_x1_check): +- bsrl %eax, %eax +- subq $(VEC_SIZE * 3), %rdx +- addq %rax, %rdx +- jl L(zero) +- addl $VEC_SIZE, %eax +- addq %rdi, %rax +- ret ++ vpcmpb $0, -(VEC_SIZE * 2)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x1) + +- .p2align 4 +-L(last_vec_x3_check): +- bsrl %eax, %eax +- subq $VEC_SIZE, %rdx +- addq %rax, %rdx +- jl L(zero) +- addl $(VEC_SIZE * 3), %eax +- addq %rdi, %rax +- ret ++ /* Need no matter what. */ ++ vpcmpb $0, -(VEC_SIZE * 3)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx + +- .p2align 4 +-L(zero): +- xorl %eax, %eax ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(more_4x_vec) ++ ++ cmpl $(VEC_SIZE * -1), %edx ++ jle L(ret_vec_x2_test) ++L(last_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x2) ++ ++ ++ /* Need no matter what. */ ++ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx ++ lzcntl %ecx, %ecx ++ subq $(VEC_SIZE * 3 + 1), %rax ++ subq %rcx, %rax ++ cmpq %rax, %rdi ++ ja L(zero_1) + ret + +- .p2align 4 +-L(last_vec_or_less_aligned): +- movl %edx, %ecx +- +- vpcmpb $0, (%rdi), %YMMMATCH, %k1 +- +- movl $1, %edx +- /* Support rdx << 32. */ +- salq %cl, %rdx +- subq $1, %rdx +- +- kmovd %k1, %eax +- +- /* Remove the trailing bytes. */ +- andl %edx, %eax +- testl %eax, %eax +- jz L(zero) +- +- bsrl %eax, %eax +- addq %rdi, %rax ++ .p2align 4,, 8 ++L(ret_vec_x2_test): ++ lzcntl %ecx, %ecx ++ subq $(VEC_SIZE * 2 + 1), %rax ++ subq %rcx, %rax ++ cmpq %rax, %rdi ++ ja L(zero_1) + ret + +- .p2align 4 +-L(last_vec_or_less): +- addl $VEC_SIZE, %edx +- +- /* Check for zero length. */ +- testl %edx, %edx +- jz L(zero) +- +- movl %edi, %ecx +- andl $(VEC_SIZE - 1), %ecx +- jz L(last_vec_or_less_aligned) +- +- movl %ecx, %esi +- movl %ecx, %r8d +- addl %edx, %esi +- andq $-VEC_SIZE, %rdi ++ .p2align 4,, 8 ++L(ret_vec_x2): ++ bsrl %ecx, %ecx ++ leaq -(VEC_SIZE * 3)(%rcx, %rax), %rax ++ ret + +- subl $VEC_SIZE, %esi +- ja L(last_vec_2x_aligned) ++ .p2align 4,, 8 ++L(ret_vec_x3): ++ bsrl %ecx, %ecx ++ leaq -(VEC_SIZE * 4)(%rcx, %rax), %rax ++ ret + +- /* Check the last VEC. */ +- vpcmpb $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ .p2align 4,, 8 ++L(more_4x_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x2) + +- /* Remove the leading and trailing bytes. */ +- sarl %cl, %eax +- movl %edx, %ecx ++ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx + +- movl $1, %edx +- sall %cl, %edx +- subl $1, %edx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x3) + +- andl %edx, %eax +- testl %eax, %eax +- jz L(zero) ++ /* Check if near end before re-aligning (otherwise might do an ++ unnecessary loop iteration). */ ++ addq $-(VEC_SIZE * 4), %rax ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec) + +- bsrl %eax, %eax +- addq %rdi, %rax +- addq %r8, %rax +- ret ++ decq %rax ++ andq $-(VEC_SIZE * 4), %rax ++ movq %rdi, %rdx ++ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because ++ lengths that overflow can be valid and break the comparison. */ ++ andq $-(VEC_SIZE * 4), %rdx + + .p2align 4 +-L(last_vec_2x_aligned): +- movl %esi, %ecx +- +- /* Check the last VEC. */ +- vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++L(loop_4x_vec): ++ /* Store 1 were not-equals and 0 where equals in k1 (used to mask later ++ on). */ ++ vpcmpb $4, (VEC_SIZE * 3)(%rax), %VECMATCH, %k1 ++ ++ /* VEC(2/3) will have zero-byte where we found a CHAR. */ ++ vpxorq (VEC_SIZE * 2)(%rax), %VECMATCH, %VEC(2) ++ vpxorq (VEC_SIZE * 1)(%rax), %VECMATCH, %VEC(3) ++ vpcmpb $0, (VEC_SIZE * 0)(%rax), %VECMATCH, %k4 ++ ++ /* Combine VEC(2/3) with min and maskz with k1 (k1 has zero bit where ++ CHAR is found and VEC(2/3) have zero-byte where CHAR is found. */ ++ vpminub %VEC(2), %VEC(3), %VEC(3){%k1}{z} ++ vptestnmb %VEC(3), %VEC(3), %k2 ++ ++ /* Any 1s and we found CHAR. */ ++ kortestd %k2, %k4 ++ jnz L(loop_end) ++ ++ addq $-(VEC_SIZE * 4), %rax ++ cmpq %rdx, %rax ++ jne L(loop_4x_vec) ++ ++ /* Need to re-adjust rdx / rax for L(last_4x_vec). */ ++ subq $-(VEC_SIZE * 4), %rdx ++ movq %rdx, %rax ++ subl %edi, %edx ++L(last_4x_vec): ++ ++ /* Used no matter what. */ ++ vpcmpb $0, (VEC_SIZE * -1)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx + +- movl $1, %edx +- sall %cl, %edx +- subl $1, %edx ++ cmpl $(VEC_SIZE * 2), %edx ++ jbe L(last_2x_vec) + +- kmovd %k1, %eax ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0_dec) + +- /* Remove the trailing bytes. */ +- andl %edx, %eax + +- testl %eax, %eax +- jnz L(last_vec_x1) ++ vpcmpb $0, (VEC_SIZE * -2)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx + +- /* Check the second last VEC. */ +- vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ testl %ecx, %ecx ++ jnz L(ret_vec_x1) + +- movl %r8d, %ecx ++ /* Used no matter what. */ ++ vpcmpb $0, (VEC_SIZE * -3)(%rax), %VECMATCH, %k0 ++ kmovd %k0, %ecx + +- kmovd %k1, %eax ++ cmpl $(VEC_SIZE * 3), %edx ++ ja L(last_vec) + +- /* Remove the leading bytes. Must use unsigned right shift for +- bsrl below. */ +- shrl %cl, %eax +- testl %eax, %eax +- jz L(zero) ++ lzcntl %ecx, %ecx ++ subq $(VEC_SIZE * 2 + 1), %rax ++ subq %rcx, %rax ++ cmpq %rax, %rdi ++ jbe L(ret_1) ++ xorl %eax, %eax ++L(ret_1): ++ ret + +- bsrl %eax, %eax +- addq %rdi, %rax +- addq %r8, %rax ++ .p2align 4,, 6 ++L(loop_end): ++ kmovd %k1, %ecx ++ notl %ecx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0_end) ++ ++ vptestnmb %VEC(2), %VEC(2), %k0 ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x1_end) ++ ++ kmovd %k2, %ecx ++ kmovd %k4, %esi ++ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) ++ then it won't affect the result in esi (VEC4). If ecx is non-zero ++ then CHAR in VEC3 and bsrq will use that position. */ ++ salq $32, %rcx ++ orq %rsi, %rcx ++ bsrq %rcx, %rcx ++ addq %rcx, %rax ++ ret ++ .p2align 4,, 4 ++L(ret_vec_x0_end): ++ addq $(VEC_SIZE), %rax ++L(ret_vec_x1_end): ++ bsrl %ecx, %ecx ++ leaq (VEC_SIZE * 2)(%rax, %rcx), %rax + ret +-END (__memrchr_evex) ++ ++END(MEMRCHR) + #endif diff --git a/glibc-upstream-2.34-285.patch b/glibc-upstream-2.34-285.patch new file mode 100644 index 0000000..c3b0837 --- /dev/null +++ b/glibc-upstream-2.34-285.patch @@ -0,0 +1,645 @@ +commit b05bd59823bcedee281d3fd5bd4928698ea9d69d +Author: Noah Goldstein +Date: Mon Jun 6 21:11:32 2022 -0700 + + x86: Optimize memrchr-avx2.S + + The new code: + 1. prioritizes smaller user-arg lengths more. + 2. optimizes target placement more carefully + 3. reuses logic more + 4. fixes up various inefficiencies in the logic. The biggest + case here is the `lzcnt` logic for checking returns which + saves either a branch or multiple instructions. + + The total code size saving is: 306 bytes + Geometric Mean of all benchmarks New / Old: 0.760 + + Regressions: + There are some regressions. Particularly where the length (user arg + length) is large but the position of the match char is near the + beginning of the string (in first VEC). This case has roughly a + 10-20% regression. + + This is because the new logic gives the hot path for immediate matches + to shorter lengths (the more common input). This case has roughly + a 15-45% speedup. + + Full xcheck passes on x86_64. + Reviewed-by: H.J. Lu + + (cherry picked from commit af5306a735eb0966fdc2f8ccdafa8888e2df0c87) + +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +index cea2d2a72db7406a..5e9beeeef2677c9f 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +@@ -2,6 +2,7 @@ + # define MEMRCHR __memrchr_avx2_rtm + #endif + ++#define COND_VZEROUPPER COND_VZEROUPPER_XTEST + #define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S +index ac7370cb06e9a0fd..5f8e0be18cfe4fad 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S +@@ -21,340 +21,318 @@ + # include + + # ifndef MEMRCHR +-# define MEMRCHR __memrchr_avx2 ++# define MEMRCHR __memrchr_avx2 + # endif + + # ifndef VZEROUPPER +-# define VZEROUPPER vzeroupper ++# define VZEROUPPER vzeroupper + # endif + + # ifndef SECTION + # define SECTION(p) p##.avx + # endif + +-# define VEC_SIZE 32 ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++ .section SECTION(.text), "ax", @progbits ++ENTRY(MEMRCHR) ++# ifdef __ILP32__ ++ /* Clear upper bits. */ ++ and %RDX_LP, %RDX_LP ++# else ++ test %RDX_LP, %RDX_LP ++# endif ++ jz L(zero_0) + +- .section SECTION(.text),"ax",@progbits +-ENTRY (MEMRCHR) +- /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 +- vpbroadcastb %xmm0, %ymm0 +- +- sub $VEC_SIZE, %RDX_LP +- jbe L(last_vec_or_less) +- +- add %RDX_LP, %RDI_LP +- +- /* Check the last VEC_SIZE bytes. */ +- vpcmpeqb (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(last_vec_x0) ++ /* Get end pointer. Minus one for two reasons. 1) It is necessary for a ++ correct page cross check and 2) it correctly sets up end ptr to be ++ subtract by lzcnt aligned. */ ++ leaq -1(%rdx, %rdi), %rax + +- subq $(VEC_SIZE * 4), %rdi +- movl %edi, %ecx +- andl $(VEC_SIZE - 1), %ecx +- jz L(aligned_more) ++ vpbroadcastb %xmm0, %ymm0 + +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rdx +- andq $-VEC_SIZE, %rdi +- subq %rcx, %rdx ++ /* Check if we can load 1x VEC without cross a page. */ ++ testl $(PAGE_SIZE - VEC_SIZE), %eax ++ jz L(page_cross) ++ ++ vpcmpeqb -(VEC_SIZE - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ cmpq $VEC_SIZE, %rdx ++ ja L(more_1x_vec) ++ ++L(ret_vec_x0_test): ++ /* If ecx is zero (no matches) lzcnt will set it 32 (VEC_SIZE) which ++ will gurantee edx (len) is less than it. */ ++ lzcntl %ecx, %ecx ++ ++ /* Hoist vzeroupper (not great for RTM) to save code size. This allows ++ all logic for edx (len) <= VEC_SIZE to fit in first cache line. */ ++ COND_VZEROUPPER ++ cmpl %ecx, %edx ++ jle L(zero_0) ++ subq %rcx, %rax ++ ret + +- .p2align 4 +-L(aligned_more): +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +- +- /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(last_vec_x3) +- +- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 +- vpmovmskb %ymm2, %eax +- testl %eax, %eax +- jnz L(last_vec_x2) +- +- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 +- vpmovmskb %ymm3, %eax +- testl %eax, %eax +- jnz L(last_vec_x1) +- +- vpcmpeqb (%rdi), %ymm0, %ymm4 +- vpmovmskb %ymm4, %eax +- testl %eax, %eax +- jnz L(last_vec_x0) +- +- /* Align data to 4 * VEC_SIZE for loop with fewer branches. +- There are some overlaps with above if data isn't aligned +- to 4 * VEC_SIZE. */ +- movl %edi, %ecx +- andl $(VEC_SIZE * 4 - 1), %ecx +- jz L(loop_4x_vec) +- +- addq $(VEC_SIZE * 4), %rdi +- addq $(VEC_SIZE * 4), %rdx +- andq $-(VEC_SIZE * 4), %rdi +- subq %rcx, %rdx ++ /* Fits in aligning bytes of first cache line. */ ++L(zero_0): ++ xorl %eax, %eax ++ ret + +- .p2align 4 +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- subq $(VEC_SIZE * 4), %rdi +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +- +- vmovdqa (%rdi), %ymm1 +- vmovdqa VEC_SIZE(%rdi), %ymm2 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 +- +- vpcmpeqb %ymm1, %ymm0, %ymm1 +- vpcmpeqb %ymm2, %ymm0, %ymm2 +- vpcmpeqb %ymm3, %ymm0, %ymm3 +- vpcmpeqb %ymm4, %ymm0, %ymm4 +- +- vpor %ymm1, %ymm2, %ymm5 +- vpor %ymm3, %ymm4, %ymm6 +- vpor %ymm5, %ymm6, %ymm5 +- +- vpmovmskb %ymm5, %eax +- testl %eax, %eax +- jz L(loop_4x_vec) +- +- /* There is a match. */ +- vpmovmskb %ymm4, %eax +- testl %eax, %eax +- jnz L(last_vec_x3) +- +- vpmovmskb %ymm3, %eax +- testl %eax, %eax +- jnz L(last_vec_x2) +- +- vpmovmskb %ymm2, %eax +- testl %eax, %eax +- jnz L(last_vec_x1) +- +- vpmovmskb %ymm1, %eax +- bsrl %eax, %eax +- addq %rdi, %rax ++ .p2align 4,, 9 ++L(ret_vec_x0): ++ lzcntl %ecx, %ecx ++ subq %rcx, %rax + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +- .p2align 4 +-L(last_4x_vec_or_less): +- addl $(VEC_SIZE * 4), %edx +- cmpl $(VEC_SIZE * 2), %edx +- jbe L(last_2x_vec) +- +- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(last_vec_x3) +- +- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm2 +- vpmovmskb %ymm2, %eax +- testl %eax, %eax +- jnz L(last_vec_x2) +- +- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm3 +- vpmovmskb %ymm3, %eax +- testl %eax, %eax +- jnz L(last_vec_x1_check) +- cmpl $(VEC_SIZE * 3), %edx +- jbe L(zero) +- +- vpcmpeqb (%rdi), %ymm0, %ymm4 +- vpmovmskb %ymm4, %eax +- testl %eax, %eax +- jz L(zero) +- bsrl %eax, %eax +- subq $(VEC_SIZE * 4), %rdx +- addq %rax, %rdx +- jl L(zero) +- addq %rdi, %rax +- VZEROUPPER_RETURN +- +- .p2align 4 ++ .p2align 4,, 10 ++L(more_1x_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0) ++ ++ /* Align rax (string pointer). */ ++ andq $-VEC_SIZE, %rax ++ ++ /* Recompute remaining length after aligning. */ ++ movq %rax, %rdx ++ /* Need this comparison next no matter what. */ ++ vpcmpeqb -(VEC_SIZE)(%rax), %ymm0, %ymm1 ++ subq %rdi, %rdx ++ decq %rax ++ vpmovmskb %ymm1, %ecx ++ /* Fall through for short (hotter than length). */ ++ cmpq $(VEC_SIZE * 2), %rdx ++ ja L(more_2x_vec) + L(last_2x_vec): +- vpcmpeqb (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(last_vec_x3_check) + cmpl $VEC_SIZE, %edx +- jbe L(zero) +- +- vpcmpeqb (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jz L(zero) +- bsrl %eax, %eax +- subq $(VEC_SIZE * 2), %rdx +- addq %rax, %rdx +- jl L(zero) +- addl $(VEC_SIZE * 2), %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN +- +- .p2align 4 +-L(last_vec_x0): +- bsrl %eax, %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN ++ jbe L(ret_vec_x0_test) ++ ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0) ++ ++ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ /* 64-bit lzcnt. This will naturally add 32 to position. */ ++ lzcntq %rcx, %rcx ++ COND_VZEROUPPER ++ cmpl %ecx, %edx ++ jle L(zero_0) ++ subq %rcx, %rax ++ ret + +- .p2align 4 +-L(last_vec_x1): +- bsrl %eax, %eax +- addl $VEC_SIZE, %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN + +- .p2align 4 +-L(last_vec_x2): +- bsrl %eax, %eax +- addl $(VEC_SIZE * 2), %eax +- addq %rdi, %rax ++ /* Inexpensive place to put this regarding code size / target alignments ++ / ICache NLP. Necessary for 2-byte encoding of jump to page cross ++ case which in turn is necessary for hot path (len <= VEC_SIZE) to fit ++ in first cache line. */ ++L(page_cross): ++ movq %rax, %rsi ++ andq $-VEC_SIZE, %rsi ++ vpcmpeqb (%rsi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ /* Shift out negative alignment (because we are starting from endptr and ++ working backwards). */ ++ movl %eax, %r8d ++ /* notl because eax already has endptr - 1. (-x = ~(x - 1)). */ ++ notl %r8d ++ shlxl %r8d, %ecx, %ecx ++ cmpq %rdi, %rsi ++ ja L(more_1x_vec) ++ lzcntl %ecx, %ecx ++ COND_VZEROUPPER ++ cmpl %ecx, %edx ++ jle L(zero_0) ++ subq %rcx, %rax ++ ret ++ .p2align 4,, 11 ++L(ret_vec_x1): ++ /* This will naturally add 32 to position. */ ++ lzcntq %rcx, %rcx ++ subq %rcx, %rax + VZEROUPPER_RETURN ++ .p2align 4,, 10 ++L(more_2x_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0) + +- .p2align 4 +-L(last_vec_x3): +- bsrl %eax, %eax +- addl $(VEC_SIZE * 3), %eax +- addq %rdi, %rax +- ret ++ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x1) + +- .p2align 4 +-L(last_vec_x1_check): +- bsrl %eax, %eax +- subq $(VEC_SIZE * 3), %rdx +- addq %rax, %rdx +- jl L(zero) +- addl $VEC_SIZE, %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN + +- .p2align 4 +-L(last_vec_x3_check): +- bsrl %eax, %eax +- subq $VEC_SIZE, %rdx +- addq %rax, %rdx +- jl L(zero) +- addl $(VEC_SIZE * 3), %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN ++ /* Needed no matter what. */ ++ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx + +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- VZEROUPPER_RETURN ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(more_4x_vec) ++ ++ cmpl $(VEC_SIZE * -1), %edx ++ jle L(ret_vec_x2_test) ++ ++L(last_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x2) ++ ++ /* Needed no matter what. */ ++ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ lzcntl %ecx, %ecx ++ subq $(VEC_SIZE * 3), %rax ++ COND_VZEROUPPER ++ subq %rcx, %rax ++ cmpq %rax, %rdi ++ ja L(zero_2) ++ ret + +- .p2align 4 +-L(null): ++ /* First in aligning bytes. */ ++L(zero_2): + xorl %eax, %eax + ret + +- .p2align 4 +-L(last_vec_or_less_aligned): +- movl %edx, %ecx ++ .p2align 4,, 4 ++L(ret_vec_x2_test): ++ lzcntl %ecx, %ecx ++ subq $(VEC_SIZE * 2), %rax ++ COND_VZEROUPPER ++ subq %rcx, %rax ++ cmpq %rax, %rdi ++ ja L(zero_2) ++ ret + +- vpcmpeqb (%rdi), %ymm0, %ymm1 + +- movl $1, %edx +- /* Support rdx << 32. */ +- salq %cl, %rdx +- subq $1, %rdx ++ .p2align 4,, 11 ++L(ret_vec_x2): ++ /* ecx must be non-zero. */ ++ bsrl %ecx, %ecx ++ leaq (VEC_SIZE * -3 + 1)(%rcx, %rax), %rax ++ VZEROUPPER_RETURN + +- vpmovmskb %ymm1, %eax ++ .p2align 4,, 14 ++L(ret_vec_x3): ++ /* ecx must be non-zero. */ ++ bsrl %ecx, %ecx ++ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax ++ VZEROUPPER_RETURN + +- /* Remove the trailing bytes. */ +- andl %edx, %eax +- testl %eax, %eax +- jz L(zero) + +- bsrl %eax, %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN + + .p2align 4 +-L(last_vec_or_less): +- addl $VEC_SIZE, %edx ++L(more_4x_vec): ++ testl %ecx, %ecx ++ jnz L(ret_vec_x2) + +- /* Check for zero length. */ +- testl %edx, %edx +- jz L(null) ++ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx + +- movl %edi, %ecx +- andl $(VEC_SIZE - 1), %ecx +- jz L(last_vec_or_less_aligned) ++ testl %ecx, %ecx ++ jnz L(ret_vec_x3) + +- movl %ecx, %esi +- movl %ecx, %r8d +- addl %edx, %esi +- andq $-VEC_SIZE, %rdi ++ /* Check if near end before re-aligning (otherwise might do an ++ unnecissary loop iteration). */ ++ addq $-(VEC_SIZE * 4), %rax ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec) + +- subl $VEC_SIZE, %esi +- ja L(last_vec_2x_aligned) ++ /* Align rax to (VEC_SIZE - 1). */ ++ orq $(VEC_SIZE * 4 - 1), %rax ++ movq %rdi, %rdx ++ /* Get endptr for loop in rdx. NB: Can't just do while rax > rdi because ++ lengths that overflow can be valid and break the comparison. */ ++ orq $(VEC_SIZE * 4 - 1), %rdx + +- /* Check the last VEC. */ +- vpcmpeqb (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- +- /* Remove the leading and trailing bytes. */ +- sarl %cl, %eax +- movl %edx, %ecx ++ .p2align 4 ++L(loop_4x_vec): ++ /* Need this comparison next no matter what. */ ++ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 ++ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm2 ++ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm3 ++ vpcmpeqb -(VEC_SIZE * 4 - 1)(%rax), %ymm0, %ymm4 + +- movl $1, %edx +- sall %cl, %edx +- subl $1, %edx ++ vpor %ymm1, %ymm2, %ymm2 ++ vpor %ymm3, %ymm4, %ymm4 ++ vpor %ymm2, %ymm4, %ymm4 ++ vpmovmskb %ymm4, %esi + +- andl %edx, %eax +- testl %eax, %eax +- jz L(zero) ++ testl %esi, %esi ++ jnz L(loop_end) + +- bsrl %eax, %eax +- addq %rdi, %rax +- addq %r8, %rax +- VZEROUPPER_RETURN ++ addq $(VEC_SIZE * -4), %rax ++ cmpq %rdx, %rax ++ jne L(loop_4x_vec) + +- .p2align 4 +-L(last_vec_2x_aligned): +- movl %esi, %ecx ++ subl %edi, %edx ++ incl %edx + +- /* Check the last VEC. */ +- vpcmpeqb VEC_SIZE(%rdi), %ymm0, %ymm1 ++L(last_4x_vec): ++ /* Used no matter what. */ ++ vpcmpeqb -(VEC_SIZE * 1 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx + +- movl $1, %edx +- sall %cl, %edx +- subl $1, %edx ++ cmpl $(VEC_SIZE * 2), %edx ++ jbe L(last_2x_vec) + +- vpmovmskb %ymm1, %eax ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0_end) + +- /* Remove the trailing bytes. */ +- andl %edx, %eax ++ vpcmpeqb -(VEC_SIZE * 2 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x1_end) + +- testl %eax, %eax +- jnz L(last_vec_x1) ++ /* Used no matter what. */ ++ vpcmpeqb -(VEC_SIZE * 3 - 1)(%rax), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %ecx + +- /* Check the second last VEC. */ +- vpcmpeqb (%rdi), %ymm0, %ymm1 ++ cmpl $(VEC_SIZE * 3), %edx ++ ja L(last_vec) ++ ++ lzcntl %ecx, %ecx ++ subq $(VEC_SIZE * 2), %rax ++ COND_VZEROUPPER ++ subq %rcx, %rax ++ cmpq %rax, %rdi ++ jbe L(ret0) ++ xorl %eax, %eax ++L(ret0): ++ ret + +- movl %r8d, %ecx + +- vpmovmskb %ymm1, %eax ++ .p2align 4 ++L(loop_end): ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x0_end) ++ ++ vpmovmskb %ymm2, %ecx ++ testl %ecx, %ecx ++ jnz L(ret_vec_x1_end) ++ ++ vpmovmskb %ymm3, %ecx ++ /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3) ++ then it won't affect the result in esi (VEC4). If ecx is non-zero ++ then CHAR in VEC3 and bsrq will use that position. */ ++ salq $32, %rcx ++ orq %rsi, %rcx ++ bsrq %rcx, %rcx ++ leaq (VEC_SIZE * -4 + 1)(%rcx, %rax), %rax ++ VZEROUPPER_RETURN + +- /* Remove the leading bytes. Must use unsigned right shift for +- bsrl below. */ +- shrl %cl, %eax +- testl %eax, %eax +- jz L(zero) ++ .p2align 4,, 4 ++L(ret_vec_x1_end): ++ /* 64-bit version will automatically add 32 (VEC_SIZE). */ ++ lzcntq %rcx, %rcx ++ subq %rcx, %rax ++ VZEROUPPER_RETURN + +- bsrl %eax, %eax +- addq %rdi, %rax +- addq %r8, %rax ++ .p2align 4,, 4 ++L(ret_vec_x0_end): ++ lzcntl %ecx, %ecx ++ subq %rcx, %rax + VZEROUPPER_RETURN +-END (MEMRCHR) ++ ++ /* 2 bytes until next cache line. */ ++END(MEMRCHR) + #endif diff --git a/glibc-upstream-2.34-286.patch b/glibc-upstream-2.34-286.patch new file mode 100644 index 0000000..41a0188 --- /dev/null +++ b/glibc-upstream-2.34-286.patch @@ -0,0 +1,323 @@ +commit a910d7e164f1d9b8e77bbea35a2d2ab89a5e26cc +Author: Noah Goldstein +Date: Mon Jun 6 21:11:33 2022 -0700 + + x86: Shrink code size of memchr-avx2.S + + This is not meant as a performance optimization. The previous code was + far to liberal in aligning targets and wasted code size unnecissarily. + + The total code size saving is: 59 bytes + + There are no major changes in the benchmarks. + Geometric Mean of all benchmarks New / Old: 0.967 + + Full xcheck passes on x86_64. + Reviewed-by: H.J. Lu + + (cherry picked from commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35) + + x86: Fix page cross case in rawmemchr-avx2 [BZ #29234] + + commit 6dcbb7d95dded20153b12d76d2f4e0ef0cda4f35 + Author: Noah Goldstein + Date: Mon Jun 6 21:11:33 2022 -0700 + + x86: Shrink code size of memchr-avx2.S + + Changed how the page cross case aligned string (rdi) in + rawmemchr. This was incompatible with how + `L(cross_page_continue)` expected the pointer to be aligned and + would cause rawmemchr to read data start started before the + beginning of the string. What it would read was in valid memory + but could count CHAR matches resulting in an incorrect return + value. + + This commit fixes that issue by essentially reverting the changes to + the L(page_cross) case as they didn't really matter. + + Test cases added and all pass with the new code (and where confirmed + to fail with the old code). + Reviewed-by: H.J. Lu + + (cherry picked from commit 2c9af8421d2b4a7fcce163e7bc81a118d22fd346) + +diff --git a/string/test-rawmemchr.c b/string/test-rawmemchr.c +index 085098aba8fdbc13..327c0654e69e7669 100644 +--- a/string/test-rawmemchr.c ++++ b/string/test-rawmemchr.c +@@ -18,6 +18,7 @@ + . */ + + #include ++#include + + #define TEST_MAIN + #define TEST_NAME "rawmemchr" +@@ -51,13 +52,45 @@ do_one_test (impl_t *impl, const char *s, int c, char *exp_res) + } + } + ++static void ++do_test_bz29234 (void) ++{ ++ size_t i, j; ++ char *ptr_start; ++ char *buf = xmmap (0, 8192, PROT_READ | PROT_WRITE, ++ MAP_PRIVATE | MAP_ANONYMOUS, -1); ++ ++ memset (buf, -1, 8192); ++ ++ ptr_start = buf + 4096 - 8; ++ ++ /* Out of range matches before the start of a page. */ ++ memset (ptr_start - 8, 0x1, 8); ++ ++ for (j = 0; j < 8; ++j) ++ { ++ for (i = 0; i < 128; ++i) ++ { ++ ptr_start[i + j] = 0x1; ++ ++ FOR_EACH_IMPL (impl, 0) ++ do_one_test (impl, (char *) (ptr_start + j), 0x1, ++ ptr_start + i + j); ++ ++ ptr_start[i + j] = 0xff; ++ } ++ } ++ ++ xmunmap (buf, 8192); ++} ++ + static void + do_test (size_t align, size_t pos, size_t len, int seek_char) + { + size_t i; + char *result; + +- align &= 7; ++ align &= getpagesize () - 1; + if (align + len >= page_size) + return; + +@@ -115,6 +148,13 @@ do_random_tests (void) + } + } + ++ if (align) ++ { ++ p[align - 1] = seek_char; ++ if (align > 4) ++ p[align - 4] = seek_char; ++ } ++ + assert (pos < len); + size_t r = random (); + if ((r & 31) == 0) +@@ -130,6 +170,13 @@ do_random_tests (void) + result, p); + ret = 1; + } ++ ++ if (align) ++ { ++ p[align - 1] = seek_char; ++ if (align > 4) ++ p[align - 4] = seek_char; ++ } + } + } + +@@ -151,14 +198,22 @@ test_main (void) + do_test (i, 64, 256, 23); + do_test (0, 16 << i, 2048, 0); + do_test (i, 64, 256, 0); ++ ++ do_test (getpagesize () - i, 64, 256, 23); ++ do_test (getpagesize () - i, 64, 256, 0); + } + for (i = 1; i < 32; ++i) + { + do_test (0, i, i + 1, 23); + do_test (0, i, i + 1, 0); ++ ++ do_test (getpagesize () - 7, i, i + 1, 23); ++ do_test (getpagesize () - i / 2, i, i + 1, 23); ++ do_test (getpagesize () - i, i, i + 1, 23); + } + + do_random_tests (); ++ do_test_bz29234 (); + return ret; + } + +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +index 87b076c7c403ba85..c4d71938c5a3ed24 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +@@ -2,6 +2,7 @@ + # define MEMCHR __memchr_avx2_rtm + #endif + ++#define COND_VZEROUPPER COND_VZEROUPPER_XTEST + #define ZERO_UPPER_VEC_REGISTERS_RETURN \ + ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST + +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index afdb95650232fdac..9e0b7dd1f4fe9909 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -57,7 +57,7 @@ + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits +-ENTRY (MEMCHR) ++ENTRY_P2ALIGN (MEMCHR, 5) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + # ifdef __ILP32__ +@@ -87,12 +87,14 @@ ENTRY (MEMCHR) + # endif + testl %eax, %eax + jz L(aligned_more) +- tzcntl %eax, %eax ++ bsfl %eax, %eax + addq %rdi, %rax +- VZEROUPPER_RETURN ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++ + + # ifndef USE_AS_RAWMEMCHR +- .p2align 5 ++ .p2align 4 + L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax +@@ -100,58 +102,31 @@ L(first_vec_x0): + /* NB: Multiply length by 4 to get byte count. */ + sall $2, %edx + # endif +- xorl %ecx, %ecx ++ COND_VZEROUPPER ++ /* Use branch instead of cmovcc so L(first_vec_x0) fits in one fetch ++ block. branch here as opposed to cmovcc is not that costly. Common ++ usage of memchr is to check if the return was NULL (if string was ++ known to contain CHAR user would use rawmemchr). This branch will be ++ highly correlated with the user branch and can be used by most ++ modern branch predictors to predict the user branch. */ + cmpl %eax, %edx +- leaq (%rdi, %rax), %rax +- cmovle %rcx, %rax +- VZEROUPPER_RETURN +- +-L(null): +- xorl %eax, %eax +- ret +-# endif +- .p2align 4 +-L(cross_page_boundary): +- /* Save pointer before aligning as its original value is +- necessary for computer return address if byte is found or +- adjusting length if it is not and this is memchr. */ +- movq %rdi, %rcx +- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr +- and rdi for rawmemchr. */ +- orq $(VEC_SIZE - 1), %ALGN_PTR_REG +- VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +-# ifndef USE_AS_RAWMEMCHR +- /* Calculate length until end of page (length checked for a +- match). */ +- leaq 1(%ALGN_PTR_REG), %rsi +- subq %RRAW_PTR_REG, %rsi +-# ifdef USE_AS_WMEMCHR +- /* NB: Divide bytes by 4 to get wchar_t count. */ +- shrl $2, %esi +-# endif +-# endif +- /* Remove the leading bytes. */ +- sarxl %ERAW_PTR_REG, %eax, %eax +-# ifndef USE_AS_RAWMEMCHR +- /* Check the end of data. */ +- cmpq %rsi, %rdx +- jbe L(first_vec_x0) ++ jle L(null) ++ addq %rdi, %rax ++ ret + # endif +- testl %eax, %eax +- jz L(cross_page_continue) +- tzcntl %eax, %eax +- addq %RRAW_PTR_REG, %rax +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x1): +- tzcntl %eax, %eax ++ bsfl %eax, %eax + incq %rdi + addq %rdi, %rax + VZEROUPPER_RETURN +- ++# ifndef USE_AS_RAWMEMCHR ++ /* First in aligning bytes here. */ ++L(null): ++ xorl %eax, %eax ++ ret ++# endif + .p2align 4 + L(first_vec_x2): + tzcntl %eax, %eax +@@ -340,7 +315,7 @@ L(first_vec_x1_check): + incq %rdi + addq %rdi, %rax + VZEROUPPER_RETURN +- .p2align 4 ++ .p2align 4,, 6 + L(set_zero_end): + xorl %eax, %eax + VZEROUPPER_RETURN +@@ -428,5 +403,39 @@ L(last_vec_x3): + VZEROUPPER_RETURN + # endif + ++ .p2align 4 ++L(cross_page_boundary): ++ /* Save pointer before aligning as its original value is necessary for ++ computer return address if byte is found or adjusting length if it ++ is not and this is memchr. */ ++ movq %rdi, %rcx ++ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr ++ and rdi for rawmemchr. */ ++ orq $(VEC_SIZE - 1), %ALGN_PTR_REG ++ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Calculate length until end of page (length checked for a match). */ ++ leaq 1(%ALGN_PTR_REG), %rsi ++ subq %RRAW_PTR_REG, %rsi ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get wchar_t count. */ ++ shrl $2, %esi ++# endif ++# endif ++ /* Remove the leading bytes. */ ++ sarxl %ERAW_PTR_REG, %eax, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Check the end of data. */ ++ cmpq %rsi, %rdx ++ jbe L(first_vec_x0) ++# endif ++ testl %eax, %eax ++ jz L(cross_page_continue) ++ bsfl %eax, %eax ++ addq %RRAW_PTR_REG, %rax ++ VZEROUPPER_RETURN ++ ++ + END (MEMCHR) + #endif diff --git a/glibc-upstream-2.34-287.patch b/glibc-upstream-2.34-287.patch new file mode 100644 index 0000000..083be9d --- /dev/null +++ b/glibc-upstream-2.34-287.patch @@ -0,0 +1,130 @@ +commit 3c87383a20daff9a230439e31b778716bfed4d8b +Author: Noah Goldstein +Date: Mon Jun 6 21:11:34 2022 -0700 + + x86: Shrink code size of memchr-evex.S + + This is not meant as a performance optimization. The previous code was + far to liberal in aligning targets and wasted code size unnecissarily. + + The total code size saving is: 64 bytes + + There are no non-negligible changes in the benchmarks. + Geometric Mean of all benchmarks New / Old: 1.000 + + Full xcheck passes on x86_64. + Reviewed-by: H.J. Lu + + (cherry picked from commit 56da3fe1dd075285fa8186d44b3c28e68c687e62) + +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +index 4d0ed6d136f099e1..68381c99a4948134 100644 +--- a/sysdeps/x86_64/multiarch/memchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -88,7 +88,7 @@ + # define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits +-ENTRY (MEMCHR) ++ENTRY_P2ALIGN (MEMCHR, 6) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ + test %RDX_LP, %RDX_LP +@@ -131,22 +131,24 @@ L(zero): + xorl %eax, %eax + ret + +- .p2align 5 ++ .p2align 4 + L(first_vec_x0): +- /* Check if first match was before length. */ +- tzcntl %eax, %eax +- xorl %ecx, %ecx +- cmpl %eax, %edx +- leaq (%rdi, %rax, CHAR_SIZE), %rax +- cmovle %rcx, %rax ++ /* Check if first match was before length. NB: tzcnt has false data- ++ dependency on destination. eax already had a data-dependency on esi ++ so this should have no affect here. */ ++ tzcntl %eax, %esi ++# ifdef USE_AS_WMEMCHR ++ leaq (%rdi, %rsi, CHAR_SIZE), %rdi ++# else ++ addq %rsi, %rdi ++# endif ++ xorl %eax, %eax ++ cmpl %esi, %edx ++ cmovg %rdi, %rax + ret +-# else +- /* NB: first_vec_x0 is 17 bytes which will leave +- cross_page_boundary (which is relatively cold) close enough +- to ideal alignment. So only realign L(cross_page_boundary) if +- rawmemchr. */ +- .p2align 4 + # endif ++ ++ .p2align 4 + L(cross_page_boundary): + /* Save pointer before aligning as its original value is + necessary for computer return address if byte is found or +@@ -400,10 +402,14 @@ L(last_2x_vec): + L(zero_end): + ret + ++L(set_zero_end): ++ xorl %eax, %eax ++ ret + + .p2align 4 + L(first_vec_x1_check): +- tzcntl %eax, %eax ++ /* eax must be non-zero. Use bsfl to save code size. */ ++ bsfl %eax, %eax + /* Adjust length. */ + subl $-(CHAR_PER_VEC * 4), %edx + /* Check if match within remaining length. */ +@@ -412,9 +418,6 @@ L(first_vec_x1_check): + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ + leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax + ret +-L(set_zero_end): +- xorl %eax, %eax +- ret + + .p2align 4 + L(loop_4x_vec_end): +@@ -464,7 +467,7 @@ L(loop_4x_vec_end): + # endif + ret + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x1_return): + tzcntl %eax, %eax + # if defined USE_AS_WMEMCHR || RET_OFFSET != 0 +@@ -496,6 +499,7 @@ L(last_vec_x3_return): + # endif + + # ifndef USE_AS_RAWMEMCHR ++ .p2align 4,, 5 + L(last_4x_vec_or_less_cmpeq): + VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 + kmovd %k0, %eax +@@ -546,7 +550,7 @@ L(last_4x_vec): + # endif + andl %ecx, %eax + jz L(zero_end2) +- tzcntl %eax, %eax ++ bsfl %eax, %eax + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + L(zero_end2): + ret +@@ -562,6 +566,6 @@ L(last_vec_x3): + leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret + # endif +- ++ /* 7 bytes from next cache line. */ + END (MEMCHR) + #endif diff --git a/glibc-upstream-2.34-288.patch b/glibc-upstream-2.34-288.patch new file mode 100644 index 0000000..6565455 --- /dev/null +++ b/glibc-upstream-2.34-288.patch @@ -0,0 +1,33 @@ +commit 820504e3edd7276bf869d543ad5b57187ff9c9b6 +Author: Noah Goldstein +Date: Fri Jun 3 18:52:37 2022 -0500 + + x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions + + Give fall-through path to `vzeroupper` and taken-path to `vzeroall`. + + Generally even on machines with RTM the expectation is the + string-library functions will not be called in transactions. + Reviewed-by: H.J. Lu + + (cherry picked from commit c28db9cb29a7d6cf3ce08fd8445e6b7dea03f35b) + +diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h +index 93e44be22e2275f1..04478b097cdffe20 100644 +--- a/sysdeps/x86_64/sysdep.h ++++ b/sysdeps/x86_64/sysdep.h +@@ -99,11 +99,11 @@ lose: \ + to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ + #define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ + xtest; \ +- jz 1f; \ +- vzeroall; \ ++ jnz 1f; \ ++ vzeroupper; \ + ret; \ + 1: \ +- vzeroupper; \ ++ vzeroall; \ + ret + + /* Can be used to replace vzeroupper that is not directly before a diff --git a/glibc-upstream-2.34-289.patch b/glibc-upstream-2.34-289.patch new file mode 100644 index 0000000..fe9c335 --- /dev/null +++ b/glibc-upstream-2.34-289.patch @@ -0,0 +1,41 @@ +commit fc54e1fae854e6ee6361cd4ddf900c36fce8158e +Author: Noah Goldstein +Date: Wed Jun 8 21:16:51 2022 -0700 + + x86: Align varshift table to 32-bytes + + This ensures the load will never split a cache line. + + (cherry picked from commit 0f91811333f23b61cf681cab2704b35a0a073b97) + +diff --git a/sysdeps/x86_64/multiarch/varshift.c b/sysdeps/x86_64/multiarch/varshift.c +index 45267b0a6823459a..1f563542666bc4f1 100644 +--- a/sysdeps/x86_64/multiarch/varshift.c ++++ b/sysdeps/x86_64/multiarch/varshift.c +@@ -16,9 +16,10 @@ + License along with the GNU C Library; if not, see + . */ + +-#include "varshift.h" ++#include + +-const int8_t ___m128i_shift_right[31] attribute_hidden = ++const int8_t ___m128i_shift_right[31] attribute_hidden ++ __attribute__((aligned(32))) = + { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 +diff --git a/sysdeps/x86_64/multiarch/varshift.h b/sysdeps/x86_64/multiarch/varshift.h +index 32f2173dd2a95b3a..745d48fa7c775136 100644 +--- a/sysdeps/x86_64/multiarch/varshift.h ++++ b/sysdeps/x86_64/multiarch/varshift.h +@@ -19,7 +19,8 @@ + #include + #include + +-extern const int8_t ___m128i_shift_right[31] attribute_hidden; ++extern const int8_t ___m128i_shift_right[31] attribute_hidden ++ __attribute__ ((aligned (32))); + + static __inline__ __m128i + __m128i_shift_right (__m128i value, unsigned long int offset) diff --git a/glibc-upstream-2.34-290.patch b/glibc-upstream-2.34-290.patch new file mode 100644 index 0000000..326109c --- /dev/null +++ b/glibc-upstream-2.34-290.patch @@ -0,0 +1,56 @@ +commit 6e008c884dad5a25f91085c68d044bb5e2d63761 +Author: Noah Goldstein +Date: Tue Jun 14 13:50:11 2022 -0700 + + x86: Fix misordered logic for setting `rep_movsb_stop_threshold` + + Move the setting of `rep_movsb_stop_threshold` to after the tunables + have been collected so that the `rep_movsb_stop_threshold` (which + is used to redirect control flow to the non_temporal case) will + use any user value for `non_temporal_threshold` (set using + glibc.cpu.x86_non_temporal_threshold) + + (cherry picked from commit 035591551400cfc810b07244a015c9411e8bff7c) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 2e43e67e4f4037d3..560bf260e8fbd7bf 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -898,18 +898,6 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) + rep_movsb_threshold = 2112; + +- unsigned long int rep_movsb_stop_threshold; +- /* ERMS feature is implemented from AMD Zen3 architecture and it is +- performing poorly for data above L2 cache size. Henceforth, adding +- an upper bound threshold parameter to limit the usage of Enhanced +- REP MOVSB operations and setting its value to L2 cache size. */ +- if (cpu_features->basic.kind == arch_kind_amd) +- rep_movsb_stop_threshold = core; +- /* Setting the upper bound of ERMS to the computed value of +- non-temporal threshold for architectures other than AMD. */ +- else +- rep_movsb_stop_threshold = non_temporal_threshold; +- + /* The default threshold to use Enhanced REP STOSB. */ + unsigned long int rep_stosb_threshold = 2048; + +@@ -951,6 +939,18 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + SIZE_MAX); + #endif + ++ unsigned long int rep_movsb_stop_threshold; ++ /* ERMS feature is implemented from AMD Zen3 architecture and it is ++ performing poorly for data above L2 cache size. Henceforth, adding ++ an upper bound threshold parameter to limit the usage of Enhanced ++ REP MOVSB operations and setting its value to L2 cache size. */ ++ if (cpu_features->basic.kind == arch_kind_amd) ++ rep_movsb_stop_threshold = core; ++ /* Setting the upper bound of ERMS to the computed value of ++ non-temporal threshold for architectures other than AMD. */ ++ else ++ rep_movsb_stop_threshold = non_temporal_threshold; ++ + cpu_features->data_cache_size = data; + cpu_features->shared_cache_size = shared; + cpu_features->non_temporal_threshold = non_temporal_threshold; diff --git a/glibc-upstream-2.34-291.patch b/glibc-upstream-2.34-291.patch new file mode 100644 index 0000000..849476f --- /dev/null +++ b/glibc-upstream-2.34-291.patch @@ -0,0 +1,38 @@ +commit 9d50e162eef88e1f870a941b0a973060e984e7ca +Author: Noah Goldstein +Date: Tue Jun 14 15:37:28 2022 -0700 + + x86: Add sse42 implementation to strcmp's ifunc + + This has been missing since the the ifuncs where added. + + The performance of SSE4.2 is preferable to to SSE2. + + Measured on Tigerlake with N = 20 runs. + Geometric Mean of all benchmarks SSE4.2 / SSE2: 0.906 + + (cherry picked from commit ff439c47173565fbff4f0f78d07b0f14e4a7db05) + +diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c +index 7c2901bf44456259..b457fb4c150e4407 100644 +--- a/sysdeps/x86_64/multiarch/strcmp.c ++++ b/sysdeps/x86_64/multiarch/strcmp.c +@@ -29,6 +29,7 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; +@@ -53,6 +54,10 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2); + } + ++ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) ++ return OPTIMIZE (sse42); ++ + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return OPTIMIZE (sse2_unaligned); + diff --git a/glibc-upstream-2.34-292.patch b/glibc-upstream-2.34-292.patch new file mode 100644 index 0000000..492f541 --- /dev/null +++ b/glibc-upstream-2.34-292.patch @@ -0,0 +1,54 @@ +commit 94b0dc9419bd038cb85b364a7556569386c31741 +Author: Noah Goldstein +Date: Wed Jun 15 10:41:29 2022 -0700 + + x86: Add bounds `x86_non_temporal_threshold` + + The lower-bound (16448) and upper-bound (SIZE_MAX / 16) are assumed + by memmove-vec-unaligned-erms. + + The lower-bound is needed because memmove-vec-unaligned-erms unrolls + the loop aggressively in the L(large_memset_4x) case. + + The upper-bound is needed because memmove-vec-unaligned-erms + right-shifts the value of `x86_non_temporal_threshold` by + LOG_4X_MEMCPY_THRESH (4) which without a bound may overflow. + + The lack of lower-bound can be a correctness issue. The lack of + upper-bound cannot. + + (cherry picked from commit b446822b6ae4e8149902a78cdd4a886634ad6321) + +diff --git a/manual/tunables.texi b/manual/tunables.texi +index 28ff502990c2a10f..5ab3212f34e3dc37 100644 +--- a/manual/tunables.texi ++++ b/manual/tunables.texi +@@ -47,7 +47,7 @@ glibc.malloc.mxfast: 0x0 (min: 0x0, max: 0xffffffffffffffff) + glibc.elision.skip_lock_busy: 3 (min: -2147483648, max: 2147483647) + glibc.malloc.top_pad: 0x0 (min: 0x0, max: 0xffffffffffffffff) + glibc.cpu.x86_rep_stosb_threshold: 0x800 (min: 0x1, max: 0xffffffffffffffff) +-glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x0, max: 0xffffffffffffffff) ++glibc.cpu.x86_non_temporal_threshold: 0xc0000 (min: 0x4040, max: 0x0fffffffffffffff) + glibc.cpu.x86_shstk: + glibc.cpu.hwcap_mask: 0x6 (min: 0x0, max: 0xffffffffffffffff) + glibc.malloc.mmap_max: 0 (min: -2147483648, max: 2147483647) +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 560bf260e8fbd7bf..8f85f70858413ebe 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -931,8 +931,14 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + + TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); ++ /* SIZE_MAX >> 4 because memmove-vec-unaligned-erms right-shifts the value of ++ 'x86_non_temporal_threshold' by `LOG_4X_MEMCPY_THRESH` (4) and it is best ++ if that operation cannot overflow. Minimum of 0x4040 (16448) because the ++ L(large_memset_4x) loops need 64-byte to cache align and enough space for ++ at least 1 iteration of 4x PAGE_SIZE unrolled loop. Both values are ++ reflected in the manual. */ + TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, +- 0, SIZE_MAX); ++ 0x4040, SIZE_MAX >> 4); + TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, + minimum_rep_movsb_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, diff --git a/glibc-upstream-2.34-293.patch b/glibc-upstream-2.34-293.patch new file mode 100644 index 0000000..81a4e0e --- /dev/null +++ b/glibc-upstream-2.34-293.patch @@ -0,0 +1,88 @@ +commit ba1c3f23d9ba63c38333116eec6043c471c378c4 +Author: Noah Goldstein +Date: Wed Jun 15 10:41:28 2022 -0700 + + x86: Cleanup bounds checking in large memcpy case + + 1. Fix incorrect lower-bound threshold in L(large_memcpy_2x). + Previously was using `__x86_rep_movsb_threshold` and should + have been using `__x86_shared_non_temporal_threshold`. + + 2. Avoid reloading __x86_shared_non_temporal_threshold before + the L(large_memcpy_4x) bounds check. + + 3. Document the second bounds check for L(large_memcpy_4x) + more clearly. + + (cherry picked from commit 89a25c6f64746732b87eaf433af0964b564d4a92) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 7b27cbdda5fb99f7..618d46d8ce28828c 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -118,7 +118,13 @@ + # define LARGE_LOAD_SIZE (VEC_SIZE * 4) + #endif + +-/* Amount to shift rdx by to compare for memcpy_large_4x. */ ++/* Amount to shift __x86_shared_non_temporal_threshold by for ++ bound for memcpy_large_4x. This is essentially use to to ++ indicate that the copy is far beyond the scope of L3 ++ (assuming no user config x86_non_temporal_threshold) and to ++ use a more aggressively unrolled loop. NB: before ++ increasing the value also update initialization of ++ x86_non_temporal_threshold. */ + #ifndef LOG_4X_MEMCPY_THRESH + # define LOG_4X_MEMCPY_THRESH 4 + #endif +@@ -724,9 +730,14 @@ L(skip_short_movsb_check): + .p2align 4,, 10 + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + L(large_memcpy_2x_check): +- cmp __x86_rep_movsb_threshold(%rip), %RDX_LP +- jb L(more_8x_vec_check) ++ /* Entry from L(large_memcpy_2x) has a redundant load of ++ __x86_shared_non_temporal_threshold(%rip). L(large_memcpy_2x) ++ is only use for the non-erms memmove which is generally less ++ common. */ + L(large_memcpy_2x): ++ mov __x86_shared_non_temporal_threshold(%rip), %R11_LP ++ cmp %R11_LP, %RDX_LP ++ jb L(more_8x_vec_check) + /* To reach this point it is impossible for dst > src and + overlap. Remaining to check is src > dst and overlap. rcx + already contains dst - src. Negate rcx to get src - dst. If +@@ -774,18 +785,21 @@ L(large_memcpy_2x): + /* ecx contains -(dst - src). not ecx will return dst - src - 1 + which works for testing aliasing. */ + notl %ecx ++ movq %rdx, %r10 + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx + jz L(large_memcpy_4x) + +- movq %rdx, %r10 +- shrq $LOG_4X_MEMCPY_THRESH, %r10 +- cmp __x86_shared_non_temporal_threshold(%rip), %r10 ++ /* r11 has __x86_shared_non_temporal_threshold. Shift it left ++ by LOG_4X_MEMCPY_THRESH to get L(large_memcpy_4x) threshold. ++ */ ++ shlq $LOG_4X_MEMCPY_THRESH, %r11 ++ cmp %r11, %rdx + jae L(large_memcpy_4x) + + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 2 - 1), %edx + /* r10 stores outer loop counter. */ +- shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 ++ shrq $(LOG_PAGE_SIZE + 1), %r10 + /* Copy 4x VEC at a time from 2 pages. */ + .p2align 4 + L(loop_large_memcpy_2x_outer): +@@ -850,7 +864,6 @@ L(large_memcpy_2x_end): + + .p2align 4 + L(large_memcpy_4x): +- movq %rdx, %r10 + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 4 - 1), %edx + /* r10 stores outer loop counter. */ diff --git a/glibc-upstream-2.34-294.patch b/glibc-upstream-2.34-294.patch new file mode 100644 index 0000000..a7bd391 --- /dev/null +++ b/glibc-upstream-2.34-294.patch @@ -0,0 +1,27 @@ +commit c51d8d383cfb92142b86d8d1822159f3bea10d16 +Author: Noah Goldstein +Date: Thu Jun 16 15:01:08 2022 -0700 + + x86: Add BMI1/BMI2 checks for ISA_V3 check + + BMI1/BMI2 are part of the ISA V3 requirements: + https://en.wikipedia.org/wiki/X86-64 + + And defined by GCC when building with `-march=x86-64-v3` + + (cherry picked from commit 8da9f346cb2051844348785b8a932ec44489e0b7) + +diff --git a/sysdeps/x86/isa-level.c b/sysdeps/x86/isa-level.c +index 49ef4aa6122072cf..07815381122c94c3 100644 +--- a/sysdeps/x86/isa-level.c ++++ b/sysdeps/x86/isa-level.c +@@ -47,7 +47,8 @@ + # endif + + # if ISA_V2 && defined __AVX__ && defined __AVX2__ && defined __F16C__ \ +- && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE ++ && defined __FMA__ && defined __LZCNT__ && defined HAVE_X86_MOVBE \ ++ && defined __BMI__ && defined __BMI2__ + /* NB: ISAs in x86-64 ISA level v3 are used. */ + # define ISA_V3 GNU_PROPERTY_X86_ISA_1_V3 + # else diff --git a/glibc-upstream-2.34-295.patch b/glibc-upstream-2.34-295.patch new file mode 100644 index 0000000..88d89e8 --- /dev/null +++ b/glibc-upstream-2.34-295.patch @@ -0,0 +1,28 @@ +commit d201c59177b98946d7f80145e7b4d02991d04805 +Author: Noah Goldstein +Date: Fri Jun 24 09:42:12 2022 -0700 + + x86: Align entry for memrchr to 64-bytes. + + The function was tuned around 64-byte entry alignment and performs + better for all sizes with it. + + As well different code boths where explicitly written to touch the + minimum number of cache line i.e sizes <= 32 touch only the entry + cache line. + + (cherry picked from commit 227afaa67213efcdce6a870ef5086200f1076438) + +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S +index 5f8e0be18cfe4fad..edd8180ba1ede9a5 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S +@@ -35,7 +35,7 @@ + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 + .section SECTION(.text), "ax", @progbits +-ENTRY(MEMRCHR) ++ENTRY_P2ALIGN(MEMRCHR, 6) + # ifdef __ILP32__ + /* Clear upper bits. */ + and %RDX_LP, %RDX_LP diff --git a/glibc-upstream-2.34-296.patch b/glibc-upstream-2.34-296.patch new file mode 100644 index 0000000..eecd005 --- /dev/null +++ b/glibc-upstream-2.34-296.patch @@ -0,0 +1,56 @@ +commit aadd0a1c7c89d016e1186c81c0efcafa36bf84fc +Author: Noah Goldstein +Date: Fri Jun 24 09:42:15 2022 -0700 + + x86: Put wcs{n}len-sse4.1 in the sse4.1 text section + + Previously was missing but the two implementations shouldn't get in + the sse2 (generic) text section. + + (cherry picked from commit afc6e4328ff80973bde50d5401691b4c4b2e522c) + +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +index 031753a91763b351..762f4755020c35f9 100644 +--- a/sysdeps/x86_64/multiarch/strlen-vec.S ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -28,6 +28,10 @@ + # define SHIFT_RETURN + #endif + ++#ifndef SECTION ++# define SECTION(p) p ++#endif ++ + /* Long lived register in strlen(s), strnlen(s, n) are: + + %xmm3 - zero +@@ -37,7 +41,7 @@ + */ + + +-.text ++ .section SECTION(.text),"ax",@progbits + ENTRY(strlen) + + /* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +index 7e62621afc729492..e306a77f51e650d1 100644 +--- a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S ++++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +@@ -1,4 +1,5 @@ + #define AS_WCSLEN + #define strlen __wcslen_sse4_1 ++#define SECTION(p) p##.sse4.1 + + #include "strlen-vec.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +index 5fa51fe07cbbdf5c..d2f7dd6e2254736c 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S ++++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +@@ -1,5 +1,6 @@ + #define AS_WCSLEN + #define AS_STRNLEN + #define strlen __wcsnlen_sse4_1 ++#define SECTION(p) p##.sse4.1 + + #include "strlen-vec.S" diff --git a/glibc-upstream-2.34-297.patch b/glibc-upstream-2.34-297.patch new file mode 100644 index 0000000..1bc2f3c --- /dev/null +++ b/glibc-upstream-2.34-297.patch @@ -0,0 +1,25 @@ +commit f4598f0351559f1a4176d7ce0154423d98bcfb0d +Author: Noah Goldstein +Date: Wed Jun 29 16:07:04 2022 -0700 + + x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list + + This was simply missing and meant we weren't testing it properly. + + (cherry picked from commit 2a1099020cdc1e4c9c928156aa85c8cf9d540291) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 043821278fdb6d8f..8d649e263eb24b8a 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -1032,6 +1032,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + CPU_FEATURE_USABLE (AVX2), + __wmemset_chk_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, __wmemset_chk, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wmemset_chk_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) diff --git a/glibc-upstream-2.34-298.patch b/glibc-upstream-2.34-298.patch new file mode 100644 index 0000000..ae07d14 --- /dev/null +++ b/glibc-upstream-2.34-298.patch @@ -0,0 +1,124 @@ +commit 7079931c51547854323fe2ed6fdccf2a1b8b04d7 +Author: Noah Goldstein +Date: Wed Jun 29 16:07:05 2022 -0700 + + x86: Move and slightly improve memset_erms + + Implementation wise: + 1. Remove the VZEROUPPER as memset_{impl}_unaligned_erms does not + use the L(stosb) label that was previously defined. + + 2. Don't give the hotpath (fallthrough) to zero size. + + Code positioning wise: + + Move memset_{chk}_erms to its own file. Leaving it in between the + memset_{impl}_unaligned both adds unnecessary complexity to the + file and wastes space in a relatively hot cache section. + + (cherry picked from commit 4a3f29e7e475dd4e7cce2a24c187e6fb7b5b0a05) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 0e39e63ef6be6a86..da9f16286a763556 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -29,6 +29,7 @@ sysdep_routines += \ + memset-avx2-unaligned-erms-rtm \ + memset-avx512-no-vzeroupper \ + memset-avx512-unaligned-erms \ ++ memset-erms \ + memset-evex-unaligned-erms \ + memset-sse2-unaligned-erms \ + rawmemchr-avx2 \ +diff --git a/sysdeps/x86_64/multiarch/memset-erms.S b/sysdeps/x86_64/multiarch/memset-erms.S +new file mode 100644 +index 0000000000000000..e83cccc731f0a7ea +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memset-erms.S +@@ -0,0 +1,44 @@ ++/* memset implement with rep stosb ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++ ++#include ++ ++#if defined USE_MULTIARCH && IS_IN (libc) ++ .text ++ENTRY (__memset_chk_erms) ++ cmp %RDX_LP, %RCX_LP ++ jb HIDDEN_JUMPTARGET (__chk_fail) ++END (__memset_chk_erms) ++ ++/* Only used to measure performance of REP STOSB. */ ++ENTRY (__memset_erms) ++ /* Skip zero length. */ ++ test %RDX_LP, %RDX_LP ++ jz L(stosb_return_zero) ++ mov %RDX_LP, %RCX_LP ++ movzbl %sil, %eax ++ mov %RDI_LP, %RDX_LP ++ rep stosb ++ mov %RDX_LP, %RAX_LP ++ ret ++L(stosb_return_zero): ++ movq %rdi, %rax ++ ret ++END (__memset_erms) ++#endif +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index abc12d9cda1b3843..905d0fa4643d5768 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -156,37 +156,6 @@ L(entry_from_wmemset): + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMSET_SYMBOL (__memset, unaligned)) + +-# if VEC_SIZE == 16 +-ENTRY (__memset_chk_erms) +- cmp %RDX_LP, %RCX_LP +- jb HIDDEN_JUMPTARGET (__chk_fail) +-END (__memset_chk_erms) +- +-/* Only used to measure performance of REP STOSB. */ +-ENTRY (__memset_erms) +- /* Skip zero length. */ +- test %RDX_LP, %RDX_LP +- jnz L(stosb) +- movq %rdi, %rax +- ret +-# else +-/* Provide a hidden symbol to debugger. */ +- .hidden MEMSET_SYMBOL (__memset, erms) +-ENTRY (MEMSET_SYMBOL (__memset, erms)) +-# endif +-L(stosb): +- mov %RDX_LP, %RCX_LP +- movzbl %sil, %eax +- mov %RDI_LP, %RDX_LP +- rep stosb +- mov %RDX_LP, %RAX_LP +- VZEROUPPER_RETURN +-# if VEC_SIZE == 16 +-END (__memset_erms) +-# else +-END (MEMSET_SYMBOL (__memset, erms)) +-# endif +- + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP diff --git a/glibc-upstream-2.34-299.patch b/glibc-upstream-2.34-299.patch new file mode 100644 index 0000000..afa05e9 --- /dev/null +++ b/glibc-upstream-2.34-299.patch @@ -0,0 +1,163 @@ +commit 35f9c72c8bd7bc30deb412e966e2f548241b15d2 +Author: Noah Goldstein +Date: Wed Jun 29 16:07:15 2022 -0700 + + x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file + + The primary memmove_{impl}_unaligned_erms implementations don't + interact with this function. Putting them in same file both + wastes space and unnecessarily bloats a hot code section. + + (cherry picked from commit 21925f64730d52eb7d8b2fb62b412f8ab92b0caf) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index da9f16286a763556..b9ea5b60c2be1b0a 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -17,6 +17,7 @@ sysdep_routines += \ + memmove-avx-unaligned-erms-rtm \ + memmove-avx512-no-vzeroupper \ + memmove-avx512-unaligned-erms \ ++ memmove-erms \ + memmove-evex-unaligned-erms \ + memmove-sse2-unaligned-erms \ + memmove-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/memmove-erms.S b/sysdeps/x86_64/multiarch/memmove-erms.S +new file mode 100644 +index 0000000000000000..2d3a6ccb76d77052 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memmove-erms.S +@@ -0,0 +1,72 @@ ++/* memcpy/mempcpy/memmove implement with rep movsb ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++ ++#include ++ ++#if defined USE_MULTIARCH && IS_IN (libc) ++ .text ++ENTRY (__mempcpy_chk_erms) ++ cmp %RDX_LP, %RCX_LP ++ jb HIDDEN_JUMPTARGET (__chk_fail) ++END (__mempcpy_chk_erms) ++ ++/* Only used to measure performance of REP MOVSB. */ ++ENTRY (__mempcpy_erms) ++ mov %RDI_LP, %RAX_LP ++ /* Skip zero length. */ ++ test %RDX_LP, %RDX_LP ++ jz 2f ++ add %RDX_LP, %RAX_LP ++ jmp L(start_movsb) ++END (__mempcpy_erms) ++ ++ENTRY (__memmove_chk_erms) ++ cmp %RDX_LP, %RCX_LP ++ jb HIDDEN_JUMPTARGET (__chk_fail) ++END (__memmove_chk_erms) ++ ++ENTRY (__memmove_erms) ++ movq %rdi, %rax ++ /* Skip zero length. */ ++ test %RDX_LP, %RDX_LP ++ jz 2f ++L(start_movsb): ++ mov %RDX_LP, %RCX_LP ++ cmp %RSI_LP, %RDI_LP ++ jb 1f ++ /* Source == destination is less common. */ ++ je 2f ++ lea (%rsi,%rcx), %RDX_LP ++ cmp %RDX_LP, %RDI_LP ++ jb L(movsb_backward) ++1: ++ rep movsb ++2: ++ ret ++L(movsb_backward): ++ leaq -1(%rdi,%rcx), %rdi ++ leaq -1(%rsi,%rcx), %rsi ++ std ++ rep movsb ++ cld ++ ret ++END (__memmove_erms) ++strong_alias (__memmove_erms, __memcpy_erms) ++strong_alias (__memmove_chk_erms, __memcpy_chk_erms) ++#endif +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 618d46d8ce28828c..93c7e6883a254434 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -239,56 +239,6 @@ L(start): + #endif + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMMOVE_SYMBOL (__memmove, unaligned)) +-# if VEC_SIZE == 16 +-ENTRY (__mempcpy_chk_erms) +- cmp %RDX_LP, %RCX_LP +- jb HIDDEN_JUMPTARGET (__chk_fail) +-END (__mempcpy_chk_erms) +- +-/* Only used to measure performance of REP MOVSB. */ +-ENTRY (__mempcpy_erms) +- mov %RDI_LP, %RAX_LP +- /* Skip zero length. */ +- test %RDX_LP, %RDX_LP +- jz 2f +- add %RDX_LP, %RAX_LP +- jmp L(start_movsb) +-END (__mempcpy_erms) +- +-ENTRY (__memmove_chk_erms) +- cmp %RDX_LP, %RCX_LP +- jb HIDDEN_JUMPTARGET (__chk_fail) +-END (__memmove_chk_erms) +- +-ENTRY (__memmove_erms) +- movq %rdi, %rax +- /* Skip zero length. */ +- test %RDX_LP, %RDX_LP +- jz 2f +-L(start_movsb): +- mov %RDX_LP, %RCX_LP +- cmp %RSI_LP, %RDI_LP +- jb 1f +- /* Source == destination is less common. */ +- je 2f +- lea (%rsi,%rcx), %RDX_LP +- cmp %RDX_LP, %RDI_LP +- jb L(movsb_backward) +-1: +- rep movsb +-2: +- ret +-L(movsb_backward): +- leaq -1(%rdi,%rcx), %rdi +- leaq -1(%rsi,%rcx), %rsi +- std +- rep movsb +- cld +- ret +-END (__memmove_erms) +-strong_alias (__memmove_erms, __memcpy_erms) +-strong_alias (__memmove_chk_erms, __memcpy_chk_erms) +-# endif + + # ifdef SHARED + ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) diff --git a/glibc-upstream-2.34-300.patch b/glibc-upstream-2.34-300.patch new file mode 100644 index 0000000..5db8327 --- /dev/null +++ b/glibc-upstream-2.34-300.patch @@ -0,0 +1,38 @@ +commit ccc54bd61c768b6a27f9305a0831b76a7b6d706f +Author: Noah Goldstein +Date: Wed Jun 29 18:56:18 2022 -0700 + + x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S + + Was missing to for the multiarch build rtld-strncmp-sse4_2.os was + being built and exporting symbols: + + build/glibc/string/rtld-strncmp-sse4_2.os: + 0000000000000000 T __strncmp_sse42 + + Introduced in: + + commit 11ffcacb64a939c10cfc713746b8ec88837f5c4a + Author: H.J. Lu + Date: Wed Jun 21 12:10:50 2017 -0700 + + x86-64: Implement strcmp family IFUNC selectors in C + + (cherry picked from commit 96ac447d915ea5ecef3f9168cc13f4e731349a3b) + +diff --git a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S +index 22f51a0dfd2770c9..85dc363bf9d6273d 100644 +--- a/sysdeps/x86_64/multiarch/strncmp-sse4_2.S ++++ b/sysdeps/x86_64/multiarch/strncmp-sse4_2.S +@@ -16,6 +16,8 @@ + License along with the GNU C Library; if not, see + . */ + +-#define STRCMP_SSE42 __strncmp_sse42 +-#define USE_AS_STRNCMP +-#include "strcmp-sse42.S" ++#if IS_IN (libc) ++# define STRCMP_SSE42 __strncmp_sse42 ++# define USE_AS_STRNCMP ++# include "strcmp-sse42.S" ++#endif diff --git a/glibc-upstream-2.34-301.patch b/glibc-upstream-2.34-301.patch new file mode 100644 index 0000000..4d8b2fb --- /dev/null +++ b/glibc-upstream-2.34-301.patch @@ -0,0 +1,28 @@ +commit b991af50632a2da262b00b2375d9120f23b25525 +Author: Joseph Myers +Date: Wed May 25 14:37:28 2022 +0000 + + Update syscall-names.list for Linux 5.18 + + Linux 5.18 has no new syscalls. Update the version number in + syscall-names.list to reflect that it is still current for 5.18. + + Tested with build-many-glibcs.py. + + (cherry picked from commit 3d9926663cba19f40d26d8a8ab3b2a7cc09ffb13) + +diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list +index e2743c649586d97a..95370e2ec5dbc4a7 100644 +--- a/sysdeps/unix/sysv/linux/syscall-names.list ++++ b/sysdeps/unix/sysv/linux/syscall-names.list +@@ -21,8 +21,8 @@ + # This file can list all potential system calls. The names are only + # used if the installed kernel headers also provide them. + +-# The list of system calls is current as of Linux 5.17. +-kernel 5.17 ++# The list of system calls is current as of Linux 5.18. ++kernel 5.18 + + FAST_atomic_update + FAST_cmpxchg diff --git a/glibc-upstream-2.34-302.patch b/glibc-upstream-2.34-302.patch new file mode 100644 index 0000000..3c37bca --- /dev/null +++ b/glibc-upstream-2.34-302.patch @@ -0,0 +1,44 @@ +commit b2f32e746492615a6eb3e66fac1e766e32e8deb1 +Author: Florian Weimer +Date: Thu Jul 21 12:12:08 2022 +0200 + + malloc: Simplify implementation of __malloc_assert + + It is prudent not to run too much code after detecting heap + corruption, and __fxprintf is really complex. The line number + and file name do not carry much information, so it is not included + in the error message. (__libc_message only supports %s formatting.) + The function name and assertion should provide some context. + + Reviewed-by: Siddhesh Poyarekar + (cherry picked from commit ac8047cdf326504f652f7db97ec96c0e0cee052f) + +diff --git a/malloc/malloc.c b/malloc/malloc.c +index 7882c70f0a0312d1..d31e985ecce968fe 100644 +--- a/malloc/malloc.c ++++ b/malloc/malloc.c +@@ -294,19 +294,14 @@ + # define __assert_fail(assertion, file, line, function) \ + __malloc_assert(assertion, file, line, function) + +-extern const char *__progname; +- +-static void ++_Noreturn static void + __malloc_assert (const char *assertion, const char *file, unsigned int line, + const char *function) + { +- (void) __fxprintf (NULL, "%s%s%s:%u: %s%sAssertion `%s' failed.\n", +- __progname, __progname[0] ? ": " : "", +- file, line, +- function ? function : "", function ? ": " : "", +- assertion); +- fflush (stderr); +- abort (); ++ __libc_message (do_abort, "\ ++Fatal glibc error: malloc assertion failure in %s: %s\n", ++ function, assertion); ++ __builtin_unreachable (); + } + #endif + #endif diff --git a/glibc.spec b/glibc.spec index 2b4c989..dcfb89f 100644 --- a/glibc.spec +++ b/glibc.spec @@ -148,7 +148,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: 39%{?dist} +Release: 40%{?dist} # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -550,6 +550,35 @@ Patch342: glibc-upstream-2.34-272.patch Patch343: glibc-upstream-2.34-273.patch Patch344: glibc-rh2096191-1.patch Patch345: glibc-rh2096191-2.patch +Patch346: glibc-upstream-2.34-274.patch +Patch347: glibc-upstream-2.34-275.patch +Patch348: glibc-upstream-2.34-276.patch +Patch349: glibc-upstream-2.34-277.patch +Patch350: glibc-upstream-2.34-278.patch +Patch351: glibc-upstream-2.34-279.patch +Patch352: glibc-upstream-2.34-280.patch +Patch353: glibc-upstream-2.34-281.patch +Patch354: glibc-upstream-2.34-282.patch +Patch355: glibc-upstream-2.34-283.patch +Patch356: glibc-upstream-2.34-284.patch +Patch357: glibc-upstream-2.34-285.patch +Patch358: glibc-upstream-2.34-286.patch +Patch359: glibc-upstream-2.34-287.patch +Patch360: glibc-upstream-2.34-288.patch +Patch361: glibc-upstream-2.34-289.patch +Patch362: glibc-upstream-2.34-290.patch +Patch363: glibc-upstream-2.34-291.patch +Patch364: glibc-upstream-2.34-292.patch +Patch365: glibc-upstream-2.34-293.patch +Patch366: glibc-upstream-2.34-294.patch +Patch367: glibc-upstream-2.34-295.patch +Patch368: glibc-upstream-2.34-296.patch +Patch369: glibc-upstream-2.34-297.patch +Patch370: glibc-upstream-2.34-298.patch +Patch371: glibc-upstream-2.34-299.patch +Patch372: glibc-upstream-2.34-300.patch +Patch373: glibc-upstream-2.34-301.patch +Patch374: glibc-upstream-2.34-302.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2606,6 +2635,39 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Fri Jul 22 2022 Arjun Shankar - 2.34-40 +- Sync with upstream branch release/2.34/master, + commit b2f32e746492615a6eb3e66fac1e766e32e8deb1: +- malloc: Simplify implementation of __malloc_assert +- Update syscall-names.list for Linux 5.18 +- x86: Add missing IS_IN (libc) check to strncmp-sse4_2.S +- x86: Move mem{p}{mov|cpy}_{chk_}erms to its own file +- x86: Move and slightly improve memset_erms +- x86: Add definition for __wmemset_chk AVX2 RTM in ifunc impl list +- x86: Put wcs{n}len-sse4.1 in the sse4.1 text section +- x86: Align entry for memrchr to 64-bytes. +- x86: Add BMI1/BMI2 checks for ISA_V3 check +- x86: Cleanup bounds checking in large memcpy case +- x86: Add bounds `x86_non_temporal_threshold` +- x86: Add sse42 implementation to strcmp's ifunc +- x86: Fix misordered logic for setting `rep_movsb_stop_threshold` +- x86: Align varshift table to 32-bytes +- x86: ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST expect no transactions +- x86: Shrink code size of memchr-evex.S +- x86: Shrink code size of memchr-avx2.S +- x86: Optimize memrchr-avx2.S +- x86: Optimize memrchr-evex.S +- x86: Optimize memrchr-sse2.S +- x86: Add COND_VZEROUPPER that can replace vzeroupper if no `ret` +- x86: Create header for VEC classes in x86 strings library +- x86_64: Add strstr function with 512-bit EVEX +- x86-64: Ignore r_addend for R_X86_64_GLOB_DAT/R_X86_64_JUMP_SLOT +- x86_64: Implement evex512 version of strlen, strnlen, wcslen and wcsnlen +- x86_64: Remove bzero optimization +- x86_64: Remove end of line trailing spaces +- nptl: Fix ___pthread_unregister_cancel_restore asynchronous restore +- linux: Fix mq_timereceive check for 32 bit fallback code (BZ 29304) + * Fri Jun 24 2022 Florian Weimer - 2.34-39 - Add the no-aaaa DNS stub resolver option (#2096191)