diff --git a/glibc-RHEL-174869-1.patch b/glibc-RHEL-174869-1.patch new file mode 100644 index 0000000..82eed26 --- /dev/null +++ b/glibc-RHEL-174869-1.patch @@ -0,0 +1,199 @@ +commit f446d90fe6605ac473aaa6cd17a1800e72dcc1a2 +Author: Noah Goldstein +Date: Wed Aug 14 14:37:31 2024 +0800 + + x86: Add `Avoid_STOSB` tunable to allow NT memset without ERMS + + The goal of this flag is to allow targets which don't prefer/have ERMS + to still access the non-temporal memset implementation. + + There are 4 cases for tuning memset: + 1) `Avoid_STOSB && Avoid_Non_Temporal_Memset` + - Memset with temporal stores + 2) `Avoid_STOSB && !Avoid_Non_Temporal_Memset` + - Memset with temporal/non-temporal stores. Non-temporal path + goes through `rep stosb` path. We accomplish this by setting + `x86_rep_stosb_threshold` to + `x86_memset_non_temporal_threshold`. + 3) `!Avoid_STOSB && Avoid_Non_Temporal_Memset` + - Memset with temporal stores/`rep stosb` + 3) `!Avoid_STOSB && !Avoid_Non_Temporal_Memset` + - Memset with temporal stores/`rep stosb`/non-temporal stores. + Reviewed-by: H.J. Lu + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index f87d6d354924f3d9..e0728cb010aae637 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -1103,6 +1103,10 @@ disable_tsx: + if (CPU_FEATURES_CPU_P (cpu_features, CMOV)) + cpu_features->preferred[index_arch_I686] |= bit_arch_I686; + ++ /* No ERMS, we want to avoid stosb for memset. */ ++ if (!CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ cpu_features->preferred[index_arch_Avoid_STOSB] |= bit_arch_Avoid_STOSB; ++ + #if !HAS_CPUID + no_cpuid: + #endif +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index a0b31d80f64127c5..98da2c54a5c58851 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -195,6 +195,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + 11); + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, Prefer_FSRM, + 11); ++ CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, Avoid_STOSB, ++ 11); + CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH (n, cpu_features, + Slow_SSE4_2, + SSE4_2, +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index ebfea0a32ce0cff6..e21592e166a041dd 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -1039,18 +1039,42 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + slightly better than ERMS. */ + rep_stosb_threshold = SIZE_MAX; + ++ /* ++ For memset, the non-temporal implementation is only accessed through the ++ stosb code. ie: ++ ``` ++ if (size >= rep_stosb_thresh) ++ { ++ if (size >= non_temporal_thresh) ++ { ++ do_non_temporal (); ++ } ++ do_stosb (); ++ } ++ do_normal_vec_loop (); ++ ``` ++ So if we prefer non-temporal, set `rep_stosb_thresh = non_temporal_thresh` ++ to enable the implementation. If `rep_stosb_thresh = non_temporal_thresh`, ++ `rep stosb` will never be used. ++ */ ++ TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, ++ memset_non_temporal_threshold, ++ minimum_non_temporal_threshold, SIZE_MAX); ++ /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the ++ final value of `x86_memset_non_temporal_threshold`. In some cases this can ++ be a matter of correctness. */ ++ if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB)) ++ rep_stosb_threshold ++ = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); ++ TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, ++ SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_data_cache_size, data, 0, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_shared_cache_size, shared, 0, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_non_temporal_threshold, non_temporal_threshold, + minimum_non_temporal_threshold, + maximum_non_temporal_threshold); +- TUNABLE_SET_WITH_BOUNDS (x86_memset_non_temporal_threshold, +- memset_non_temporal_threshold, +- minimum_non_temporal_threshold, SIZE_MAX); + TUNABLE_SET_WITH_BOUNDS (x86_rep_movsb_threshold, rep_movsb_threshold, + minimum_rep_movsb_threshold, SIZE_MAX); +- TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1, +- SIZE_MAX); + + unsigned long int rep_movsb_stop_threshold; + /* Setting the upper bound of ERMS to the computed value of +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index 61bbbc2e8983482e..2a58000147d22ddb 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -34,3 +34,4 @@ BIT (MathVec_Prefer_No_AVX512) + BIT (Prefer_FSRM) + BIT (Avoid_Short_Distance_REP_MOVSB) + BIT (Avoid_Non_Temporal_Memset) ++BIT (Avoid_STOSB) +diff --git a/sysdeps/x86/tst-hwcap-tunables.c b/sysdeps/x86/tst-hwcap-tunables.c +index 94307283d7cdbdc7..1920f5057e69c48a 100644 +--- a/sysdeps/x86/tst-hwcap-tunables.c ++++ b/sysdeps/x86/tst-hwcap-tunables.c +@@ -60,7 +60,8 @@ static const struct test_t + /* Disable everything. */ + "-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," + "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,-ERMS," +- "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset", ++ "-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," ++ "-Avoid_STOSB", + test_1, + array_length (test_1) + }, +@@ -68,7 +69,8 @@ static const struct test_t + /* Same as before, but with some empty suboptions. */ + ",-,-Prefer_ERMS,-Prefer_FSRM,-AVX,-AVX2,-AVX512F,-AVX512VL," + "-SSE4_1,-SSE4_2,-SSSE3,-Fast_Unaligned_Load,,-," +- "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset,-,", ++ "-ERMS,-AVX_Fast_Unaligned_Load,-Avoid_Non_Temporal_Memset," ++ "-Avoid_STOSB,-,", + test_1, + array_length (test_1) + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 7a637ef7ca286694..8dc3d7ab5abaaecb 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -46,6 +46,13 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + attribute_hidden; + ++static inline int ++prefer_erms_nt_impl (const struct cpu_features *cpu_features) ++{ ++ return CPU_FEATURE_USABLE_P (cpu_features, ERMS) ++ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset); ++} ++ + static inline void * + IFUNC_SELECTOR (void) + { +@@ -61,7 +68,7 @@ IFUNC_SELECTOR (void) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (avx512_unaligned_erms); + + return OPTIMIZE (avx512_unaligned); +@@ -76,7 +83,7 @@ IFUNC_SELECTOR (void) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (evex_unaligned_erms); + + return OPTIMIZE (evex_unaligned); +@@ -84,7 +91,7 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (avx2_unaligned_erms_rtm); + + return OPTIMIZE (avx2_unaligned_rtm); +@@ -93,14 +100,15 @@ IFUNC_SELECTOR (void) + if (X86_ISA_CPU_FEATURES_ARCH_P (cpu_features, + Prefer_No_VZEROUPPER, !)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (prefer_erms_nt_impl (cpu_features)) + return OPTIMIZE (avx2_unaligned_erms); + + return OPTIMIZE (avx2_unaligned); + } + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS) ++ || !CPU_FEATURES_ARCH_P (cpu_features, Avoid_Non_Temporal_Memset)) + return OPTIMIZE (sse2_unaligned_erms); + + return OPTIMIZE (sse2_unaligned); diff --git a/glibc-RHEL-174869-2.patch b/glibc-RHEL-174869-2.patch new file mode 100644 index 0000000..33ebed5 --- /dev/null +++ b/glibc-RHEL-174869-2.patch @@ -0,0 +1,246 @@ +commit cd5fda114ece002945ace3d54a8f80a4f67d1fbb +Author: Sajan Karumanchi +Date: Thu Mar 26 09:21:30 2026 +0000 + + x86_64: Prefer EVEX512 code-path on AMD Zen5 CPUs + + Introduced a synthetic architecture preference flag (Prefer_EVEX512) + and enabled it for AMD Zen5 (CPUID Family 0x1A) when AVX-512 is supported. + + This flag modifies IFUNC dispatch to prefer 512-bit EVEX variants over + 256-bit EVEX variants for string and memory functions on Zen5 processors, + leveraging their native 512-bit execution units for improved throughput. + When Prefer_EVEX512 is set, the dispatcher selects evex512 implementations; + otherwise, it falls back to evex (256-bit) variants. + + The implementation updates the IFUNC selection logic in ifunc-avx2.h and + ifunc-evex.h to check for the Prefer_EVEX512 flag before dispatching to + EVEX512 implementations. This change affects six string/memory functions: + + - strchr + - strlen + - strnlen + - strrchr + - strchrnul + - memchr + + Benchmarks conducted on AMD Zen5 hardware demonstrate significant + performance improvements across all affected functions: + + Function Baseline Patched Avg Avg Avg Max + Variant Variant Baseline Patched Change Improve + (ns) (ns) % % + ------------+----------+----------+-----------+----------+--------+-------- + STRCHR evex evex512 16.408 12.293 25.08% 37.69% + STRLEN evex evex512 16.862 11.436 32.18% 56.74% + STRNLEN evex evex512 18.493 11.762 36.40% 64.40% + STRRCHR evex evex512 15.154 10.874 28.24% 44.38% + STRCHRNUL evex evex512 16.464 12.605 23.44% 45.56% + MEMCHR evex evex512 9.984 8.268 17.19% 39.99% + + Additionally, a tunable option (glibc.cpu.x86_cpu_features.preferred) + is provided to allow runtime control of the Prefer_EVEX512 flag for testing + and compatibility. + + Reviewed-by: Ganesh Gopalasubramanian + Reviewed-by: H.J. Lu + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index e0728cb010aae637..55c952dded4c1030 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -1013,6 +1013,12 @@ disable_tsx: + cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset] + &= ~bit_arch_Avoid_Non_Temporal_Memset; + ++ /* Prefer EVEX512 string/memory variants on AMD Zen5 (Family 0x1A) ++ when AVX-512 is usable. */ ++ if (family == 0x1A && CPU_FEATURE_USABLE_P (cpu_features, AVX512F)) ++ cpu_features->preferred[index_arch_Prefer_EVEX512] ++ |= bit_arch_Prefer_EVEX512; ++ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + { + /* Since the FMA4 bit is in CPUID_INDEX_80000001 and +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index 98da2c54a5c58851..4627748670f6da84 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -203,6 +203,12 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + 11); + } + break; ++ case 14: ++ { ++ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH ++ (n, cpu_features, Prefer_EVEX512, AVX512F, 14); ++ } ++ break; + case 15: + { + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index 2a58000147d22ddb..25e535af62615449 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -35,3 +35,4 @@ BIT (Prefer_FSRM) + BIT (Avoid_Short_Distance_REP_MOVSB) + BIT (Avoid_Non_Temporal_Memset) + BIT (Avoid_STOSB) ++BIT (Prefer_EVEX512) +diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h +index 4174928dab666878..5566e9760933ad2d 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h ++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h +@@ -1,4 +1,4 @@ +-/* Common definition for ifunc selections optimized with SSE2 and AVX2. ++/* Common definition for ifunc selections optimized with SSE2, AVX2 and EVEX512. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. +@@ -25,6 +25,10 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + ++#ifdef USE_EVEX512 ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex512) attribute_hidden; ++#endif ++ + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + +@@ -44,8 +48,13 @@ IFUNC_SELECTOR (void) + { + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) +- return OPTIMIZE (evex); +- ++ { ++#ifdef USE_EVEX512 ++ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_EVEX512)) ++ return OPTIMIZE (evex512); ++#endif ++ return OPTIMIZE (evex); ++ } + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + +diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h +index bbd1e3115f2e3a7c..643817a515b3fd71 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-evex.h ++++ b/sysdeps/x86_64/multiarch/ifunc-evex.h +@@ -1,4 +1,4 @@ +-/* Common definition for ifunc selection optimized with EVEX. ++/* Common definition for ifunc selection optimized with EVEX and EVEX512. + All versions must be listed in ifunc-impl-list.c. + Copyright (C) 2017-2024 Free Software Foundation, Inc. + This file is part of the GNU C Library. +@@ -22,6 +22,10 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; + ++#ifdef USE_EVEX512 ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex512) attribute_hidden; ++#endif ++ + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + +@@ -42,6 +46,11 @@ IFUNC_SELECTOR (void) + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + { ++#ifdef USE_EVEX512 ++ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_EVEX512)) ++ return OPTIMIZE (evex512); ++#endif ++ + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (evex_rtm); + +diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c +index 2c7754e759d2f1dd..9f915861a40498d8 100644 +--- a/sysdeps/x86_64/multiarch/memchr.c ++++ b/sysdeps/x86_64/multiarch/memchr.c +@@ -24,6 +24,7 @@ + # undef memchr + + # define SYMBOL_NAME memchr ++# define USE_EVEX512 1 + # include "ifunc-evex.h" + + libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ()); +diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c +index 4b15d53e97e682db..2d3d084aa8d3bdd1 100644 +--- a/sysdeps/x86_64/multiarch/strchr.c ++++ b/sysdeps/x86_64/multiarch/strchr.c +@@ -27,6 +27,7 @@ + # include + + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex512) attribute_hidden; + + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +@@ -46,7 +47,12 @@ IFUNC_SELECTOR (void) + { + if (X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) + && X86_ISA_CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) +- return OPTIMIZE (evex); ++ { ++ if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_EVEX512)) ++ return OPTIMIZE (evex512); ++ ++ return OPTIMIZE (evex); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); +diff --git a/sysdeps/x86_64/multiarch/strchrnul.c b/sysdeps/x86_64/multiarch/strchrnul.c +index 663819918e103083..e3fb2503eade9764 100644 +--- a/sysdeps/x86_64/multiarch/strchrnul.c ++++ b/sysdeps/x86_64/multiarch/strchrnul.c +@@ -26,6 +26,7 @@ + # undef strchrnul + + # define SYMBOL_NAME strchrnul ++# define USE_EVEX512 1 + # include "ifunc-avx2.h" + + libc_ifunc_redirected (__redirect_strchrnul, __strchrnul, +diff --git a/sysdeps/x86_64/multiarch/strlen.c b/sysdeps/x86_64/multiarch/strlen.c +index a362c2bf8bce9dcf..9b39da5c760a9fa4 100644 +--- a/sysdeps/x86_64/multiarch/strlen.c ++++ b/sysdeps/x86_64/multiarch/strlen.c +@@ -24,6 +24,7 @@ + # undef strlen + + # define SYMBOL_NAME strlen ++# define USE_EVEX512 1 + # include "ifunc-avx2.h" + + libc_ifunc_redirected (__redirect_strlen, strlen, IFUNC_SELECTOR ()); +diff --git a/sysdeps/x86_64/multiarch/strnlen.c b/sysdeps/x86_64/multiarch/strnlen.c +index d1537e039052551d..a09ff4bea54bb0d1 100644 +--- a/sysdeps/x86_64/multiarch/strnlen.c ++++ b/sysdeps/x86_64/multiarch/strnlen.c +@@ -26,6 +26,7 @@ + # undef strnlen + + # define SYMBOL_NAME strnlen ++# define USE_EVEX512 1 + # include "ifunc-avx2.h" + + libc_ifunc_redirected (__redirect_strnlen, __strnlen, IFUNC_SELECTOR ()); +diff --git a/sysdeps/x86_64/multiarch/strrchr.c b/sysdeps/x86_64/multiarch/strrchr.c +index f14237d1ffeb8e11..2ad1192d3ec013d8 100644 +--- a/sysdeps/x86_64/multiarch/strrchr.c ++++ b/sysdeps/x86_64/multiarch/strrchr.c +@@ -23,6 +23,7 @@ + # undef strrchr + + # define SYMBOL_NAME strrchr ++# define USE_EVEX512 1 + # include "ifunc-avx2.h" + + libc_ifunc_redirected (__redirect_strrchr, strrchr, IFUNC_SELECTOR ()); diff --git a/glibc-RHEL-174869-3.patch b/glibc-RHEL-174869-3.patch new file mode 100644 index 0000000..6350257 --- /dev/null +++ b/glibc-RHEL-174869-3.patch @@ -0,0 +1,50 @@ +commit 54abc8566fea592e795cb443949266ef206462a8 +Author: zombie12138 +Date: Tue May 5 22:38:01 2026 -0700 + + x86: Fix non-temporal memset unreachable on AMD Zen 3/4/5 + + On AMD Zen 3/4/5 with ERMS, the non-temporal memset path is unreachable + because rep_stosb_threshold is set to SIZE_MAX (vectorized loop is faster + than ERMS on these CPUs), but the non-temporal code path is nested inside + the rep_stosb branch. + + The existing rescue logic at the Avoid_STOSB check only covers the case + where the CPU lacks ERMS hardware support. It does not cover AMD Zen 3+ + where ERMS is supported but deliberately unused for performance reasons. + + Extend the condition to also lower rep_stosb_threshold when: + - The user has not explicitly set x86_rep_stosb_threshold (respect tunables) + - rep_stosb_threshold is higher than memset_non_temporal_threshold (NT gated) + + This makes the non-temporal path reachable for large memset operations, + providing ~2x speedup on pre-faulted buffers larger than L3 cache. + + Tested on AMD Ryzen 7 8745HS (Zen 4): + - Pre-faulted 64MB memset: 2.02 ms -> 0.94 ms (2.15x faster) + - First-touch 64MB memset: 19.3 ms -> 21.3 ms (11% regression, expected: + kernel clear_page cache warming bypassed by NT stores) + + * sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Extend + rep_stosb_threshold lowering condition to cover AMD Zen 3/4/5 + where ERMS is supported but stosb is disabled via threshold. + + Signed-off-by: zombie12138 + Bug: https://sourceware.org/bugzilla/show_bug.cgi?id=34129 + Reviewed-by: Adhemerval Zanella + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index e21592e166a041dd..f3477a1c5e190dc9 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -1063,7 +1063,9 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + /* Do `rep_stosb_thresh = non_temporal_thresh` after setting/getting the + final value of `x86_memset_non_temporal_threshold`. In some cases this can + be a matter of correctness. */ +- if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB)) ++ if (CPU_FEATURES_ARCH_P (cpu_features, Avoid_STOSB) ++ || (!TUNABLE_IS_INITIALIZED (x86_rep_stosb_threshold) ++ && rep_stosb_threshold > memset_non_temporal_threshold)) + rep_stosb_threshold + = TUNABLE_GET (x86_memset_non_temporal_threshold, long int, NULL); + TUNABLE_SET_WITH_BOUNDS (x86_rep_stosb_threshold, rep_stosb_threshold, 1,