From 7e7241f6206d2f68286aa56853db7594e749c4f6 Mon Sep 17 00:00:00 2001 From: Florian Weimer Date: Thu, 23 Jan 2025 09:08:39 +0100 Subject: [PATCH] Sync with upstream branch release/2.39/master (CVE-2025-0395) Upstream commit: 808a84a8b81468b517a4d721fdc62069cb8c211f - Fix underallocation of abort_msg_s struct (CVE-2025-0395) - x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212] - x86: Improve large memset perf with non-temporal stores [RHEL-29312] - x86: Avoid integer truncation with large cache sizes (bug 32470) - math: Exclude internal math symbols for tests [BZ #32414] - malloc: add indirection for malloc(-like) functions in tests [BZ #32366] - Pass -nostdlib -nostartfiles together with -r [BZ #31753] - nptl: initialize cpu_id_start prior to rseq registration - nptl: initialize rseq area prior to registration --- glibc-upstream-2.39-138.patch | 55 ++++++++ glibc-upstream-2.39-139.patch | 29 ++++ glibc-upstream-2.39-140.patch | 28 ++++ glibc-upstream-2.39-141.patch | 172 +++++++++++++++++++++++ glibc-upstream-2.39-142.patch | 44 ++++++ glibc-upstream-2.39-143.patch | 30 ++++ glibc-upstream-2.39-144.patch | 250 ++++++++++++++++++++++++++++++++++ glibc-upstream-2.39-145.patch | 143 +++++++++++++++++++ glibc-upstream-2.39-146.patch | 57 ++++++++ glibc.spec | 24 +++- 10 files changed, 831 insertions(+), 1 deletion(-) create mode 100644 glibc-upstream-2.39-138.patch create mode 100644 glibc-upstream-2.39-139.patch create mode 100644 glibc-upstream-2.39-140.patch create mode 100644 glibc-upstream-2.39-141.patch create mode 100644 glibc-upstream-2.39-142.patch create mode 100644 glibc-upstream-2.39-143.patch create mode 100644 glibc-upstream-2.39-144.patch create mode 100644 glibc-upstream-2.39-145.patch create mode 100644 glibc-upstream-2.39-146.patch diff --git a/glibc-upstream-2.39-138.patch b/glibc-upstream-2.39-138.patch new file mode 100644 index 0000000..c2ed486 --- /dev/null +++ b/glibc-upstream-2.39-138.patch @@ -0,0 +1,55 @@ +commit 9a0e174a39a3a65f628c6a55e29fe35f6d67bf42 +Author: Michael Jeanson +Date: Thu Nov 7 22:23:49 2024 +0100 + + nptl: initialize rseq area prior to registration + + Per the rseq syscall documentation, 3 fields are required to be + initialized by userspace prior to registration, they are 'cpu_id', + 'rseq_cs' and 'flags'. Since we have no guarantee that 'struct pthread' + is cleared on all architectures, explicitly set those 3 fields prior to + registration. + + Signed-off-by: Michael Jeanson + Reviewed-by: Florian Weimer + (cherry picked from commit 97f60abd25628425971f07e9b0e7f8eec0741235) + +diff --git a/nptl/descr.h b/nptl/descr.h +index 4697f633e16c7359..a83df327e4bcba2e 100644 +--- a/nptl/descr.h ++++ b/nptl/descr.h +@@ -417,6 +417,8 @@ struct pthread + { + uint32_t cpu_id_start; + uint32_t cpu_id; ++ uint64_t rseq_cs; ++ uint32_t flags; + }; + char pad[32]; /* Original rseq area size. */ + } rseq_area __attribute__ ((aligned (32))); +diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h +index 7ea935b4adab8c20..37a8f630b6519ff0 100644 +--- a/sysdeps/unix/sysv/linux/rseq-internal.h ++++ b/sysdeps/unix/sysv/linux/rseq-internal.h +@@ -51,11 +51,21 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) + /* The initial implementation used only 20 bytes out of 32, + but still expected size 32. */ + size = RSEQ_AREA_SIZE_INITIAL; ++ ++ /* Initialize the rseq fields that are read by the kernel on ++ registration, there is no guarantee that struct pthread is ++ cleared on all architectures. */ ++ THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED); ++ THREAD_SETMEM (self, rseq_area.rseq_cs, 0); ++ THREAD_SETMEM (self, rseq_area.flags, 0); ++ + int ret = INTERNAL_SYSCALL_CALL (rseq, &self->rseq_area, + size, 0, RSEQ_SIG); + if (!INTERNAL_SYSCALL_ERROR_P (ret)) + return true; + } ++ /* When rseq is disabled by tunables or the registration fails, inform ++ userspace by setting 'cpu_id' to RSEQ_CPU_ID_REGISTRATION_FAILED. */ + THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_REGISTRATION_FAILED); + return false; + } diff --git a/glibc-upstream-2.39-139.patch b/glibc-upstream-2.39-139.patch new file mode 100644 index 0000000..20d6367 --- /dev/null +++ b/glibc-upstream-2.39-139.patch @@ -0,0 +1,29 @@ +commit 350db2839387659e1500a54d276e401c9c6b2dee +Author: Michael Jeanson +Date: Wed Nov 20 14:15:42 2024 -0500 + + nptl: initialize cpu_id_start prior to rseq registration + + When adding explicit initialization of rseq fields prior to + registration, I glossed over the fact that 'cpu_id_start' is also + documented as initialized by user-space. + + While current kernels don't validate the content of this field on + registration, future ones could. + + Signed-off-by: Michael Jeanson + Reviewed-by: Mathieu Desnoyers + (cherry picked from commit d9f40387d3305d97e30a8cf8724218c42a63680a) + +diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h +index 37a8f630b6519ff0..ef3eab1fefd4d90d 100644 +--- a/sysdeps/unix/sysv/linux/rseq-internal.h ++++ b/sysdeps/unix/sysv/linux/rseq-internal.h +@@ -56,6 +56,7 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq) + registration, there is no guarantee that struct pthread is + cleared on all architectures. */ + THREAD_SETMEM (self, rseq_area.cpu_id, RSEQ_CPU_ID_UNINITIALIZED); ++ THREAD_SETMEM (self, rseq_area.cpu_id_start, 0); + THREAD_SETMEM (self, rseq_area.rseq_cs, 0); + THREAD_SETMEM (self, rseq_area.flags, 0); + diff --git a/glibc-upstream-2.39-140.patch b/glibc-upstream-2.39-140.patch new file mode 100644 index 0000000..aa67ae2 --- /dev/null +++ b/glibc-upstream-2.39-140.patch @@ -0,0 +1,28 @@ +commit aa8768999e94fcee1695feb766c69dd8a93b706b +Author: H.J. Lu +Date: Fri May 17 20:00:38 2024 -0700 + + Pass -nostdlib -nostartfiles together with -r [BZ #31753] + + Since -r in GCC 6/7/8 doesn't imply -nostdlib -nostartfiles, update the + link-static-libc.out rule to also pass -nostdlib -nostartfiles. This + fixes BZ #31753. + + Signed-off-by: H.J. Lu + Reviewed-by: Florian Weimer + (cherry picked from commit 2be3352f0b1ebaa39596393fffe1062275186669) + +diff --git a/Makefile b/Makefile +index 37bf70aa4ad4403f..ae9bc09327dd2d5b 100644 +--- a/Makefile ++++ b/Makefile +@@ -581,7 +581,8 @@ $(objpfx)lint-makefiles.out: scripts/lint-makefiles.sh + # definitions of any symbols. + tests-special += $(objpfx)link-static-libc.out + $(objpfx)link-static-libc.out: +- $(LINK.o) $(whole-archive) -r $(objpfx)libc.a -o /dev/null > $@ 2>&1; \ ++ $(LINK.o) $(whole-archive) -nostdlib -nostartfiles -r \ ++ $(objpfx)libc.a -o /dev/null > $@ 2>&1; \ + $(evaluate-test) + + # Print test summary for tests in $1 .sum file; diff --git a/glibc-upstream-2.39-141.patch b/glibc-upstream-2.39-141.patch new file mode 100644 index 0000000..2801ac0 --- /dev/null +++ b/glibc-upstream-2.39-141.patch @@ -0,0 +1,172 @@ +commit 51da74a97e0f024fd89b57304b3ab010a3cfaef1 +Author: Sam James +Date: Mon Dec 9 23:11:25 2024 +0000 + + malloc: add indirection for malloc(-like) functions in tests [BZ #32366] + + GCC 15 introduces allocation dead code removal (DCE) for PR117370 in + r15-5255-g7828dc070510f8. This breaks various glibc tests which want + to assert various properties of the allocator without doing anything + obviously useful with the allocated memory. + + Alexander Monakov rightly pointed out that we can and should do better + than passing -fno-malloc-dce to paper over the problem. Not least because + GCC 14 already does such DCE where there's no testing of malloc's return + value against NULL, and LLVM has such optimisations too. + + Handle this by providing malloc (and friends) wrappers with a volatile + function pointer to obscure that we're calling malloc (et. al) from the + compiler. + + Reviewed-by: Paul Eggert + (cherry picked from commit a9944a52c967ce76a5894c30d0274b824df43c7a) + +diff --git a/malloc/tst-aligned-alloc.c b/malloc/tst-aligned-alloc.c +index 91167d1392c0e626..b0f05a8fec78d5e8 100644 +--- a/malloc/tst-aligned-alloc.c ++++ b/malloc/tst-aligned-alloc.c +@@ -25,6 +25,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + static int + do_test (void) + { +diff --git a/malloc/tst-compathooks-off.c b/malloc/tst-compathooks-off.c +index d0106f3fb74ff3b1..4cce6e5a8076f6b6 100644 +--- a/malloc/tst-compathooks-off.c ++++ b/malloc/tst-compathooks-off.c +@@ -25,6 +25,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + extern void (*volatile __free_hook) (void *, const void *); + extern void *(*volatile __malloc_hook)(size_t, const void *); + extern void *(*volatile __realloc_hook)(void *, size_t, const void *); +diff --git a/malloc/tst-malloc-aux.h b/malloc/tst-malloc-aux.h +new file mode 100644 +index 0000000000000000..54908b4a2464d510 +--- /dev/null ++++ b/malloc/tst-malloc-aux.h +@@ -0,0 +1,41 @@ ++/* Wrappers for malloc-like functions to allow testing the implementation ++ without optimization. ++ Copyright (C) 2024 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public License as ++ published by the Free Software Foundation; either version 2.1 of the ++ License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; see the file COPYING.LIB. If ++ not, see . */ ++ ++#ifndef TST_MALLOC_AUX_H ++#define TST_MALLOC_AUX_H ++ ++#include ++#include ++ ++static void *(*volatile aligned_alloc_indirect)(size_t, size_t) = aligned_alloc; ++static void *(*volatile calloc_indirect)(size_t, size_t) = calloc; ++static void *(*volatile malloc_indirect)(size_t) = malloc; ++static void *(*volatile realloc_indirect)(void*, size_t) = realloc; ++ ++#undef aligned_alloc ++#undef calloc ++#undef malloc ++#undef realloc ++ ++#define aligned_alloc aligned_alloc_indirect ++#define calloc calloc_indirect ++#define malloc malloc_indirect ++#define realloc realloc_indirect ++ ++#endif /* TST_MALLOC_AUX_H */ +diff --git a/malloc/tst-malloc-check.c b/malloc/tst-malloc-check.c +index fde8863ad7561a71..cc88bff3b39a421c 100644 +--- a/malloc/tst-malloc-check.c ++++ b/malloc/tst-malloc-check.c +@@ -20,6 +20,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + static int errors = 0; + + static void +diff --git a/malloc/tst-malloc-too-large.c b/malloc/tst-malloc-too-large.c +index 8e9e0d5fa2b4b907..2b91377e54cdc485 100644 +--- a/malloc/tst-malloc-too-large.c ++++ b/malloc/tst-malloc-too-large.c +@@ -43,6 +43,7 @@ + #include + #include + ++#include "tst-malloc-aux.h" + + /* This function prepares for each 'too-large memory allocation' test by + performing a small successful malloc/free and resetting errno prior to +diff --git a/malloc/tst-malloc.c b/malloc/tst-malloc.c +index f7a6e4654c374d01..68af399022543111 100644 +--- a/malloc/tst-malloc.c ++++ b/malloc/tst-malloc.c +@@ -22,6 +22,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + static int errors = 0; + + static void +diff --git a/malloc/tst-realloc.c b/malloc/tst-realloc.c +index f50499ecb114d574..74a28fb45ed80bf5 100644 +--- a/malloc/tst-realloc.c ++++ b/malloc/tst-realloc.c +@@ -23,6 +23,8 @@ + #include + #include + ++#include "tst-malloc-aux.h" ++ + static int + do_test (void) + { +diff --git a/support/support.h b/support/support.h +index ba21ec9b5add7c02..1a77f7979330d60c 100644 +--- a/support/support.h ++++ b/support/support.h +@@ -113,7 +113,7 @@ void *xposix_memalign (size_t alignment, size_t n) + __attribute_malloc__ __attribute_alloc_align__ ((1)) + __attribute_alloc_size__ ((2)) __attr_dealloc_free __returns_nonnull; + char *xasprintf (const char *format, ...) +- __attribute__ ((format (printf, 1, 2), malloc)) __attr_dealloc_free ++ __attribute__ ((format (printf, 1, 2), __malloc__)) __attr_dealloc_free + __returns_nonnull; + char *xstrdup (const char *) __attr_dealloc_free __returns_nonnull; + char *xstrndup (const char *, size_t) __attr_dealloc_free __returns_nonnull; +diff --git a/test-skeleton.c b/test-skeleton.c +index ae185a4f2821de00..690f26e7cf229622 100644 +--- a/test-skeleton.c ++++ b/test-skeleton.c +@@ -27,7 +27,6 @@ + #include + #include + #include +-#include + #include + #include + #include diff --git a/glibc-upstream-2.39-142.patch b/glibc-upstream-2.39-142.patch new file mode 100644 index 0000000..72c8751 --- /dev/null +++ b/glibc-upstream-2.39-142.patch @@ -0,0 +1,44 @@ +commit 2c882bf9c15d206aaf04766d1b8e3ae5b1002cc2 +Author: H.J. Lu +Date: Thu Dec 5 08:39:44 2024 +0800 + + math: Exclude internal math symbols for tests [BZ #32414] + + Since internal tests don't have access to internal symbols in libm, + exclude them for internal tests. Also make tst-strtod5 and tst-strtod5i + depend on $(libm) to support older versions of GCC which can't inline + copysign family functions. This fixes BZ #32414. + + Signed-off-by: H.J. Lu + Reviewed-by: Sunil K Pandey + (cherry picked from commit 5df09b444835fca6e64b3d4b4a5beb19b3b2ba21) + +diff --git a/include/math.h b/include/math.h +index fa11a710a6c152a4..035fd160ffb9e032 100644 +--- a/include/math.h ++++ b/include/math.h +@@ -130,7 +130,10 @@ fabsf128 (_Float128 x) + } + # endif + +-# if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0) ++ ++/* NB: Internal tests don't have access to internal symbols. */ ++# if !IS_IN (testsuite_internal) \ ++ && !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0) + # ifndef NO_MATH_REDIRECT + /* Declare some functions for use within GLIBC. Compilers typically + inline those functions as a single instruction. Use an asm to +diff --git a/stdlib/Makefile b/stdlib/Makefile +index 70d7291c6e3454a8..ff1418f5bb2ea5c9 100644 +--- a/stdlib/Makefile ++++ b/stdlib/Makefile +@@ -607,6 +607,8 @@ $(objpfx)bug-strtod2: $(libm) + $(objpfx)tst-strtod-round: $(libm) + $(objpfx)tst-tininess: $(libm) + $(objpfx)tst-strtod-underflow: $(libm) ++$(objpfx)tst-strtod5: $(libm) ++$(objpfx)tst-strtod5i: $(libm) + $(objpfx)tst-strtod6: $(libm) + $(objpfx)tst-strtod-nan-locale: $(libm) + $(objpfx)tst-strtod-nan-sign: $(libm) diff --git a/glibc-upstream-2.39-143.patch b/glibc-upstream-2.39-143.patch new file mode 100644 index 0000000..ade64c9 --- /dev/null +++ b/glibc-upstream-2.39-143.patch @@ -0,0 +1,30 @@ +commit 2c8a7f14fac3628b6a06cc76cdfda54a7ac20386 +Author: Florian Weimer +Date: Tue Dec 17 18:12:03 2024 +0100 + + x86: Avoid integer truncation with large cache sizes (bug 32470) + + Some hypervisors report 1 TiB L3 cache size. This results + in some variables incorrectly getting zeroed, causing crashes + in memcpy/memmove because invariants are violated. + + (cherry picked from commit 61c3450db96dce96ad2b24b4f0b548e6a46d68e5) + +diff --git a/sysdeps/x86/dl-cacheinfo.h b/sysdeps/x86/dl-cacheinfo.h +index 5a98f70364220da4..1f68968a9a457586 100644 +--- a/sysdeps/x86/dl-cacheinfo.h ++++ b/sysdeps/x86/dl-cacheinfo.h +@@ -959,11 +959,11 @@ dl_init_cacheinfo (struct cpu_features *cpu_features) + non_temporal_threshold = maximum_non_temporal_threshold; + + /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ +- unsigned int minimum_rep_movsb_threshold; ++ unsigned long int minimum_rep_movsb_threshold; + /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for + VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB + threshold is 2048 * (VEC_SIZE / 16). */ +- unsigned int rep_movsb_threshold; ++ unsigned long int rep_movsb_threshold; + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) + { diff --git a/glibc-upstream-2.39-144.patch b/glibc-upstream-2.39-144.patch new file mode 100644 index 0000000..17ca3b3 --- /dev/null +++ b/glibc-upstream-2.39-144.patch @@ -0,0 +1,250 @@ +commit 61daaa76390e0ff73eade3a688d3626b7e7e0c20 +Author: Noah Goldstein +Date: Fri May 24 12:38:50 2024 -0500 + + x86: Improve large memset perf with non-temporal stores [RHEL-29312] + + Previously we use `rep stosb` for all medium/large memsets. This is + notably worse than non-temporal stores for large (above a + few MBs) memsets. + See: + https://docs.google.com/spreadsheets/d/1opzukzvum4n6-RUVHTGddV6RjAEil4P2uMjjQGLbLcU/edit?usp=sharing + For data using different stategies for large memset on ICX and SKX. + + Using non-temporal stores can be up to 3x faster on ICX and 2x faster + on SKX. Historically, these numbers would not have been so good + because of the zero-over-zero writeback optimization that `rep stosb` + is able to do. But, the zero-over-zero writeback optimization has been + removed as a potential side-channel attack, so there is no longer any + good reason to only rely on `rep stosb` for large memsets. On the flip + size, non-temporal writes can avoid data in their RFO requests saving + memory bandwidth. + + All of the other changes to the file are to re-organize the + code-blocks to maintain "good" alignment given the new code added in + the `L(stosb_local)` case. + + The results from running the GLIBC memset benchmarks on TGL-client for + N=20 runs: + + Geometric Mean across the suite New / Old EXEX256: 0.979 + Geometric Mean across the suite New / Old EXEX512: 0.979 + Geometric Mean across the suite New / Old AVX2 : 0.986 + Geometric Mean across the suite New / Old SSE2 : 0.979 + + Most of the cases are essentially unchanged, this is mostly to show + that adding the non-temporal case didn't add any regressions to the + other cases. + + The results on the memset-large benchmark suite on TGL-client for N=20 + runs: + + Geometric Mean across the suite New / Old EXEX256: 0.926 + Geometric Mean across the suite New / Old EXEX512: 0.925 + Geometric Mean across the suite New / Old AVX2 : 0.928 + Geometric Mean across the suite New / Old SSE2 : 0.924 + + So roughly a 7.5% speedup. This is lower than what we see on servers + (likely because clients typically have faster single-core bandwidth so + saving bandwidth on RFOs is less impactful), but still advantageous. + + Full test-suite passes on x86_64 w/ and w/o multiarch. + Reviewed-by: H.J. Lu + + (cherry picked from commit 5bf0ab80573d66e4ae5d94b094659094336da90f) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 97839a22483b0613..637caadb406b2544 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -21,10 +21,13 @@ + 2. If size is less than VEC, use integer register stores. + 3. If size is from VEC_SIZE to 2 * VEC_SIZE, use 2 VEC stores. + 4. If size is from 2 * VEC_SIZE to 4 * VEC_SIZE, use 4 VEC stores. +- 5. On machines ERMS feature, if size is greater or equal than +- __x86_rep_stosb_threshold then REP STOSB will be used. +- 6. If size is more to 4 * VEC_SIZE, align to 4 * VEC_SIZE with +- 4 VEC stores and store 4 * VEC at a time until done. */ ++ 5. If size is more to 4 * VEC_SIZE, align to 1 * VEC_SIZE with ++ 4 VEC stores and store 4 * VEC at a time until done. ++ 6. On machines ERMS feature, if size is range ++ [__x86_rep_stosb_threshold, __x86_shared_non_temporal_threshold) ++ then REP STOSB will be used. ++ 7. If size >= __x86_shared_non_temporal_threshold, use a ++ non-temporal stores. */ + + #include + +@@ -147,6 +150,41 @@ L(entry_from_wmemset): + VMOVU %VMM(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VMM(0), (%rdi) + VZEROUPPER_RETURN ++ ++ /* If have AVX512 mask instructions put L(less_vec) close to ++ entry as it doesn't take much space and is likely a hot target. */ ++#ifdef USE_LESS_VEC_MASK_STORE ++ /* Align to ensure the L(less_vec) logic all fits in 1x cache lines. */ ++ .p2align 6,, 47 ++ .p2align 4 ++L(less_vec): ++L(less_vec_from_wmemset): ++ /* Less than 1 VEC. */ ++# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 ++# error Unsupported VEC_SIZE! ++# endif ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. Note that we are using rax which is set in ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ ++ andl $(PAGE_SIZE - 1), %edi ++ /* Check if VEC_SIZE store cross page. Mask stores suffer ++ serious performance degradation when it has to fault suppress. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ /* This is generally considered a cold target. */ ++ ja L(cross_page) ++# if VEC_SIZE > 32 ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx ++ kmovq %rcx, %k1 ++# else ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k1 ++# endif ++ vmovdqu8 %VMM(0), (%rax){%k1} ++ VZEROUPPER_RETURN ++#endif ++ + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMSET_SYMBOL (__memset, unaligned)) + +@@ -185,54 +223,6 @@ L(last_2x_vec): + #endif + VZEROUPPER_RETURN + +- /* If have AVX512 mask instructions put L(less_vec) close to +- entry as it doesn't take much space and is likely a hot target. +- */ +-#ifdef USE_LESS_VEC_MASK_STORE +- .p2align 4,, 10 +-L(less_vec): +-L(less_vec_from_wmemset): +- /* Less than 1 VEC. */ +-# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +-# error Unsupported VEC_SIZE! +-# endif +- /* Clear high bits from edi. Only keeping bits relevant to page +- cross check. Note that we are using rax which is set in +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ +- andl $(PAGE_SIZE - 1), %edi +- /* Check if VEC_SIZE store cross page. Mask stores suffer +- serious performance degradation when it has to fault suppress. +- */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %edi +- /* This is generally considered a cold target. */ +- ja L(cross_page) +-# if VEC_SIZE > 32 +- movq $-1, %rcx +- bzhiq %rdx, %rcx, %rcx +- kmovq %rcx, %k1 +-# else +- movl $-1, %ecx +- bzhil %edx, %ecx, %ecx +- kmovd %ecx, %k1 +-# endif +- vmovdqu8 %VMM(0), (%rax){%k1} +- VZEROUPPER_RETURN +- +-# if defined USE_MULTIARCH && IS_IN (libc) +- /* Include L(stosb_local) here if including L(less_vec) between +- L(stosb_more_2x_vec) and ENTRY. This is to cache align the +- L(stosb_more_2x_vec) target. */ +- .p2align 4,, 10 +-L(stosb_local): +- movzbl %sil, %eax +- mov %RDX_LP, %RCX_LP +- mov %RDI_LP, %RDX_LP +- rep stosb +- mov %RDX_LP, %RAX_LP +- VZEROUPPER_RETURN +-# endif +-#endif +- + #if defined USE_MULTIARCH && IS_IN (libc) + .p2align 4 + L(stosb_more_2x_vec): +@@ -318,21 +308,33 @@ L(return_vzeroupper): + ret + #endif + +- .p2align 4,, 10 +-#ifndef USE_LESS_VEC_MASK_STORE +-# if defined USE_MULTIARCH && IS_IN (libc) ++#ifdef USE_WITH_AVX2 ++ .p2align 4 ++#else ++ .p2align 4,, 4 ++#endif ++ ++#if defined USE_MULTIARCH && IS_IN (libc) + /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in + range for 2-byte jump encoding. */ + L(stosb_local): ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ++ jae L(nt_memset) + movzbl %sil, %eax + mov %RDX_LP, %RCX_LP + mov %RDI_LP, %RDX_LP + rep stosb ++# if (defined USE_WITH_SSE2) || (defined USE_WITH_AVX512) ++ /* Use xchg to save 1-byte (this helps align targets below). */ ++ xchg %RDX_LP, %RAX_LP ++# else + mov %RDX_LP, %RAX_LP +- VZEROUPPER_RETURN + # endif ++ VZEROUPPER_RETURN ++#endif ++#ifndef USE_LESS_VEC_MASK_STORE + /* Define L(less_vec) only if not otherwise defined. */ +- .p2align 4 ++ .p2align 4,, 12 + L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ +@@ -423,4 +425,35 @@ L(between_2_3): + movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret +-END (MEMSET_SYMBOL (__memset, unaligned_erms)) ++ ++#if defined USE_MULTIARCH && IS_IN (libc) ++# ifdef USE_WITH_AVX512 ++ /* Force align so the loop doesn't cross a cache-line. */ ++ .p2align 4 ++# endif ++ .p2align 4,, 7 ++ /* Memset using non-temporal stores. */ ++L(nt_memset): ++ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdi) ++ leaq (VEC_SIZE * -4)(%rdi, %rdx), %rdx ++ /* Align DST. */ ++ orq $(VEC_SIZE * 1 - 1), %rdi ++ incq %rdi ++ .p2align 4,, 7 ++L(nt_loop): ++ VMOVNT %VMM(0), (VEC_SIZE * 0)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 1)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 2)(%rdi) ++ VMOVNT %VMM(0), (VEC_SIZE * 3)(%rdi) ++ subq $(VEC_SIZE * -4), %rdi ++ cmpq %rdx, %rdi ++ jb L(nt_loop) ++ sfence ++ VMOVU %VMM(0), (VEC_SIZE * 0)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 1)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 2)(%rdx) ++ VMOVU %VMM(0), (VEC_SIZE * 3)(%rdx) ++ VZEROUPPER_RETURN ++#endif ++ ++END(MEMSET_SYMBOL(__memset, unaligned_erms)) diff --git a/glibc-upstream-2.39-145.patch b/glibc-upstream-2.39-145.patch new file mode 100644 index 0000000..1248613 --- /dev/null +++ b/glibc-upstream-2.39-145.patch @@ -0,0 +1,143 @@ +commit 994b129a35ca5218ecddd1add74aea68f1314560 +Author: Noah Goldstein +Date: Fri Sep 27 15:50:10 2024 -0700 + + x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212] + + The loop should be aligned to 32-bytes so that it can ideally run out + the DSB. This is particularly important on Skylake-Server where + deficiencies in it's DSB implementation make it prone to not being + able to run loops out of the DSB. + + For example running strcmp-evex on 200Mb string: + + 32-byte aligned loop: + - 43,399,578,766 idq.dsb_uops + not 32-byte aligned loop: + - 6,060,139,704 idq.dsb_uops + + This results in a 25% performance degradation for the non-aligned + version. + + The fix is to just ensure the code layout is such that the loop is + aligned. (Which was previously the case but was accidentally dropped + in 84e7c46df). + + NB: The fix was actually 64-byte alignment. This is because 64-byte + alignment generally produces more stable performance than 32-byte + aligned code (cache line crosses can affect perf), so if we are going + past 16-byte alignmnent, might as well go to 64. 64-byte alignment + also matches most other functions we over-align, so it creates a + common point of optimization. + + Times are reported as ratio of Time_With_Patch / + Time_Without_Patch. Lower is better. + + The values being reported is the geometric mean of the ratio across + all tests in bench-strcmp and bench-strncmp. + + Note this patch is only attempting to improve the Skylake-Server + strcmp for long strings. The rest of the numbers are only to test for + regressions. + + Tigerlake Results Strings <= 512: + strcmp : 1.026 + strncmp: 0.949 + + Tigerlake Results Strings > 512: + strcmp : 0.994 + strncmp: 0.998 + + Skylake-Server Results Strings <= 512: + strcmp : 0.945 + strncmp: 0.943 + + Skylake-Server Results Strings > 512: + strcmp : 0.778 + strncmp: 1.000 + + The 2.6% regression on TGL-strcmp is due to slowdowns caused by + changes in alignment of code handling small sizes (most on the + page-cross logic). These should be safe to ignore because 1) We + previously only 16-byte aligned the function so this behavior is not + new and was essentially up to chance before this patch and 2) this + type of alignment related regression on small sizes really only comes + up in tight micro-benchmark loops and is unlikely to have any affect + on realworld performance. + + Reviewed-by: H.J. Lu + (cherry picked from commit 483443d3211532903d7e790211af5a1d55fdb1f3) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 06730ab2a18f72a0..cea034f394ab45e2 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -209,7 +209,9 @@ + returned. */ + + .section SECTION(.text), "ax", @progbits +- .align 16 ++ /* Align 64 bytes here. This is to get the L(loop) block ideally ++ aligned for the DSB. */ ++ .align 64 + .type STRCMP, @function + .globl STRCMP + # ifdef USE_AS_STRCASECMP_L +@@ -509,9 +511,7 @@ L(ret4): + ret + # endif + +- /* 32 byte align here ensures the main loop is ideally aligned +- for DSB. */ +- .p2align 5 ++ .p2align 4,, 4 + L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU (VEC_SIZE)(%rdi), %VMM(0) +@@ -1426,10 +1426,9 @@ L(less_32_till_page): + L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax + ret +-# endif +- +- ++# else + .p2align 4,, 10 ++# endif + L(less_16_till_page): + cmpl $((VEC_SIZE - 8) / SIZE_OF_CHAR), %eax + ja L(less_8_till_page) +@@ -1482,8 +1481,12 @@ L(less_16_till_page): + # endif + jmp L(prepare_loop_aligned) + +- +- ++# ifndef USE_AS_STRNCMP ++ /* Fits in aligning bytes. */ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++# endif + + .p2align 4,, 10 + L(less_8_till_page): +@@ -1554,6 +1557,7 @@ L(ret_less_8_wcs): + + # ifdef USE_AS_STRNCMP + .p2align 4,, 2 ++L(ret_zero_4_loop): + L(ret_zero_page_cross_slow_case1): + xorl %eax, %eax + ret +@@ -1586,10 +1590,6 @@ L(less_4_loop): + subq $-(CHAR_PER_VEC * 4), %rdx + # endif + jmp L(prepare_loop_aligned) +- +-L(ret_zero_4_loop): +- xorl %eax, %eax +- ret + L(ret_less_4_loop): + xorl %r8d, %eax + subl %r8d, %eax diff --git a/glibc-upstream-2.39-146.patch b/glibc-upstream-2.39-146.patch new file mode 100644 index 0000000..9d16b94 --- /dev/null +++ b/glibc-upstream-2.39-146.patch @@ -0,0 +1,57 @@ +commit 808a84a8b81468b517a4d721fdc62069cb8c211f +Author: Siddhesh Poyarekar +Date: Tue Jan 21 16:11:06 2025 -0500 + + Fix underallocation of abort_msg_s struct (CVE-2025-0395) + + Include the space needed to store the length of the message itself, in + addition to the message string. This resolves BZ #32582. + + Signed-off-by: Siddhesh Poyarekar + Reviewed: Adhemerval Zanella + (cherry picked from commit 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578) + +diff --git a/assert/assert.c b/assert/assert.c +index c29629f5f68921a0..b6e37d694cf4b779 100644 +--- a/assert/assert.c ++++ b/assert/assert.c +@@ -18,6 +18,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -65,7 +66,8 @@ __assert_fail_base (const char *fmt, const char *assertion, const char *file, + (void) __fxprintf (NULL, "%s", str); + (void) fflush (stderr); + +- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1); ++ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1, ++ GLRO(dl_pagesize)); + struct abort_msg_s *buf = __mmap (NULL, total, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + if (__glibc_likely (buf != MAP_FAILED)) +diff --git a/sysdeps/posix/libc_fatal.c b/sysdeps/posix/libc_fatal.c +index f9e3425e04496a26..089c47b04b8af049 100644 +--- a/sysdeps/posix/libc_fatal.c ++++ b/sysdeps/posix/libc_fatal.c +@@ -20,6 +20,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -105,7 +106,8 @@ __libc_message_impl (const char *fmt, ...) + { + WRITEV_FOR_FATAL (fd, iov, iovcnt, total); + +- total = (total + 1 + GLRO(dl_pagesize) - 1) & ~(GLRO(dl_pagesize) - 1); ++ total = ALIGN_UP (total + sizeof (struct abort_msg_s) + 1, ++ GLRO(dl_pagesize)); + struct abort_msg_s *buf = __mmap (NULL, total, + PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); diff --git a/glibc.spec b/glibc.spec index 66c4e1f..ef89355 100644 --- a/glibc.spec +++ b/glibc.spec @@ -145,7 +145,7 @@ Version: %{glibcversion} # - It allows using the Release number without the %%dist tag in the dependency # generator to make the generated requires interchangeable between Rawhide # and ELN (.elnYY < .fcXX). -%global baserelease 33 +%global baserelease 34 Release: %{baserelease}%{?dist} # Licenses: @@ -495,6 +495,15 @@ Patch177: glibc-RHEL-71530-7.patch Patch178: glibc-RHEL-71530-8.patch Patch179: glibc-RHEL-71530-9.patch Patch180: glibc-RHEL-71530-10.patch +Patch181: glibc-upstream-2.39-138.patch +Patch182: glibc-upstream-2.39-139.patch +Patch183: glibc-upstream-2.39-140.patch +Patch184: glibc-upstream-2.39-141.patch +Patch185: glibc-upstream-2.39-142.patch +Patch186: glibc-upstream-2.39-143.patch +Patch187: glibc-upstream-2.39-144.patch +Patch188: glibc-upstream-2.39-145.patch +Patch189: glibc-upstream-2.39-146.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2490,6 +2499,19 @@ update_gconv_modules_cache () %endif %changelog +* Thu Jan 23 2025 Florian Weimer - 2.39-34 +- Sync with upstream branch release/2.39/master, + commit 808a84a8b81468b517a4d721fdc62069cb8c211f: +- Fix underallocation of abort_msg_s struct (CVE-2025-0395) +- x86/string: Fixup alignment of main loop in str{n}cmp-evex [BZ #32212] +- x86: Improve large memset perf with non-temporal stores [RHEL-29312] +- x86: Avoid integer truncation with large cache sizes (bug 32470) +- math: Exclude internal math symbols for tests [BZ #32414] +- malloc: add indirection for malloc(-like) functions in tests [BZ #32366] +- Pass -nostdlib -nostartfiles together with -r [BZ #31753] +- nptl: initialize cpu_id_start prior to rseq registration +- nptl: initialize rseq area prior to registration + * Mon Dec 23 2024 Florian Weimer - 2.39-33 - Support in-place file conversion in the iconv tool (RHEL-71530)