diff --git a/glibc-upstream-2.34-191.patch b/glibc-upstream-2.34-191.patch new file mode 100644 index 0000000..55b6a81 --- /dev/null +++ b/glibc-upstream-2.34-191.patch @@ -0,0 +1,35 @@ +commit bc6fba3c8048b11c9f73db03339c97a2fec3f0cf +Author: Joseph Myers +Date: Wed Nov 17 14:25:16 2021 +0000 + + Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h + + Linux 5.15 adds a new address / protocol family PF_MCTP / AF_MCTP; add + these constants to bits/socket.h. + + Tested for x86_64. + + (cherry picked from commit bdeb7a8fa9989d18dab6310753d04d908125dc1d) + +diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h +index a011a8c0959b9970..7bb9e863d7329da9 100644 +--- a/sysdeps/unix/sysv/linux/bits/socket.h ++++ b/sysdeps/unix/sysv/linux/bits/socket.h +@@ -86,7 +86,8 @@ typedef __socklen_t socklen_t; + #define PF_QIPCRTR 42 /* Qualcomm IPC Router. */ + #define PF_SMC 43 /* SMC sockets. */ + #define PF_XDP 44 /* XDP sockets. */ +-#define PF_MAX 45 /* For now.. */ ++#define PF_MCTP 45 /* Management component transport protocol. */ ++#define PF_MAX 46 /* For now.. */ + + /* Address families. */ + #define AF_UNSPEC PF_UNSPEC +@@ -137,6 +138,7 @@ typedef __socklen_t socklen_t; + #define AF_QIPCRTR PF_QIPCRTR + #define AF_SMC PF_SMC + #define AF_XDP PF_XDP ++#define AF_MCTP PF_MCTP + #define AF_MAX PF_MAX + + /* Socket level values. Others are defined in the appropriate headers. diff --git a/glibc-upstream-2.34-192.patch b/glibc-upstream-2.34-192.patch new file mode 100644 index 0000000..5a89460 --- /dev/null +++ b/glibc-upstream-2.34-192.patch @@ -0,0 +1,27 @@ +commit fd5dbfd1cd98cb2f12f9e9f7004a4d25ab0c977f +Author: Joseph Myers +Date: Mon Nov 22 15:30:12 2021 +0000 + + Update kernel version to 5.15 in tst-mman-consts.py + + This patch updates the kernel version in the test tst-mman-consts.py + to 5.15. (There are no new MAP_* constants covered by this test in + 5.15 that need any other header changes.) + + Tested with build-many-glibcs.py. + + (cherry picked from commit 5c3ece451d46a7d8721311609bfcb6faafacb39e) + +diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py +index 810433c238f31c25..eeccdfd04dae57ab 100644 +--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py ++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py +@@ -33,7 +33,7 @@ def main(): + help='C compiler (including options) to use') + args = parser.parse_args() + linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) +- linux_version_glibc = (5, 14) ++ linux_version_glibc = (5, 15) + sys.exit(glibcextract.compare_macro_consts( + '#define _GNU_SOURCE 1\n' + '#include \n', diff --git a/glibc-upstream-2.34-193.patch b/glibc-upstream-2.34-193.patch new file mode 100644 index 0000000..d056d36 --- /dev/null +++ b/glibc-upstream-2.34-193.patch @@ -0,0 +1,28 @@ +commit 5146b73d72ced9bab125e986aa99ef5fe2f88475 +Author: Joseph Myers +Date: Mon Dec 20 15:38:32 2021 +0000 + + Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h + + Add the constant ARPHRD_MCTP, from Linux 5.15, to net/if_arp.h, along + with ARPHRD_CAN which was added to Linux in version 2.6.25 (commit + cd05acfe65ed2cf2db683fa9a6adb8d35635263b, "[CAN]: Allocate protocol + numbers for PF_CAN") but apparently missed for glibc at the time. + + Tested for x86_64. + + (cherry picked from commit a94d9659cd69dbc70d3494b1cbbbb5a1551675c5) + +diff --git a/sysdeps/unix/sysv/linux/net/if_arp.h b/sysdeps/unix/sysv/linux/net/if_arp.h +index 2a8933cde7cf236d..42910b776660def1 100644 +--- a/sysdeps/unix/sysv/linux/net/if_arp.h ++++ b/sysdeps/unix/sysv/linux/net/if_arp.h +@@ -95,6 +95,8 @@ struct arphdr + #define ARPHRD_ROSE 270 + #define ARPHRD_X25 271 /* CCITT X.25. */ + #define ARPHRD_HWX25 272 /* Boards with X.25 in firmware. */ ++#define ARPHRD_CAN 280 /* Controller Area Network. */ ++#define ARPHRD_MCTP 290 + #define ARPHRD_PPP 512 + #define ARPHRD_CISCO 513 /* Cisco HDLC. */ + #define ARPHRD_HDLC ARPHRD_CISCO diff --git a/glibc-upstream-2.34-194.patch b/glibc-upstream-2.34-194.patch new file mode 100644 index 0000000..0437f53 --- /dev/null +++ b/glibc-upstream-2.34-194.patch @@ -0,0 +1,337 @@ +commit 6af165658d0999ac2c4e9ce88bee020fbc2ee49f +Author: Joseph Myers +Date: Wed Mar 23 17:11:56 2022 +0000 + + Update syscall lists for Linux 5.17 + + Linux 5.17 has one new syscall, set_mempolicy_home_node. Update + syscall-names.list and regenerate the arch-syscall.h headers with + build-many-glibcs.py update-syscalls. + + Tested with build-many-glibcs.py. + + (cherry picked from commit 8ef9196b26793830515402ea95aca2629f7721ec) + +diff --git a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h +index 9905ebedf298954c..4fcb6da80af37e9e 100644 +--- a/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/aarch64/arch-syscall.h +@@ -236,6 +236,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h +index ee8085be69958b25..0cf74c1a96bb1235 100644 +--- a/sysdeps/unix/sysv/linux/alpha/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/alpha/arch-syscall.h +@@ -391,6 +391,7 @@ + #define __NR_sendmsg 114 + #define __NR_sendto 133 + #define __NR_set_mempolicy 431 ++#define __NR_set_mempolicy_home_node 560 + #define __NR_set_robust_list 466 + #define __NR_set_tid_address 411 + #define __NR_setdomainname 166 +diff --git a/sysdeps/unix/sysv/linux/arc/arch-syscall.h b/sysdeps/unix/sysv/linux/arc/arch-syscall.h +index 1b626d97705d545a..c1207aaa12be6a51 100644 +--- a/sysdeps/unix/sysv/linux/arc/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/arc/arch-syscall.h +@@ -238,6 +238,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/arm/arch-syscall.h b/sysdeps/unix/sysv/linux/arm/arch-syscall.h +index 96ef8db9368e7de4..e7ba04c106d8af7d 100644 +--- a/sysdeps/unix/sysv/linux/arm/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/arm/arch-syscall.h +@@ -302,6 +302,7 @@ + #define __NR_sendmsg 296 + #define __NR_sendto 290 + #define __NR_set_mempolicy 321 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 338 + #define __NR_set_tid_address 256 + #define __NR_set_tls 983045 +diff --git a/sysdeps/unix/sysv/linux/csky/arch-syscall.h b/sysdeps/unix/sysv/linux/csky/arch-syscall.h +index 96910154ed6a5c1b..dc9383758ebc641b 100644 +--- a/sysdeps/unix/sysv/linux/csky/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/csky/arch-syscall.h +@@ -250,6 +250,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_thread_area 244 + #define __NR_set_tid_address 96 +diff --git a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h +index 36675fd48e6f50c5..767f1287a30b473e 100644 +--- a/sysdeps/unix/sysv/linux/hppa/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/hppa/arch-syscall.h +@@ -289,6 +289,7 @@ + #define __NR_sendmsg 183 + #define __NR_sendto 82 + #define __NR_set_mempolicy 262 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 289 + #define __NR_set_tid_address 237 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/i386/arch-syscall.h b/sysdeps/unix/sysv/linux/i386/arch-syscall.h +index c86ccbda4681066c..1998f0d76a444cac 100644 +--- a/sysdeps/unix/sysv/linux/i386/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/i386/arch-syscall.h +@@ -323,6 +323,7 @@ + #define __NR_sendmsg 370 + #define __NR_sendto 369 + #define __NR_set_mempolicy 276 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 311 + #define __NR_set_thread_area 243 + #define __NR_set_tid_address 258 +diff --git a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h +index d898bce404955ef0..b2eab1b93d70b9de 100644 +--- a/sysdeps/unix/sysv/linux/ia64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/ia64/arch-syscall.h +@@ -272,6 +272,7 @@ + #define __NR_sendmsg 1205 + #define __NR_sendto 1199 + #define __NR_set_mempolicy 1261 ++#define __NR_set_mempolicy_home_node 1474 + #define __NR_set_robust_list 1298 + #define __NR_set_tid_address 1233 + #define __NR_setdomainname 1129 +diff --git a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h +index fe721b809076abeb..5fc3723772f92516 100644 +--- a/sysdeps/unix/sysv/linux/m68k/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/m68k/arch-syscall.h +@@ -310,6 +310,7 @@ + #define __NR_sendmsg 367 + #define __NR_sendto 366 + #define __NR_set_mempolicy 270 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 304 + #define __NR_set_thread_area 334 + #define __NR_set_tid_address 253 +diff --git a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h +index 6e10c3661db96a1e..b6e9b007e496cd80 100644 +--- a/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/microblaze/arch-syscall.h +@@ -326,6 +326,7 @@ + #define __NR_sendmsg 360 + #define __NR_sendto 353 + #define __NR_set_mempolicy 276 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 311 + #define __NR_set_thread_area 243 + #define __NR_set_tid_address 258 +diff --git a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h +index 26a6d594a2222f15..b3a3871f8ab8a23e 100644 +--- a/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/mips/mips32/arch-syscall.h +@@ -308,6 +308,7 @@ + #define __NR_sendmsg 4179 + #define __NR_sendto 4180 + #define __NR_set_mempolicy 4270 ++#define __NR_set_mempolicy_home_node 4450 + #define __NR_set_robust_list 4309 + #define __NR_set_thread_area 4283 + #define __NR_set_tid_address 4252 +diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h +index 83e0d49c5e3ca1bc..b462182723aff286 100644 +--- a/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/mips/mips64/n32/arch-syscall.h +@@ -288,6 +288,7 @@ + #define __NR_sendmsg 6045 + #define __NR_sendto 6043 + #define __NR_set_mempolicy 6233 ++#define __NR_set_mempolicy_home_node 6450 + #define __NR_set_robust_list 6272 + #define __NR_set_thread_area 6246 + #define __NR_set_tid_address 6213 +diff --git a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h +index d6747c542f63202b..a9d6b94572e93001 100644 +--- a/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/mips/mips64/n64/arch-syscall.h +@@ -270,6 +270,7 @@ + #define __NR_sendmsg 5045 + #define __NR_sendto 5043 + #define __NR_set_mempolicy 5229 ++#define __NR_set_mempolicy_home_node 5450 + #define __NR_set_robust_list 5268 + #define __NR_set_thread_area 5242 + #define __NR_set_tid_address 5212 +diff --git a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h +index 4ee209bc4475ea7d..809a219ef32a45ef 100644 +--- a/sysdeps/unix/sysv/linux/nios2/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/nios2/arch-syscall.h +@@ -250,6 +250,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h +index 497299fbc47a708c..627831ebae1b9e90 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/powerpc/powerpc32/arch-syscall.h +@@ -319,6 +319,7 @@ + #define __NR_sendmsg 341 + #define __NR_sendto 335 + #define __NR_set_mempolicy 261 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 232 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h +index e840279f171b10b9..bae597199d79eaad 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/powerpc/powerpc64/arch-syscall.h +@@ -298,6 +298,7 @@ + #define __NR_sendmsg 341 + #define __NR_sendto 335 + #define __NR_set_mempolicy 261 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 232 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h +index 73ef74c005e5a2bb..bf4be80f8d380963 100644 +--- a/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/riscv/rv32/arch-syscall.h +@@ -228,6 +228,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h +index 919a79ee91177459..d656aedcc2be6009 100644 +--- a/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/riscv/rv64/arch-syscall.h +@@ -235,6 +235,7 @@ + #define __NR_sendmsg 211 + #define __NR_sendto 206 + #define __NR_set_mempolicy 237 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 99 + #define __NR_set_tid_address 96 + #define __NR_setdomainname 162 +diff --git a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h +index 005c0ada7aab85a1..57025107e82c9439 100644 +--- a/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/s390/s390-32/arch-syscall.h +@@ -311,6 +311,7 @@ + #define __NR_sendmsg 370 + #define __NR_sendto 369 + #define __NR_set_mempolicy 270 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 304 + #define __NR_set_tid_address 252 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h +index 9131fddcc16116e4..72e19c6d569fbf9b 100644 +--- a/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/s390/s390-64/arch-syscall.h +@@ -278,6 +278,7 @@ + #define __NR_sendmsg 370 + #define __NR_sendto 369 + #define __NR_set_mempolicy 270 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 304 + #define __NR_set_tid_address 252 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/sh/arch-syscall.h b/sysdeps/unix/sysv/linux/sh/arch-syscall.h +index d8fb041568ecb4da..d52b522d9cac87ef 100644 +--- a/sysdeps/unix/sysv/linux/sh/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/sh/arch-syscall.h +@@ -303,6 +303,7 @@ + #define __NR_sendmsg 355 + #define __NR_sendto 349 + #define __NR_set_mempolicy 276 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 311 + #define __NR_set_tid_address 258 + #define __NR_setdomainname 121 +diff --git a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h +index 2bc014fe6a1a1f4a..d3f4d8aa3edb4795 100644 +--- a/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/sparc/sparc32/arch-syscall.h +@@ -310,6 +310,7 @@ + #define __NR_sendmsg 114 + #define __NR_sendto 133 + #define __NR_set_mempolicy 305 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 166 + #define __NR_setdomainname 163 +diff --git a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h +index 76dbbe595ffe868f..2cc03d7a24453335 100644 +--- a/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/sparc/sparc64/arch-syscall.h +@@ -286,6 +286,7 @@ + #define __NR_sendmsg 114 + #define __NR_sendto 133 + #define __NR_set_mempolicy 305 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 300 + #define __NR_set_tid_address 166 + #define __NR_setdomainname 163 +diff --git a/sysdeps/unix/sysv/linux/syscall-names.list b/sysdeps/unix/sysv/linux/syscall-names.list +index 0bc2af37dfa1eeb5..e2743c649586d97a 100644 +--- a/sysdeps/unix/sysv/linux/syscall-names.list ++++ b/sysdeps/unix/sysv/linux/syscall-names.list +@@ -21,8 +21,8 @@ + # This file can list all potential system calls. The names are only + # used if the installed kernel headers also provide them. + +-# The list of system calls is current as of Linux 5.16. +-kernel 5.16 ++# The list of system calls is current as of Linux 5.17. ++kernel 5.17 + + FAST_atomic_update + FAST_cmpxchg +@@ -523,6 +523,7 @@ sendmmsg + sendmsg + sendto + set_mempolicy ++set_mempolicy_home_node + set_robust_list + set_thread_area + set_tid_address +diff --git a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h +index 28558279b48a1ef4..b4ab892ec183e32d 100644 +--- a/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/x86_64/64/arch-syscall.h +@@ -278,6 +278,7 @@ + #define __NR_sendmsg 46 + #define __NR_sendto 44 + #define __NR_set_mempolicy 238 ++#define __NR_set_mempolicy_home_node 450 + #define __NR_set_robust_list 273 + #define __NR_set_thread_area 205 + #define __NR_set_tid_address 218 +diff --git a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h +index c1ab8ec45e8b8fd3..772559c87b3625b8 100644 +--- a/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h ++++ b/sysdeps/unix/sysv/linux/x86_64/x32/arch-syscall.h +@@ -270,6 +270,7 @@ + #define __NR_sendmsg 1073742342 + #define __NR_sendto 1073741868 + #define __NR_set_mempolicy 1073742062 ++#define __NR_set_mempolicy_home_node 1073742274 + #define __NR_set_robust_list 1073742354 + #define __NR_set_thread_area 1073742029 + #define __NR_set_tid_address 1073742042 diff --git a/glibc-upstream-2.34-195.patch b/glibc-upstream-2.34-195.patch new file mode 100644 index 0000000..d2b7afb --- /dev/null +++ b/glibc-upstream-2.34-195.patch @@ -0,0 +1,27 @@ +commit 81181ba5d916fc49bd737f603e28a3c2dc8430b4 +Author: Joseph Myers +Date: Wed Feb 16 14:19:24 2022 +0000 + + Update kernel version to 5.16 in tst-mman-consts.py + + This patch updates the kernel version in the test tst-mman-consts.py + to 5.16. (There are no new MAP_* constants covered by this test in + 5.16 that need any other header changes.) + + Tested with build-many-glibcs.py. + + (cherry picked from commit 790a607e234aa10d4b977a1b80aebe8a2acac970) + +diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py +index eeccdfd04dae57ab..8102d80b6660e523 100644 +--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py ++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py +@@ -33,7 +33,7 @@ def main(): + help='C compiler (including options) to use') + args = parser.parse_args() + linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) +- linux_version_glibc = (5, 15) ++ linux_version_glibc = (5, 16) + sys.exit(glibcextract.compare_macro_consts( + '#define _GNU_SOURCE 1\n' + '#include \n', diff --git a/glibc-upstream-2.34-196.patch b/glibc-upstream-2.34-196.patch new file mode 100644 index 0000000..5294eea --- /dev/null +++ b/glibc-upstream-2.34-196.patch @@ -0,0 +1,27 @@ +commit 0499c3a95fb864284fef36d3e9c5a54f6646b2db +Author: Joseph Myers +Date: Thu Mar 24 15:35:27 2022 +0000 + + Update kernel version to 5.17 in tst-mman-consts.py + + This patch updates the kernel version in the test tst-mman-consts.py + to 5.17. (There are no new MAP_* constants covered by this test in + 5.17 that need any other header changes.) + + Tested with build-many-glibcs.py. + + (cherry picked from commit 23808a422e6036accaba7236fd3b9a0d7ab7e8ee) + +diff --git a/sysdeps/unix/sysv/linux/tst-mman-consts.py b/sysdeps/unix/sysv/linux/tst-mman-consts.py +index 8102d80b6660e523..724c7375c3a1623b 100644 +--- a/sysdeps/unix/sysv/linux/tst-mman-consts.py ++++ b/sysdeps/unix/sysv/linux/tst-mman-consts.py +@@ -33,7 +33,7 @@ def main(): + help='C compiler (including options) to use') + args = parser.parse_args() + linux_version_headers = glibcsyscalls.linux_kernel_version(args.cc) +- linux_version_glibc = (5, 16) ++ linux_version_glibc = (5, 17) + sys.exit(glibcextract.compare_macro_consts( + '#define _GNU_SOURCE 1\n' + '#include \n', diff --git a/glibc-upstream-2.34-197.patch b/glibc-upstream-2.34-197.patch new file mode 100644 index 0000000..afe47ec --- /dev/null +++ b/glibc-upstream-2.34-197.patch @@ -0,0 +1,26 @@ +commit f858bc309315a03ff6b1a048f59405c159d23430 +Author: Joseph Myers +Date: Mon Feb 21 22:49:36 2022 +0000 + + Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h + + Linux 5.16 adds constants SOL_MPTCP and SOL_MCTP to the getsockopt / + setsockopt levels; add these constants to bits/socket.h. + + Tested for x86_64. + + (cherry picked from commit fdc1ae67fef27eea1445bab4bdfe2f0fb3bc7aa1) + +diff --git a/sysdeps/unix/sysv/linux/bits/socket.h b/sysdeps/unix/sysv/linux/bits/socket.h +index 7bb9e863d7329da9..c81fab840918924e 100644 +--- a/sysdeps/unix/sysv/linux/bits/socket.h ++++ b/sysdeps/unix/sysv/linux/bits/socket.h +@@ -169,6 +169,8 @@ typedef __socklen_t socklen_t; + #define SOL_KCM 281 + #define SOL_TLS 282 + #define SOL_XDP 283 ++#define SOL_MPTCP 284 ++#define SOL_MCTP 285 + + /* Maximum queue length specifiable by listen. */ + #define SOMAXCONN 4096 diff --git a/glibc-upstream-2.34-198.patch b/glibc-upstream-2.34-198.patch new file mode 100644 index 0000000..67ab10c --- /dev/null +++ b/glibc-upstream-2.34-198.patch @@ -0,0 +1,21 @@ +commit c108e87026d61d6744e3e55704e0bea937243f5a +Author: Szabolcs Nagy +Date: Tue Dec 14 11:15:07 2021 +0000 + + aarch64: Add HWCAP2_ECV from Linux 5.16 + + Indicates the availability of enhanced counter virtualization extension + of armv8.6-a with self-synchronized virtual counter CNTVCTSS_EL0 usable + in userspace. + + (cherry picked from commit 5a1be8ebdf6f02d4efec6e5f12ad06db17511f90) + +diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +index 30fda0a4a347695e..04cc762015a7230a 100644 +--- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h ++++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +@@ -74,3 +74,4 @@ + #define HWCAP2_RNG (1 << 16) + #define HWCAP2_BTI (1 << 17) + #define HWCAP2_MTE (1 << 18) ++#define HWCAP2_ECV (1 << 19) diff --git a/glibc-upstream-2.34-199.patch b/glibc-upstream-2.34-199.patch new file mode 100644 index 0000000..02675fc --- /dev/null +++ b/glibc-upstream-2.34-199.patch @@ -0,0 +1,21 @@ +commit 97cb8227b864b8ea0d99a4a50e4163baad3e1c72 +Author: Joseph Myers +Date: Mon Mar 28 13:16:48 2022 +0000 + + Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h + + Add the new HWCAP2_AFP and HWCAP2_RPRES constants from Linux 5.17. + Tested with build-many-glibcs.py for aarch64-linux-gnu. + + (cherry picked from commit 866c599182e87f116440b5d854f9e99533c48eb3) + +diff --git a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +index 04cc762015a7230a..9a5c4116b3fe9903 100644 +--- a/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h ++++ b/sysdeps/unix/sysv/linux/aarch64/bits/hwcap.h +@@ -75,3 +75,5 @@ + #define HWCAP2_BTI (1 << 17) + #define HWCAP2_MTE (1 << 18) + #define HWCAP2_ECV (1 << 19) ++#define HWCAP2_AFP (1 << 20) ++#define HWCAP2_RPRES (1 << 21) diff --git a/glibc-upstream-2.34-200.patch b/glibc-upstream-2.34-200.patch new file mode 100644 index 0000000..7ad14c9 --- /dev/null +++ b/glibc-upstream-2.34-200.patch @@ -0,0 +1,29 @@ +commit 31af92b9c8cf753992d45c801a855a02060afc08 +Author: Siddhesh Poyarekar +Date: Wed May 4 15:56:47 2022 +0530 + + manual: Clarify that abbreviations of long options are allowed + + The man page and code comments clearly state that abbreviations of long + option names are recognized correctly as long as they are unique. + Document this fact in the glibc manual as well. + + Signed-off-by: Siddhesh Poyarekar + Reviewed-by: Florian Weimer + Reviewed-by: Andreas Schwab + (cherry picked from commit db1efe02c9f15affc3908d6ae73875b82898a489) + +diff --git a/manual/getopt.texi b/manual/getopt.texi +index 5485fc46946631f7..b4c0b15ac2060560 100644 +--- a/manual/getopt.texi ++++ b/manual/getopt.texi +@@ -250,7 +250,8 @@ option, and stores the option's argument (if it has one) in @code{optarg}. + + When @code{getopt_long} encounters a long option, it takes actions based + on the @code{flag} and @code{val} fields of the definition of that +-option. ++option. The option name may be abbreviated as long as the abbreviation is ++unique. + + If @code{flag} is a null pointer, then @code{getopt_long} returns the + contents of @code{val} to indicate which option it found. You should diff --git a/glibc-upstream-2.34-201.patch b/glibc-upstream-2.34-201.patch new file mode 100644 index 0000000..68ca969 --- /dev/null +++ b/glibc-upstream-2.34-201.patch @@ -0,0 +1,1789 @@ +commit 0d5b36c8cc15f064e302d29692853f8a760e1547 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:38 2022 -0600 + + x86: Optimize strcmp-avx2.S + + Optimization are primarily to the loop logic and how the page cross + logic interacts with the loop. + + The page cross logic is at times more expensive for short strings near + the end of a page but not crossing the page. This is done to retest + the page cross conditions with a non-faulty check and to improve the + logic for entering the loop afterwards. This is only particular cases, + however, and is general made up for by more than 10x improvements on + the transition from the page cross -> loop case. + + The non-page cross cases are improved most for smaller sizes [0, 128] + and go about even for (128, 4096]. The loop page cross logic is + improved so some more significant speedup is seen there as well. + + test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + + Signed-off-by: Noah Goldstein + (cherry picked from commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index fa70c994fc25dfd8..a0d1c65db11028bc 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -26,35 +26,57 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ ++ /* Compare packed dwords. */ + # define VPCMPEQ vpcmpeqd +-/* Compare packed dwords and store minimum. */ ++ /* Compare packed dwords and store minimum. */ + # define VPMINU vpminud +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ ++ /* Compare packed bytes. */ + # define VPCMPEQ vpcmpeqb +-/* Compare packed bytes and store minimum. */ ++ /* Compare packed bytes and store minimum. */ + # define VPMINU vpminub +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ + # ifndef VZEROUPPER + # define VZEROUPPER vzeroupper + # endif + ++# if defined USE_AS_STRNCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ ++# define xmmZERO xmm15 ++# define ymmZERO ymm15 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif +@@ -79,783 +101,1049 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRCMP) ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx ++# endif + cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ + movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ ++ ++ /* Multiplying length by sizeof(wchar_t) can result in overflow. ++ Check if that is possible. All cases where overflow are possible ++ are cases where length is large enough that it can never be a ++ bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz OVERFLOW_STRCMP +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++ jnz __wcscmp_avx2 ++ ++ leaq (, %rdx, 4), %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP + # endif ++ vpxor %xmmZERO, %xmmZERO, %xmmZERO + movl %edi, %eax +- xorl %edx, %edx +- /* Make %xmm7 (%ymm7) all zeros in this function. */ +- vpxor %xmm7, %xmm7, %xmm7 + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ +- vmovdqu (%rdi), %ymm1 +- VPCMPEQ (%rsi), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- je L(next_3_vectors) +- tzcntl %ecx, %edx ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (%rdi), %ymm0 ++ /* 1s where s1 and s2 equal. */ ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s at null CHAR. */ ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ /* 1s where s1 and s2 equal AND not null CHAR. */ ++ vpandn %ymm1, %ymm2, %ymm1 ++ ++ /* All 1s -> keep going, any 0s -> return. */ ++ vpmovmskb %ymm1, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $VEC_SIZE, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* All 1s represents all equals. incl will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ incl %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +- .p2align 4 +-L(return_vec_size): +- tzcntl %ecx, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 8 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ VZEROUPPER_RETURN ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_avx2 ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ ++ jnbe __strcmp_avx2 ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif ++L(ret1): ++ ret + # endif +- VZEROUPPER_RETURN + +- .p2align 4 +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of ++ overflow. */ ++ addq $-VEC_SIZE, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_3_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++L(return_vec_3): ++ salq $32, %rcx ++# endif ++ ++L(return_vec_2): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + VZEROUPPER_RETURN ++# endif ++ ++ .p2align 4,, 10 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_2) ++ ++ VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_3) + +- .p2align 4 +-L(next_3_vectors): +- vmovdqu VEC_SIZE(%rdi), %ymm6 +- VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 +- VPMINU %ymm6, %ymm3, %ymm3 +- VPCMPEQ %ymm7, %ymm3, %ymm3 +- vpmovmskb %ymm3, %ecx +- testl %ecx, %ecx +- jne L(return_vec_size) +- vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 +- vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 +- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 +- VPMINU %ymm5, %ymm2, %ymm2 +- VPCMPEQ %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm2, %ymm2 +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jne L(return_2_vec_size) +- VPMINU %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ /* any non-zero positive value that doesn't inference with 0x1. + */ +- subq %rdx, %r11 +- jbe L(zero) +-# endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) ++ movl $2, %r8d + ++# else ++ xorl %r8d, %r8d ++# endif ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ ++# ifdef USE_AS_STRNCMP ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++# endif ++L(prepare_loop_no_len): ++ ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++ addq %rdi, %rsi ++ ++# ifdef USE_AS_STRNCMP ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) +-# endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- vmovdqa (%rax), %ymm0 +- vmovdqa VEC_SIZE(%rax), %ymm3 +- VPCMPEQ (%rdx), %ymm0, %ymm4 +- VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 +- VPMINU %ymm0, %ymm4, %ymm4 +- VPMINU %ymm3, %ymm1, %ymm1 +- vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 +- VPMINU %ymm1, %ymm4, %ymm0 +- vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPMINU %ymm5, %ymm0, %ymm0 +- VPMINU %ymm6, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- vpmovmskb %ymm0, %ecx ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ ++ VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 ++ ++ VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ ++ ++ /* If any mismatches or null CHAR then 0 CHAR, otherwise non- ++ zero. */ ++ vpand %ymm0, %ymm1, %ymm1 ++ ++ ++ vpand %ymm2, %ymm3, %ymm3 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ ++ VPMINU %ymm1, %ymm3, %ymm3 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ ++ /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ ++ VPMINU %ymm3, %ymm7, %ymm7 ++ ++ /* If any 0 CHAR then done. */ ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jz L(loop) ++ ++ /* Find which VEC has the mismatch of end of string. */ ++ VPCMPEQ %ymm1, %ymmZERO, %ymm1 ++ vpmovmskb %ymm1, %ecx + testl %ecx, %ecx +- je L(loop) +- VPCMPEQ %ymm7, %ymm4, %ymm0 +- vpmovmskb %ymm0, %edi +- testl %edi, %edi +- je L(test_vec) +- tzcntl %edi, %ecx ++ jnz L(return_vec_0_end) ++ ++ ++ VPCMPEQ %ymm3, %ymmZERO, %ymm3 ++ vpmovmskb %ymm3, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_1_end) ++ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++ VPCMPEQ %ymm5, %ymmZERO, %ymm5 ++ vpmovmskb %ymm5, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_2_end) ++ ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++ tzcntl %LOOP_REG, %LOOP_REG ++ ++# ifdef USE_AS_STRNCMP ++ subl $-(VEC_SIZE), %LOOP_REG ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_vec): + # ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) ++ .p2align 4,, 2 ++L(ret_zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- vpmovmskb %ymm1, %ecx +- testl %ecx, %ecx +- je L(test_2_vec) +- tzcntl %ecx, %edi ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++L(return_vec_1_end): ++ salq $32, %rcx ++# endif ++L(return_vec_0_end): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (%rsi, %rcx), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax ++# endif ++L(ret6): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(test_2_vec): ++ .p2align 4,, 10 ++L(return_vec_2_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- vpmovmskb %ymm5, %ecx +- testl %ecx, %ecx +- je L(test_3_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret11) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret11): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_3_vec): ++ ++ /* Page cross in rsi in next 4x VEC. */ ++ ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ ++ ++ /* Optimistically rsi and rdi and both aligned inwhich case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) ++ ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) ++ ++ VMOVA (%rdi), %ymm0 ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 ++ VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ ++ movl $-1, %r10d ++ shlxl %esi, %r10d, %r10d ++ notl %ecx ++ + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) +-# endif +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- vpmovmskb %ymm6, %esi +- tzcntl %esi, %ecx ++ cmpq %rax, %rdx ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + VZEROUPPER_RETURN + +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 +-# endif +- +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) +- +- vmovdqu (%rax, %r10), %ymm2 +- vmovdqu VEC_SIZE(%rax, %r10), %ymm3 +- VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 +- VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 +- VPMINU %ymm2, %ymm0, %ymm0 +- VPMINU %ymm3, %ymm1, %ymm1 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- +- vpmovmskb %ymm0, %edi +- vpmovmskb %ymm1, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi +- +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrq %cl, %rdi +- +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 +- vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- +- vpmovmskb %ymm5, %edi +- vpmovmskb %ymm6, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* Skip ECX bytes. */ +- shrq %cl, %rdi +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi +- +- testq %rdi, %rdi ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1_end) ++ + # ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) +-# else +- je L(back_to_loop) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif +- tzcntq %rdi, %rcx +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx ++ ++ subl $-(VEC_SIZE * 4), %eax ++ ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_1) ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ /* Must check length here as length might proclude reading next ++ page. */ ++ cmpq %rax, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# endif ++ ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ VZEROUPPER_RETURN + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ jmp L(loop_skip_page_cross_check) + # endif +- VZEROUPPER_RETURN + ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# else ++ addl %eax, %ecx + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ movl VEC_OFFSET(%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx + subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++L(ret9): ++ VZEROUPPER_RETURN ++ ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif ++ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ xorl %r8d, %r8d + # endif +- /* Check null char. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ ++ .p2align 4,, 10 ++L(page_cross_loop): ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ ++ jnz L(check_ret_vec_page_cross) ++ addl $VEC_SIZE, %OFFSET_REG ++# ifdef USE_AS_STRNCMP ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VZEROUPPER_RETURN ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++# ifdef USE_AS_STRNCMP ++ leal VEC_SIZE(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++ addq %rdi, %rdx ++# endif ++ incl %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- VZEROUPPER_RETURN ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ incl %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- VZEROUPPER_RETURN ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $16, %eax ++ ja L(less_16_till_page) ++ ++ VMOVU (%rdi), %xmm0 ++ VPCMPEQ (%rsi), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ movl $16, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif +- tzcntl %ecx, %edx ++ ++ VMOVU (%rdi, %OFFSET_REG64), %xmm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ addl $16, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi + # endif +-# ifdef USE_AS_WCSCMP ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ ret + # endif +- VZEROUPPER_RETURN + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- vmovdqu (%rdi, %rdx), %ymm1 +- VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) + +- addl $VEC_SIZE, %edx ++ .p2align 4,, 10 ++L(less_16_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $24, %eax ++ ja L(less_8_till_page) + +- addl $VEC_SIZE, %eax +-# ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- vmovdqu (%rdi, %rdx), %xmm1 +- VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $8, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif ++ movl $24, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ ++ ++ ++ vmovq (%rdi, %OFFSET_REG64), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %xmm1 +- vmovq (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ addl $8, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx + +- addl $8, %edx +- addl $8, %eax ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): ++# ifdef USE_AS_WCSCMP ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addq %rdi, %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ + # endif ++ testl %eax, %eax ++ jnz L(prepare_loop_no_len) ++ ret + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %xmm1 +- vmovd (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ ++# else ++ ++ /* Find largest load size we can use. */ ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $28, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ ++ ++ vmovd (%rdi, %OFFSET_REG64), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) ++ ++# ifdef USE_AS_STRNCMP ++ addl $4, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax +- VZEROUPPER_RETURN +-END (STRCMP) ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(VEC_SIZE * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax ++ ret ++# endif ++END(STRCMP) + #endif diff --git a/glibc-upstream-2.34-202.patch b/glibc-upstream-2.34-202.patch new file mode 100644 index 0000000..9357b6f --- /dev/null +++ b/glibc-upstream-2.34-202.patch @@ -0,0 +1,1987 @@ +commit c41a66767d23b7f219fb943be6fab5ddf822d7da +Author: Noah Goldstein +Date: Mon Jan 10 15:35:39 2022 -0600 + + x86: Optimize strcmp-evex.S + + Optimization are primarily to the loop logic and how the page cross + logic interacts with the loop. + + The page cross logic is at times more expensive for short strings near + the end of a page but not crossing the page. This is done to retest + the page cross conditions with a non-faulty check and to improve the + logic for entering the loop afterwards. This is only particular cases, + however, and is general made up for by more than 10x improvements on + the transition from the page cross -> loop case. + + The non-page cross cases as well are nearly universally improved. + + test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + + Signed-off-by: Noah Goldstein + (cherry picked from commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 6f5c4bf984da2b80..99d8409af27327ad 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -26,54 +26,69 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 ++# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif +- +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ +-# define VPCMP vpcmpd ++# define TESTEQ subl $0xff, ++ /* Compare packed dwords. */ ++# define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd +-# define SHIFT_REG32 r8d +-# define SHIFT_REG64 r8 +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ +-# define VPCMP vpcmpb ++# define TESTEQ incl ++ /* Compare packed bytes. */ ++# define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb +-# define SHIFT_REG32 ecx +-# define SHIFT_REG64 rcx +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ + # define XMMZERO xmm16 +-# define XMM0 xmm17 +-# define XMM1 xmm18 ++# define XMM0 xmm17 ++# define XMM1 xmm18 + + # define YMMZERO ymm16 +-# define YMM0 ymm17 +-# define YMM1 ymm18 +-# define YMM2 ymm19 +-# define YMM3 ymm20 +-# define YMM4 ymm21 +-# define YMM5 ymm22 +-# define YMM6 ymm23 +-# define YMM7 ymm24 +-# define YMM8 ymm25 +-# define YMM9 ymm26 +-# define YMM10 ymm27 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -96,985 +111,1096 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section .text.evex,"ax",@progbits +-ENTRY (STRCMP) ++ .section .text.evex, "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ +- cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) +-# ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ +- movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ +- shrq $56, %rcx +- jnz __wcscmp_evex +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP ++ cmp $1, %RDX_LP ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # endif + movl %edi, %eax +- xorl %edx, %edx +- /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ ++ /* Shift out the bits irrelivant to page boundary ([63:12]). */ ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %YMM0 +- +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ + VPCMP $0, (%rsi), %YMM0, %k1{%k2} +- + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(next_3_vectors) +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $CHAR_PER_VEC, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for ++ wcscmp/wcsncmp. */ ++ ++ /* All 1s represents all equals. TESTEQ will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ TESTEQ %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + ret + +-L(return_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 4 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ ret ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_evex ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __strcmp_evex ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret1): + ret ++# endif + +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx ++# ifdef USE_AS_STRNCMP ++ /* rdx must be > CHAR_PER_VEC so its safe to subtract without ++ worrying about underflow. */ ++ addq $-CHAR_PER_VEC, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): ++ ret ++ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++L(return_vec_3): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_2): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +-L(return_3_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + ret ++# endif + +- .p2align 4 +-L(next_3_vectors): +- VMOVU VEC_SIZE(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ /* 32 byte align here ensures the main loop is ideally aligned ++ for DSB. */ ++ .p2align 5 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) + # endif +- jne L(return_vec_size) + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- jne L(return_2_vec_size) ++ TESTEQ %ecx ++ jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_3) ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d ++ + # else +- incl %ecx ++ xorl %r8d, %r8d + # endif +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. +- */ +- subq %rdx, %r11 +- jbe L(zero) ++# ifdef USE_AS_WCSCMP ++L(prepare_loop_no_len): ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ shrl $2, %ecx ++ leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx ++# else ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++L(prepare_loop_no_len): ++# endif ++# else ++L(prepare_loop_no_len): + # endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) + ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++L(prepare_loop_readj): ++ addq %rdi, %rsi ++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ vpxorq %YMMZERO, %YMMZERO, %YMMZERO ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(ret_zero) + # endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- VMOVA (%rax), %YMM0 +- VMOVA VEC_SIZE(%rax), %YMM2 +- VMOVA (VEC_SIZE * 2)(%rax), %YMM4 +- VMOVA (VEC_SIZE * 3)(%rax), %YMM6 ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + + VPMINU %YMM0, %YMM2, %YMM8 + VPMINU %YMM4, %YMM6, %YMM9 + +- /* A zero CHAR in YMM8 means that there is a null CHAR. */ +- VPMINU %YMM8, %YMM9, %YMM8 ++ /* A zero CHAR in YMM9 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM9 + + /* Each bit set in K1 represents a non-null CHAR in YMM8. */ +- VPTESTM %YMM8, %YMM8, %k1 ++ VPTESTM %YMM9, %YMM9, %k1 + +- /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ +- vpxorq (%rdx), %YMM0, %YMM1 +- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 +- vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 ++ vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while ++ oring with YMM1. Result is stored in YMM6. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 + +- vporq %YMM1, %YMM3, %YMM9 +- vporq %YMM5, %YMM7, %YMM10 ++ /* Or together YMM3, YMM5, and YMM6. */ ++ vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + +- /* A non-zero CHAR in YMM9 represents a mismatch. */ +- vporq %YMM9, %YMM10, %YMM9 + +- /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ +- VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} +- kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(loop) ++ /* A non-zero CHAR in YMM6 represents a mismatch. */ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG + +- /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ TESTEQ %LOOP_REG ++ jz L(loop) ++ ++ ++ /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM0 and (%rdx). */ + VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_vec) +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) + +- .p2align 4 +-L(test_vec): +-# ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) +-# endif +- /* Each bit set in K1 represents a non-null CHAR in YMM2. */ + VPTESTM %YMM2, %YMM2, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM2 and VEC_SIZE(%rdx). */ + VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_2_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi +-# endif +-# ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- .p2align 4 +-L(test_2_vec): ++ ++ /* Handle VEC 2 and 3 without branches. */ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ + VPTESTM %YMM4, %YMM4, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM4 and (VEC_SIZE * 2)(%rdx). */ + VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ TESTEQ %ecx ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %LOOP_REG ++ orl %ecx, %LOOP_REG + # else +- incl %ecx ++ salq $CHAR_PER_VEC, %LOOP_REG64 ++ orq %rcx, %LOOP_REG64 ++# endif ++L(return_vec_3_end): ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++# if CHAR_PER_VEC <= 16 ++ tzcntl %LOOP_REG, %LOOP_REG ++# else ++ tzcntq %LOOP_REG64, %LOOP_REG64 ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) + # endif +- je L(test_3_vec) +- tzcntl %ecx, %edi ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi ++ movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ xorl %eax, %eax ++ cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): ++ ret ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 2 ++L(ret_zero_end): + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ ret ++# endif ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 ++# ifdef USE_AS_STRNCMP ++L(return_vec_1_end): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_0_end): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +- .p2align 4 +-L(test_3_vec): + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM6. */ +- VPTESTM %YMM6, %YMM6, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM6 and (VEC_SIZE * 3)(%rdx). */ +- VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} +- kmovd %k0, %ecx ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ /* This is the non-zero case for `eax` so just xorl with `r8d` ++ flip is `rdi` and `rsi` where swapped. */ ++ xorl %r8d, %eax + # else +- incl %ecx ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ /* Flip `eax` if `rdi` and `rsi` where swapped in page cross ++ logic. Subtract `r8d` after xor for zero case. */ ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret6): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): + tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + ret +- +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 + # endif + +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) + +- VMOVU (%rax, %r10), %YMM2 +- VMOVU VEC_SIZE(%rax, %r10), %YMM3 ++ /* Page cross in rsi in next 4x VEC. */ + +- /* Each bit set in K2 represents a non-null CHAR in YMM2. */ +- VPTESTM %YMM2, %YMM2, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM2 and 32 bytes at (%rdx, %r10). */ +- VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ + +- /* Each bit set in K4 represents a non-null CHAR in YMM3. */ +- VPTESTM %YMM3, %YMM3, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ +- VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi +-# endif ++ /* Optimistically rsi and rdi and both aligned in which case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG32 +- sarl $2, %SHIFT_REG32 +- +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi +-# endif ++ VMOVA (%rdi), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ + +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrxq %SHIFT_REG64, %rdi, %rdi +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx ++ movl $-1, %r10d ++ movl %esi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ shrl $2, %ecx ++ shlxl %ecx, %r10d, %ecx ++ movzbl %cl, %r10d ++# else ++ movl $-1, %ecx ++ shlxl %esi, %ecx, %r10d + # endif ++ ++ kmovd %k1, %ecx ++ notl %ecx ++ ++ + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx + # else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax ++ cmpq %rax, %rdx + # endif ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ ++ /* Readjust eax before potentially returning to the loop. */ ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ ++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + ret + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 +- VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_WCSCMP ++ sall $2, %edx ++# endif ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) ++ xorl %eax, %eax ++ ret ++# endif ++ + ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- VPTESTM %YMM1, %YMM1, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi ++ subl $-(VEC_SIZE * 4), %eax + +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_1) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi ++# ifdef USE_AS_STRNCMP ++ /* Must check length here as length might proclude reading next ++ page. */ ++# ifdef USE_AS_WCSCMP ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx ++# else ++ cmpq %rax, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-# ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in RDI represent 4 +- bytes. */ +- sarl $2, %ecx +- /* Skip ECX bytes. */ +- shrl %cl, %edi ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ VPTESTM %YMM9, %YMM9, %k1 ++ ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 ++ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG ++ TESTEQ %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): ++ xorl %eax, %eax ++ ret + # else +- /* Skip ECX bytes. */ +- shrq %cl, %rdi ++ jmp L(loop_skip_page_cross_check) + # endif +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + +- testq %rdi, %rdi +-# ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_STRNCMP ++# ifdef USE_AS_WCSCMP ++ /* Must divide ecx instead of multiply rdx due to overflow. */ ++ movl %ecx, %eax ++ shrl $2, %eax ++ cmpq %rax, %rdx ++# else ++ cmpq %rcx, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) ++# endif + # else +- je L(back_to_loop) ++ addl %eax, %ecx + # endif +- tzcntq %rdi, %rcx ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret9): + ret + +-# ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- subl %ecx, %eax ++ xorl %r8d, %r8d + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ .p2align 4,, 8 ++L(page_cross_loop): ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(check_ret_vec_page_cross) ++ addl $CHAR_PER_VEC, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ shrl $2, %eax + # endif +- /* Check null CHAR. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ ++ kmovd %k1, %ecx ++# ifdef USE_AS_STRNCMP ++ leal CHAR_PER_VEC(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++# ifdef USE_AS_WCSCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++# else ++ addq %rdi, %rdx ++# endif + # endif +- ret ++ TESTEQ %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- ret ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax ++ movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ ret ++ + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ TESTEQ %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + ret ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- ret ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi +-# ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++# ifdef USE_AS_WCSCMP ++ shrl $2, %eax + # endif +- tzcntl %ecx, %edx ++ /* Find largest load size we can use. */ ++ cmpl $(16 / SIZE_OF_CHAR), %eax ++ ja L(less_16_till_page) ++ ++ /* Use 16 byte comparison. */ ++ vmovdqu (%rdi), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ subl $0xf, %ecx ++# else ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++ movl $(16 / SIZE_OF_CHAR), %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif ++ vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ subl $0xf, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(16 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): ++ xorl %eax, %eax + ret ++# endif + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- VMOVU (%rdi, %rdx), %YMM0 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} ++ .p2align 4,, 10 ++L(less_16_till_page): ++ cmpl $(24 / SIZE_OF_CHAR), %eax ++ ja L(less_8_till_page) ++ ++ /* Use 8 byte comparison. */ ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ subl $0x3, %ecx + # else +- incl %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) + +- addl $VEC_SIZE, %edx + +- addl $VEC_SIZE, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $(8 / SIZE_OF_CHAR), %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- VMOVU (%rdi, %rdx), %XMM0 ++ movl $(24 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and 16 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xf, %ecx ++ subl $0x3, %ecx + # else +- subl $0xffff, %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++ + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addl $(8 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi + # endif ++ jmp L(prepare_loop_aligned) + +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %XMM0 +- vmovq (%rsi, %rdx), %XMM1 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} +- kmovb %k1, %ecx ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): + # ifdef USE_AS_WCSCMP +- subl $0x3, %ecx ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) ++# ifdef USE_AS_STRNCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ ++# endif ++ testl %eax, %eax ++ jnz L(prepare_loop) ++ ret ++ ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ + # else +- subl $0xff, %ecx +-# endif +- jne L(last_vector) ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ kmovd %k1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $8, %edx +- addl $8, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $(28 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %XMM0 +- vmovd (%rsi, %rdx), %XMM1 +- +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0x1, %ecx +-# else + subl $0xf, %ecx +-# endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(4 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret + # endif + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(CHAR_PER_VEC * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax + ret +-END (STRCMP) ++# endif ++END(STRCMP) + #endif diff --git a/glibc-upstream-2.34-203.patch b/glibc-upstream-2.34-203.patch new file mode 100644 index 0000000..e45b588 --- /dev/null +++ b/glibc-upstream-2.34-203.patch @@ -0,0 +1,29 @@ +commit d299032743e05571ef326c838a5ecf6ef5b3e9c3 +Author: H.J. Lu +Date: Fri Feb 4 11:09:10 2022 -0800 + + x86-64: Fix strcmp-avx2.S + + Change "movl %edx, %rdx" to "movl %edx, %edx" in: + + commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 + Author: Noah Goldstein + Date: Mon Jan 10 15:35:38 2022 -0600 + + x86: Optimize strcmp-avx2.S + + (cherry picked from commit c15efd011cea3d8f0494269eb539583215a1feed) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index a0d1c65db11028bc..cdded412a70bad10 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -106,7 +106,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also diff --git a/glibc-upstream-2.34-204.patch b/glibc-upstream-2.34-204.patch new file mode 100644 index 0000000..4250493 --- /dev/null +++ b/glibc-upstream-2.34-204.patch @@ -0,0 +1,29 @@ +commit 53ddafe917a8af17b16beb794c29e5b09b86d534 +Author: H.J. Lu +Date: Fri Feb 4 11:11:08 2022 -0800 + + x86-64: Fix strcmp-evex.S + + Change "movl %edx, %rdx" to "movl %edx, %edx" in: + + commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 + Author: Noah Goldstein + Date: Mon Jan 10 15:35:39 2022 -0600 + + x86: Optimize strcmp-evex.S + + (cherry picked from commit 0e0199a9e02ebe42e2b36958964d63f03573c382) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 99d8409af27327ad..ed56af8ecdad48b2 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -116,7 +116,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also diff --git a/glibc-upstream-2.34-205.patch b/glibc-upstream-2.34-205.patch new file mode 100644 index 0000000..6cf18b8 --- /dev/null +++ b/glibc-upstream-2.34-205.patch @@ -0,0 +1,451 @@ +commit ea19c490a3f5628d55ded271cbb753e66b2f05e8 +Author: Noah Goldstein +Date: Sun Feb 6 00:54:18 2022 -0600 + + x86: Improve vec generation in memset-vec-unaligned-erms.S + + No bug. + + Split vec generation into multiple steps. This allows the + broadcast in AVX2 to use 'xmm' registers for the L(less_vec) + case. This saves an expensive lane-cross instruction and removes + the need for 'vzeroupper'. + + For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for + byte broadcast. + + Results for memset-avx2 small (geomean of N = 20 benchset runs). + + size, New Time, Old Time, New / Old + 0, 4.100, 3.831, 0.934 + 1, 5.074, 4.399, 0.867 + 2, 4.433, 4.411, 0.995 + 4, 4.487, 4.415, 0.984 + 8, 4.454, 4.396, 0.987 + 16, 4.502, 4.443, 0.987 + + All relevant string/wcsmbs tests are passing. + Reviewed-by: H.J. Lu + + (cherry picked from commit b62ace2740a106222e124cc86956448fa07abf4d) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 0137eba4cdd9f830..34ee0bfdcb81fb39 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -28,17 +28,22 @@ + #define VMOVU movups + #define VMOVA movaps + +-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- punpcklbw %xmm0, %xmm0; \ +- punpcklwd %xmm0, %xmm0; \ +- pshufd $0, %xmm0, %xmm0 ++ pxor %xmm1, %xmm1; \ ++ pshufb %xmm1, %xmm0; \ ++ movq r, %rax + +-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- pshufd $0, %xmm0, %xmm0 ++ pshufd $0, %xmm0, %xmm0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + #define SECTION(p) p + +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index 1af668af0aeda59e..c0bf2875d03d51ab 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -10,15 +10,18 @@ + # define VMOVU vmovdqu + # define VMOVA vmovdqa + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastb %xmm0, %ymm0 ++ movq r, %rax; + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ MEMSET_SET_VEC0_AND_SET_RETURN(d, r) ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 ++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 + + # ifndef SECTION + # define SECTION(p) p##.avx +@@ -30,5 +33,6 @@ + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif + ++# define USE_XMM_LESS_VEC + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index f14d6f8493c21a36..5241216a77bf72b7 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 64b09e77cc20cc42..637002150659123c 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex + # define MEMSET_SYMBOL(p,s) p##_evex_##s +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index e723413a664c088f..c8db87dcbf69f0d8 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -58,8 +58,10 @@ + #ifndef MOVQ + # if VEC_SIZE > 16 + # define MOVQ vmovq ++# define MOVD vmovd + # else + # define MOVQ movq ++# define MOVD movd + # endif + #endif + +@@ -72,9 +74,17 @@ + #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + # define END_REG rcx + # define LOOP_REG rdi ++# define LESS_VEC_REG rax + #else + # define END_REG rdi + # define LOOP_REG rdx ++# define LESS_VEC_REG rdi ++#endif ++ ++#ifdef USE_XMM_LESS_VEC ++# define XMM_SMALL 1 ++#else ++# define XMM_SMALL 0 + #endif + + #define PAGE_SIZE 4096 +@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + + ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shl $2, %RDX_LP +- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +- jmp L(entry_from_bzero) ++ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) ++ WMEMSET_VDUP_TO_VEC0_LOW() ++ cmpq $VEC_SIZE, %rdx ++ jb L(less_vec_no_vdup) ++ WMEMSET_VDUP_TO_VEC0_HIGH() ++ jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + +@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + #endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned)) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH() ++L(entry_from_wmemset): + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + + ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH () + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. +- */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VZEROUPPER_RETURN + #endif + +- .p2align 4,, 10 ++ .p2align 4,, 4 + L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE +- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) +- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + #else + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) +@@ -212,6 +228,7 @@ L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE + .p2align 4,, 10 + L(less_vec): ++L(less_vec_no_vdup): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -262,28 +279,18 @@ L(stosb_more_2x_vec): + /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] + and (4x, 8x] jump to target. */ + L(more_2x_vec): +- +- /* Two different methods of setting up pointers / compare. The +- two methods are based on the fact that EVEX/AVX512 mov +- instructions take more bytes then AVX2/SSE2 mov instructions. As +- well that EVEX/AVX512 machines also have fast LEA_BID. Both +- setup and END_REG to avoid complex address mode. For EVEX/AVX512 +- this saves code size and keeps a few targets in one fetch block. +- For AVX2/SSE2 this helps prevent AGU bottlenecks. */ +-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 +- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + +- LOOP_4X_OFFSET) with LEA_BID. */ +- +- /* END_REG is rcx for EVEX/AVX512. */ +- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +-#endif +- +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), VEC_SIZE(%rax) ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + + ++ /* Two different methods of setting up pointers / compare. The two ++ methods are based on the fact that EVEX/AVX512 mov instructions take ++ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 ++ machines also have fast LEA_BID. Both setup and END_REG to avoid complex ++ address mode. For EVEX/AVX512 this saves code size and keeps a few ++ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU ++ bottlenecks. */ + #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) + /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ + addq %rdx, %END_REG +@@ -292,6 +299,15 @@ L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_2x_vec) + ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with ++ LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) +@@ -355,65 +371,93 @@ L(stosb_local): + /* Define L(less_vec) only if not otherwise defined. */ + .p2align 4 + L(less_vec): ++ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to ++ xmm). This is only does anything for AVX2. */ ++ MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_no_vdup): + #endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +- jae L(between_32_63) ++ jge L(between_32_63) + #endif + #if VEC_SIZE > 16 + cmpl $16, %edx +- jae L(between_16_31) ++ jge L(between_16_31) ++#endif ++#ifndef USE_XMM_LESS_VEC ++ MOVQ %XMM0, %rcx + #endif +- MOVQ %XMM0, %rdi + cmpl $8, %edx +- jae L(between_8_15) ++ jge L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) ++ jge L(between_4_7) + cmpl $1, %edx +- ja L(between_2_3) +- jb L(return) +- movb %sil, (%rax) +- VZEROUPPER_RETURN ++ jg L(between_2_3) ++ jl L(between_0_0) ++ movb %sil, (%LESS_VEC_REG) ++L(between_0_0): ++ ret + +- /* Align small targets only if not doing so would cross a fetch +- line. */ ++ /* Align small targets only if not doing so would cross a fetch line. ++ */ + #if VEC_SIZE > 32 + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, (%rax) +- VMOVU %YMM0, -32(%rax, %rdx) ++ VMOVU %YMM0, (%LESS_VEC_REG) ++ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VZEROUPPER_RETURN + #endif + + #if VEC_SIZE >= 32 +- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) + L(between_16_31): + /* From 16 to 31. No branch when size == 16. */ +- VMOVU %XMM0, (%rax) +- VMOVU %XMM0, -16(%rax, %rdx) +- VZEROUPPER_RETURN ++ VMOVU %XMM0, (%LESS_VEC_REG) ++ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) ++ ret + #endif + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +- movq %rdi, (%rax) +- movq %rdi, -8(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVQ %XMM0, (%rdi) ++ MOVQ %XMM0, -8(%rdi, %rdx) ++#else ++ movq %rcx, (%LESS_VEC_REG) ++ movq %rcx, -8(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) ++ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %edi, (%rax) +- movl %edi, -4(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVD %XMM0, (%rdi) ++ MOVD %XMM0, -4(%rdi, %rdx) ++#else ++ movl %ecx, (%LESS_VEC_REG) ++ movl %ecx, -4(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* 4 * XMM_SMALL for the third mov for AVX2. */ ++ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %di, (%rax) +- movb %dil, -1(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ movb %sil, (%rdi) ++ movb %sil, 1(%rdi) ++ movb %sil, -1(%rdi, %rdx) ++#else ++ movw %cx, (%LESS_VEC_REG) ++ movb %sil, -1(%LESS_VEC_REG, %rdx) ++#endif ++ ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/glibc-upstream-2.34-206.patch b/glibc-upstream-2.34-206.patch new file mode 100644 index 0000000..ed9f37b --- /dev/null +++ b/glibc-upstream-2.34-206.patch @@ -0,0 +1,35 @@ +commit 190ea5f7e4e7e98b9b6e3f29835ae8b1f6a5442e +Author: Noah Goldstein +Date: Mon Feb 7 00:32:23 2022 -0600 + + x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) + + commit b62ace2740a106222e124cc86956448fa07abf4d + Author: Noah Goldstein + Date: Sun Feb 6 00:54:18 2022 -0600 + + x86: Improve vec generation in memset-vec-unaligned-erms.S + + Revert usage of 'pshufb' in broadcast logic as it is an SSSE3 + instruction and memset.S is restricted to only SSE2 instructions. + + (cherry picked from commit 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 34ee0bfdcb81fb39..954471e5a5bf225b 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -30,9 +30,10 @@ + + # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- pxor %xmm1, %xmm1; \ +- pshufb %xmm1, %xmm0; \ +- movq r, %rax ++ movq r, %rax; \ ++ punpcklbw %xmm0, %xmm0; \ ++ punpcklwd %xmm0, %xmm0; \ ++ pshufd $0, %xmm0, %xmm0 + + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ diff --git a/glibc-upstream-2.34-207.patch b/glibc-upstream-2.34-207.patch new file mode 100644 index 0000000..9818f5d --- /dev/null +++ b/glibc-upstream-2.34-207.patch @@ -0,0 +1,719 @@ +commit 5cb6329652696e79d6d576165ea87e332c9de106 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + + memset with zero as the value to set is by far the majority value (99%+ + for Python3 and GCC). + + bzero can be slightly more optimized for this case by using a zero-idiom + xor for broadcasting the set value to a register (vector or GPR). + + Co-developed-by: Noah Goldstein + (cherry picked from commit 3d9f171bfb5325bd5f427e9fc386453358c6e840) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 954471e5a5bf225b..0358210c7ff3a976 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -35,6 +35,9 @@ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + ++# define BZERO_ZERO_VEC0() \ ++ pxor %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + pshufd $0, %xmm0, %xmm0; \ +@@ -53,6 +56,10 @@ + # define MEMSET_SYMBOL(p,s) memset + #endif + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) __bzero ++#endif ++ + #ifndef WMEMSET_SYMBOL + # define WMEMSET_CHK_SYMBOL(p,s) p + # define WMEMSET_SYMBOL(p,s) __wmemset +@@ -63,6 +70,7 @@ + libc_hidden_builtin_def (memset) + + #if IS_IN (libc) ++weak_alias (__bzero, bzero) + libc_hidden_def (__wmemset) + weak_alias (__wmemset, wmemset) + libc_hidden_weak (wmemset) +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 26be40959ce62895..37d8d6f0bd2d10cc 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -1,85 +1,130 @@ + ifeq ($(subdir),string) + +-sysdep_routines += strncat-c stpncpy-c strncpy-c \ +- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \ +- strcmp-sse4_2 strcmp-avx2 \ +- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \ +- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ +- memrchr-sse2 memrchr-avx2 \ +- memcmp-sse2 \ +- memcmp-avx2-movbe \ +- memcmp-sse4 memcpy-ssse3 \ +- memmove-ssse3 \ +- memcpy-ssse3-back \ +- memmove-ssse3-back \ +- memmove-avx512-no-vzeroupper \ +- strcasecmp_l-sse2 strcasecmp_l-ssse3 \ +- strcasecmp_l-sse4_2 strcasecmp_l-avx \ +- strncase_l-sse2 strncase_l-ssse3 \ +- strncase_l-sse4_2 strncase_l-avx \ +- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ +- strrchr-sse2 strrchr-avx2 \ +- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ +- strcat-avx2 strncat-avx2 \ +- strcat-ssse3 strncat-ssse3\ +- strcpy-avx2 strncpy-avx2 \ +- strcpy-sse2 stpcpy-sse2 \ +- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ +- strcpy-sse2-unaligned strncpy-sse2-unaligned \ +- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ +- stpcpy-avx2 stpncpy-avx2 \ +- strcat-sse2 \ +- strcat-sse2-unaligned strncat-sse2-unaligned \ +- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ +- strcspn-sse2 strpbrk-sse2 strspn-sse2 \ +- strcspn-c strpbrk-c strspn-c varshift \ +- memset-avx512-no-vzeroupper \ +- memmove-sse2-unaligned-erms \ +- memmove-avx-unaligned-erms \ +- memmove-avx512-unaligned-erms \ +- memset-sse2-unaligned-erms \ +- memset-avx2-unaligned-erms \ +- memset-avx512-unaligned-erms \ +- memchr-avx2-rtm \ +- memcmp-avx2-movbe-rtm \ +- memmove-avx-unaligned-erms-rtm \ +- memrchr-avx2-rtm \ +- memset-avx2-unaligned-erms-rtm \ +- rawmemchr-avx2-rtm \ +- strchr-avx2-rtm \ +- strcmp-avx2-rtm \ +- strchrnul-avx2-rtm \ +- stpcpy-avx2-rtm \ +- stpncpy-avx2-rtm \ +- strcat-avx2-rtm \ +- strcpy-avx2-rtm \ +- strlen-avx2-rtm \ +- strncat-avx2-rtm \ +- strncmp-avx2-rtm \ +- strncpy-avx2-rtm \ +- strnlen-avx2-rtm \ +- strrchr-avx2-rtm \ +- memchr-evex \ +- memcmp-evex-movbe \ +- memmove-evex-unaligned-erms \ +- memrchr-evex \ +- memset-evex-unaligned-erms \ +- rawmemchr-evex \ +- stpcpy-evex \ +- stpncpy-evex \ +- strcat-evex \ +- strchr-evex \ +- strchrnul-evex \ +- strcmp-evex \ +- strcpy-evex \ +- strlen-evex \ +- strncat-evex \ +- strncmp-evex \ +- strncpy-evex \ +- strnlen-evex \ +- strrchr-evex \ +- memchr-evex-rtm \ +- rawmemchr-evex-rtm ++sysdep_routines += \ ++ bzero \ ++ memchr-avx2 \ ++ memchr-avx2-rtm \ ++ memchr-evex \ ++ memchr-evex-rtm \ ++ memchr-sse2 \ ++ memcmp-avx2-movbe \ ++ memcmp-avx2-movbe-rtm \ ++ memcmp-evex-movbe \ ++ memcmp-sse2 \ ++ memcmp-sse4 \ ++ memcmp-ssse3 \ ++ memcpy-ssse3 \ ++ memcpy-ssse3-back \ ++ memmove-avx-unaligned-erms \ ++ memmove-avx-unaligned-erms-rtm \ ++ memmove-avx512-no-vzeroupper \ ++ memmove-avx512-unaligned-erms \ ++ memmove-evex-unaligned-erms \ ++ memmove-sse2-unaligned-erms \ ++ memmove-ssse3 \ ++ memmove-ssse3-back \ ++ memrchr-avx2 \ ++ memrchr-avx2-rtm \ ++ memrchr-evex \ ++ memrchr-sse2 \ ++ memset-avx2-unaligned-erms \ ++ memset-avx2-unaligned-erms-rtm \ ++ memset-avx512-no-vzeroupper \ ++ memset-avx512-unaligned-erms \ ++ memset-evex-unaligned-erms \ ++ memset-sse2-unaligned-erms \ ++ rawmemchr-avx2 \ ++ rawmemchr-avx2-rtm \ ++ rawmemchr-evex \ ++ rawmemchr-evex-rtm \ ++ rawmemchr-sse2 \ ++ stpcpy-avx2 \ ++ stpcpy-avx2-rtm \ ++ stpcpy-evex \ ++ stpcpy-sse2 \ ++ stpcpy-sse2-unaligned \ ++ stpcpy-ssse3 \ ++ stpncpy-avx2 \ ++ stpncpy-avx2-rtm \ ++ stpncpy-c \ ++ stpncpy-evex \ ++ stpncpy-sse2-unaligned \ ++ stpncpy-ssse3 \ ++ strcasecmp_l-avx \ ++ strcasecmp_l-sse2 \ ++ strcasecmp_l-sse4_2 \ ++ strcasecmp_l-ssse3 \ ++ strcat-avx2 \ ++ strcat-avx2-rtm \ ++ strcat-evex \ ++ strcat-sse2 \ ++ strcat-sse2-unaligned \ ++ strcat-ssse3 \ ++ strchr-avx2 \ ++ strchr-avx2-rtm \ ++ strchr-evex \ ++ strchr-sse2 \ ++ strchr-sse2-no-bsf \ ++ strchrnul-avx2 \ ++ strchrnul-avx2-rtm \ ++ strchrnul-evex \ ++ strchrnul-sse2 \ ++ strcmp-avx2 \ ++ strcmp-avx2-rtm \ ++ strcmp-evex \ ++ strcmp-sse2 \ ++ strcmp-sse2-unaligned \ ++ strcmp-sse4_2 \ ++ strcmp-ssse3 \ ++ strcpy-avx2 \ ++ strcpy-avx2-rtm \ ++ strcpy-evex \ ++ strcpy-sse2 \ ++ strcpy-sse2-unaligned \ ++ strcpy-ssse3 \ ++ strcspn-c \ ++ strcspn-sse2 \ ++ strlen-avx2 \ ++ strlen-avx2-rtm \ ++ strlen-evex \ ++ strlen-sse2 \ ++ strncase_l-avx \ ++ strncase_l-sse2 \ ++ strncase_l-sse4_2 \ ++ strncase_l-ssse3 \ ++ strncat-avx2 \ ++ strncat-avx2-rtm \ ++ strncat-c \ ++ strncat-evex \ ++ strncat-sse2-unaligned \ ++ strncat-ssse3 \ ++ strncmp-avx2 \ ++ strncmp-avx2-rtm \ ++ strncmp-evex \ ++ strncmp-sse2 \ ++ strncmp-sse4_2 \ ++ strncmp-ssse3 \ ++ strncpy-avx2 \ ++ strncpy-avx2-rtm \ ++ strncpy-c \ ++ strncpy-evex \ ++ strncpy-sse2-unaligned \ ++ strncpy-ssse3 \ ++ strnlen-avx2 \ ++ strnlen-avx2-rtm \ ++ strnlen-evex \ ++ strnlen-sse2 \ ++ strpbrk-c \ ++ strpbrk-sse2 \ ++ strrchr-avx2 \ ++ strrchr-avx2-rtm \ ++ strrchr-evex \ ++ strrchr-sse2 \ ++ strspn-c \ ++ strspn-sse2 \ ++ strstr-sse2-unaligned \ ++ varshift \ ++# sysdep_routines + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c +new file mode 100644 +index 0000000000000000..13e399a9a1fbdeb2 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/bzero.c +@@ -0,0 +1,108 @@ ++/* Multiple versions of bzero. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define __bzero __redirect___bzero ++# include ++# undef __bzero ++ ++/* OPTIMIZE1 definition required for bzero patch. */ ++# define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name) ++# define SYMBOL_NAME __bzero ++# include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) ++ attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx512_unaligned_erms); ++ ++ return OPTIMIZE1 (avx512_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (evex_unaligned_erms); ++ ++ return OPTIMIZE1 (evex_unaligned); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms_rtm); ++ ++ return OPTIMIZE1 (avx2_unaligned_rtm); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms); ++ ++ return OPTIMIZE1 (avx2_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (sse2_unaligned_erms); ++ ++ return OPTIMIZE1 (sse2_unaligned); ++} ++ ++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); ++ ++weak_alias (__bzero, bzero) ++#endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 39ab10613bb0ffea..4992d7bd3206a7c0 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx512_no_vzeroupper) + ) + ++ /* Support sysdeps/x86_64/multiarch/bzero.c. */ ++ IFUNC_IMPL (i, name, bzero, ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_erms_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned) ++ ) ++ + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +index 8ac3e479bba488be..5a5ee6f67299400b 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -5,6 +5,7 @@ + + #define SECTION(p) p##.avx.rtm + #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm + #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + + #include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index c0bf2875d03d51ab..a093a2831f3dfa0d 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,6 +14,9 @@ + vmovd d, %xmm0; \ + movq r, %rax; + ++# define BZERO_ZERO_VEC0() \ ++ vpxor %xmm0, %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +@@ -29,6 +32,9 @@ + # ifndef MEMSET_SYMBOL + # define MEMSET_SYMBOL(p,s) p##_avx2_##s + # endif ++# ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) p##_avx2_##s ++# endif + # ifndef WMEMSET_SYMBOL + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 5241216a77bf72b7..727c92133a15900f 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 637002150659123c..5d8fa78f05476b10 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index e4e95fc19fe48d2d..bac74ac37fd3c144 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -22,6 +22,7 @@ + + #if IS_IN (libc) + # define MEMSET_SYMBOL(p,s) p##_sse2_##s ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) + # define WMEMSET_SYMBOL(p,s) p##_sse2_##s + + # ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index c8db87dcbf69f0d8..39a096a594ccb5b6 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -26,6 +26,10 @@ + + #include + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) ++#endif ++ + #ifndef MEMSET_CHK_SYMBOL + # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) + #endif +@@ -87,6 +91,18 @@ + # define XMM_SMALL 0 + #endif + ++#ifdef USE_LESS_VEC_MASK_STORE ++# define SET_REG64 rcx ++# define SET_REG32 ecx ++# define SET_REG16 cx ++# define SET_REG8 cl ++#else ++# define SET_REG64 rsi ++# define SET_REG32 esi ++# define SET_REG16 si ++# define SET_REG8 sil ++#endif ++ + #define PAGE_SIZE 4096 + + /* Macro to calculate size of small memset block for aligning +@@ -96,18 +112,6 @@ + + #ifndef SECTION + # error SECTION is not defined! +-#endif +- +- .section SECTION(.text),"ax",@progbits +-#if VEC_SIZE == 16 && IS_IN (libc) +-ENTRY (__bzero) +- mov %RDI_LP, %RAX_LP /* Set return value. */ +- mov %RSI_LP, %RDX_LP /* Set n. */ +- xorl %esi, %esi +- pxor %XMM0, %XMM0 +- jmp L(entry_from_bzero) +-END (__bzero) +-weak_alias (__bzero, bzero) + #endif + + #if IS_IN (libc) +@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + WMEMSET_VDUP_TO_VEC0_LOW() + cmpq $VEC_SIZE, %rdx +- jb L(less_vec_no_vdup) ++ jb L(less_vec_from_wmemset) + WMEMSET_VDUP_TO_VEC0_HIGH() + jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + ++ENTRY (BZERO_SYMBOL(__bzero, unaligned)) ++#if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++#ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++#ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++#if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned)) ++ + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmp %RDX_LP, %RCX_LP +@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +-L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH() +@@ -187,6 +215,31 @@ END (__memset_erms) + END (MEMSET_SYMBOL (__memset, erms)) + # endif + ++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) ++# if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++# ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++# ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++# if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(stosb_more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned_erms)) ++ + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP +@@ -229,6 +282,7 @@ L(last_2x_vec): + .p2align 4,, 10 + L(less_vec): + L(less_vec_no_vdup): ++L(less_vec_from_wmemset): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -374,8 +428,11 @@ L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_from_wmemset): ++#if VEC_SIZE > 16 + L(less_vec_no_vdup): + #endif ++#endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +@@ -386,7 +443,10 @@ L(cross_page): + jge L(between_16_31) + #endif + #ifndef USE_XMM_LESS_VEC +- MOVQ %XMM0, %rcx ++ MOVQ %XMM0, %SET_REG64 ++#endif ++#if VEC_SIZE <= 16 ++L(less_vec_no_vdup): + #endif + cmpl $8, %edx + jge L(between_8_15) +@@ -395,7 +455,7 @@ L(cross_page): + cmpl $1, %edx + jg L(between_2_3) + jl L(between_0_0) +- movb %sil, (%LESS_VEC_REG) ++ movb %SET_REG8, (%LESS_VEC_REG) + L(between_0_0): + ret + +@@ -428,8 +488,8 @@ L(between_8_15): + MOVQ %XMM0, (%rdi) + MOVQ %XMM0, -8(%rdi, %rdx) + #else +- movq %rcx, (%LESS_VEC_REG) +- movq %rcx, -8(%LESS_VEC_REG, %rdx) ++ movq %SET_REG64, (%LESS_VEC_REG) ++ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -442,8 +502,8 @@ L(between_4_7): + MOVD %XMM0, (%rdi) + MOVD %XMM0, -4(%rdi, %rdx) + #else +- movl %ecx, (%LESS_VEC_REG) +- movl %ecx, -4(%LESS_VEC_REG, %rdx) ++ movl %SET_REG32, (%LESS_VEC_REG) ++ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -452,12 +512,12 @@ L(between_4_7): + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + #ifdef USE_XMM_LESS_VEC +- movb %sil, (%rdi) +- movb %sil, 1(%rdi) +- movb %sil, -1(%rdi, %rdx) ++ movb %SET_REG8, (%rdi) ++ movb %SET_REG8, 1(%rdi) ++ movb %SET_REG8, -1(%rdi, %rdx) + #else +- movw %cx, (%LESS_VEC_REG) +- movb %sil, -1(%LESS_VEC_REG, %rdx) ++ movw %SET_REG16, (%LESS_VEC_REG) ++ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) diff --git a/glibc-upstream-2.34-208.patch b/glibc-upstream-2.34-208.patch new file mode 100644 index 0000000..d4d9b52 --- /dev/null +++ b/glibc-upstream-2.34-208.patch @@ -0,0 +1,29 @@ +commit 70509f9b4807295b2b4b43bffe110580fc0381ef +Author: Noah Goldstein +Date: Sat Feb 12 00:45:00 2022 -0600 + + x86: Set .text section in memset-vec-unaligned-erms + + commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 + Author: H.J. Lu + Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + + Remove setting the .text section for the code. This commit + adds that back. + + (cherry picked from commit 7912236f4a597deb092650ca79f33504ddb4af28) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 39a096a594ccb5b6..d9c577fb5ff9700f 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -114,6 +114,7 @@ + # error SECTION is not defined! + #endif + ++ .section SECTION(.text), "ax", @progbits + #if IS_IN (libc) + # if defined SHARED + ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) diff --git a/glibc-upstream-2.34-209.patch b/glibc-upstream-2.34-209.patch new file mode 100644 index 0000000..4874143 --- /dev/null +++ b/glibc-upstream-2.34-209.patch @@ -0,0 +1,76 @@ +commit 5373c90f2ea3c3fa9931a684c9b81c648dfbe8d7 +Author: Noah Goldstein +Date: Tue Feb 15 20:27:21 2022 -0600 + + x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] + + Logic can read before the start of `s1` / `s2` if both `s1` and `s2` + are near the start of a page. To avoid having the result contimated by + these comparisons the `strcmp` variants would mask off these + comparisons. This was missing in the `strncmp` variants causing + the bug. This commit adds the masking to `strncmp` so that out of + range comparisons don't affect the result. + + test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as + well a full xcheck on x86_64 linux. + Reviewed-by: H.J. Lu + + (cherry picked from commit e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf) + +diff --git a/string/test-strncmp.c b/string/test-strncmp.c +index 97e831d88fd24316..56e23670ae7f90e4 100644 +--- a/string/test-strncmp.c ++++ b/string/test-strncmp.c +@@ -438,13 +438,23 @@ check3 (void) + static void + check4 (void) + { +- const CHAR *s1 = L ("abc"); +- CHAR *s2 = STRDUP (s1); ++ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of ++ the end of the page. 2) For there to be no mismatch/null byte before the ++ first page cross. 3) For length (`n`) to be large enough for one string to ++ cross the page. And 4) for there to be either mismatch/null bytes before ++ the start of the strings. */ ++ ++ size_t size = 10; ++ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1); ++ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa)); ++ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed)); ++ int exp_result; + ++ STRCPY (s1, L ("tst-tlsmod%")); ++ STRCPY (s2, L ("tst-tls-manydynamic73mod")); ++ exp_result = SIMPLE_STRNCMP (s1, s2, size); + FOR_EACH_IMPL (impl, 0) +- check_result (impl, s1, s2, SIZE_MAX, 0); +- +- free (s2); ++ check_result (impl, s1, s2, size, exp_result); + } + + int +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index cdded412a70bad10..f9bdc5ccd03aa1f9 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -661,6 +661,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx), %ecx + cmpl %ecx, %edx +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index ed56af8ecdad48b2..0dfa62bd149c02b4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -689,6 +689,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx + # ifdef USE_AS_WCSCMP diff --git a/glibc-upstream-2.34-210.patch b/glibc-upstream-2.34-210.patch new file mode 100644 index 0000000..4898d45 --- /dev/null +++ b/glibc-upstream-2.34-210.patch @@ -0,0 +1,71 @@ +commit e123f08ad5ea4691bc37430ce536988c221332d6 +Author: Noah Goldstein +Date: Thu Mar 24 15:50:33 2022 -0500 + + x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] + + Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not + __wcscmp_avx2. + + commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87 + Author: Noah Goldstein + Date: Sun Jan 9 16:02:21 2022 -0600 + + x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] + + Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set + to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which + can cause spurious aborts. + + This change will need to be backported. + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 9fef7039a7d04947bc89296ee0d187bc8d89b772) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index aef9866cf2fbe774..ba6543be8ce13927 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -70,6 +70,16 @@ function_overflow (void) + return 1; + } + ++__attribute__ ((noinline, noclone)) ++static int ++function_overflow2 (void) ++{ ++ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0) ++ return 0; ++ else ++ return 1; ++} ++ + static int + do_test (void) + { +@@ -77,5 +87,10 @@ do_test (void) + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); ++ if (status != EXIT_SUCCESS) ++ return status; ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2); ++ if (status != EXIT_SUCCESS) ++ return status; + return status; + } +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index f9bdc5ccd03aa1f9..09a73942086f9c9f 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -122,7 +122,7 @@ ENTRY(STRCMP) + are cases where length is large enough that it can never be a + bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz __wcscmp_avx2 ++ jnz OVERFLOW_STRCMP + + leaq (, %rdx, 4), %rdx + # endif diff --git a/glibc-upstream-2.34-211.patch b/glibc-upstream-2.34-211.patch new file mode 100644 index 0000000..1221458 --- /dev/null +++ b/glibc-upstream-2.34-211.patch @@ -0,0 +1,170 @@ +commit e4a2fb76efb45210c541ee3f8ef32f317783c3a8 +Author: Florian Weimer +Date: Wed May 11 20:30:49 2022 +0200 + + manual: Document the dlinfo function + + Reviewed-by: Carlos O'Donell + Tested-by: Carlos O'Donell + (cherry picked from commit 93804a1ee084d4bdc620b2b9f91615c7da0fabe1) + + Also includes partial backport of commit 5d28a8962dcb6ec056b81d730e + (the addition of manual/dynlink.texi). + +diff --git a/manual/Makefile b/manual/Makefile +index e83444341e282916..31678681ef059e0f 100644 +--- a/manual/Makefile ++++ b/manual/Makefile +@@ -39,7 +39,7 @@ chapters = $(addsuffix .texi, \ + pipe socket terminal syslog math arith time \ + resource setjmp signal startup process ipc job \ + nss users sysinfo conf crypt debug threads \ +- probes tunables) ++ dynlink probes tunables) + appendices = lang.texi header.texi install.texi maint.texi platform.texi \ + contrib.texi + licenses = freemanuals.texi lgpl-2.1.texi fdl-1.3.texi +diff --git a/manual/dynlink.texi b/manual/dynlink.texi +new file mode 100644 +index 0000000000000000..dbf3de11769d8e57 +--- /dev/null ++++ b/manual/dynlink.texi +@@ -0,0 +1,100 @@ ++@node Dynamic Linker ++@c @node Dynamic Linker, Internal Probes, Threads, Top ++@c %MENU% Loading programs and shared objects. ++@chapter Dynamic Linker ++@cindex dynamic linker ++@cindex dynamic loader ++ ++The @dfn{dynamic linker} is responsible for loading dynamically linked ++programs and their dependencies (in the form of shared objects). The ++dynamic linker in @theglibc{} also supports loading shared objects (such ++as plugins) later at run time. ++ ++Dynamic linkers are sometimes called @dfn{dynamic loaders}. ++ ++@menu ++* Dynamic Linker Introspection:: Interfaces for querying mapping information. ++@end menu ++ ++@node Dynamic Linker Introspection ++@section Dynamic Linker Introspection ++ ++@Theglibc{} provides various functions for querying information from the ++dynamic linker. ++ ++@deftypefun {int} dlinfo (void *@var{handle}, int @var{request}, void *@var{arg}) ++@safety{@mtsafe{}@asunsafe{@asucorrupt{}}@acunsafe{@acucorrupt{}}} ++@standards{GNU, dlfcn.h} ++This function returns information about @var{handle} in the memory ++location @var{arg}, based on @var{request}. The @var{handle} argument ++must be a pointer returned by @code{dlopen} or @code{dlmopen}; it must ++not have been closed by @code{dlclose}. ++ ++On success, @code{dlinfo} returns 0. If there is an error, the function ++returns @math{-1}, and @code{dlerror} can be used to obtain a ++corresponding error message. ++ ++The following operations are defined for use with @var{request}: ++ ++@vtable @code ++@item RTLD_DI_LINKMAP ++The corresponding @code{struct link_map} pointer for @var{handle} is ++written to @code{*@var{arg}}. The @var{arg} argument must be the ++address of an object of type @code{struct link_map *}. ++ ++@item RTLD_DI_LMID ++The namespace identifier of @var{handle} is written to ++@code{*@var{arg}}. The @var{arg} argument must be the address of an ++object of type @code{Lmid_t}. ++ ++@item RTLD_DI_ORIGIN ++The value of the @code{$ORIGIN} dynamic string token for @var{handle} is ++written to the character array starting at @var{arg} as a ++null-terminated string. ++ ++This request type should not be used because it is prone to buffer ++overflows. ++ ++@item RTLD_DI_SERINFO ++@itemx RTLD_DI_SERINFOSIZE ++These requests can be used to obtain search path information for ++@var{handle}. For both requests, @var{arg} must point to a ++@code{Dl_serinfo} object. The @code{RTLD_DI_SERINFOSIZE} request must ++be made first; it updates the @code{dls_size} and @code{dls_cnt} members ++of the @code{Dl_serinfo} object. The caller should then allocate memory ++to store at least @code{dls_size} bytes and pass that buffer to a ++@code{RTLD_DI_SERINFO} request. This second request fills the ++@code{dls_serpath} array. The number of array elements was returned in ++the @code{dls_cnt} member in the initial @code{RTLD_DI_SERINFOSIZE} ++request. The caller is responsible for freeing the allocated buffer. ++ ++This interface is prone to buffer overflows in multi-threaded processes ++because the required size can change between the ++@code{RTLD_DI_SERINFOSIZE} and @code{RTLD_DI_SERINFO} requests. ++ ++@item RTLD_DI_TLS_DATA ++This request writes the address of the TLS block (in the current thread) ++for the shared object identified by @var{handle} to @code{*@var{arg}}. ++The argument @var{arg} must be the address of an object of type ++@code{void *}. A null pointer is written if the object does not have ++any associated TLS block. ++ ++@item RTLD_DI_TLS_MODID ++This request writes the TLS module ID for the shared object @var{handle} ++to @code{*@var{arg}}. The argument @var{arg} must be the address of an ++object of type @code{size_t}. The module ID is zero if the object ++does not have an associated TLS block. ++@end vtable ++ ++The @code{dlinfo} function is a GNU extension. ++@end deftypefun ++ ++@c FIXME these are undocumented: ++@c dladdr ++@c dladdr1 ++@c dlclose ++@c dlerror ++@c dlmopen ++@c dlopen ++@c dlsym ++@c dlvsym +diff --git a/manual/libdl.texi b/manual/libdl.texi +deleted file mode 100644 +index e3fe0452d9f41d47..0000000000000000 +--- a/manual/libdl.texi ++++ /dev/null +@@ -1,10 +0,0 @@ +-@c FIXME these are undocumented: +-@c dladdr +-@c dladdr1 +-@c dlclose +-@c dlerror +-@c dlinfo +-@c dlmopen +-@c dlopen +-@c dlsym +-@c dlvsym +diff --git a/manual/probes.texi b/manual/probes.texi +index 4aae76b81921f347..ee019e651706f492 100644 +--- a/manual/probes.texi ++++ b/manual/probes.texi +@@ -1,5 +1,5 @@ + @node Internal Probes +-@c @node Internal Probes, Tunables, Threads, Top ++@c @node Internal Probes, Tunables, Dynamic Linker, Top + @c %MENU% Probes to monitor libc internal behavior + @chapter Internal probes + +diff --git a/manual/threads.texi b/manual/threads.texi +index 06b6b277a1228af1..7f166bfa87e88c36 100644 +--- a/manual/threads.texi ++++ b/manual/threads.texi +@@ -1,5 +1,5 @@ + @node Threads +-@c @node Threads, Internal Probes, Debugging Support, Top ++@c @node Threads, Dynamic Linker, Debugging Support, Top + @c %MENU% Functions, constants, and data types for working with threads + @chapter Threads + @cindex threads diff --git a/glibc-upstream-2.34-212.patch b/glibc-upstream-2.34-212.patch new file mode 100644 index 0000000..000023f --- /dev/null +++ b/glibc-upstream-2.34-212.patch @@ -0,0 +1,256 @@ +commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23 +Author: Florian Weimer +Date: Fri Apr 29 17:00:53 2022 +0200 + + dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo + + The information is theoretically available via dl_iterate_phdr as + well, but that approach is very slow if there are many shared + objects. + + Reviewed-by: Carlos O'Donell + Tested-by: Carlos O'Donell + (cherry picked from commit d056c212130280c0a54d9a4f72170ec621b70ce5) + +diff --git a/dlfcn/Makefile b/dlfcn/Makefile +index 6bbfbb8344da05cb..d3965427dabed898 100644 +--- a/dlfcn/Makefile ++++ b/dlfcn/Makefile +@@ -73,6 +73,10 @@ tststatic3-ENV = $(tststatic-ENV) + tststatic4-ENV = $(tststatic-ENV) + tststatic5-ENV = $(tststatic-ENV) + ++tests-internal += \ ++ tst-dlinfo-phdr \ ++ # tests-internal ++ + ifneq (,$(CXX)) + modules-names += bug-atexit3-lib + else +diff --git a/dlfcn/dlfcn.h b/dlfcn/dlfcn.h +index 4a3b870a487ea789..24388cfedae4dd67 100644 +--- a/dlfcn/dlfcn.h ++++ b/dlfcn/dlfcn.h +@@ -162,7 +162,12 @@ enum + segment, or if the calling thread has not allocated a block for it. */ + RTLD_DI_TLS_DATA = 10, + +- RTLD_DI_MAX = 10 ++ /* Treat ARG as const ElfW(Phdr) **, and store the address of the ++ program header array at that location. The dlinfo call returns ++ the number of program headers in the array. */ ++ RTLD_DI_PHDR = 11, ++ ++ RTLD_DI_MAX = 11 + }; + + +diff --git a/dlfcn/dlinfo.c b/dlfcn/dlinfo.c +index 47d2daa96fa5986f..1842925fb7c594dd 100644 +--- a/dlfcn/dlinfo.c ++++ b/dlfcn/dlinfo.c +@@ -28,6 +28,10 @@ struct dlinfo_args + void *handle; + int request; + void *arg; ++ ++ /* This is the value that is returned from dlinfo if no error is ++ signaled. */ ++ int result; + }; + + static void +@@ -40,6 +44,7 @@ dlinfo_doit (void *argsblock) + { + case RTLD_DI_CONFIGADDR: + default: ++ args->result = -1; + _dl_signal_error (0, NULL, NULL, N_("unsupported dlinfo request")); + break; + +@@ -75,6 +80,11 @@ dlinfo_doit (void *argsblock) + *(void **) args->arg = data; + break; + } ++ ++ case RTLD_DI_PHDR: ++ *(const ElfW(Phdr) **) args->arg = l->l_phdr; ++ args->result = l->l_phnum; ++ break; + } + } + +@@ -82,7 +92,8 @@ static int + dlinfo_implementation (void *handle, int request, void *arg) + { + struct dlinfo_args args = { handle, request, arg }; +- return _dlerror_run (&dlinfo_doit, &args) ? -1 : 0; ++ _dlerror_run (&dlinfo_doit, &args); ++ return args.result; + } + + #ifdef SHARED +diff --git a/dlfcn/tst-dlinfo-phdr.c b/dlfcn/tst-dlinfo-phdr.c +new file mode 100644 +index 0000000000000000..a15a7d48ebd3b976 +--- /dev/null ++++ b/dlfcn/tst-dlinfo-phdr.c +@@ -0,0 +1,125 @@ ++/* Test for dlinfo (RTLD_DI_PHDR). ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++/* Used to verify that the program header array appears as expected ++ among the dl_iterate_phdr callback invocations. */ ++ ++struct dlip_callback_args ++{ ++ struct link_map *l; /* l->l_addr is used to find the object. */ ++ const ElfW(Phdr) *phdr; /* Expected program header pointed. */ ++ int phnum; /* Expected program header count. */ ++ bool found; /* True if l->l_addr has been found. */ ++}; ++ ++static int ++dlip_callback (struct dl_phdr_info *dlpi, size_t size, void *closure) ++{ ++ TEST_COMPARE (sizeof (*dlpi), size); ++ struct dlip_callback_args *args = closure; ++ ++ if (dlpi->dlpi_addr == args->l->l_addr) ++ { ++ TEST_VERIFY (!args->found); ++ args->found = true; ++ TEST_VERIFY (args->phdr == dlpi->dlpi_phdr); ++ TEST_COMPARE (args->phnum, dlpi->dlpi_phnum); ++ } ++ ++ return 0; ++} ++ ++static int ++do_test (void) ++{ ++ /* Avoid a copy relocation. */ ++ struct r_debug *debug = xdlsym (RTLD_DEFAULT, "_r_debug"); ++ struct link_map *l = (struct link_map *) debug->r_map; ++ TEST_VERIFY_EXIT (l != NULL); ++ ++ do ++ { ++ printf ("info: checking link map %p (%p) for \"%s\"\n", ++ l, l->l_phdr, l->l_name); ++ ++ /* Cause dlerror () to return an error message. */ ++ dlsym (RTLD_DEFAULT, "does-not-exist"); ++ ++ /* Use the extension that link maps are valid dlopen handles. */ ++ const ElfW(Phdr) *phdr; ++ int phnum = dlinfo (l, RTLD_DI_PHDR, &phdr); ++ TEST_VERIFY (phnum >= 0); ++ /* Verify that the error message has been cleared. */ ++ TEST_COMPARE_STRING (dlerror (), NULL); ++ ++ TEST_VERIFY (phdr == l->l_phdr); ++ TEST_COMPARE (phnum, l->l_phnum); ++ ++ /* Check that we can find PT_DYNAMIC among the array. */ ++ { ++ bool dynamic_found = false; ++ for (int i = 0; i < phnum; ++i) ++ if (phdr[i].p_type == PT_DYNAMIC) ++ { ++ dynamic_found = true; ++ TEST_COMPARE ((ElfW(Addr)) l->l_ld, l->l_addr + phdr[i].p_vaddr); ++ } ++ TEST_VERIFY (dynamic_found); ++ } ++ ++ /* Check that dl_iterate_phdr finds the link map with the same ++ program headers. */ ++ { ++ struct dlip_callback_args args = ++ { ++ .l = l, ++ .phdr = phdr, ++ .phnum = phnum, ++ .found = false, ++ }; ++ TEST_COMPARE (dl_iterate_phdr (dlip_callback, &args), 0); ++ TEST_VERIFY (args.found); ++ } ++ ++ if (l->l_prev == NULL) ++ { ++ /* This is the executable, so the information is also ++ available via getauxval. */ ++ TEST_COMPARE_STRING (l->l_name, ""); ++ TEST_VERIFY (phdr == (const ElfW(Phdr) *) getauxval (AT_PHDR)); ++ TEST_COMPARE (phnum, getauxval (AT_PHNUM)); ++ } ++ ++ l = l->l_next; ++ } ++ while (l != NULL); ++ ++ return 0; ++} ++ ++#include +diff --git a/manual/dynlink.texi b/manual/dynlink.texi +index dbf3de11769d8e57..7dcac64889e389fd 100644 +--- a/manual/dynlink.texi ++++ b/manual/dynlink.texi +@@ -30,9 +30,9 @@ location @var{arg}, based on @var{request}. The @var{handle} argument + must be a pointer returned by @code{dlopen} or @code{dlmopen}; it must + not have been closed by @code{dlclose}. + +-On success, @code{dlinfo} returns 0. If there is an error, the function +-returns @math{-1}, and @code{dlerror} can be used to obtain a +-corresponding error message. ++On success, @code{dlinfo} returns 0 for most request types; exceptions ++are noted below. If there is an error, the function returns @math{-1}, ++and @code{dlerror} can be used to obtain a corresponding error message. + + The following operations are defined for use with @var{request}: + +@@ -84,6 +84,15 @@ This request writes the TLS module ID for the shared object @var{handle} + to @code{*@var{arg}}. The argument @var{arg} must be the address of an + object of type @code{size_t}. The module ID is zero if the object + does not have an associated TLS block. ++ ++@item RTLD_DI_PHDR ++This request writes the address of the program header array to ++@code{*@var{arg}}. The argument @var{arg} must be the address of an ++object of type @code{const ElfW(Phdr) *} (that is, ++@code{const Elf32_Phdr *} or @code{const Elf64_Phdr *}, as appropriate ++for the current architecture). For this request, the value returned by ++@code{dlinfo} is the number of program headers in the program header ++array. + @end vtable + + The @code{dlinfo} function is a GNU extension. diff --git a/glibc-upstream-2.34-213.patch b/glibc-upstream-2.34-213.patch new file mode 100644 index 0000000..544f599 --- /dev/null +++ b/glibc-upstream-2.34-213.patch @@ -0,0 +1,31 @@ +commit b72bbba23687ed67887d1d18c51cce5cc9c575ca +Author: Siddhesh Poyarekar +Date: Fri May 13 10:01:47 2022 +0530 + + fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141] + + The fix c8ee1c85 introduced a -1 check for object size without also + checking that object size is a constant. Because of this, the tree + optimizer passes in gcc fail to fold away one of the branches in + __glibc_fortify and trips on a spurious Wstringop-overflow. The warning + itself is incorrect and the branch does go away eventually in DCE in the + rtl passes in gcc, but the constant check is a helpful hint to simplify + code early, so add it in. + + Resolves: BZ #29141 + Signed-off-by: Siddhesh Poyarekar + (cherry picked from commit 61a87530108ec9181e1b18a9b727ec3cc3ba7532) + +diff --git a/misc/sys/cdefs.h b/misc/sys/cdefs.h +index b36013b9a6b4d9c3..e0ecd9147ee3ce48 100644 +--- a/misc/sys/cdefs.h ++++ b/misc/sys/cdefs.h +@@ -163,7 +163,7 @@ + /* Length is known to be safe at compile time if the __L * __S <= __OBJSZ + condition can be folded to a constant and if it is true, or unknown (-1) */ + #define __glibc_safe_or_unknown_len(__l, __s, __osz) \ +- ((__osz) == (__SIZE_TYPE__) -1 \ ++ ((__builtin_constant_p (__osz) && (__osz) == (__SIZE_TYPE__) -1) \ + || (__glibc_unsigned_or_positive (__l) \ + && __builtin_constant_p (__glibc_safe_len_cond ((__SIZE_TYPE__) (__l), \ + (__s), (__osz))) \ diff --git a/glibc-upstream-2.34-214.patch b/glibc-upstream-2.34-214.patch new file mode 100644 index 0000000..d51a006 --- /dev/null +++ b/glibc-upstream-2.34-214.patch @@ -0,0 +1,22 @@ +commit 8de6e4a199ba6cc8aaeb43924b974eed67164bd6 +Author: H.J. Lu +Date: Sat Feb 5 11:06:01 2022 -0800 + + x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) + + (cherry picked from commit 1283948f236f209b7d3f44b69a42b96806fa6da0) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index 937180c1bd791570..deda1c4e492f6176 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -111,7 +111,8 @@ enum cf_protection_level + /* Local label name for asm code. */ + #ifndef L + /* ELF-like local names start with `.L'. */ +-# define L(name) .L##name ++# define LOCAL_LABEL(name) .L##name ++# define L(name) LOCAL_LABEL(name) + #endif + + #define atom_text_section .section ".text.atom", "ax" diff --git a/glibc-upstream-2.34-215.patch b/glibc-upstream-2.34-215.patch new file mode 100644 index 0000000..d33cace --- /dev/null +++ b/glibc-upstream-2.34-215.patch @@ -0,0 +1,98 @@ +commit 6cba46c85804988f4fd41ef03e8a170a4c987a86 +Author: H.J. Lu +Date: Sat Feb 5 11:52:33 2022 -0800 + + x86_64/multiarch: Sort sysdep_routines and put one entry per line + + (cherry picked from commit c328d0152d4b14cca58407ec68143894c8863004) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 37d8d6f0bd2d10cc..8c9e7812c6af10b8 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4 + endif + + ifeq ($(subdir),wcsmbs) +-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ +- wmemcmp-avx2-movbe \ +- wmemchr-sse2 wmemchr-avx2 \ +- wcscmp-sse2 wcscmp-avx2 \ +- wcsncmp-sse2 wcsncmp-avx2 \ +- wcscpy-ssse3 wcscpy-c \ +- wcschr-sse2 wcschr-avx2 \ +- wcsrchr-sse2 wcsrchr-avx2 \ +- wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ +- wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ +- wcschr-avx2-rtm \ +- wcscmp-avx2-rtm \ +- wcslen-avx2-rtm \ +- wcsncmp-avx2-rtm \ +- wcsnlen-avx2-rtm \ +- wcsrchr-avx2-rtm \ +- wmemchr-avx2-rtm \ +- wmemcmp-avx2-movbe-rtm \ +- wcschr-evex \ +- wcscmp-evex \ +- wcslen-evex \ +- wcsncmp-evex \ +- wcsnlen-evex \ +- wcsrchr-evex \ +- wmemchr-evex \ +- wmemcmp-evex-movbe \ +- wmemchr-evex-rtm ++sysdep_routines += \ ++ wcschr-avx2 \ ++ wcschr-avx2-rtm \ ++ wcschr-evex \ ++ wcschr-sse2 \ ++ wcscmp-avx2 \ ++ wcscmp-avx2-rtm \ ++ wcscmp-evex \ ++ wcscmp-sse2 \ ++ wcscpy-c \ ++ wcscpy-ssse3 \ ++ wcslen-avx2 \ ++ wcslen-avx2-rtm \ ++ wcslen-evex \ ++ wcslen-sse2 \ ++ wcslen-sse4_1 \ ++ wcsncmp-avx2 \ ++ wcsncmp-avx2-rtm \ ++ wcsncmp-evex \ ++ wcsncmp-sse2 \ ++ wcsnlen-avx2 \ ++ wcsnlen-avx2-rtm \ ++ wcsnlen-c \ ++ wcsnlen-evex \ ++ wcsnlen-sse4_1 \ ++ wcsrchr-avx2 \ ++ wcsrchr-avx2-rtm \ ++ wcsrchr-evex \ ++ wcsrchr-sse2 \ ++ wmemchr-avx2 \ ++ wmemchr-avx2-rtm \ ++ wmemchr-evex \ ++ wmemchr-evex-rtm \ ++ wmemchr-sse2 \ ++ wmemcmp-avx2-movbe \ ++ wmemcmp-avx2-movbe-rtm \ ++ wmemcmp-c \ ++ wmemcmp-evex-movbe \ ++ wmemcmp-sse4 \ ++ wmemcmp-ssse3 \ ++# sysdep_routines + endif + + ifeq ($(subdir),debug) +-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \ +- memmove_chk-nonshared memset_chk-nonshared \ +- wmemset_chk-nonshared ++sysdep_routines += \ ++ memcpy_chk-nonshared \ ++ memmove_chk-nonshared \ ++ mempcpy_chk-nonshared \ ++ memset_chk-nonshared \ ++ wmemset_chk-nonshared \ ++# sysdep_routines + endif diff --git a/glibc-upstream-2.34-216.patch b/glibc-upstream-2.34-216.patch new file mode 100644 index 0000000..b1e36ab --- /dev/null +++ b/glibc-upstream-2.34-216.patch @@ -0,0 +1,32 @@ +commit 37f373e33496ea437cc7e375cc835c20d4b35fb2 +Author: H.J. Lu +Date: Thu Feb 10 11:52:50 2022 -0800 + + x86-64: Remove bzero weak alias in SS2 memset + + commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 + Author: H.J. Lu + Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + + added the optimized bzero. Remove bzero weak alias in SS2 memset to + avoid undefined __bzero in memset-sse2-unaligned-erms. + + (cherry picked from commit 0fb8800029d230b3711bf722b2a47db92d0e273f) + +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index bac74ac37fd3c144..2951f7f5f70e274a 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -31,9 +31,7 @@ + # endif + + # undef weak_alias +-# define weak_alias(original, alias) \ +- .weak bzero; bzero = __bzero +- ++# define weak_alias(original, alias) + # undef strong_alias + # define strong_alias(ignored1, ignored2) + #endif diff --git a/glibc-upstream-2.34-217.patch b/glibc-upstream-2.34-217.patch new file mode 100644 index 0000000..8f92420 --- /dev/null +++ b/glibc-upstream-2.34-217.patch @@ -0,0 +1,24 @@ +commit dd457606ca4583b4a5e83d4e8956e6f9db61df6d +Author: Adhemerval Zanella +Date: Thu Feb 10 11:23:24 2022 -0300 + + x86_64: Remove bcopy optimizations + + The symbols is not present in current POSIX specification and compiler + already generates memmove call. + + (cherry picked from commit bf92893a14ebc161b08b28acc24fa06ae6be19cb) + +diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S +deleted file mode 100644 +index 639f02bde3ac3ed1..0000000000000000 +--- a/sysdeps/x86_64/multiarch/bcopy.S ++++ /dev/null +@@ -1,7 +0,0 @@ +-#include +- +- .text +-ENTRY(bcopy) +- xchg %rdi, %rsi +- jmp __libc_memmove /* Branch to IFUNC memmove. */ +-END(bcopy) diff --git a/glibc-upstream-2.34-218.patch b/glibc-upstream-2.34-218.patch new file mode 100644 index 0000000..312016b --- /dev/null +++ b/glibc-upstream-2.34-218.patch @@ -0,0 +1,367 @@ +commit 3c55c207564c0ae30d78d01689b4ae16bf38dd63 +Author: Noah Goldstein +Date: Wed Mar 23 16:57:16 2022 -0500 + + x86: Code cleanup in strchr-avx2 and comment justifying branch + + Small code cleanup for size: -53 bytes. + + Add comment justifying using a branch to do NULL/non-null return. + + All string/memory tests pass and no regressions in benchtests. + + geometric_mean(N=20) of all benchmarks Original / New: 1.00 + Reviewed-by: H.J. Lu + + (cherry picked from commit a6fbf4d51e9ba8063c4f8331564892ead9c67344) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 413942b96a835c4a..ef4ce0f3677e30c8 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -48,13 +48,13 @@ + # define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + VPBROADCAST %xmm0, %ymm0 +- vpxor %xmm9, %xmm9, %xmm9 ++ vpxor %xmm1, %xmm1, %xmm1 + + /* Check if we cross page boundary with one vector load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -62,37 +62,29 @@ ENTRY (STRCHR) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ +- vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqu (%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rdi, %rax), %CHAR_REG +- jne L(zero) +-# endif +- addq %rdi, %rax +- VZEROUPPER_RETURN +- +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x4): +- tzcntl %eax, %eax +- addq $(VEC_SIZE * 3 + 1), %rdi +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ ++ /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ + jne L(zero) + # endif + addq %rdi, %rax +- VZEROUPPER_RETURN ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + # ifndef USE_AS_STRCHRNUL + L(zero): +@@ -103,7 +95,8 @@ L(zero): + + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + incq %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -113,9 +106,10 @@ L(first_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -125,9 +119,10 @@ L(first_vec_x2): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 8 + L(first_vec_x3): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -137,6 +132,21 @@ L(first_vec_x3): + addq %rdi, %rax + VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x4): ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of +@@ -146,90 +156,92 @@ L(aligned_more): + L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- vmovdqa 1(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x1) + +- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x4) +- /* Align data to VEC_SIZE * 4 - 1. */ +- addq $(VEC_SIZE * 4 + 1), %rdi +- andq $-(VEC_SIZE * 4), %rdi ++ /* Align data to VEC_SIZE * 4 - 1. */ ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + .p2align 4 + L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm5 +- vmovdqa (VEC_SIZE)(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 ++ vmovdqa 1(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 + + /* Leaves only CHARS matching esi as 0. */ +- vpxor %ymm5, %ymm0, %ymm1 + vpxor %ymm6, %ymm0, %ymm2 + vpxor %ymm7, %ymm0, %ymm3 +- vpxor %ymm8, %ymm0, %ymm4 + +- VPMINU %ymm1, %ymm5, %ymm1 + VPMINU %ymm2, %ymm6, %ymm2 + VPMINU %ymm3, %ymm7, %ymm3 +- VPMINU %ymm4, %ymm8, %ymm4 + +- VPMINU %ymm1, %ymm2, %ymm5 +- VPMINU %ymm3, %ymm4, %ymm6 ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 ++ ++ vpxor %ymm6, %ymm0, %ymm4 ++ vpxor %ymm7, %ymm0, %ymm5 ++ ++ VPMINU %ymm4, %ymm6, %ymm4 ++ VPMINU %ymm5, %ymm7, %ymm5 + +- VPMINU %ymm5, %ymm6, %ymm6 ++ VPMINU %ymm2, %ymm3, %ymm6 ++ VPMINU %ymm4, %ymm5, %ymm7 + +- VPCMPEQ %ymm6, %ymm9, %ymm6 +- vpmovmskb %ymm6, %ecx ++ VPMINU %ymm6, %ymm7, %ymm7 ++ ++ VPCMPEQ %ymm7, %ymm1, %ymm7 ++ vpmovmskb %ymm7, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- +- VPCMPEQ %ymm1, %ymm9, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + +- VPCMPEQ %ymm5, %ymm9, %ymm2 +- vpmovmskb %ymm2, %eax ++ VPCMPEQ %ymm3, %ymm1, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMPEQ %ymm3, %ymm9, %ymm3 +- vpmovmskb %ymm3, %eax ++ VPCMPEQ %ymm4, %ymm1, %ymm4 ++ vpmovmskb %ymm4, %eax + /* rcx has combined result from all 4 VEC. It will only be used + if the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax +- subq $(VEC_SIZE * 2), %rdi ++ subq $(VEC_SIZE * 2 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -239,10 +251,11 @@ L(loop_4x_vec): + VZEROUPPER_RETURN + + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x0): +- tzcntl %eax, %eax +- addq $-(VEC_SIZE * 4), %rdi ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $-(VEC_SIZE * 4 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -251,16 +264,11 @@ L(last_vec_x0): + addq %rdi, %rax + VZEROUPPER_RETURN + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- VZEROUPPER_RETURN +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x1): + tzcntl %eax, %eax +- subq $(VEC_SIZE * 3), %rdi ++ subq $(VEC_SIZE * 3 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -269,18 +277,23 @@ L(last_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + ++# ifndef USE_AS_STRCHRNUL ++L(zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++# endif + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi +- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod edx. */ + sarxl %edx, %eax, %eax +@@ -291,13 +304,10 @@ L(cross_page_boundary): + xorl %ecx, %ecx + /* Found CHAR or the null byte. */ + cmp (%rdx, %rax), %CHAR_REG +- leaq (%rdx, %rax), %rax +- cmovne %rcx, %rax +-# else +- addq %rdx, %rax ++ jne L(zero_end) + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ addq %rdx, %rax ++ VZEROUPPER_RETURN + + END (STRCHR) +-# endif ++#endif diff --git a/glibc-upstream-2.34-219.patch b/glibc-upstream-2.34-219.patch new file mode 100644 index 0000000..654fb28 --- /dev/null +++ b/glibc-upstream-2.34-219.patch @@ -0,0 +1,338 @@ +commit dd6d3a0bbcc67cb2b50b0add0c599f9f99491d8b +Author: Noah Goldstein +Date: Wed Mar 23 16:57:18 2022 -0500 + + x86: Code cleanup in strchr-evex and comment justifying branch + + Small code cleanup for size: -81 bytes. + + Add comment justifying using a branch to do NULL/non-null return. + + All string/memory tests pass and no regressions in benchtests. + + geometric_mean(N=20) of all benchmarks New / Original: .985 + Reviewed-by: H.J. Lu + + (cherry picked from commit ec285ea90415458225623ddc0492ae3f705af043) + +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +index 7f9d4ee48ddaa998..0b49e0ac54e7b0dd 100644 +--- a/sysdeps/x86_64/multiarch/strchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -30,6 +30,7 @@ + # ifdef USE_AS_WCSCHR + # define VPBROADCAST vpbroadcastd + # define VPCMP vpcmpd ++# define VPTESTN vptestnmd + # define VPMINU vpminud + # define CHAR_REG esi + # define SHIFT_REG ecx +@@ -37,6 +38,7 @@ + # else + # define VPBROADCAST vpbroadcastb + # define VPCMP vpcmpb ++# define VPTESTN vptestnmb + # define VPMINU vpminub + # define CHAR_REG sil + # define SHIFT_REG edx +@@ -61,13 +63,11 @@ + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + VPBROADCAST %esi, %YMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- + /* Check if we cross page boundary with one vector load. + Otherwise it is safe to use an unaligned load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -81,49 +81,35 @@ ENTRY (STRCHR) + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ ++ jne L(zero) ++# endif + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax + # else + addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rax), %CHAR_REG +- jne L(zero) + # endif + ret + +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x3): +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero) +-# endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +- ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero): +- xorl %eax, %eax +- ret +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x4): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -144,9 +130,18 @@ L(first_vec_x4): + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++# ifndef USE_AS_STRCHRNUL ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -158,7 +153,7 @@ L(first_vec_x1): + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -179,6 +174,21 @@ L(first_vec_x2): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++ .p2align 4,, 10 ++L(first_vec_x3): ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) ++# endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE. */ +@@ -195,7 +205,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) +@@ -206,7 +216,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x2) + +@@ -215,7 +225,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) +@@ -224,7 +234,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x4) + +@@ -265,33 +275,33 @@ L(loop_4x_vec): + VPMINU %YMM3, %YMM4, %YMM4 + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} + +- VPCMP $0, %YMMZERO, %YMM4, %k1 ++ VPTESTN %YMM4, %YMM4, %k1 + kmovd %k1, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- VPCMP $0, %YMMZERO, %YMM1, %k0 ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + +- VPCMP $0, %YMMZERO, %YMM3, %k0 ++ VPTESTN %YMM3, %YMM3, %k0 + kmovd %k0, %eax + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ + # ifdef USE_AS_WCSCHR + sall $8, %ecx + orl %ecx, %eax +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # else + salq $32, %rcx + orq %rcx, %rax +- tzcntq %rax, %rax ++ bsfq %rax, %rax + # endif + # ifndef USE_AS_STRCHRNUL + /* Check if match was CHAR or null. */ +@@ -303,28 +313,28 @@ L(loop_4x_vec): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- ret ++ .p2align 4,, 8 ++L(last_vec_x1): ++ bsfl %eax, %eax ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. ++ */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif + +- .p2align 4 +-L(last_vec_x1): +- tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ +- cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ cmp (%rax), %CHAR_REG + jne L(zero_end) + # endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (%rdi, %rax, CHAR_SIZE), %rax ++ + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(last_vec_x2): +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -336,7 +346,7 @@ L(last_vec_x2): + ret + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi. */ +@@ -346,9 +356,9 @@ L(cross_page_boundary): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax +- /* Remove the leading bits. */ ++ /* Remove the leading bits. */ + # ifdef USE_AS_WCSCHR + movl %edx, %SHIFT_REG + /* NB: Divide shift count by 4 since each bit in K1 represent 4 +@@ -360,20 +370,24 @@ L(cross_page_boundary): + /* If eax is zero continue. */ + testl %eax, %eax + jz L(cross_page_continue) +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Check to see if match was CHAR or null. */ +- cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero_end) +-# endif ++ bsfl %eax, %eax ++ + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of + bytes. */ + leaq (%rdx, %rax, CHAR_SIZE), %rax + # else + addq %rdx, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if match was CHAR or null. */ ++ cmp (%rax), %CHAR_REG ++ je L(cross_page_ret) ++L(zero_end): ++ xorl %eax, %eax ++L(cross_page_ret): + # endif + ret + + END (STRCHR) +-# endif ++#endif diff --git a/glibc-upstream-2.34-220.patch b/glibc-upstream-2.34-220.patch new file mode 100644 index 0000000..5f77e5c --- /dev/null +++ b/glibc-upstream-2.34-220.patch @@ -0,0 +1,143 @@ +commit 0ae1006967eef11909fbed0f6ecef2f260b133d3 +Author: Noah Goldstein +Date: Wed Mar 23 16:57:22 2022 -0500 + + x86: Optimize strcspn and strpbrk in strcspn-c.c + + Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of + _mm_cmpistri. Also change offset to unsigned to avoid unnecessary + sign extensions. + + geometric_mean(N=20) of all benchmarks that dont fallback on + sse2/strlen; New / Original: .928 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 30d627d477d7255345a4b713cf352ac32d644d61) + +diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c +index c56ddbd22f014653..2436b6dcd90d8efe 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-c.c ++++ b/sysdeps/x86_64/multiarch/strcspn-c.c +@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a) + RETURN (NULL, strlen (s)); + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (unsigned int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return STRCSPN_SSE2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return STRCSPN_SSE2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return STRCSPN_SSE2 (s, a); + } + +- offset = (int) ((size_t) s & 15); ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + +- int length = _mm_cmpistri (mask, value, 0x2); ++ unsigned int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ +- int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); ++ unsigned int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x2); +- int cflag = _mm_cmpistrc (mask, value, 0x2); +- int zflag = _mm_cmpistrz (mask, value, 0x2); ++ unsigned int index = _mm_cmpistri (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) diff --git a/glibc-upstream-2.34-221.patch b/glibc-upstream-2.34-221.patch new file mode 100644 index 0000000..c4b411b --- /dev/null +++ b/glibc-upstream-2.34-221.patch @@ -0,0 +1,143 @@ +commit 0a2da0111037b1cc214f8f40ca5bdebf36f35cbd +Author: Noah Goldstein +Date: Wed Mar 23 16:57:24 2022 -0500 + + x86: Optimize strspn in strspn-c.c + + Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of + _mm_cmpistri. Also change offset to unsigned to avoid unnecessary + sign extensions. + + geometric_mean(N=20) of all benchmarks that dont fallback on + sse2; New / Original: .901 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 412d10343168b05b8cf6c3683457cf9711d28046) + +diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c +index a17196296b9ebe52..3bcc479f1b52ff6a 100644 +--- a/sysdeps/x86_64/multiarch/strspn-c.c ++++ b/sysdeps/x86_64/multiarch/strspn-c.c +@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a) + return 0; + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return __strspn_sse2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return __strspn_sse2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return __strspn_sse2 (s, a); + } ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + +- offset = (int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); ++ __m128i adj_value = __m128i_shift_right (value, offset); + +- value = __m128i_shift_right (value, offset); +- +- int length = _mm_cmpistri (mask, value, 0x12); ++ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); +- if (index < 16 - offset) ++ maskz = _mm_cmpeq_epi8 (value, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) + return length; + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x12); +- int cflag = _mm_cmpistrc (mask, value, 0x12); ++ unsigned int index = _mm_cmpistri (mask, value, 0x12); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; diff --git a/glibc-upstream-2.34-222.patch b/glibc-upstream-2.34-222.patch new file mode 100644 index 0000000..4b54799 --- /dev/null +++ b/glibc-upstream-2.34-222.patch @@ -0,0 +1,164 @@ +commit 0dafa75e3c42994d0f23db62651d1802577272f2 +Author: Noah Goldstein +Date: Wed Mar 23 16:57:26 2022 -0500 + + x86: Remove strcspn-sse2.S and use the generic implementation + + The generic implementation is faster. + + geometric_mean(N=20) of all benchmarks New / Original: .678 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit fe28e7d9d9535ebab4081d195c553b4fbf39d9ae) + +diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strcspn-sse2.S +rename to sysdeps/x86_64/multiarch/strcspn-sse2.c +index 63b260a9ed265230..9bd3dac82d90b3a5 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strcspn_sse2 ++# define STRCSPN __strcspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strcspn) ++# define libc_hidden_builtin_def(STRCSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S +deleted file mode 100644 +index 6035a274c87bafb0..0000000000000000 +--- a/sysdeps/x86_64/strcspn.S ++++ /dev/null +@@ -1,122 +0,0 @@ +-/* strcspn (str, ss) -- Return the length of the initial segment of STR +- which contains no characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2021 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +-#include "asm-syntax.h" +- +- .text +-ENTRY (strcspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup skipset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from skipset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 1(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 2(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 3(%rax), %cl /* get byte from skipset */ +- addq $4, %rax /* increment skipset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from skipset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the skipset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the skipset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(4) /* yes => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(5) /* yes => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* yes => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jne L(3) /* no => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove skipset */ +- cfi_adjust_cfa_offset(-256) +-#ifdef USE_AS_STRPBRK +- xorl %edx,%edx +- orb %cl, %cl /* was last character NUL? */ +- cmovzq %rdx, %rax /* Yes: return NULL */ +-#else +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +-#endif +- ret +-END (strcspn) +-libc_hidden_builtin_def (strcspn) diff --git a/glibc-upstream-2.34-223.patch b/glibc-upstream-2.34-223.patch new file mode 100644 index 0000000..42accca --- /dev/null +++ b/glibc-upstream-2.34-223.patch @@ -0,0 +1,44 @@ +commit 38115446558e6d0976299eb592ba7266681c27d5 +Author: Noah Goldstein +Date: Wed Mar 23 16:57:27 2022 -0500 + + x86: Remove strpbrk-sse2.S and use the generic implementation + + The generic implementation is faster (see strcspn commit). + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 653358535280a599382cb6c77538a187dac6a87f) + +diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +similarity index 87% +rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S +rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c +index c5b95d08ff09cb27..8a58f051c35163dd 100644 +--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S ++++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +@@ -19,11 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strpbrk_sse2 ++# define STRPBRK __strpbrk_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strpbrk) ++# define libc_hidden_builtin_def(STRPBRK) + #endif + +-#define USE_AS_STRPBRK +-#include ++#include +diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S +deleted file mode 100644 +index 21888a5b923974f9..0000000000000000 +--- a/sysdeps/x86_64/strpbrk.S ++++ /dev/null +@@ -1,3 +0,0 @@ +-#define strcspn strpbrk +-#define USE_AS_STRPBRK +-#include diff --git a/glibc-upstream-2.34-224.patch b/glibc-upstream-2.34-224.patch new file mode 100644 index 0000000..764dd6d --- /dev/null +++ b/glibc-upstream-2.34-224.patch @@ -0,0 +1,157 @@ +commit a4b1cae068d4d6e3117dd49e7d0599e4c62ac39f +Author: Noah Goldstein +Date: Wed Mar 23 16:57:29 2022 -0500 + + x86: Remove strspn-sse2.S and use the generic implementation + + The generic implementation is faster. + + geometric_mean(N=20) of all benchmarks New / Original: .710 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 9c8a6ad620b49a27120ecdd7049c26bf05900397) + +diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strspn-sse2.S +rename to sysdeps/x86_64/multiarch/strspn-sse2.c +index e919fe492cc15151..f5e5686db1037740 100644 +--- a/sysdeps/x86_64/multiarch/strspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strspn __strspn_sse2 ++# define STRSPN __strspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strspn) ++# define libc_hidden_builtin_def(STRSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S +deleted file mode 100644 +index e878f328852792db..0000000000000000 +--- a/sysdeps/x86_64/strspn.S ++++ /dev/null +@@ -1,115 +0,0 @@ +-/* strspn (str, ss) -- Return the length of the initial segment of STR +- which contains only characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2021 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +- +- .text +-ENTRY (strspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup stopset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from stopset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 1(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 2(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 3(%rax), %cl /* get byte from stopset */ +- addq $4, %rax /* increment stopset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from stopset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the stopset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the stopset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(4) /* no => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(5) /* no => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* no => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jnz L(3) /* yes => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove stopset */ +- cfi_adjust_cfa_offset(-256) +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +- ret +-END (strspn) +-libc_hidden_builtin_def (strspn) diff --git a/glibc-upstream-2.34-225.patch b/glibc-upstream-2.34-225.patch new file mode 100644 index 0000000..61ccb20 --- /dev/null +++ b/glibc-upstream-2.34-225.patch @@ -0,0 +1,118 @@ +commit 5997011826b7bbb7015f56bf143a6e4fd0f5a7df +Author: Noah Goldstein +Date: Wed Mar 23 16:57:36 2022 -0500 + + x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S + + Slightly faster method of doing TOLOWER that saves an + instruction. + + Also replace the hard coded 5-byte no with .p2align 4. On builds with + CET enabled this misaligned entry to strcasecmp. + + geometric_mean(N=40) of all benchmarks New / Original: .894 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 670b54bc585ea4a94f3b2e9272ba44aa6b730b73) + +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index 7f8a1bc756f86aee..ca70b540eb2dd190 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strcasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strcasecmp, strcasecmp) +@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strncasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strncasecmp, strncasecmp) +@@ -149,22 +147,22 @@ ENTRY (STRCMP) + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-.Lbelowupper: +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-.Ltopupper: +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-.Ltouppermask: ++.Llcase_min: ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++.Llcase_max: ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++.Lcase_add: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa .Lbelowupper(%rip), %xmm5 +-# define UCLOW_reg %xmm5 +- movdqa .Ltopupper(%rip), %xmm6 +-# define UCHIGH_reg %xmm6 +- movdqa .Ltouppermask(%rip), %xmm7 +-# define LCQWORD_reg %xmm7 ++ movdqa .Llcase_min(%rip), %xmm5 ++# define LCASE_MIN_reg %xmm5 ++ movdqa .Llcase_max(%rip), %xmm6 ++# define LCASE_MAX_reg %xmm6 ++ movdqa .Lcase_add(%rip), %xmm7 ++# define CASE_ADD_reg %xmm7 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ +@@ -175,22 +173,18 @@ ENTRY (STRCMP) + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm8; \ +- movdqa UCHIGH_reg, %xmm9; \ +- movdqa reg2, %xmm10; \ +- movdqa UCHIGH_reg, %xmm11; \ +- pcmpgtb UCLOW_reg, %xmm8; \ +- pcmpgtb reg1, %xmm9; \ +- pcmpgtb UCLOW_reg, %xmm10; \ +- pcmpgtb reg2, %xmm11; \ +- pand %xmm9, %xmm8; \ +- pand %xmm11, %xmm10; \ +- pand LCQWORD_reg, %xmm8; \ +- pand LCQWORD_reg, %xmm10; \ +- por %xmm8, reg1; \ +- por %xmm10, reg2 +- TOLOWER (%xmm1, %xmm2) ++# define TOLOWER(reg1, reg2) \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ movdqa LCASE_MIN_reg, %xmm9; \ ++ paddb reg1, %xmm8; \ ++ paddb reg2, %xmm9; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm9; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm9; \ ++ paddb %xmm8, reg1; \ ++ paddb %xmm9, reg2 ++ TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif diff --git a/glibc-upstream-2.34-226.patch b/glibc-upstream-2.34-226.patch new file mode 100644 index 0000000..fcadc66 --- /dev/null +++ b/glibc-upstream-2.34-226.patch @@ -0,0 +1,139 @@ +commit 3605c744078bb048d876298aaf12a2869e8071b8 +Author: Noah Goldstein +Date: Wed Mar 23 16:57:38 2022 -0500 + + x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S + + Slightly faster method of doing TOLOWER that saves an + instruction. + + Also replace the hard coded 5-byte no with .p2align 4. On builds with + CET enabled this misaligned entry to strcasecmp. + + geometric_mean(N=40) of all benchmarks New / Original: .920 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit d154758e618ec9324f5d339c46db0aa27e8b1226) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index 6197a723b9e0606e..a6825de8195ad8c6 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strcasecmp)) + /* FALLTHROUGH to strcasecmp_l. */ + #endif +@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strncasecmp)) + /* FALLTHROUGH to strncasecmp_l. */ + #endif +@@ -170,27 +168,22 @@ STRCMP_SSE42: + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-LABEL(belowupper): +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-LABEL(topupper): +-# ifdef USE_AVX +- .quad 0x5a5a5a5a5a5a5a5a +- .quad 0x5a5a5a5a5a5a5a5a +-# else +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-# endif +-LABEL(touppermask): ++LABEL(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++LABEL(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++LABEL(case_add): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa LABEL(belowupper)(%rip), %xmm4 +-# define UCLOW_reg %xmm4 +- movdqa LABEL(topupper)(%rip), %xmm5 +-# define UCHIGH_reg %xmm5 +- movdqa LABEL(touppermask)(%rip), %xmm6 +-# define LCQWORD_reg %xmm6 ++ movdqa LABEL(lcase_min)(%rip), %xmm4 ++# define LCASE_MIN_reg %xmm4 ++ movdqa LABEL(lcase_max)(%rip), %xmm5 ++# define LCASE_MAX_reg %xmm5 ++ movdqa LABEL(case_add)(%rip), %xmm6 ++# define CASE_ADD_reg %xmm6 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ +@@ -201,32 +194,26 @@ LABEL(touppermask): + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + # ifdef USE_AVX + # define TOLOWER(reg1, reg2) \ +- vpcmpgtb UCLOW_reg, reg1, %xmm7; \ +- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ +- vpcmpgtb UCLOW_reg, reg2, %xmm9; \ +- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ +- vpandn %xmm7, %xmm8, %xmm8; \ +- vpandn %xmm9, %xmm10, %xmm10; \ +- vpand LCQWORD_reg, %xmm8, %xmm8; \ +- vpand LCQWORD_reg, %xmm10, %xmm10; \ +- vpor reg1, %xmm8, reg1; \ +- vpor reg2, %xmm10, reg2 ++ vpaddb LCASE_MIN_reg, reg1, %xmm7; \ ++ vpaddb LCASE_MIN_reg, reg2, %xmm8; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ ++ vpandn CASE_ADD_reg, %xmm7, %xmm7; \ ++ vpandn CASE_ADD_reg, %xmm8, %xmm8; \ ++ vpaddb %xmm7, reg1, reg1; \ ++ vpaddb %xmm8, reg2, reg2 + # else + # define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm7; \ +- movdqa UCHIGH_reg, %xmm8; \ +- movdqa reg2, %xmm9; \ +- movdqa UCHIGH_reg, %xmm10; \ +- pcmpgtb UCLOW_reg, %xmm7; \ +- pcmpgtb reg1, %xmm8; \ +- pcmpgtb UCLOW_reg, %xmm9; \ +- pcmpgtb reg2, %xmm10; \ +- pand %xmm8, %xmm7; \ +- pand %xmm10, %xmm9; \ +- pand LCQWORD_reg, %xmm7; \ +- pand LCQWORD_reg, %xmm9; \ +- por %xmm7, reg1; \ +- por %xmm9, reg2 ++ movdqa LCASE_MIN_reg, %xmm7; \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ paddb reg1, %xmm7; \ ++ paddb reg2, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm7; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm7; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ paddb %xmm7, reg1; \ ++ paddb %xmm8, reg2 + # endif + TOLOWER (%xmm1, %xmm2) + #else diff --git a/glibc-upstream-2.34-227.patch b/glibc-upstream-2.34-227.patch new file mode 100644 index 0000000..9dd23aa --- /dev/null +++ b/glibc-upstream-2.34-227.patch @@ -0,0 +1,744 @@ +commit 3051cf3e745015a9106cf71be7f7adbb2f83fcac +Author: Noah Goldstein +Date: Thu Mar 24 18:56:12 2022 -0500 + + x86: Add AVX2 optimized str{n}casecmp + + geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit bbf81222343fed5cd704001a2ae0d86c71544151) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 8c9e7812c6af10b8..711ecf2ee45d61b9 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -51,6 +51,8 @@ sysdep_routines += \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ + strcasecmp_l-avx \ ++ strcasecmp_l-avx2 \ ++ strcasecmp_l-avx2-rtm \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -89,6 +91,8 @@ sysdep_routines += \ + strlen-evex \ + strlen-sse2 \ + strncase_l-avx \ ++ strncase_l-avx2 \ ++ strncase_l-avx2-rtm \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 4992d7bd3206a7c0..a687b387c91aa9ae 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_avx) +@@ -431,6 +438,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_l_avx) +@@ -558,6 +572,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_avx) +@@ -572,6 +593,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_l_avx) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 931770e079fcc69f..64d0cd6ef25f73c0 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + return OPTIMIZE (avx); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +new file mode 100644 +index 0000000000000000..09957fc3c543b40c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +@@ -0,0 +1,15 @@ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcasecmp_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +new file mode 100644 +index 0000000000000000..e2762f2a222b2a65 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 09a73942086f9c9f..aa91f6e48a0e1ce5 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -20,6 +20,10 @@ + + # include + ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif ++ + # ifndef STRCMP + # define STRCMP __strcmp_avx2 + # endif +@@ -74,13 +78,88 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_avx2 ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_avx2 ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ + # define xmmZERO xmm15 + # define ymmZERO ymm15 + ++# define LCASE_MIN_ymm %ymm10 ++# define LCASE_MAX_ymm %ymm11 ++# define CASE_ADD_ymm %ymm12 ++ ++# define LCASE_MIN_xmm %xmm10 ++# define LCASE_MAX_xmm %xmm11 ++# define CASE_ADD_xmm %xmm12 ++ ++ /* r11 is never use elsewhere so this is safe to maintain. */ ++# define TOLOWER_BASE %r11 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define REG(x, y) x ## y ++# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ ++ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ ++ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpaddb REG(%ext, 8), reg1_in, reg1_out; \ ++ vpaddb REG(%ext, 9), reg2_in, reg2_out ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) ++# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ ++ VPCMPEQ scratch_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ ++ VMOVU s2_mem, reg_out; \ ++ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) ++ ++# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) ++# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) ++# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_ymm(...) ++# define TOLOWER_xmm(...) ++ ++# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ ++ VPCMPEQ s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) ++# endif ++ + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +@@ -102,8 +181,49 @@ + returned. */ + + .section SECTION(.text), "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifndef GLABEL ++# define GLABEL(...) __VA_ARGS__ ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (GLABEL(STRCASECMP)) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (GLABEL(STRCASECMP)) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -128,6 +248,30 @@ ENTRY(STRCMP) + # endif + # endif + vpxor %xmmZERO, %xmmZERO, %xmmZERO ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++L(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm ++ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm ++ vmovdqa L(case_add)(%rip), CASE_ADD_ymm ++# endif + movl %edi, %eax + orl %esi, %eax + sall $20, %eax +@@ -138,8 +282,10 @@ ENTRY(STRCMP) + L(no_page_cross): + /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %ymm0 +- /* 1s where s1 and s2 equal. */ +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. ++ Otherwise converts ymm0 and load from rsi to lower. ymm2 is ++ scratch and ymm1 is the return. */ ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + /* 1s at null CHAR. */ + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + /* 1s where s1 and s2 equal AND not null CHAR. */ +@@ -172,6 +318,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -192,6 +340,10 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) + # ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large +@@ -211,6 +363,8 @@ L(one_or_less): + jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -238,6 +392,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -269,6 +425,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -289,6 +447,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -299,7 +459,7 @@ L(ret4): + L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -312,7 +472,7 @@ L(more_3x_vec): + # endif + + VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -320,7 +480,7 @@ L(more_3x_vec): + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check): + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + + /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ +- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 +- +- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 +- ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) ++ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + + /* If any mismatches or null CHAR then 0 CHAR, otherwise non- + zero. */ +@@ -469,6 +627,8 @@ L(return_vec_2_3_end): + # else + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -512,6 +672,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -534,6 +696,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -560,6 +724,8 @@ L(return_vec_2_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -587,7 +753,7 @@ L(page_cross_during_loop): + jle L(less_1x_vec_till_page_cross) + + VMOVA (%rdi), %ymm0 +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross): + here, it means the previous page (rdi - VEC_SIZE) has already + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 +- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross): + iteration here. */ + + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross): + + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross): + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + vpand %ymm4, %ymm5, %ymm5 + vpand %ymm6, %ymm7, %ymm7 + VPMINU %ymm5, %ymm7, %ymm7 +@@ -771,6 +939,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -826,7 +996,7 @@ L(page_cross): + L(page_cross_loop): + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -844,11 +1014,11 @@ L(page_cross_loop): + subl %eax, %OFFSET_REG + /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed + to not cross page so is safe to load. Since we have already +- loaded at least 1 VEC from rsi it is also guranteed to be safe. +- */ ++ loaded at least 1 VEC from rsi it is also guranteed to be ++ safe. */ + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page): + ja L(less_16_till_page) + + VMOVU (%rdi), %xmm0 +- VPCMPEQ (%rsi), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page): + # endif + + VMOVU (%rdi, %OFFSET_REG64), %xmm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -990,7 +1162,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1010,7 +1182,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64), %xmm0 + vmovq (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64), %xmm0 + vmovd (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1119,7 +1291,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1146,5 +1320,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +new file mode 100644 +index 0000000000000000..58c05dcfb8643791 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +@@ -0,0 +1,16 @@ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm ++ ++#include "strncase_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +new file mode 100644 +index 0000000000000000..48c0aa21f84ad32c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +@@ -0,0 +1,27 @@ ++/* strncasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcasecmp_l_avx2 ++#endif ++#include "strcmp-avx2.S" diff --git a/glibc-upstream-2.34-228.patch b/glibc-upstream-2.34-228.patch new file mode 100644 index 0000000..dee6598 --- /dev/null +++ b/glibc-upstream-2.34-228.patch @@ -0,0 +1,803 @@ +commit b13a2e68eb3b84f2a7b587132ec2ea813815febf +Author: Noah Goldstein +Date: Thu Mar 24 18:56:13 2022 -0500 + + x86: Add EVEX optimized str{n}casecmp + + geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 84e7c46df4086873eae28a1fb87d2cf5388b1e16) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 711ecf2ee45d61b9..359712c1491a2431 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -53,6 +53,7 @@ sysdep_routines += \ + strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ ++ strcasecmp_l-evex \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -93,6 +94,7 @@ sysdep_routines += \ + strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ ++ strncase_l-evex \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index a687b387c91aa9ae..f6994e5406933d53 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_avx2) +@@ -438,6 +442,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_l_avx2) +@@ -572,6 +580,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_avx2) +@@ -593,6 +605,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_l_avx2) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 64d0cd6ef25f73c0..488e99e4997f379b 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) +@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +new file mode 100644 +index 0000000000000000..58642db748e3db71 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_evex ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 0dfa62bd149c02b4..b81b57753c38db1f 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -19,6 +19,9 @@ + #if IS_IN (libc) + + # include ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif + + # ifndef STRCMP + # define STRCMP __strcmp_evex +@@ -34,19 +37,29 @@ + # define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-# define TESTEQ subl $0xff, ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __wcscmp_evex ++# endif ++ ++# define TESTEQ subl $0xff, + /* Compare packed dwords. */ + # define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd ++# define VPTESTNM vptestnmd + /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcmp_evex ++# endif ++ + # define TESTEQ incl + /* Compare packed bytes. */ + # define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb ++# define VPTESTNM vptestnmb + /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif +@@ -73,11 +86,16 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + +-# define XMMZERO xmm16 + # define XMM0 xmm17 + # define XMM1 xmm18 + +-# define YMMZERO ymm16 ++# define XMM10 xmm27 ++# define XMM11 xmm28 ++# define XMM12 xmm29 ++# define XMM13 xmm30 ++# define XMM14 xmm31 ++ ++ + # define YMM0 ymm17 + # define YMM1 ymm18 + # define YMM2 ymm19 +@@ -89,6 +107,87 @@ + # define YMM8 ymm25 + # define YMM9 ymm26 + # define YMM10 ymm27 ++# define YMM11 ymm28 ++# define YMM12 ymm29 ++# define YMM13 ymm30 ++# define YMM14 ymm31 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_evex ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_evex ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ ++# define LCASE_MIN_YMM %YMM12 ++# define LCASE_MAX_YMM %YMM13 ++# define CASE_ADD_YMM %YMM14 ++ ++# define LCASE_MIN_XMM %XMM12 ++# define LCASE_MAX_XMM %XMM13 ++# define CASE_ADD_XMM %XMM14 ++ ++ /* NB: wcsncmp uses r11 but strcasecmp is never used in ++ conjunction with wcscmp. */ ++# define TOLOWER_BASE %r11 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define _REG(x, y) x ## y ++# define REG(x, y) _REG(x, y) ++# define TOLOWER(reg1, reg2, ext) \ ++ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ ++ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ ++ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ ++ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) ++# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, s2_reg, ext); \ ++ VPCMP $0, s1_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ ++ VMOVU s2_mem, s2_reg; \ ++ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) ++ ++# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) ++# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) ++ ++# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) ++# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_YMM(...) ++# define TOLOWER_XMM(...) ++ ++# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ ++ VPCMP $0, s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) ++ ++# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ ++ VPCMP $0, s2_mem, s1_reg, reg_out ++ ++# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) ++# endif + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -112,8 +211,45 @@ + returned. */ + + .section .text.evex, "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (STRCASECMP) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (STRCASECMP) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -125,6 +261,32 @@ ENTRY(STRCMP) + actually bound the buffer. */ + jle L(one_or_less) + # endif ++ ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++L(lcase_max): ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM ++ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM ++ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM ++# endif ++ + movl %edi, %eax + orl %esi, %eax + /* Shift out the bits irrelivant to page boundary ([63:12]). */ +@@ -139,7 +301,7 @@ L(no_page_cross): + VPTESTM %YMM0, %YMM0, %k2 + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP + cmpq $CHAR_PER_VEC, %rdx +@@ -169,6 +331,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -188,11 +352,15 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_evex ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -201,11 +369,10 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- jnbe __strcmp_evex + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -233,6 +400,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -270,6 +439,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -290,6 +461,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -303,7 +476,7 @@ L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1) +@@ -315,14 +488,14 @@ L(more_3x_vec): + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_3) +@@ -381,7 +554,6 @@ L(prepare_loop_aligned): + subl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + +- vpxorq %YMMZERO, %YMMZERO, %YMMZERO + + /* Loop 4x comparisons at a time. */ + .p2align 4 +@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check): + /* A zero CHAR in YMM9 means that there is a null CHAR. */ + VPMINU %YMM8, %YMM9, %YMM9 + +- /* Each bit set in K1 represents a non-null CHAR in YMM8. */ ++ /* Each bit set in K1 represents a non-null CHAR in YMM9. */ + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 + vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while + oring with YMM1. Result is stored in YMM6. */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 +- ++# else ++ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 ++ TOLOWER_YMM (%YMM0, %YMM1) ++ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 ++ TOLOWER_YMM (%YMM2, %YMM3) ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM0, %YMM1, %YMM1 ++ vpxorq %YMM2, %YMM3, %YMM3 ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM1, %YMM6 ++# endif + /* Or together YMM3, YMM5, and YMM6. */ + vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + + + /* A non-zero CHAR in YMM6 represents a mismatch. */ +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + + TESTEQ %LOOP_REG +@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check): + + /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} ++ VPTESTNM %YMM1, %YMM1, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) + + VPTESTM %YMM2, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} ++ VPTESTNM %YMM3, %YMM3, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -457,7 +642,7 @@ L(return_vec_2_3_end): + # endif + + VPTESTM %YMM4, %YMM4, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} ++ VPTESTNM %YMM5, %YMM5, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + # if CHAR_PER_VEC <= 16 +@@ -493,6 +678,8 @@ L(return_vec_3_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -545,6 +732,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + /* Flip `eax` if `rdi` and `rsi` where swapped in page cross + logic. Subtract `r8d` after xor for zero case. */ +@@ -569,6 +758,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -598,7 +789,7 @@ L(page_cross_during_loop): + + VMOVA (%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) +@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross): + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} +- ++ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} + /* Mask of potentially valid bits. The lower bits can be out of + range comparisons (but safe regarding page crosses). */ + +@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross): + + # ifdef USE_AS_STRNCMP + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross): + + VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross): + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_1) +@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross): + /* Must check length here as length might proclude reading next + page. */ + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + VPMINU %YMM4, %YMM6, %YMM9 + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 +- +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++# else ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM5, %YMM6 ++# endif ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + TESTEQ %LOOP_REG + jnz L(return_vec_2_3_end) +@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -871,7 +1076,7 @@ L(page_cross): + L(page_cross_loop): + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(check_ret_vec_page_cross) +@@ -895,7 +1100,7 @@ L(page_cross_loop): + */ + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP +@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax + movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page): + /* Use 16 byte comparison. */ + vmovdqu (%rdi), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page): + # endif + vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1048,7 +1255,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1068,7 +1275,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1176,7 +1383,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1203,5 +1412,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S +new file mode 100644 +index 0000000000000000..8a5af3695cb8cfff +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S +@@ -0,0 +1,25 @@ ++/* strncasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_evex ++#endif ++#define OVERFLOW_STRCMP __strcasecmp_l_evex ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#include "strcmp-evex.S" diff --git a/glibc-upstream-2.34-229.patch b/glibc-upstream-2.34-229.patch new file mode 100644 index 0000000..97f6bbd --- /dev/null +++ b/glibc-upstream-2.34-229.patch @@ -0,0 +1,902 @@ +commit 80883f43545f4f9afcb26beef9358dfdcd021bd6 +Author: Noah Goldstein +Date: Wed Mar 23 16:57:46 2022 -0500 + + x86: Remove AVX str{n}casecmp + + The rational is: + + 1. SSE42 has nearly identical logic so any benefit is minimal (3.4% + regression on Tigerlake using SSE42 versus AVX across the + benchtest suite). + 2. AVX2 version covers the majority of targets that previously + prefered it. + 3. The targets where AVX would still be best (SnB and IVB) are + becoming outdated. + + All in all the saving the code size is worth it. + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 305769b2a15c2e96f9e1b5195d3c4e0d6f0f4b68) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 359712c1491a2431..bca82e38d86cc440 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -50,7 +50,6 @@ sysdep_routines += \ + stpncpy-evex \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ +- strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ + strcasecmp_l-evex \ +@@ -91,7 +90,6 @@ sysdep_routines += \ + strlen-avx2-rtm \ + strlen-evex \ + strlen-sse2 \ +- strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ + strncase_l-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index f6994e5406933d53..4c7834dd0b951fa4 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -429,9 +429,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_sse42) +@@ -453,9 +450,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_l_sse42) +@@ -591,9 +585,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_sse42) +@@ -616,9 +607,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_l_sse42) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 488e99e4997f379b..40819caf5ab10337 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -22,7 +22,6 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; +@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) +- return OPTIMIZE (avx); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) + return OPTIMIZE (sse42); +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S +deleted file mode 100644 +index 647aa05714d7a36c..0000000000000000 +--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strcasecmp_l optimized with AVX. +- Copyright (C) 2017-2021 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strcasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRCASECMP_L +-#include "strcmp-sse42.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index a6825de8195ad8c6..466c6a92a612ebcb 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -42,13 +42,8 @@ + # define UPDATE_STRNCMP_COUNTER + #endif + +-#ifdef USE_AVX +-# define SECTION avx +-# define GLABEL(l) l##_avx +-#else +-# define SECTION sse4.2 +-# define GLABEL(l) l##_sse42 +-#endif ++#define SECTION sse4.2 ++#define GLABEL(l) l##_sse42 + + #define LABEL(l) .L##l + +@@ -106,21 +101,7 @@ END (GLABEL(__strncasecmp)) + #endif + + +-#ifdef USE_AVX +-# define movdqa vmovdqa +-# define movdqu vmovdqu +-# define pmovmskb vpmovmskb +-# define pcmpistri vpcmpistri +-# define psubb vpsubb +-# define pcmpeqb vpcmpeqb +-# define psrldq vpsrldq +-# define pslldq vpslldq +-# define palignr vpalignr +-# define pxor vpxor +-# define D(arg) arg, arg +-#else +-# define D(arg) arg +-#endif ++#define arg arg + + STRCMP_SSE42: + cfi_startproc +@@ -192,18 +173,7 @@ LABEL(case_add): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# ifdef USE_AVX +-# define TOLOWER(reg1, reg2) \ +- vpaddb LCASE_MIN_reg, reg1, %xmm7; \ +- vpaddb LCASE_MIN_reg, reg2, %xmm8; \ +- vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ +- vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ +- vpandn CASE_ADD_reg, %xmm7, %xmm7; \ +- vpandn CASE_ADD_reg, %xmm8, %xmm8; \ +- vpaddb %xmm7, reg1, reg1; \ +- vpaddb %xmm8, reg2, reg2 +-# else +-# define TOLOWER(reg1, reg2) \ ++# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ +@@ -214,15 +184,15 @@ LABEL(case_add): + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 +-# endif ++ + TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ +- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ ++ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes)/* If not, find different value or null char */ +@@ -246,7 +216,7 @@ LABEL(crosscache): + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) +@@ -260,7 +230,7 @@ LABEL(bigger): + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + lea (%r10, %r9), %r10 + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ + +@@ -273,15 +243,15 @@ LABEL(bigger): + LABEL(ashr_0): + + movdqa (%rsi), %xmm1 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L +- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + #else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ + #endif +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -361,10 +331,10 @@ LABEL(ashr_0_exit_use): + */ + .p2align 4 + LABEL(ashr_1): +- pslldq $15, D(%xmm2) /* shift first string to align with second */ ++ pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ +- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ ++ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ ++ psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -392,7 +362,7 @@ LABEL(loop_ashr_1_use): + + LABEL(nibble_ashr_1_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -411,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use): + jg LABEL(nibble_ashr_1_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -431,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use): + LABEL(nibble_ashr_1_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $1, D(%xmm0) ++ psrldq $1, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -449,10 +419,10 @@ LABEL(nibble_ashr_1_use): + */ + .p2align 4 + LABEL(ashr_2): +- pslldq $14, D(%xmm2) ++ pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -480,7 +450,7 @@ LABEL(loop_ashr_2_use): + + LABEL(nibble_ashr_2_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -499,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use): + jg LABEL(nibble_ashr_2_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -519,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use): + LABEL(nibble_ashr_2_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $2, D(%xmm0) ++ psrldq $2, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -537,10 +507,10 @@ LABEL(nibble_ashr_2_use): + */ + .p2align 4 + LABEL(ashr_3): +- pslldq $13, D(%xmm2) ++ pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -568,7 +538,7 @@ LABEL(loop_ashr_3_use): + + LABEL(nibble_ashr_3_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -587,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use): + jg LABEL(nibble_ashr_3_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -607,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use): + LABEL(nibble_ashr_3_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $3, D(%xmm0) ++ psrldq $3, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -625,10 +595,10 @@ LABEL(nibble_ashr_3_use): + */ + .p2align 4 + LABEL(ashr_4): +- pslldq $12, D(%xmm2) ++ pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -657,7 +627,7 @@ LABEL(loop_ashr_4_use): + + LABEL(nibble_ashr_4_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -676,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use): + jg LABEL(nibble_ashr_4_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -696,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use): + LABEL(nibble_ashr_4_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $4, D(%xmm0) ++ psrldq $4, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -714,10 +684,10 @@ LABEL(nibble_ashr_4_use): + */ + .p2align 4 + LABEL(ashr_5): +- pslldq $11, D(%xmm2) ++ pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -746,7 +716,7 @@ LABEL(loop_ashr_5_use): + + LABEL(nibble_ashr_5_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -766,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use): + + movdqa (%rdi, %rdx), %xmm0 + +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -786,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use): + LABEL(nibble_ashr_5_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $5, D(%xmm0) ++ psrldq $5, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -804,10 +774,10 @@ LABEL(nibble_ashr_5_use): + */ + .p2align 4 + LABEL(ashr_6): +- pslldq $10, D(%xmm2) ++ pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -836,7 +806,7 @@ LABEL(loop_ashr_6_use): + + LABEL(nibble_ashr_6_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -855,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use): + jg LABEL(nibble_ashr_6_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -875,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use): + LABEL(nibble_ashr_6_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $6, D(%xmm0) ++ psrldq $6, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -893,10 +863,10 @@ LABEL(nibble_ashr_6_use): + */ + .p2align 4 + LABEL(ashr_7): +- pslldq $9, D(%xmm2) ++ pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -925,7 +895,7 @@ LABEL(loop_ashr_7_use): + + LABEL(nibble_ashr_7_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -944,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use): + jg LABEL(nibble_ashr_7_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -964,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use): + LABEL(nibble_ashr_7_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $7, D(%xmm0) ++ psrldq $7, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -982,10 +952,10 @@ LABEL(nibble_ashr_7_use): + */ + .p2align 4 + LABEL(ashr_8): +- pslldq $8, D(%xmm2) ++ pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1014,7 +984,7 @@ LABEL(loop_ashr_8_use): + + LABEL(nibble_ashr_8_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1033,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use): + jg LABEL(nibble_ashr_8_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1053,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use): + LABEL(nibble_ashr_8_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $8, D(%xmm0) ++ psrldq $8, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1071,10 +1041,10 @@ LABEL(nibble_ashr_8_use): + */ + .p2align 4 + LABEL(ashr_9): +- pslldq $7, D(%xmm2) ++ pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1104,7 +1074,7 @@ LABEL(loop_ashr_9_use): + LABEL(nibble_ashr_9_restart_use): + movdqa (%rdi, %rdx), %xmm0 + +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1123,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use): + jg LABEL(nibble_ashr_9_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1143,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use): + LABEL(nibble_ashr_9_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $9, D(%xmm0) ++ psrldq $9, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1161,10 +1131,10 @@ LABEL(nibble_ashr_9_use): + */ + .p2align 4 + LABEL(ashr_10): +- pslldq $6, D(%xmm2) ++ pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1193,7 +1163,7 @@ LABEL(loop_ashr_10_use): + + LABEL(nibble_ashr_10_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1212,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use): + jg LABEL(nibble_ashr_10_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1232,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use): + LABEL(nibble_ashr_10_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $10, D(%xmm0) ++ psrldq $10, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1250,10 +1220,10 @@ LABEL(nibble_ashr_10_use): + */ + .p2align 4 + LABEL(ashr_11): +- pslldq $5, D(%xmm2) ++ pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1282,7 +1252,7 @@ LABEL(loop_ashr_11_use): + + LABEL(nibble_ashr_11_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1301,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use): + jg LABEL(nibble_ashr_11_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1321,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use): + LABEL(nibble_ashr_11_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $11, D(%xmm0) ++ psrldq $11, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1339,10 +1309,10 @@ LABEL(nibble_ashr_11_use): + */ + .p2align 4 + LABEL(ashr_12): +- pslldq $4, D(%xmm2) ++ pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1371,7 +1341,7 @@ LABEL(loop_ashr_12_use): + + LABEL(nibble_ashr_12_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1390,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use): + jg LABEL(nibble_ashr_12_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1410,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use): + LABEL(nibble_ashr_12_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $12, D(%xmm0) ++ psrldq $12, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1428,10 +1398,10 @@ LABEL(nibble_ashr_12_use): + */ + .p2align 4 + LABEL(ashr_13): +- pslldq $3, D(%xmm2) ++ pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1461,7 +1431,7 @@ LABEL(loop_ashr_13_use): + + LABEL(nibble_ashr_13_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1480,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use): + jg LABEL(nibble_ashr_13_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1500,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use): + LABEL(nibble_ashr_13_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $13, D(%xmm0) ++ psrldq $13, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1518,10 +1488,10 @@ LABEL(nibble_ashr_13_use): + */ + .p2align 4 + LABEL(ashr_14): +- pslldq $2, D(%xmm2) ++ pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1551,7 +1521,7 @@ LABEL(loop_ashr_14_use): + + LABEL(nibble_ashr_14_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1570,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use): + jg LABEL(nibble_ashr_14_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1590,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use): + LABEL(nibble_ashr_14_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $14, D(%xmm0) ++ psrldq $14, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1608,10 +1578,10 @@ LABEL(nibble_ashr_14_use): + */ + .p2align 4 + LABEL(ashr_15): +- pslldq $1, D(%xmm2) ++ pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1643,7 +1613,7 @@ LABEL(loop_ashr_15_use): + + LABEL(nibble_ashr_15_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1662,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use): + jg LABEL(nibble_ashr_15_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1682,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use): + LABEL(nibble_ashr_15_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $15, D(%xmm0) ++ psrldq $15, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S +deleted file mode 100644 +index f1d3fefdd94674b8..0000000000000000 +--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strncasecmp_l optimized with AVX. +- Copyright (C) 2017-2021 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strncasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRNCASECMP_L +-#include "strcmp-sse42.S" diff --git a/glibc-upstream-2.34-230.patch b/glibc-upstream-2.34-230.patch new file mode 100644 index 0000000..b7eb594 --- /dev/null +++ b/glibc-upstream-2.34-230.patch @@ -0,0 +1,253 @@ +commit 4ff6ae069b7caacd5f99088abd755717b994f660 +Author: Noah Goldstein +Date: Fri Mar 25 17:13:33 2022 -0500 + + x86: Small improvements for wcslen + + Just a few QOL changes. + 1. Prefer `add` > `lea` as it has high execution units it can run + on. + 2. Don't break macro-fusion between `test` and `jcc` + 3. Reduce code size by removing gratuitous padding bytes (-90 + bytes). + + geometric_mean(N=20) of all benchmarks New / Original: 0.959 + + All string/memory tests pass. + Reviewed-by: H.J. Lu + + (cherry picked from commit 244b415d386487521882debb845a040a4758cb18) + +diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S +index 61edea1d14d454c6..ad066863a44ea0a5 100644 +--- a/sysdeps/x86_64/wcslen.S ++++ b/sysdeps/x86_64/wcslen.S +@@ -41,82 +41,82 @@ ENTRY (__wcslen) + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax +- lea 16(%rdi), %rcx ++ addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax +@@ -133,104 +133,100 @@ L(aligned_64_loop): + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx ++ addq $64, %rax + test %edx, %edx +- lea 64(%rax), %rax + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $48, %rdi + test %edx, %edx +- lea 48(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx +- jnz L(exit) +- +- jmp L(aligned_64_loop) ++ jz L(aligned_64_loop) + + .p2align 4 + L(exit): +- sub %rcx, %rax ++ sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + +- mov %dl, %cl +- and $15, %cl ++ andl $15, %edx + jz L(exit_1) + ret + +- .p2align 4 ++ /* No align here. Naturally aligned % 16 == 1. */ + L(exit_high): +- mov %dh, %ch +- and $15, %ch ++ andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_1): + add $1, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_3): + add $3, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail0): +- xor %rax, %rax ++ xorl %eax, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail1): +- mov $1, %rax ++ movl $1, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail2): +- mov $2, %rax ++ movl $2, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail3): +- mov $3, %rax ++ movl $3, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail4): +- mov $4, %rax ++ movl $4, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail5): +- mov $5, %rax ++ movl $5, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail6): +- mov $6, %rax ++ movl $6, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail7): +- mov $7, %rax ++ movl $7, %eax + ret + + END (__wcslen) diff --git a/glibc-upstream-2.34-231.patch b/glibc-upstream-2.34-231.patch new file mode 100644 index 0000000..3c928b8 --- /dev/null +++ b/glibc-upstream-2.34-231.patch @@ -0,0 +1,956 @@ +commit ffe75982cc0bb2d25d55ed566a3731b9c3017e6f +Author: Noah Goldstein +Date: Fri Apr 15 12:28:00 2022 -0500 + + x86: Remove memcmp-sse4.S + + Code didn't actually use any sse4 instructions since `ptest` was + removed in: + + commit 2f9062d7171850451e6044ef78d91ff8c017b9c0 + Author: Noah Goldstein + Date: Wed Nov 10 16:18:56 2021 -0600 + + x86: Shrink memcmp-sse4.S code size + + The new memcmp-sse2 implementation is also faster. + + geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 + + Note there are two regressions preferring SSE2 for Size = 1 and Size = + 65. + + Size = 1: + size, align0, align1, ret, New Time/Old Time + 1, 1, 1, 0, 1.2 + 1, 1, 1, 1, 1.197 + 1, 1, 1, -1, 1.2 + + This is intentional. Size == 1 is significantly less hot based on + profiles of GCC11 and Python3 than sizes [4, 8] (which is made + hotter). + + Python3 Size = 1 -> 13.64% + Python3 Size = [4, 8] -> 60.92% + + GCC11 Size = 1 -> 1.29% + GCC11 Size = [4, 8] -> 33.86% + + size, align0, align1, ret, New Time/Old Time + 4, 4, 4, 0, 0.622 + 4, 4, 4, 1, 0.797 + 4, 4, 4, -1, 0.805 + 5, 5, 5, 0, 0.623 + 5, 5, 5, 1, 0.777 + 5, 5, 5, -1, 0.802 + 6, 6, 6, 0, 0.625 + 6, 6, 6, 1, 0.813 + 6, 6, 6, -1, 0.788 + 7, 7, 7, 0, 0.625 + 7, 7, 7, 1, 0.799 + 7, 7, 7, -1, 0.795 + 8, 8, 8, 0, 0.625 + 8, 8, 8, 1, 0.848 + 8, 8, 8, -1, 0.914 + 9, 9, 9, 0, 0.625 + + Size = 65: + size, align0, align1, ret, New Time/Old Time + 65, 0, 0, 0, 1.103 + 65, 0, 0, 1, 1.216 + 65, 0, 0, -1, 1.227 + 65, 65, 0, 0, 1.091 + 65, 0, 65, 1, 1.19 + 65, 65, 65, -1, 1.215 + + This is because A) the checks in range [65, 96] are now unrolled 2x + and B) because smaller values <= 16 are now given a hotter path. By + contrast the SSE4 version has a branch for Size = 80. The unrolled + version has get better performance for returns which need both + comparisons. + + size, align0, align1, ret, New Time/Old Time + 128, 4, 8, 0, 0.858 + 128, 4, 8, 1, 0.879 + 128, 4, 8, -1, 0.888 + + As well, out of microbenchmark environments that are not full + predictable the branch will have a real-cost. + Reviewed-by: H.J. Lu + + (cherry picked from commit 7cbc03d03091d5664060924789afe46d30a5477e) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index bca82e38d86cc440..b503e4b81e92a11c 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -11,7 +11,6 @@ sysdep_routines += \ + memcmp-avx2-movbe-rtm \ + memcmp-evex-movbe \ + memcmp-sse2 \ +- memcmp-sse4 \ + memcmp-ssse3 \ + memcpy-ssse3 \ + memcpy-ssse3-back \ +@@ -174,7 +173,6 @@ sysdep_routines += \ + wmemcmp-avx2-movbe-rtm \ + wmemcmp-c \ + wmemcmp-evex-movbe \ +- wmemcmp-sse4 \ + wmemcmp-ssse3 \ + # sysdep_routines + endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 4c7834dd0b951fa4..e5e48b36c3175e68 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), +- __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) +@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), +- __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 89e2129968e1e49c..5b92594093c1e0bb 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -21,7 +21,6 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; +@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2_movbe); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) +- return OPTIMIZE (sse4_1); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +deleted file mode 100644 +index 97c102a9c5ab2b91..0000000000000000 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ /dev/null +@@ -1,804 +0,0 @@ +-/* memcmp with SSE4.1, wmemcmp with SSE4.1 +- Copyright (C) 2010-2021 Free Software Foundation, Inc. +- Contributed by Intel Corporation. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#if IS_IN (libc) +- +-# include +- +-# ifndef MEMCMP +-# define MEMCMP __memcmp_sse4_1 +-# endif +- +-#ifdef USE_AS_WMEMCMP +-# define CMPEQ pcmpeqd +-# define CHAR_SIZE 4 +-#else +-# define CMPEQ pcmpeqb +-# define CHAR_SIZE 1 +-#endif +- +- +-/* Warning! +- wmemcmp has to use SIGNED comparison for elements. +- memcmp has to use UNSIGNED comparison for elemnts. +-*/ +- +- .section .text.sse4.1,"ax",@progbits +-ENTRY (MEMCMP) +-# ifdef USE_AS_WMEMCMP +- shl $2, %RDX_LP +-# elif defined __ILP32__ +- /* Clear the upper 32 bits. */ +- mov %edx, %edx +-# endif +- cmp $79, %RDX_LP +- ja L(79bytesormore) +- +- cmp $CHAR_SIZE, %RDX_LP +- jbe L(firstbyte) +- +- /* N in (CHAR_SIZE, 79) bytes. */ +- cmpl $32, %edx +- ja L(more_32_bytes) +- +- cmpl $16, %edx +- jae L(16_to_32_bytes) +- +-# ifndef USE_AS_WMEMCMP +- cmpl $8, %edx +- jae L(8_to_16_bytes) +- +- cmpl $4, %edx +- jb L(2_to_3_bytes) +- +- movl (%rdi), %eax +- movl (%rsi), %ecx +- +- bswap %eax +- bswap %ecx +- +- shlq $32, %rax +- shlq $32, %rcx +- +- movl -4(%rdi, %rdx), %edi +- movl -4(%rsi, %rdx), %esi +- +- bswap %edi +- bswap %esi +- +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(2_to_3_bytes): +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- subl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(8_to_16_bytes): +- movq (%rdi), %rax +- movq (%rsi), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- jne L(8_to_16_bytes_done) +- +- movq -8(%rdi, %rdx), %rax +- movq -8(%rsi, %rdx), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- +-L(8_to_16_bytes_done): +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +-# else +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl 4(%rdi), %ecx +- cmpl 4(%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl -4(%rdi, %rdx), %ecx +- cmpl -4(%rsi, %rdx), %ecx +- jne L(8_to_16_bytes_done) +- ret +-# endif +- +- .p2align 4,, 3 +-L(ret_zero): +- xorl %eax, %eax +-L(zero): +- ret +- +- .p2align 4,, 8 +-L(firstbyte): +- jb L(ret_zero) +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- je L(zero) +-L(8_to_16_bytes_done): +- setg %al +- leal -1(%rax, %rax), %eax +-# else +- movzbl (%rdi), %eax +- movzbl (%rsi), %ecx +- sub %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_48): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin_32): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl 32(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl 32(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl 32(%rsi, %rax), %ecx +- movzbl 32(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_16): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_end_16): +- subl $16, %edx +-L(vec_return_end): +- bsfl %eax, %eax +- addl %edx, %eax +-# ifdef USE_AS_WMEMCMP +- movl -16(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl -16(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl -16(%rsi, %rax), %ecx +- movzbl -16(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4,, 8 +-L(more_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm0 +- movdqu 16(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- cmpl $64, %edx +- jbe L(32_to_64_bytes) +- movdqu 32(%rdi), %xmm0 +- movdqu 32(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- .p2align 4,, 6 +-L(32_to_64_bytes): +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(16_to_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- +- .p2align 4 +-L(79bytesormore): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- +- mov %rsi, %rcx +- and $-16, %rsi +- add $16, %rsi +- sub %rsi, %rcx +- +- sub %rcx, %rdi +- add %rcx, %rdx +- test $0xf, %rdi +- jz L(2aligned) +- +- cmp $128, %rdx +- ja L(128bytesormore) +- +- .p2align 4,, 6 +-L(less128bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(last_64_bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormore): +- cmp $256, %rdx +- ja L(unaligned_loop) +-L(less256bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytes) +- +- cmp $32, %rdx +- ja L(last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(unaligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_unaligned) +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loop): +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loop) +- +- .p2align 4,, 6 +-L(loop_tail): +- addq %rdx, %rdi +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- addq %rdx, %rsi +- movdqu (%rsi), %xmm4 +- movdqu 16(%rsi), %xmm5 +- movdqu 32(%rsi), %xmm6 +- movdqu 48(%rsi), %xmm7 +- +- CMPEQ %xmm4, %xmm0 +- CMPEQ %xmm5, %xmm1 +- CMPEQ %xmm6, %xmm2 +- CMPEQ %xmm7, %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- ret +- +-L(L2_L3_cache_unaligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_unaligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(L2_L3_unaligned_128bytes_loop) +- jmp L(loop_tail) +- +- +- /* This case is for machines which are sensitive for unaligned +- * instructions. */ +- .p2align 4 +-L(2aligned): +- cmp $128, %rdx +- ja L(128bytesormorein2aligned) +-L(less128bytesin2aligned): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(aligned_last_64_bytes): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormorein2aligned): +- cmp $256, %rdx +- ja L(aligned_loop) +-L(less256bytesin2alinged): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytesin2aligned) +- +- cmp $32, %rdx +- ja L(aligned_last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(aligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_aligned) +- +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loopin2aligned): +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loopin2aligned) +- jmp L(loop_tail) +- +-L(L2_L3_cache_aligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_aligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- addq $64, %rsi +- addq $64, %rdi +- subq $64, %rdx +- ja L(L2_L3_aligned_128bytes_loop) +- jmp L(loop_tail) +- +- .p2align 4 +-L(64bytesormore_loop_end): +- pmovmskb %xmm0, %ecx +- incw %cx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm1, %ecx +- notw %cx +- sall $16, %ecx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm2, %ecx +- notw %cx +- shlq $32, %rcx +- jnz L(loop_end_ret) +- +- addq $48, %rdi +- addq $48, %rsi +- movq %rax, %rcx +- +- .p2align 4,, 6 +-L(loop_end_ret): +- bsfq %rcx, %rcx +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rcx), %eax +- xorl %edx, %edx +- cmpl (%rsi, %rcx), %eax +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %ecx +- subl %ecx, %eax +-# endif +- ret +-END (MEMCMP) +-#endif diff --git a/glibc-upstream-2.34-232.patch b/glibc-upstream-2.34-232.patch new file mode 100644 index 0000000..03ca852 --- /dev/null +++ b/glibc-upstream-2.34-232.patch @@ -0,0 +1,259 @@ +commit df5de87260dba479873b2850bbe5c0b81c2376f6 +Author: Noah Goldstein +Date: Fri Apr 15 12:28:01 2022 -0500 + + x86: Cleanup page cross code in memcmp-avx2-movbe.S + + Old code was both inefficient and wasted code size. New code (-62 + bytes) and comparable or better performance in the page cross case. + + geometric_mean(N=20) of page cross cases New / Original: 0.960 + + size, align0, align1, ret, New Time/Old Time + 1, 4095, 0, 0, 1.001 + 1, 4095, 0, 1, 0.999 + 1, 4095, 0, -1, 1.0 + 2, 4094, 0, 0, 1.0 + 2, 4094, 0, 1, 1.0 + 2, 4094, 0, -1, 1.0 + 3, 4093, 0, 0, 1.0 + 3, 4093, 0, 1, 1.0 + 3, 4093, 0, -1, 1.0 + 4, 4092, 0, 0, 0.987 + 4, 4092, 0, 1, 1.0 + 4, 4092, 0, -1, 1.0 + 5, 4091, 0, 0, 0.984 + 5, 4091, 0, 1, 1.002 + 5, 4091, 0, -1, 1.005 + 6, 4090, 0, 0, 0.993 + 6, 4090, 0, 1, 1.001 + 6, 4090, 0, -1, 1.003 + 7, 4089, 0, 0, 0.991 + 7, 4089, 0, 1, 1.0 + 7, 4089, 0, -1, 1.001 + 8, 4088, 0, 0, 0.875 + 8, 4088, 0, 1, 0.881 + 8, 4088, 0, -1, 0.888 + 9, 4087, 0, 0, 0.872 + 9, 4087, 0, 1, 0.879 + 9, 4087, 0, -1, 0.883 + 10, 4086, 0, 0, 0.878 + 10, 4086, 0, 1, 0.886 + 10, 4086, 0, -1, 0.873 + 11, 4085, 0, 0, 0.878 + 11, 4085, 0, 1, 0.881 + 11, 4085, 0, -1, 0.879 + 12, 4084, 0, 0, 0.873 + 12, 4084, 0, 1, 0.889 + 12, 4084, 0, -1, 0.875 + 13, 4083, 0, 0, 0.873 + 13, 4083, 0, 1, 0.863 + 13, 4083, 0, -1, 0.863 + 14, 4082, 0, 0, 0.838 + 14, 4082, 0, 1, 0.869 + 14, 4082, 0, -1, 0.877 + 15, 4081, 0, 0, 0.841 + 15, 4081, 0, 1, 0.869 + 15, 4081, 0, -1, 0.876 + 16, 4080, 0, 0, 0.988 + 16, 4080, 0, 1, 0.99 + 16, 4080, 0, -1, 0.989 + 17, 4079, 0, 0, 0.978 + 17, 4079, 0, 1, 0.981 + 17, 4079, 0, -1, 0.98 + 18, 4078, 0, 0, 0.981 + 18, 4078, 0, 1, 0.98 + 18, 4078, 0, -1, 0.985 + 19, 4077, 0, 0, 0.977 + 19, 4077, 0, 1, 0.979 + 19, 4077, 0, -1, 0.986 + 20, 4076, 0, 0, 0.977 + 20, 4076, 0, 1, 0.986 + 20, 4076, 0, -1, 0.984 + 21, 4075, 0, 0, 0.977 + 21, 4075, 0, 1, 0.983 + 21, 4075, 0, -1, 0.988 + 22, 4074, 0, 0, 0.983 + 22, 4074, 0, 1, 0.994 + 22, 4074, 0, -1, 0.993 + 23, 4073, 0, 0, 0.98 + 23, 4073, 0, 1, 0.992 + 23, 4073, 0, -1, 0.995 + 24, 4072, 0, 0, 0.989 + 24, 4072, 0, 1, 0.989 + 24, 4072, 0, -1, 0.991 + 25, 4071, 0, 0, 0.99 + 25, 4071, 0, 1, 0.999 + 25, 4071, 0, -1, 0.996 + 26, 4070, 0, 0, 0.993 + 26, 4070, 0, 1, 0.995 + 26, 4070, 0, -1, 0.998 + 27, 4069, 0, 0, 0.993 + 27, 4069, 0, 1, 0.999 + 27, 4069, 0, -1, 1.0 + 28, 4068, 0, 0, 0.997 + 28, 4068, 0, 1, 1.0 + 28, 4068, 0, -1, 0.999 + 29, 4067, 0, 0, 0.996 + 29, 4067, 0, 1, 0.999 + 29, 4067, 0, -1, 0.999 + 30, 4066, 0, 0, 0.991 + 30, 4066, 0, 1, 1.001 + 30, 4066, 0, -1, 0.999 + 31, 4065, 0, 0, 0.988 + 31, 4065, 0, 1, 0.998 + 31, 4065, 0, -1, 0.998 + Reviewed-by: H.J. Lu + + (cherry picked from commit 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 2621ec907aedb781..ec9cf0852edf216d 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -429,22 +429,21 @@ L(page_cross_less_vec): + # ifndef USE_AS_WMEMCMP + cmpl $8, %edx + jae L(between_8_15) ++ /* Fall through for [4, 7]. */ + cmpl $4, %edx +- jae L(between_4_7) ++ jb L(between_2_3) + +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret + +@@ -457,9 +456,33 @@ L(one_or_less): + /* No ymm register was touched. */ + ret + ++ .p2align 4,, 5 ++L(ret_nonzero): ++ sbbl %eax, %eax ++ orl $1, %eax ++ /* No ymm register was touched. */ ++ ret ++ ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ /* No ymm register was touched. */ ++ ret ++ + .p2align 4 + L(between_8_15): +-# endif ++ movbe (%rdi), %rax ++ movbe (%rsi), %rcx ++ subq %rcx, %rax ++ jnz L(ret_nonzero) ++ movbe -8(%rdi, %rdx), %rax ++ movbe -8(%rsi, %rdx), %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) ++ /* No ymm register was touched. */ ++ ret ++# else + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +@@ -475,16 +498,13 @@ L(between_8_15): + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret ++# endif + +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +- +- .p2align 4 ++ .p2align 4,, 10 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 +@@ -501,11 +521,17 @@ L(between_16_31): + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret + + # ifdef USE_AS_WMEMCMP ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ ret ++ + .p2align 4 + L(one_or_less): + jb L(zero) +@@ -520,22 +546,20 @@ L(one_or_less): + # else + + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- jz L(zero_4_7) +- sbbl %eax, %eax +- orl $1, %eax +-L(zero_4_7): ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ bswap %eax ++ bswap %ecx ++ shrl %eax ++ shrl %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ /* Subtraction is okay because the upper bit is zero. */ ++ subl %ecx, %eax + /* No ymm register was touched. */ + ret + # endif diff --git a/glibc-upstream-2.34-233.patch b/glibc-upstream-2.34-233.patch new file mode 100644 index 0000000..30c79de --- /dev/null +++ b/glibc-upstream-2.34-233.patch @@ -0,0 +1,865 @@ +commit 0a11305416e287d85c64f04337cfd64b6b350e0c +Author: Noah Goldstein +Date: Thu Apr 21 20:52:28 2022 -0500 + + x86: Optimize {str|wcs}rchr-sse2 + + The new code unrolls the main loop slightly without adding too much + overhead and minimizes the comparisons for the search CHAR. + + Geometric Mean of all benchmarks New / Old: 0.741 + See email for all results. + + Full xcheck passes on x86_64 with and without multiarch enabled. + Reviewed-by: H.J. Lu + + (cherry picked from commit 5307aa9c1800f36a64c183c091c9af392c1fa75c) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S +index 67c30d0260cef8a3..a56300bc1830dedd 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S +@@ -17,7 +17,7 @@ + . */ + + #if IS_IN (libc) +-# define strrchr __strrchr_sse2 ++# define STRRCHR __strrchr_sse2 + + # undef weak_alias + # define weak_alias(strrchr, rindex) +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +index a36034b40afe8d3d..00f69f2be77a43a0 100644 +--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +@@ -17,7 +17,6 @@ + . */ + + #if IS_IN (libc) +-# define wcsrchr __wcsrchr_sse2 ++# define STRRCHR __wcsrchr_sse2 + #endif +- + #include "../wcsrchr.S" +diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S +index dfd09fe9508cb5bc..fc1598bb11417fd5 100644 +--- a/sysdeps/x86_64/strrchr.S ++++ b/sysdeps/x86_64/strrchr.S +@@ -19,210 +19,360 @@ + + #include + ++#ifndef STRRCHR ++# define STRRCHR strrchr ++#endif ++ ++#ifdef USE_AS_WCSRCHR ++# define PCMPEQ pcmpeqd ++# define CHAR_SIZE 4 ++# define PMINU pminud ++#else ++# define PCMPEQ pcmpeqb ++# define CHAR_SIZE 1 ++# define PMINU pminub ++#endif ++ ++#define PAGE_SIZE 4096 ++#define VEC_SIZE 16 ++ + .text +-ENTRY (strrchr) +- movd %esi, %xmm1 ++ENTRY(STRRCHR) ++ movd %esi, %xmm0 + movq %rdi, %rax +- andl $4095, %eax +- punpcklbw %xmm1, %xmm1 +- cmpq $4032, %rax +- punpcklwd %xmm1, %xmm1 +- pshufd $0, %xmm1, %xmm1 ++ andl $(PAGE_SIZE - 1), %eax ++#ifndef USE_AS_WCSRCHR ++ punpcklbw %xmm0, %xmm0 ++ punpcklwd %xmm0, %xmm0 ++#endif ++ pshufd $0, %xmm0, %xmm0 ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page) +- movdqu (%rdi), %xmm0 ++ ++L(cross_page_continue): ++ movups (%rdi), %xmm1 + pxor %xmm2, %xmm2 +- movdqa %xmm0, %xmm3 +- pcmpeqb %xmm1, %xmm0 +- pcmpeqb %xmm2, %xmm3 +- pmovmskb %xmm0, %ecx +- pmovmskb %xmm3, %edx +- testq %rdx, %rdx +- je L(next_48_bytes) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rcx, %rax +- je L(exit) +- bsrq %rax, %rax ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax + addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret0): + ret + ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ + .p2align 4 +-L(next_48_bytes): +- movdqu 16(%rdi), %xmm4 +- movdqa %xmm4, %xmm5 +- movdqu 32(%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm2, %xmm5 +- movdqu 48(%rdi), %xmm0 +- pmovmskb %xmm5, %edx +- movdqa %xmm3, %xmm5 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm2, %xmm5 +- pcmpeqb %xmm0, %xmm2 +- salq $16, %rdx +- pmovmskb %xmm3, %r8d +- pmovmskb %xmm5, %eax +- pmovmskb %xmm2, %esi +- salq $32, %r8 +- salq $32, %rax +- pcmpeqb %xmm1, %xmm0 +- orq %rdx, %rax +- movq %rsi, %rdx +- pmovmskb %xmm4, %esi +- salq $48, %rdx +- salq $16, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi +- pmovmskb %xmm0, %ecx +- salq $48, %rcx +- orq %rcx, %rsi +- orq %rdx, %rax +- je L(loop_header2) +- leaq -1(%rax), %rcx +- xorq %rax, %rcx +- andq %rcx, %rsi +- je L(exit) +- bsrq %rsi, %rsi +- leaq (%rdi,%rsi), %rax ++L(first_vec_x0_test): ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ testl %eax, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %r8, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 +-L(loop_header2): +- testq %rsi, %rsi +- movq %rdi, %rcx +- je L(no_c_found) +-L(loop_header): +- addq $64, %rdi +- pxor %xmm7, %xmm7 +- andq $-64, %rdi +- jmp L(loop_entry) ++L(first_vec_x1): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret + + .p2align 4 +-L(loop64): +- testq %rdx, %rdx +- cmovne %rdx, %rsi +- cmovne %rdi, %rcx +- addq $64, %rdi +-L(loop_entry): +- movdqa 32(%rdi), %xmm3 +- pxor %xmm6, %xmm6 +- movdqa 48(%rdi), %xmm2 +- movdqa %xmm3, %xmm0 +- movdqa 16(%rdi), %xmm4 +- pminub %xmm2, %xmm0 +- movdqa (%rdi), %xmm5 +- pminub %xmm4, %xmm0 +- pminub %xmm5, %xmm0 +- pcmpeqb %xmm7, %xmm0 +- pmovmskb %xmm0, %eax +- movdqa %xmm5, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %r9d +- movdqa %xmm4, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %edx +- movdqa %xmm3, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm0, %r10d +- movdqa %xmm2, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $32, %r10 +- orq %r10, %rdx +- pmovmskb %xmm0, %r8d +- orq %r9, %rdx +- salq $48, %r8 +- orq %r8, %rdx ++L(first_vec_x1_test): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax + testl %eax, %eax +- je L(loop64) +- pcmpeqb %xmm6, %xmm4 +- pcmpeqb %xmm6, %xmm3 +- pcmpeqb %xmm6, %xmm5 +- pmovmskb %xmm4, %eax +- pmovmskb %xmm3, %r10d +- pcmpeqb %xmm6, %xmm2 +- pmovmskb %xmm5, %r9d +- salq $32, %r10 +- salq $16, %rax +- pmovmskb %xmm2, %r8d +- orq %r10, %rax +- orq %r9, %rax +- salq $48, %r8 +- orq %r8, %rax +- leaq -1(%rax), %r8 +- xorq %rax, %r8 +- andq %r8, %rdx +- cmovne %rdi, %rcx +- cmovne %rdx, %rsi +- bsrq %rsi, %rsi +- leaq (%rcx,%rsi), %rax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm3, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ andq $-VEC_SIZE, %rdi ++ ++ movaps VEC_SIZE(%rdi), %xmm2 ++ pxor %xmm3, %xmm3 ++ PCMPEQ %xmm2, %xmm3 ++ pmovmskb %xmm3, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) ++ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm3 ++ pxor %xmm4, %xmm4 ++ PCMPEQ %xmm3, %xmm4 ++ pmovmskb %xmm4, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) ++ ++ addq $VEC_SIZE, %rdi ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ andq $-(VEC_SIZE * 2), %rdi ++ .p2align 4 ++L(first_loop): ++ /* Do 2x VEC at a time. */ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Use `addl` 1) so we can undo it with `subl` and 2) it can ++ macro-fuse with `jz`. */ ++ addl %ecx, %eax ++ jz L(first_loop) ++ ++ /* Check if there is zero match. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ /* Check if there was a match in last iteration. */ ++ subl %ecx, %eax ++ jnz L(new_match) ++ ++L(first_loop_old_match): ++ PCMPEQ %xmm0, %xmm2 ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ addl %eax, %ecx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ sall $16, %eax ++ orl %ecx, %eax ++ ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + ++ /* Save minimum state for getting most recent match. We can ++ throw out all previous work. */ + .p2align 4 +-L(no_c_found): +- movl $1, %esi +- xorl %ecx, %ecx +- jmp L(loop_header) ++L(second_loop_match): ++ movq %rdi, %rsi ++ movaps %xmm4, %xmm2 ++ movaps %xmm7, %xmm3 + + .p2align 4 +-L(exit): +- xorl %eax, %eax ++L(second_loop): ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Either null term or new occurence of CHAR. */ ++ addl %ecx, %eax ++ jz L(second_loop) ++ ++ /* No null term so much be new occurence of CHAR. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ ++ subl %ecx, %eax ++ jnz L(second_loop_new_match) ++ ++L(second_loop_old_match): ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ sall $16, %eax ++ orl %ecx, %eax ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 ++L(second_loop_new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(second_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4,, 4 + L(cross_page): +- movq %rdi, %rax +- pxor %xmm0, %xmm0 +- andq $-64, %rax +- movdqu (%rax), %xmm5 +- movdqa %xmm5, %xmm6 +- movdqu 16(%rax), %xmm4 +- pcmpeqb %xmm1, %xmm5 +- pcmpeqb %xmm0, %xmm6 +- movdqu 32(%rax), %xmm3 +- pmovmskb %xmm6, %esi +- movdqa %xmm4, %xmm6 +- movdqu 48(%rax), %xmm2 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm0, %xmm6 +- pmovmskb %xmm6, %edx +- movdqa %xmm3, %xmm6 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm0, %xmm6 +- pcmpeqb %xmm2, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm3, %r9d +- pmovmskb %xmm6, %r8d +- pmovmskb %xmm0, %ecx +- salq $32, %r9 +- salq $32, %r8 +- pcmpeqb %xmm1, %xmm2 +- orq %r8, %rdx +- salq $48, %rcx +- pmovmskb %xmm5, %r8d +- orq %rsi, %rdx +- pmovmskb %xmm4, %esi +- orq %rcx, %rdx +- pmovmskb %xmm2, %ecx +- salq $16, %rsi +- salq $48, %rcx +- orq %r9, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ movaps (%rsi), %xmm1 ++ pxor %xmm2, %xmm2 ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %edx + movl %edi, %ecx +- subl %eax, %ecx +- shrq %cl, %rdx +- shrq %cl, %rsi +- testq %rdx, %rdx +- je L(loop_header2) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rax, %rsi +- je L(exit) +- bsrq %rsi, %rax ++ andl $(VEC_SIZE - 1), %ecx ++ sarl %cl, %edx ++ jz L(cross_page_continue) ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ sarl %cl, %eax ++ leal -1(%rdx), %ecx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret1) ++ bsrl %eax, %eax + addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret1): + ret +-END (strrchr) ++END(STRRCHR) + +-weak_alias (strrchr, rindex) +-libc_hidden_builtin_def (strrchr) ++#ifndef USE_AS_WCSRCHR ++ weak_alias (STRRCHR, rindex) ++ libc_hidden_builtin_def (STRRCHR) ++#endif +diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S +index 6b318d3f29de9a9e..9006f2220963d76c 100644 +--- a/sysdeps/x86_64/wcsrchr.S ++++ b/sysdeps/x86_64/wcsrchr.S +@@ -17,266 +17,12 @@ + License along with the GNU C Library; if not, see + . */ + +-#include + +- .text +-ENTRY (wcsrchr) ++#define USE_AS_WCSRCHR 1 ++#define NO_PMINU 1 + +- movd %rsi, %xmm1 +- mov %rdi, %rcx +- punpckldq %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- punpckldq %xmm1, %xmm1 +- and $63, %rcx +- cmp $48, %rcx +- ja L(crosscache) ++#ifndef STRRCHR ++# define STRRCHR wcsrchr ++#endif + +- movdqu (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match1) +- +- test %rcx, %rcx +- jnz L(return_null) +- +- and $-16, %rdi +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match1): +- test %rcx, %rcx +- jnz L(prolog_find_zero_1) +- +- mov %rax, %r8 +- mov %rdi, %rsi +- and $-16, %rdi +- jmp L(loop) +- +- .p2align 4 +-L(crosscache): +- and $15, %rcx +- and $-16, %rdi +- pxor %xmm3, %xmm3 +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm3 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm3, %rdx +- pmovmskb %xmm0, %rax +- shr %cl, %rdx +- shr %cl, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match) +- +- test %rdx, %rdx +- jnz L(return_null) +- +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match): +- test %rdx, %rdx +- jnz L(prolog_find_zero) +- +- mov %rax, %r8 +- lea (%rdi, %rcx), %rsi +- +-/* Loop start on aligned string. */ +- .p2align 4 +-L(loop): +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm3 +- pcmpeqd %xmm3, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm3 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm3, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm4 +- pcmpeqd %xmm4, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm4 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm4, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm5 +- pcmpeqd %xmm5, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm5 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm5, %rax +- or %rax, %rcx +- jz L(loop) +- +- .p2align 4 +-L(matches): +- test %rax, %rax +- jnz L(match) +-L(return_value): +- test %r8, %r8 +- jz L(return_null) +- mov %r8, %rax +- mov %rsi, %rdi +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match): +- pmovmskb %xmm2, %rcx +- test %rcx, %rcx +- jnz L(find_zero) +- mov %rax, %r8 +- mov %rdi, %rsi +- jmp L(loop) +- +- .p2align 4 +-L(find_zero): +- test $15, %cl +- jnz L(find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(find_zero_in_second_wchar) +- test $15, %ch +- jnz L(find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_value) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_value) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero): +- add %rcx, %rdi +- mov %rdx, %rcx +-L(prolog_find_zero_1): +- test $15, %cl +- jnz L(prolog_find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(prolog_find_zero_in_second_wchar) +- test $15, %ch +- jnz L(prolog_find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_null) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_null) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_second_wchar): +- lea -12(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_third_wchar): +- lea -8(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_fourth_wchar): +- lea -4(%rdi), %rax +- ret +- +- .p2align 4 +-L(return_null): +- xor %rax, %rax +- ret +- +-END (wcsrchr) ++#include "../strrchr.S" diff --git a/glibc-upstream-2.34-234.patch b/glibc-upstream-2.34-234.patch new file mode 100644 index 0000000..4b8b07d --- /dev/null +++ b/glibc-upstream-2.34-234.patch @@ -0,0 +1,497 @@ +commit 00f09a14d2818f438959e764834abb3913f2b20a +Author: Noah Goldstein +Date: Thu Apr 21 20:52:29 2022 -0500 + + x86: Optimize {str|wcs}rchr-avx2 + + The new code unrolls the main loop slightly without adding too much + overhead and minimizes the comparisons for the search CHAR. + + Geometric Mean of all benchmarks New / Old: 0.832 + See email for all results. + + Full xcheck passes on x86_64 with and without multiarch enabled. + Reviewed-by: H.J. Lu + + (cherry picked from commit df7e295d18ffa34f629578c0017a9881af7620f6) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index 0deba97114d3b83d..b8dec737d5213b25 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -27,9 +27,13 @@ + # ifdef USE_AS_WCSRCHR + # define VPBROADCAST vpbroadcastd + # define VPCMPEQ vpcmpeqd ++# define VPMIN vpminud ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb + # define VPCMPEQ vpcmpeqb ++# define VPMIN vpminub ++# define CHAR_SIZE 1 + # endif + + # ifndef VZEROUPPER +@@ -41,196 +45,304 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRRCHR) +- movd %esi, %xmm4 +- movl %edi, %ecx ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRRCHR) ++ movd %esi, %xmm7 ++ movl %edi, %eax + /* Broadcast CHAR to YMM4. */ +- VPBROADCAST %xmm4, %ymm4 ++ VPBROADCAST %xmm7, %ymm7 + vpxor %xmm0, %xmm0, %xmm0 + +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ /* Shift here instead of `andl` to save code size (saves a fetch ++ block). */ ++ sall $20, %eax ++ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax ++ ja L(cross_page) + ++L(page_cross_continue): + vmovdqu (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- addq $VEC_SIZE, %rdi ++ /* Check end of string match. */ ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ /* Only check match with search CHAR if needed. */ ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Check if match before first zero. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret0): ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ ++ .p2align 4,, 10 ++L(first_vec_x1): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ .p2align 4,, 4 ++L(first_vec_x0_test): ++ VPCMPEQ %ymm1, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ testl %eax, %eax ++ jz L(ret1) ++ bsrl %eax, %eax ++ addq %r8, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret1): ++ VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x0_x1_test): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ /* Check ymm2 for search CHAR match. If no match then check ymm1 ++ before returning. */ + testl %eax, %eax +- jnz L(first_vec) ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq 1(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + +- testl %ecx, %ecx +- jnz L(return_null) + +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ /* If no in-range search CHAR match in ymm3 then need to check ++ ymm1/ymm2 for an earlier match (we delay checking search ++ CHAR matches until needed). */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN ++ + + .p2align 4 +-L(first_vec): +- /* Check if there is a nul CHAR. */ ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ ++ /* Align src. */ ++ orq $(VEC_SIZE - 1), %rdi ++ vmovdqu 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx + testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) ++ jnz L(first_vec_x1) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) ++ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 ++ VPCMPEQ %ymm3, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ addq $(VEC_SIZE + 1), %rdi ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %edx +- vpmovmskb %ymm3, %eax +- shrl %cl, %edx +- shrl %cl, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ ++L(first_aligned_loop): ++ /* Do 2x VEC at a time. Any more and the cost of finding the ++ match outweights loop benefit. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm8 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm8, %ymm0, %ymm8 ++ vpor %ymm5, %ymm8, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi ++ /* No zero or search CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) ++ jz L(first_aligned_loop) + +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) ++ /* If no zero CHAR then go to second loop (this allows us to ++ throw away all prior work). */ ++ vpmovmskb %ymm8, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_prep) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ /* Search char could be zero so we need to get the true match. ++ */ ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jnz L(first_aligned_loop_return) + +- .p2align 4 +-L(aligned_loop): +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- add $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx ++ .p2align 4,, 4 ++L(first_vec_x1_or_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm3 ++ VPCMPEQ %ymm2, %ymm7, %ymm2 + vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jz L(aligned_loop) +- +- .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a nul CHAR in a loop. */ +- testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ vpmovmskb %ymm2, %edx ++ /* Use add for macro-fusion. */ ++ addq %rax, %rdx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ salq $32, %rax ++ addq %rdx, %rax ++ bsrq %rax, %rax ++ leaq 1(%rsi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + ++ .p2align 4,, 8 ++L(first_aligned_loop_return): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(first_vec_x1_or_x2) ++ ++ bsrq %rax, %rax ++ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax + # ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %eax ++ andq $-CHAR_SIZE, %rax + # endif +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ VZEROUPPER_RETURN + ++ /* Search char cannot be zero. */ + .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a nul CHAR. */ +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++L(second_aligned_loop_set_furthest_match): ++ /* Save VEC and pointer from most recent match. */ ++L(second_aligned_loop_prep): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ vmovdqu %ymm6, %ymm2 ++ vmovdqu %ymm10, %ymm3 + + .p2align 4 +-L(find_nul): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++L(second_aligned_loop): ++ /* Search 2x at at time. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm1 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm1, %ymm0, %ymm1 ++ vpor %ymm5, %ymm1, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi + testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER_RETURN +- +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a nul CHAR. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++ jz L(second_aligned_loop) ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_set_furthest_match) ++ vpmovmskb %ymm5, %eax + testl %eax, %eax +- /* Return null pointer if the nul CHAR comes first. */ +- jz L(return_null) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ jnz L(return_new_match) ++ ++ /* This is the hot patch. We know CHAR is inbounds and that ++ ymm3/ymm2 have latest match. */ ++ .p2align 4,, 4 ++L(return_old_match): ++ vpmovmskb %ymm3, %eax ++ vpmovmskb %ymm2, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax ++ /* Last iteration also potentially has a match. */ ++ .p2align 4,, 8 ++L(return_new_match): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(return_old_match) ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax + VZEROUPPER_RETURN + +-END (STRRCHR) ++ .p2align 4,, 4 ++L(cross_page): ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %ecx, %ecx ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %eax, %eax ++ blsmskl %ecx, %ecx ++ /* Check if any search CHAR match in range. */ ++ andl %ecx, %eax ++ jz L(ret2) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret2): ++ VZEROUPPER_RETURN ++END(STRRCHR) + #endif diff --git a/glibc-upstream-2.34-235.patch b/glibc-upstream-2.34-235.patch new file mode 100644 index 0000000..c3ca959 --- /dev/null +++ b/glibc-upstream-2.34-235.patch @@ -0,0 +1,554 @@ +commit 596c9a32cc5d5eb82587e92d1e66c9ecb7668456 +Author: Noah Goldstein +Date: Thu Apr 21 20:52:30 2022 -0500 + + x86: Optimize {str|wcs}rchr-evex + + The new code unrolls the main loop slightly without adding too much + overhead and minimizes the comparisons for the search CHAR. + + Geometric Mean of all benchmarks New / Old: 0.755 + See email for all results. + + Full xcheck passes on x86_64 with and without multiarch enabled. + Reviewed-by: H.J. Lu + + (cherry picked from commit c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S +index f920b5a584edd293..f5b6d755ceb85ae2 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S +@@ -24,242 +24,351 @@ + # define STRRCHR __strrchr_evex + # endif + +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSRCHR ++# define SHIFT_REG esi ++ ++# define kunpck kunpckbw ++# define kmov_2x kmovd ++# define maskz_2x ecx ++# define maskm_2x eax ++# define CHAR_SIZE 4 ++# define VPMIN vpminud ++# define VPTESTN vptestnmd + # define VPBROADCAST vpbroadcastd +-# define VPCMP vpcmpd +-# define SHIFT_REG r8d ++# define VPCMP vpcmpd + # else ++# define SHIFT_REG edi ++ ++# define kunpck kunpckdq ++# define kmov_2x kmovq ++# define maskz_2x rcx ++# define maskm_2x rax ++ ++# define CHAR_SIZE 1 ++# define VPMIN vpminub ++# define VPTESTN vptestnmb + # define VPBROADCAST vpbroadcastb +-# define VPCMP vpcmpb +-# define SHIFT_REG ecx ++# define VPCMP vpcmpb + # endif + + # define XMMZERO xmm16 + # define YMMZERO ymm16 + # define YMMMATCH ymm17 +-# define YMM1 ymm18 ++# define YMMSAVE ymm18 ++ ++# define YMM1 ymm19 ++# define YMM2 ymm20 ++# define YMM3 ymm21 ++# define YMM4 ymm22 ++# define YMM5 ymm23 ++# define YMM6 ymm24 ++# define YMM7 ymm25 ++# define YMM8 ymm26 + +-# define VEC_SIZE 32 + +- .section .text.evex,"ax",@progbits +-ENTRY (STRRCHR) +- movl %edi, %ecx ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++ .section .text.evex, "ax", @progbits ++ENTRY(STRRCHR) ++ movl %edi, %eax + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(cross_page_boundary) + ++L(page_cross_continue): + VMOVU (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ /* k0 has a 1 for each zero CHAR in YMM1. */ ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- +- addq $VEC_SIZE, %rdi +- +- testl %eax, %eax +- jnz L(first_vec) +- + testl %ecx, %ecx +- jnz L(return_null) +- +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) +- +- .p2align 4 +-L(first_vec): +- /* Check if there is a null byte. */ +- testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) +- +- .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ jz L(aligned_more) ++ /* fallthrough: zero CHAR in first VEC. */ + ++ /* K1 has a 1 for each search CHAR match in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Build mask up until first zero CHAR (used to mask of ++ potential search CHAR matches past the end of the string). ++ */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ /* Get last match (the `andl` removed any out of bounds ++ matches). */ ++ bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif ++L(ret0): ++ ret + +- VMOVA (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ ++ /* Returns for first vec x1/x2/x3 have hard coded backward ++ search path for earlier matches. */ ++ .p2align 4,, 6 ++L(first_vec_x1): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ blsmskl %ecx, %ecx ++ /* eax non-zero if search CHAR in range. */ ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ /* fallthrough: no match in YMM2 then need to check for earlier ++ matches (in YMM1). */ ++ .p2align 4,, 4 ++L(first_vec_x0_test): + VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %edx + kmovd %k1, %eax +- +- shrxl %SHIFT_REG, %edx, %edx +- shrxl %SHIFT_REG, %eax, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) +- +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ jz L(ret1) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ leaq (%rsi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rsi, %rax ++# endif ++L(ret1): ++ ret + +- .p2align 4 +-L(aligned_loop): +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ .p2align 4,, 10 ++L(first_vec_x1_or_x2): ++ VPCMP $0, %YMM3, %YMMMATCH, %k3 ++ VPCMP $0, %YMM2, %YMMMATCH, %k2 ++ /* K2 and K3 have 1 for any search CHAR match. Test if any ++ matches between either of them. Otherwise check YMM1. */ ++ kortestd %k2, %k3 ++ jz L(first_vec_x0_test) ++ ++ /* Guranteed that YMM2 and YMM3 are within range so merge the ++ two bitmasks then get last result. */ ++ kunpck %k2, %k3, %k3 ++ kmovq %k3, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 6 ++L(first_vec_x3): ++ VPCMP $0, %YMMMATCH, %YMM4, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ ++ andl %ecx, %eax ++ jz L(first_vec_x1_or_x2) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- add $VEC_SIZE, %rdi ++ .p2align 4,, 6 ++L(first_vec_x0_x1_test): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Check YMM2 for last match first. If no match try YMM1. */ ++ testl %eax, %eax ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMP $0, %YMMMATCH, %YMM3, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* Check YMM3 for last match first. If no match try YMM2/YMM1. ++ */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ .p2align 4 ++L(aligned_more): ++ /* Need to keep original pointer incase YMM1 has last match. */ ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rdi ++ VMOVU VEC_SIZE(%rdi), %YMM2 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 ++ VPTESTN %YMM3, %YMM3, %k0 ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 ++ VPTESTN %YMM4, %YMM4, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jz L(aligned_loop) ++ movq %rdi, %r8 ++ testl %ecx, %ecx ++ jnz L(first_vec_x3) + ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a null byte in a loop. */ ++L(first_aligned_loop): ++ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee ++ they don't store a match. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 ++ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 ++ ++ VPCMP $0, %YMM5, %YMMMATCH, %k2 ++ vpxord %YMM6, %YMMMATCH, %YMM7 ++ ++ VPMIN %YMM5, %YMM6, %YMM8 ++ VPMIN %YMM8, %YMM7, %YMM7 ++ ++ VPTESTN %YMM7, %YMM7, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(first_aligned_loop) ++ ++ VPCMP $0, %YMM6, %YMMMATCH, %k3 ++ VPTESTN %YMM8, %YMM8, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_prep) ++ ++ kortestd %k2, %k3 ++ jnz L(return_first_aligned_loop) ++ ++ .p2align 4,, 6 ++L(first_vec_x1_or_x2_or_x3): ++ VPCMP $0, %YMM4, %YMMMATCH, %k4 ++ kmovd %k4, %eax + testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ jz L(first_vec_x1_or_x2) + bsrl %eax, %eax +-# ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax +-# endif ++ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a null byte. */ +- kmovd %k0, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) ++ .p2align 4,, 8 ++L(return_first_aligned_loop): ++ VPTESTN %YMM5, %YMM5, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(first_vec_x1_or_x2_or_x3) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++ .p2align 4 ++ /* We can throw away the work done for the first 4x checks here ++ as we have a later match. This is the 'fast' path persay. ++ */ ++L(second_aligned_loop_prep): ++L(second_aligned_loop_set_furthest_match): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ kunpck %k2, %k3, %k4 + + .p2align 4 +-L(find_nul): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax ++L(second_aligned_loop): ++ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 ++ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 ++ ++ VPCMP $0, %YMM1, %YMMMATCH, %k2 ++ vpxord %YMM2, %YMMMATCH, %YMM3 ++ ++ VPMIN %YMM1, %YMM2, %YMM4 ++ VPMIN %YMM3, %YMM4, %YMM3 ++ ++ VPTESTN %YMM3, %YMM3, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(second_aligned_loop) ++ ++ VPCMP $0, %YMM2, %YMMMATCH, %k3 ++ VPTESTN %YMM4, %YMM4, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_set_furthest_match) ++ ++ kortestd %k2, %k3 ++ /* branch here because there is a significant advantage interms ++ of output dependency chance in using edx. */ ++ jnz L(return_new_match) ++L(return_old_match): ++ kmovq %k4, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(return_new_match): ++ VPTESTN %YMM1, %YMM1, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(return_old_match) ++ ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(cross_page_boundary): ++ /* eax contains all the page offset bits of src (rdi). `xor rdi, ++ rax` sets pointer will all page offset bits cleared so ++ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC ++ before page cross (guranteed to be safe to read). Doing this ++ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves ++ a bit of code size. */ ++ xorq %rdi, %rax ++ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 ++ VPTESTN %YMM1, %YMM1, %k0 ++ kmovd %k0, %ecx ++ ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ movl %edi, %esi ++ andl $(VEC_SIZE - 1), %esi ++ shrl $2, %esi + # endif +- ret ++ shrxl %SHIFT_REG, %ecx, %ecx + +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a null byte. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* Return null pointer if the null byte comes first. */ +- jz L(return_null) ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ ++ /* Found zero CHAR so need to test for search CHAR. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %SHIFT_REG, %eax, %eax ++ ++ /* Check if any search CHAR match in range. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret3) + bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ addq %rdi, %rax + # endif ++L(ret3): + ret + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax +- ret +- +-END (STRRCHR) ++END(STRRCHR) + #endif diff --git a/glibc-upstream-2.34-236.patch b/glibc-upstream-2.34-236.patch new file mode 100644 index 0000000..46f4449 --- /dev/null +++ b/glibc-upstream-2.34-236.patch @@ -0,0 +1,35 @@ +commit 1f83d40dfab15a6888759552f24d1b5c0907408b +Author: Florian Weimer +Date: Thu Dec 23 12:24:30 2021 +0100 + + elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit cd0c333d2ea82d0ae14719bdbef86d99615bdb00) + +diff --git a/elf/dl-sysdep.c b/elf/dl-sysdep.c +index 4dc366eea445e974..1c78dc89c9cbe54d 100644 +--- a/elf/dl-sysdep.c ++++ b/elf/dl-sysdep.c +@@ -54,9 +54,6 @@ extern char _end[] attribute_hidden; + /* Protect SUID program against misuse of file descriptors. */ + extern void __libc_check_standard_fds (void); + +-#ifdef NEED_DL_BASE_ADDR +-ElfW(Addr) _dl_base_addr; +-#endif + int __libc_enable_secure attribute_relro = 0; + rtld_hidden_data_def (__libc_enable_secure) + /* This variable contains the lowest stack address ever used. */ +@@ -136,11 +133,6 @@ _dl_sysdep_start (void **start_argptr, + case AT_ENTRY: + user_entry = av->a_un.a_val; + break; +-#ifdef NEED_DL_BASE_ADDR +- case AT_BASE: +- _dl_base_addr = av->a_un.a_val; +- break; +-#endif + #ifndef HAVE_AUX_SECURE + case AT_UID: + case AT_EUID: diff --git a/glibc-upstream-2.34-237.patch b/glibc-upstream-2.34-237.patch new file mode 100644 index 0000000..1ea756f --- /dev/null +++ b/glibc-upstream-2.34-237.patch @@ -0,0 +1,751 @@ +commit b0bd6a1323c3eccd16c45bae359a76877fa75639 +Author: Florian Weimer +Date: Thu May 19 11:43:53 2022 +0200 + + elf: Merge dl-sysdep.c into the Linux version + + The generic version is the de-facto Linux implementation. It + requires an auxiliary vector, so Hurd does not use it. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 91c0a47ffb66e7cd802de870686465db3b3976a0) + +Conflicts: + elf/dl-sysdep.c + (missing ld.so dependency sorting optimization upstream) + +diff --git a/elf/dl-sysdep.c b/elf/dl-sysdep.c +index 1c78dc89c9cbe54d..7aa90ad6eeb35cad 100644 +--- a/elf/dl-sysdep.c ++++ b/elf/dl-sysdep.c +@@ -1,5 +1,5 @@ +-/* Operating system support for run-time dynamic linker. Generic Unix version. +- Copyright (C) 1995-2021 Free Software Foundation, Inc. ++/* Operating system support for run-time dynamic linker. Stub version. ++ Copyright (C) 1995-2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -16,352 +16,4 @@ + License along with the GNU C Library; if not, see + . */ + +-/* We conditionalize the whole of this file rather than simply eliding it +- from the static build, because other sysdeps/ versions of this file +- might define things needed by a static build. */ +- +-#ifdef SHARED +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include <_itoa.h> +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +- +-extern char **_environ attribute_hidden; +-extern char _end[] attribute_hidden; +- +-/* Protect SUID program against misuse of file descriptors. */ +-extern void __libc_check_standard_fds (void); +- +-int __libc_enable_secure attribute_relro = 0; +-rtld_hidden_data_def (__libc_enable_secure) +-/* This variable contains the lowest stack address ever used. */ +-void *__libc_stack_end attribute_relro = NULL; +-rtld_hidden_data_def(__libc_stack_end) +-void *_dl_random attribute_relro = NULL; +- +-#ifndef DL_FIND_ARG_COMPONENTS +-# define DL_FIND_ARG_COMPONENTS(cookie, argc, argv, envp, auxp) \ +- do { \ +- void **_tmp; \ +- (argc) = *(long int *) cookie; \ +- (argv) = (char **) ((long int *) cookie + 1); \ +- (envp) = (argv) + (argc) + 1; \ +- for (_tmp = (void **) (envp); *_tmp; ++_tmp) \ +- continue; \ +- (auxp) = (void *) ++_tmp; \ +- } while (0) +-#endif +- +-#ifndef DL_STACK_END +-# define DL_STACK_END(cookie) ((void *) (cookie)) +-#endif +- +-ElfW(Addr) +-_dl_sysdep_start (void **start_argptr, +- void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, +- ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) +-{ +- const ElfW(Phdr) *phdr = NULL; +- ElfW(Word) phnum = 0; +- ElfW(Addr) user_entry; +- ElfW(auxv_t) *av; +-#ifdef HAVE_AUX_SECURE +-# define set_seen(tag) (tag) /* Evaluate for the side effects. */ +-# define set_seen_secure() ((void) 0) +-#else +- uid_t uid = 0; +- gid_t gid = 0; +- unsigned int seen = 0; +-# define set_seen_secure() (seen = -1) +-# ifdef HAVE_AUX_XID +-# define set_seen(tag) (tag) /* Evaluate for the side effects. */ +-# else +-# define M(type) (1 << (type)) +-# define set_seen(tag) seen |= M ((tag)->a_type) +-# endif +-#endif +-#ifdef NEED_DL_SYSINFO +- uintptr_t new_sysinfo = 0; +-#endif +- +- __libc_stack_end = DL_STACK_END (start_argptr); +- DL_FIND_ARG_COMPONENTS (start_argptr, _dl_argc, _dl_argv, _environ, +- GLRO(dl_auxv)); +- +- user_entry = (ElfW(Addr)) ENTRY_POINT; +- GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */ +- +- /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ +- _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), +- "CONSTANT_MINSIGSTKSZ is constant"); +- GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ; +- +- for (av = GLRO(dl_auxv); av->a_type != AT_NULL; set_seen (av++)) +- switch (av->a_type) +- { +- case AT_PHDR: +- phdr = (void *) av->a_un.a_val; +- break; +- case AT_PHNUM: +- phnum = av->a_un.a_val; +- break; +- case AT_PAGESZ: +- GLRO(dl_pagesize) = av->a_un.a_val; +- break; +- case AT_ENTRY: +- user_entry = av->a_un.a_val; +- break; +-#ifndef HAVE_AUX_SECURE +- case AT_UID: +- case AT_EUID: +- uid ^= av->a_un.a_val; +- break; +- case AT_GID: +- case AT_EGID: +- gid ^= av->a_un.a_val; +- break; +-#endif +- case AT_SECURE: +-#ifndef HAVE_AUX_SECURE +- seen = -1; +-#endif +- __libc_enable_secure = av->a_un.a_val; +- break; +- case AT_PLATFORM: +- GLRO(dl_platform) = (void *) av->a_un.a_val; +- break; +- case AT_HWCAP: +- GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_HWCAP2: +- GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_CLKTCK: +- GLRO(dl_clktck) = av->a_un.a_val; +- break; +- case AT_FPUCW: +- GLRO(dl_fpu_control) = av->a_un.a_val; +- break; +-#ifdef NEED_DL_SYSINFO +- case AT_SYSINFO: +- new_sysinfo = av->a_un.a_val; +- break; +-#endif +-#ifdef NEED_DL_SYSINFO_DSO +- case AT_SYSINFO_EHDR: +- GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val; +- break; +-#endif +- case AT_RANDOM: +- _dl_random = (void *) av->a_un.a_val; +- break; +- case AT_MINSIGSTKSZ: +- GLRO(dl_minsigstacksize) = av->a_un.a_val; +- break; +- DL_PLATFORM_AUXV +- } +- +- dl_hwcap_check (); +- +-#ifndef HAVE_AUX_SECURE +- if (seen != -1) +- { +- /* Fill in the values we have not gotten from the kernel through the +- auxiliary vector. */ +-# ifndef HAVE_AUX_XID +-# define SEE(UID, var, uid) \ +- if ((seen & M (AT_##UID)) == 0) var ^= __get##uid () +- SEE (UID, uid, uid); +- SEE (EUID, uid, euid); +- SEE (GID, gid, gid); +- SEE (EGID, gid, egid); +-# endif +- +- /* If one of the two pairs of IDs does not match this is a setuid +- or setgid run. */ +- __libc_enable_secure = uid | gid; +- } +-#endif +- +-#ifndef HAVE_AUX_PAGESIZE +- if (GLRO(dl_pagesize) == 0) +- GLRO(dl_pagesize) = __getpagesize (); +-#endif +- +-#ifdef NEED_DL_SYSINFO +- if (new_sysinfo != 0) +- { +-# ifdef NEED_DL_SYSINFO_DSO +- /* Only set the sysinfo value if we also have the vsyscall DSO. */ +- if (GLRO(dl_sysinfo_dso) != 0) +-# endif +- GLRO(dl_sysinfo) = new_sysinfo; +- } +-#endif +- +- __tunables_init (_environ); +- +- /* Initialize DSO sorting algorithm after tunables. */ +- _dl_sort_maps_init (); +- +-#ifdef DL_SYSDEP_INIT +- DL_SYSDEP_INIT; +-#endif +- +-#ifdef DL_PLATFORM_INIT +- DL_PLATFORM_INIT; +-#endif +- +- /* Determine the length of the platform name. */ +- if (GLRO(dl_platform) != NULL) +- GLRO(dl_platformlen) = strlen (GLRO(dl_platform)); +- +- if (__sbrk (0) == _end) +- /* The dynamic linker was run as a program, and so the initial break +- starts just after our bss, at &_end. The malloc in dl-minimal.c +- will consume the rest of this page, so tell the kernel to move the +- break up that far. When the user program examines its break, it +- will see this new value and not clobber our data. */ +- __sbrk (GLRO(dl_pagesize) +- - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1))); +- +- /* If this is a SUID program we make sure that FDs 0, 1, and 2 are +- allocated. If necessary we are doing it ourself. If it is not +- possible we stop the program. */ +- if (__builtin_expect (__libc_enable_secure, 0)) +- __libc_check_standard_fds (); +- +- (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv)); +- return user_entry; +-} +- +-void +-_dl_sysdep_start_cleanup (void) +-{ +-} +- +-void +-_dl_show_auxv (void) +-{ +- char buf[64]; +- ElfW(auxv_t) *av; +- +- /* Terminate string. */ +- buf[63] = '\0'; +- +- /* The following code assumes that the AT_* values are encoded +- starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values +- close by (otherwise the array will be too large). In case we have +- to support a platform where these requirements are not fulfilled +- some alternative implementation has to be used. */ +- for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av) +- { +- static const struct +- { +- const char label[22]; +- enum { unknown = 0, dec, hex, str, ignore } form : 8; +- } auxvars[] = +- { +- [AT_EXECFD - 2] = { "EXECFD: ", dec }, +- [AT_EXECFN - 2] = { "EXECFN: ", str }, +- [AT_PHDR - 2] = { "PHDR: 0x", hex }, +- [AT_PHENT - 2] = { "PHENT: ", dec }, +- [AT_PHNUM - 2] = { "PHNUM: ", dec }, +- [AT_PAGESZ - 2] = { "PAGESZ: ", dec }, +- [AT_BASE - 2] = { "BASE: 0x", hex }, +- [AT_FLAGS - 2] = { "FLAGS: 0x", hex }, +- [AT_ENTRY - 2] = { "ENTRY: 0x", hex }, +- [AT_NOTELF - 2] = { "NOTELF: ", hex }, +- [AT_UID - 2] = { "UID: ", dec }, +- [AT_EUID - 2] = { "EUID: ", dec }, +- [AT_GID - 2] = { "GID: ", dec }, +- [AT_EGID - 2] = { "EGID: ", dec }, +- [AT_PLATFORM - 2] = { "PLATFORM: ", str }, +- [AT_HWCAP - 2] = { "HWCAP: ", hex }, +- [AT_CLKTCK - 2] = { "CLKTCK: ", dec }, +- [AT_FPUCW - 2] = { "FPUCW: ", hex }, +- [AT_DCACHEBSIZE - 2] = { "DCACHEBSIZE: 0x", hex }, +- [AT_ICACHEBSIZE - 2] = { "ICACHEBSIZE: 0x", hex }, +- [AT_UCACHEBSIZE - 2] = { "UCACHEBSIZE: 0x", hex }, +- [AT_IGNOREPPC - 2] = { "IGNOREPPC", ignore }, +- [AT_SECURE - 2] = { "SECURE: ", dec }, +- [AT_BASE_PLATFORM - 2] = { "BASE_PLATFORM: ", str }, +- [AT_SYSINFO - 2] = { "SYSINFO: 0x", hex }, +- [AT_SYSINFO_EHDR - 2] = { "SYSINFO_EHDR: 0x", hex }, +- [AT_RANDOM - 2] = { "RANDOM: 0x", hex }, +- [AT_HWCAP2 - 2] = { "HWCAP2: 0x", hex }, +- [AT_MINSIGSTKSZ - 2] = { "MINSIGSTKSZ: ", dec }, +- [AT_L1I_CACHESIZE - 2] = { "L1I_CACHESIZE: ", dec }, +- [AT_L1I_CACHEGEOMETRY - 2] = { "L1I_CACHEGEOMETRY: 0x", hex }, +- [AT_L1D_CACHESIZE - 2] = { "L1D_CACHESIZE: ", dec }, +- [AT_L1D_CACHEGEOMETRY - 2] = { "L1D_CACHEGEOMETRY: 0x", hex }, +- [AT_L2_CACHESIZE - 2] = { "L2_CACHESIZE: ", dec }, +- [AT_L2_CACHEGEOMETRY - 2] = { "L2_CACHEGEOMETRY: 0x", hex }, +- [AT_L3_CACHESIZE - 2] = { "L3_CACHESIZE: ", dec }, +- [AT_L3_CACHEGEOMETRY - 2] = { "L3_CACHEGEOMETRY: 0x", hex }, +- }; +- unsigned int idx = (unsigned int) (av->a_type - 2); +- +- if ((unsigned int) av->a_type < 2u +- || (idx < sizeof (auxvars) / sizeof (auxvars[0]) +- && auxvars[idx].form == ignore)) +- continue; +- +- assert (AT_NULL == 0); +- assert (AT_IGNORE == 1); +- +- /* Some entries are handled in a special way per platform. */ +- if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0) +- continue; +- +- if (idx < sizeof (auxvars) / sizeof (auxvars[0]) +- && auxvars[idx].form != unknown) +- { +- const char *val = (char *) av->a_un.a_val; +- +- if (__builtin_expect (auxvars[idx].form, dec) == dec) +- val = _itoa ((unsigned long int) av->a_un.a_val, +- buf + sizeof buf - 1, 10, 0); +- else if (__builtin_expect (auxvars[idx].form, hex) == hex) +- val = _itoa ((unsigned long int) av->a_un.a_val, +- buf + sizeof buf - 1, 16, 0); +- +- _dl_printf ("AT_%s%s\n", auxvars[idx].label, val); +- +- continue; +- } +- +- /* Unknown value: print a generic line. */ +- char buf2[17]; +- buf2[sizeof (buf2) - 1] = '\0'; +- const char *val2 = _itoa ((unsigned long int) av->a_un.a_val, +- buf2 + sizeof buf2 - 1, 16, 0); +- const char *val = _itoa ((unsigned long int) av->a_type, +- buf + sizeof buf - 1, 16, 0); +- _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2); +- } +-} +- +-#endif ++#error dl-sysdep support missing. +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index 144dc5ce5a1bba17..3e41469bcc395179 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -16,29 +16,352 @@ + License along with the GNU C Library; if not, see + . */ + +-/* Linux needs some special initialization, but otherwise uses +- the generic dynamic linker system interface code. */ +- +-#include ++#include <_itoa.h> ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include + #include +-#include +-#include +-#include ++#include + #include ++#include ++#include + #include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include + + #ifdef SHARED +-# define DL_SYSDEP_INIT frob_brk () ++extern char **_environ attribute_hidden; ++extern char _end[] attribute_hidden; ++ ++/* Protect SUID program against misuse of file descriptors. */ ++extern void __libc_check_standard_fds (void); + +-static inline void +-frob_brk (void) ++int __libc_enable_secure attribute_relro = 0; ++rtld_hidden_data_def (__libc_enable_secure) ++/* This variable contains the lowest stack address ever used. */ ++void *__libc_stack_end attribute_relro = NULL; ++rtld_hidden_data_def(__libc_stack_end) ++void *_dl_random attribute_relro = NULL; ++ ++#ifndef DL_FIND_ARG_COMPONENTS ++# define DL_FIND_ARG_COMPONENTS(cookie, argc, argv, envp, auxp) \ ++ do { \ ++ void **_tmp; \ ++ (argc) = *(long int *) cookie; \ ++ (argv) = (char **) ((long int *) cookie + 1); \ ++ (envp) = (argv) + (argc) + 1; \ ++ for (_tmp = (void **) (envp); *_tmp; ++_tmp) \ ++ continue; \ ++ (auxp) = (void *) ++_tmp; \ ++ } while (0) ++#endif ++ ++#ifndef DL_STACK_END ++# define DL_STACK_END(cookie) ((void *) (cookie)) ++#endif ++ ++ElfW(Addr) ++_dl_sysdep_start (void **start_argptr, ++ void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, ++ ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) + { ++ const ElfW(Phdr) *phdr = NULL; ++ ElfW(Word) phnum = 0; ++ ElfW(Addr) user_entry; ++ ElfW(auxv_t) *av; ++#ifdef HAVE_AUX_SECURE ++# define set_seen(tag) (tag) /* Evaluate for the side effects. */ ++# define set_seen_secure() ((void) 0) ++#else ++ uid_t uid = 0; ++ gid_t gid = 0; ++ unsigned int seen = 0; ++# define set_seen_secure() (seen = -1) ++# ifdef HAVE_AUX_XID ++# define set_seen(tag) (tag) /* Evaluate for the side effects. */ ++# else ++# define M(type) (1 << (type)) ++# define set_seen(tag) seen |= M ((tag)->a_type) ++# endif ++#endif ++#ifdef NEED_DL_SYSINFO ++ uintptr_t new_sysinfo = 0; ++#endif ++ ++ __libc_stack_end = DL_STACK_END (start_argptr); ++ DL_FIND_ARG_COMPONENTS (start_argptr, _dl_argc, _dl_argv, _environ, ++ GLRO(dl_auxv)); ++ ++ user_entry = (ElfW(Addr)) ENTRY_POINT; ++ GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */ ++ ++ /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ ++ _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), ++ "CONSTANT_MINSIGSTKSZ is constant"); ++ GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ; ++ ++ for (av = GLRO(dl_auxv); av->a_type != AT_NULL; set_seen (av++)) ++ switch (av->a_type) ++ { ++ case AT_PHDR: ++ phdr = (void *) av->a_un.a_val; ++ break; ++ case AT_PHNUM: ++ phnum = av->a_un.a_val; ++ break; ++ case AT_PAGESZ: ++ GLRO(dl_pagesize) = av->a_un.a_val; ++ break; ++ case AT_ENTRY: ++ user_entry = av->a_un.a_val; ++ break; ++#ifndef HAVE_AUX_SECURE ++ case AT_UID: ++ case AT_EUID: ++ uid ^= av->a_un.a_val; ++ break; ++ case AT_GID: ++ case AT_EGID: ++ gid ^= av->a_un.a_val; ++ break; ++#endif ++ case AT_SECURE: ++#ifndef HAVE_AUX_SECURE ++ seen = -1; ++#endif ++ __libc_enable_secure = av->a_un.a_val; ++ break; ++ case AT_PLATFORM: ++ GLRO(dl_platform) = (void *) av->a_un.a_val; ++ break; ++ case AT_HWCAP: ++ GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; ++ break; ++ case AT_HWCAP2: ++ GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; ++ break; ++ case AT_CLKTCK: ++ GLRO(dl_clktck) = av->a_un.a_val; ++ break; ++ case AT_FPUCW: ++ GLRO(dl_fpu_control) = av->a_un.a_val; ++ break; ++#ifdef NEED_DL_SYSINFO ++ case AT_SYSINFO: ++ new_sysinfo = av->a_un.a_val; ++ break; ++#endif ++#ifdef NEED_DL_SYSINFO_DSO ++ case AT_SYSINFO_EHDR: ++ GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val; ++ break; ++#endif ++ case AT_RANDOM: ++ _dl_random = (void *) av->a_un.a_val; ++ break; ++ case AT_MINSIGSTKSZ: ++ GLRO(dl_minsigstacksize) = av->a_un.a_val; ++ break; ++ DL_PLATFORM_AUXV ++ } ++ ++ dl_hwcap_check (); ++ ++#ifndef HAVE_AUX_SECURE ++ if (seen != -1) ++ { ++ /* Fill in the values we have not gotten from the kernel through the ++ auxiliary vector. */ ++# ifndef HAVE_AUX_XID ++# define SEE(UID, var, uid) \ ++ if ((seen & M (AT_##UID)) == 0) var ^= __get##uid () ++ SEE (UID, uid, uid); ++ SEE (EUID, uid, euid); ++ SEE (GID, gid, gid); ++ SEE (EGID, gid, egid); ++# endif ++ ++ /* If one of the two pairs of IDs does not match this is a setuid ++ or setgid run. */ ++ __libc_enable_secure = uid | gid; ++ } ++#endif ++ ++#ifndef HAVE_AUX_PAGESIZE ++ if (GLRO(dl_pagesize) == 0) ++ GLRO(dl_pagesize) = __getpagesize (); ++#endif ++ ++#ifdef NEED_DL_SYSINFO ++ if (new_sysinfo != 0) ++ { ++# ifdef NEED_DL_SYSINFO_DSO ++ /* Only set the sysinfo value if we also have the vsyscall DSO. */ ++ if (GLRO(dl_sysinfo_dso) != 0) ++# endif ++ GLRO(dl_sysinfo) = new_sysinfo; ++ } ++#endif ++ ++ __tunables_init (_environ); ++ ++ /* Initialize DSO sorting algorithm after tunables. */ ++ _dl_sort_maps_init (); ++ + __brk (0); /* Initialize the break. */ +-} + +-# include ++#ifdef DL_PLATFORM_INIT ++ DL_PLATFORM_INIT; + #endif + ++ /* Determine the length of the platform name. */ ++ if (GLRO(dl_platform) != NULL) ++ GLRO(dl_platformlen) = strlen (GLRO(dl_platform)); ++ ++ if (__sbrk (0) == _end) ++ /* The dynamic linker was run as a program, and so the initial break ++ starts just after our bss, at &_end. The malloc in dl-minimal.c ++ will consume the rest of this page, so tell the kernel to move the ++ break up that far. When the user program examines its break, it ++ will see this new value and not clobber our data. */ ++ __sbrk (GLRO(dl_pagesize) ++ - ((_end - (char *) 0) & (GLRO(dl_pagesize) - 1))); ++ ++ /* If this is a SUID program we make sure that FDs 0, 1, and 2 are ++ allocated. If necessary we are doing it ourself. If it is not ++ possible we stop the program. */ ++ if (__builtin_expect (__libc_enable_secure, 0)) ++ __libc_check_standard_fds (); ++ ++ (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv)); ++ return user_entry; ++} ++ ++void ++_dl_sysdep_start_cleanup (void) ++{ ++} ++ ++void ++_dl_show_auxv (void) ++{ ++ char buf[64]; ++ ElfW(auxv_t) *av; ++ ++ /* Terminate string. */ ++ buf[63] = '\0'; ++ ++ /* The following code assumes that the AT_* values are encoded ++ starting from 0 with AT_NULL, 1 for AT_IGNORE, and all other values ++ close by (otherwise the array will be too large). In case we have ++ to support a platform where these requirements are not fulfilled ++ some alternative implementation has to be used. */ ++ for (av = GLRO(dl_auxv); av->a_type != AT_NULL; ++av) ++ { ++ static const struct ++ { ++ const char label[22]; ++ enum { unknown = 0, dec, hex, str, ignore } form : 8; ++ } auxvars[] = ++ { ++ [AT_EXECFD - 2] = { "EXECFD: ", dec }, ++ [AT_EXECFN - 2] = { "EXECFN: ", str }, ++ [AT_PHDR - 2] = { "PHDR: 0x", hex }, ++ [AT_PHENT - 2] = { "PHENT: ", dec }, ++ [AT_PHNUM - 2] = { "PHNUM: ", dec }, ++ [AT_PAGESZ - 2] = { "PAGESZ: ", dec }, ++ [AT_BASE - 2] = { "BASE: 0x", hex }, ++ [AT_FLAGS - 2] = { "FLAGS: 0x", hex }, ++ [AT_ENTRY - 2] = { "ENTRY: 0x", hex }, ++ [AT_NOTELF - 2] = { "NOTELF: ", hex }, ++ [AT_UID - 2] = { "UID: ", dec }, ++ [AT_EUID - 2] = { "EUID: ", dec }, ++ [AT_GID - 2] = { "GID: ", dec }, ++ [AT_EGID - 2] = { "EGID: ", dec }, ++ [AT_PLATFORM - 2] = { "PLATFORM: ", str }, ++ [AT_HWCAP - 2] = { "HWCAP: ", hex }, ++ [AT_CLKTCK - 2] = { "CLKTCK: ", dec }, ++ [AT_FPUCW - 2] = { "FPUCW: ", hex }, ++ [AT_DCACHEBSIZE - 2] = { "DCACHEBSIZE: 0x", hex }, ++ [AT_ICACHEBSIZE - 2] = { "ICACHEBSIZE: 0x", hex }, ++ [AT_UCACHEBSIZE - 2] = { "UCACHEBSIZE: 0x", hex }, ++ [AT_IGNOREPPC - 2] = { "IGNOREPPC", ignore }, ++ [AT_SECURE - 2] = { "SECURE: ", dec }, ++ [AT_BASE_PLATFORM - 2] = { "BASE_PLATFORM: ", str }, ++ [AT_SYSINFO - 2] = { "SYSINFO: 0x", hex }, ++ [AT_SYSINFO_EHDR - 2] = { "SYSINFO_EHDR: 0x", hex }, ++ [AT_RANDOM - 2] = { "RANDOM: 0x", hex }, ++ [AT_HWCAP2 - 2] = { "HWCAP2: 0x", hex }, ++ [AT_MINSIGSTKSZ - 2] = { "MINSIGSTKSZ: ", dec }, ++ [AT_L1I_CACHESIZE - 2] = { "L1I_CACHESIZE: ", dec }, ++ [AT_L1I_CACHEGEOMETRY - 2] = { "L1I_CACHEGEOMETRY: 0x", hex }, ++ [AT_L1D_CACHESIZE - 2] = { "L1D_CACHESIZE: ", dec }, ++ [AT_L1D_CACHEGEOMETRY - 2] = { "L1D_CACHEGEOMETRY: 0x", hex }, ++ [AT_L2_CACHESIZE - 2] = { "L2_CACHESIZE: ", dec }, ++ [AT_L2_CACHEGEOMETRY - 2] = { "L2_CACHEGEOMETRY: 0x", hex }, ++ [AT_L3_CACHESIZE - 2] = { "L3_CACHESIZE: ", dec }, ++ [AT_L3_CACHEGEOMETRY - 2] = { "L3_CACHEGEOMETRY: 0x", hex }, ++ }; ++ unsigned int idx = (unsigned int) (av->a_type - 2); ++ ++ if ((unsigned int) av->a_type < 2u ++ || (idx < sizeof (auxvars) / sizeof (auxvars[0]) ++ && auxvars[idx].form == ignore)) ++ continue; ++ ++ assert (AT_NULL == 0); ++ assert (AT_IGNORE == 1); ++ ++ /* Some entries are handled in a special way per platform. */ ++ if (_dl_procinfo (av->a_type, av->a_un.a_val) == 0) ++ continue; ++ ++ if (idx < sizeof (auxvars) / sizeof (auxvars[0]) ++ && auxvars[idx].form != unknown) ++ { ++ const char *val = (char *) av->a_un.a_val; ++ ++ if (__builtin_expect (auxvars[idx].form, dec) == dec) ++ val = _itoa ((unsigned long int) av->a_un.a_val, ++ buf + sizeof buf - 1, 10, 0); ++ else if (__builtin_expect (auxvars[idx].form, hex) == hex) ++ val = _itoa ((unsigned long int) av->a_un.a_val, ++ buf + sizeof buf - 1, 16, 0); ++ ++ _dl_printf ("AT_%s%s\n", auxvars[idx].label, val); ++ ++ continue; ++ } ++ ++ /* Unknown value: print a generic line. */ ++ char buf2[17]; ++ buf2[sizeof (buf2) - 1] = '\0'; ++ const char *val2 = _itoa ((unsigned long int) av->a_un.a_val, ++ buf2 + sizeof buf2 - 1, 16, 0); ++ const char *val = _itoa ((unsigned long int) av->a_type, ++ buf + sizeof buf - 1, 16, 0); ++ _dl_printf ("AT_??? (0x%s): 0x%s\n", val, val2); ++ } ++} ++ ++#endif /* SHARED */ ++ + + int + attribute_hidden diff --git a/glibc-upstream-2.34-238.patch b/glibc-upstream-2.34-238.patch new file mode 100644 index 0000000..4d07a8a --- /dev/null +++ b/glibc-upstream-2.34-238.patch @@ -0,0 +1,120 @@ +commit 2139b1848e3e0a960ccc615fe1fd78b5d10b1411 +Author: Florian Weimer +Date: Thu Feb 3 10:58:59 2022 +0100 + + Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE + + They are always defined. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit b9c3d3382f6f50e9723002deb2dc8127de720fa6) + +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index 3e41469bcc395179..aae983777ba15fae 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -85,21 +85,6 @@ _dl_sysdep_start (void **start_argptr, + ElfW(Word) phnum = 0; + ElfW(Addr) user_entry; + ElfW(auxv_t) *av; +-#ifdef HAVE_AUX_SECURE +-# define set_seen(tag) (tag) /* Evaluate for the side effects. */ +-# define set_seen_secure() ((void) 0) +-#else +- uid_t uid = 0; +- gid_t gid = 0; +- unsigned int seen = 0; +-# define set_seen_secure() (seen = -1) +-# ifdef HAVE_AUX_XID +-# define set_seen(tag) (tag) /* Evaluate for the side effects. */ +-# else +-# define M(type) (1 << (type)) +-# define set_seen(tag) seen |= M ((tag)->a_type) +-# endif +-#endif + #ifdef NEED_DL_SYSINFO + uintptr_t new_sysinfo = 0; + #endif +@@ -116,7 +101,7 @@ _dl_sysdep_start (void **start_argptr, + "CONSTANT_MINSIGSTKSZ is constant"); + GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ; + +- for (av = GLRO(dl_auxv); av->a_type != AT_NULL; set_seen (av++)) ++ for (av = GLRO(dl_auxv); av->a_type != AT_NULL; av++) + switch (av->a_type) + { + case AT_PHDR: +@@ -131,20 +116,7 @@ _dl_sysdep_start (void **start_argptr, + case AT_ENTRY: + user_entry = av->a_un.a_val; + break; +-#ifndef HAVE_AUX_SECURE +- case AT_UID: +- case AT_EUID: +- uid ^= av->a_un.a_val; +- break; +- case AT_GID: +- case AT_EGID: +- gid ^= av->a_un.a_val; +- break; +-#endif + case AT_SECURE: +-#ifndef HAVE_AUX_SECURE +- seen = -1; +-#endif + __libc_enable_secure = av->a_un.a_val; + break; + case AT_PLATFORM: +@@ -183,31 +155,6 @@ _dl_sysdep_start (void **start_argptr, + + dl_hwcap_check (); + +-#ifndef HAVE_AUX_SECURE +- if (seen != -1) +- { +- /* Fill in the values we have not gotten from the kernel through the +- auxiliary vector. */ +-# ifndef HAVE_AUX_XID +-# define SEE(UID, var, uid) \ +- if ((seen & M (AT_##UID)) == 0) var ^= __get##uid () +- SEE (UID, uid, uid); +- SEE (EUID, uid, euid); +- SEE (GID, gid, gid); +- SEE (EGID, gid, egid); +-# endif +- +- /* If one of the two pairs of IDs does not match this is a setuid +- or setgid run. */ +- __libc_enable_secure = uid | gid; +- } +-#endif +- +-#ifndef HAVE_AUX_PAGESIZE +- if (GLRO(dl_pagesize) == 0) +- GLRO(dl_pagesize) = __getpagesize (); +-#endif +- + #ifdef NEED_DL_SYSINFO + if (new_sysinfo != 0) + { +diff --git a/sysdeps/unix/sysv/linux/ldsodefs.h b/sysdeps/unix/sysv/linux/ldsodefs.h +index 7e01f685b03b984d..0f152c592c2a9b04 100644 +--- a/sysdeps/unix/sysv/linux/ldsodefs.h ++++ b/sysdeps/unix/sysv/linux/ldsodefs.h +@@ -24,16 +24,4 @@ + /* Get the real definitions. */ + #include_next + +-/* We can assume that the kernel always provides the AT_UID, AT_EUID, +- AT_GID, and AT_EGID values in the auxiliary vector from 2.4.0 or so on. */ +-#define HAVE_AUX_XID +- +-/* We can assume that the kernel always provides the AT_SECURE value +- in the auxiliary vector from 2.5.74 or so on. */ +-#define HAVE_AUX_SECURE +- +-/* Starting with one of the 2.4.0 pre-releases the Linux kernel passes +- up the page size information. */ +-#define HAVE_AUX_PAGESIZE +- + #endif /* ldsodefs.h */ diff --git a/glibc-upstream-2.34-239.patch b/glibc-upstream-2.34-239.patch new file mode 100644 index 0000000..ef06d23 --- /dev/null +++ b/glibc-upstream-2.34-239.patch @@ -0,0 +1,55 @@ +commit 458733fffe2c410418b5f633ffd6ed65efd2aac0 +Author: Florian Weimer +Date: Thu Feb 3 10:58:59 2022 +0100 + + Linux: Remove DL_FIND_ARG_COMPONENTS + + The generic definition is always used since the Native Client + port has been removed. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 2d47fa68628e831a692cba8fc9050cef435afc5e) + +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index aae983777ba15fae..e36b3e6b63b1aa7e 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -59,19 +59,6 @@ void *__libc_stack_end attribute_relro = NULL; + rtld_hidden_data_def(__libc_stack_end) + void *_dl_random attribute_relro = NULL; + +-#ifndef DL_FIND_ARG_COMPONENTS +-# define DL_FIND_ARG_COMPONENTS(cookie, argc, argv, envp, auxp) \ +- do { \ +- void **_tmp; \ +- (argc) = *(long int *) cookie; \ +- (argv) = (char **) ((long int *) cookie + 1); \ +- (envp) = (argv) + (argc) + 1; \ +- for (_tmp = (void **) (envp); *_tmp; ++_tmp) \ +- continue; \ +- (auxp) = (void *) ++_tmp; \ +- } while (0) +-#endif +- + #ifndef DL_STACK_END + # define DL_STACK_END(cookie) ((void *) (cookie)) + #endif +@@ -90,8 +77,16 @@ _dl_sysdep_start (void **start_argptr, + #endif + + __libc_stack_end = DL_STACK_END (start_argptr); +- DL_FIND_ARG_COMPONENTS (start_argptr, _dl_argc, _dl_argv, _environ, +- GLRO(dl_auxv)); ++ _dl_argc = (intptr_t) *start_argptr; ++ _dl_argv = (char **) (start_argptr + 1); /* Necessary aliasing violation. */ ++ _environ = _dl_argv + _dl_argc + 1; ++ for (char **tmp = _environ + 1; ; ++tmp) ++ if (*tmp == NULL) ++ { ++ /* Another necessary aliasing violation. */ ++ GLRO(dl_auxv) = (ElfW(auxv_t) *) (tmp + 1); ++ break; ++ } + + user_entry = (ElfW(Addr)) ENTRY_POINT; + GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */ diff --git a/glibc-upstream-2.34-240.patch b/glibc-upstream-2.34-240.patch new file mode 100644 index 0000000..d5cec58 --- /dev/null +++ b/glibc-upstream-2.34-240.patch @@ -0,0 +1,70 @@ +commit 08728256faf69b159b9ecd64f7f8b734f5f456e4 +Author: Florian Weimer +Date: Thu Feb 3 10:58:59 2022 +0100 + + Linux: Assume that NEED_DL_SYSINFO_DSO is always defined + + The definition itself is still needed for generic code. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit f19fc997a5754a6c0bb9e43618f0597e878061f7) + +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index e36b3e6b63b1aa7e..1829dab4f38b560c 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -134,11 +134,9 @@ _dl_sysdep_start (void **start_argptr, + new_sysinfo = av->a_un.a_val; + break; + #endif +-#ifdef NEED_DL_SYSINFO_DSO + case AT_SYSINFO_EHDR: + GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val; + break; +-#endif + case AT_RANDOM: + _dl_random = (void *) av->a_un.a_val; + break; +@@ -153,10 +151,8 @@ _dl_sysdep_start (void **start_argptr, + #ifdef NEED_DL_SYSINFO + if (new_sysinfo != 0) + { +-# ifdef NEED_DL_SYSINFO_DSO + /* Only set the sysinfo value if we also have the vsyscall DSO. */ + if (GLRO(dl_sysinfo_dso) != 0) +-# endif + GLRO(dl_sysinfo) = new_sysinfo; + } + #endif +@@ -309,7 +305,7 @@ int + attribute_hidden + _dl_discover_osversion (void) + { +-#if defined NEED_DL_SYSINFO_DSO && defined SHARED ++#ifdef SHARED + if (GLRO(dl_sysinfo_map) != NULL) + { + /* If the kernel-supplied DSO contains a note indicating the kernel's +@@ -340,7 +336,7 @@ _dl_discover_osversion (void) + } + } + } +-#endif ++#endif /* SHARED */ + + char bufmem[64]; + char *buf = bufmem; +diff --git a/sysdeps/unix/sysv/linux/m68k/sysdep.h b/sysdeps/unix/sysv/linux/m68k/sysdep.h +index b29986339a7e6cc0..11b93f2fa0af0e71 100644 +--- a/sysdeps/unix/sysv/linux/m68k/sysdep.h ++++ b/sysdeps/unix/sysv/linux/m68k/sysdep.h +@@ -301,8 +301,6 @@ SYSCALL_ERROR_LABEL: \ + #define PTR_MANGLE(var) (void) (var) + #define PTR_DEMANGLE(var) (void) (var) + +-#if defined NEED_DL_SYSINFO || defined NEED_DL_SYSINFO_DSO + /* M68K needs system-supplied DSO to access TLS helpers + even when statically linked. */ +-# define NEED_STATIC_SYSINFO_DSO 1 +-#endif ++#define NEED_STATIC_SYSINFO_DSO 1 diff --git a/glibc-upstream-2.34-241.patch b/glibc-upstream-2.34-241.patch new file mode 100644 index 0000000..0d67f1c --- /dev/null +++ b/glibc-upstream-2.34-241.patch @@ -0,0 +1,410 @@ +commit 4b9cd5465d5158dad7b4f0762bc70a3a1209b481 +Author: Florian Weimer +Date: Thu Feb 3 10:58:59 2022 +0100 + + Linux: Consolidate auxiliary vector parsing + + And optimize it slightly. + + The large switch statement in _dl_sysdep_start can be replaced with + a large array. This reduces source code and binary size. On + i686-linux-gnu: + + Before: + + text data bss dec hex filename + 7791 12 0 7803 1e7b elf/dl-sysdep.os + + After: + + text data bss dec hex filename + 7135 12 0 7147 1beb elf/dl-sysdep.os + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 8c8510ab2790039e58995ef3a22309582413d3ff) + +diff --git a/elf/dl-support.c b/elf/dl-support.c +index f29dc965f4d10648..40ef07521336857d 100644 +--- a/elf/dl-support.c ++++ b/elf/dl-support.c +@@ -241,93 +241,21 @@ __rtld_lock_define_initialized_recursive (, _dl_load_tls_lock) + + + #ifdef HAVE_AUX_VECTOR ++#include ++ + int _dl_clktck; + + void + _dl_aux_init (ElfW(auxv_t) *av) + { +- int seen = 0; +- uid_t uid = 0; +- gid_t gid = 0; +- + #ifdef NEED_DL_SYSINFO + /* NB: Avoid RELATIVE relocation in static PIE. */ + GL(dl_sysinfo) = DL_SYSINFO_DEFAULT; + #endif + + _dl_auxv = av; +- for (; av->a_type != AT_NULL; ++av) +- switch (av->a_type) +- { +- case AT_PAGESZ: +- if (av->a_un.a_val != 0) +- GLRO(dl_pagesize) = av->a_un.a_val; +- break; +- case AT_CLKTCK: +- GLRO(dl_clktck) = av->a_un.a_val; +- break; +- case AT_PHDR: +- GL(dl_phdr) = (const void *) av->a_un.a_val; +- break; +- case AT_PHNUM: +- GL(dl_phnum) = av->a_un.a_val; +- break; +- case AT_PLATFORM: +- GLRO(dl_platform) = (void *) av->a_un.a_val; +- break; +- case AT_HWCAP: +- GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_HWCAP2: +- GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_FPUCW: +- GLRO(dl_fpu_control) = av->a_un.a_val; +- break; +-#ifdef NEED_DL_SYSINFO +- case AT_SYSINFO: +- GL(dl_sysinfo) = av->a_un.a_val; +- break; +-#endif +-#ifdef NEED_DL_SYSINFO_DSO +- case AT_SYSINFO_EHDR: +- GL(dl_sysinfo_dso) = (void *) av->a_un.a_val; +- break; +-#endif +- case AT_UID: +- uid ^= av->a_un.a_val; +- seen |= 1; +- break; +- case AT_EUID: +- uid ^= av->a_un.a_val; +- seen |= 2; +- break; +- case AT_GID: +- gid ^= av->a_un.a_val; +- seen |= 4; +- break; +- case AT_EGID: +- gid ^= av->a_un.a_val; +- seen |= 8; +- break; +- case AT_SECURE: +- seen = -1; +- __libc_enable_secure = av->a_un.a_val; +- __libc_enable_secure_decided = 1; +- break; +- case AT_RANDOM: +- _dl_random = (void *) av->a_un.a_val; +- break; +- case AT_MINSIGSTKSZ: +- _dl_minsigstacksize = av->a_un.a_val; +- break; +- DL_PLATFORM_AUXV +- } +- if (seen == 0xf) +- { +- __libc_enable_secure = uid != 0 || gid != 0; +- __libc_enable_secure_decided = 1; +- } ++ dl_parse_auxv_t auxv_values = { 0, }; ++ _dl_parse_auxv (av, auxv_values); + } + #endif + +diff --git a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h +index 1aa9dca80d189ebe..8c99e776a0af9cef 100644 +--- a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h ++++ b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h +@@ -20,16 +20,8 @@ + + extern long __libc_alpha_cache_shape[4]; + +-#define DL_PLATFORM_AUXV \ +- case AT_L1I_CACHESHAPE: \ +- __libc_alpha_cache_shape[0] = av->a_un.a_val; \ +- break; \ +- case AT_L1D_CACHESHAPE: \ +- __libc_alpha_cache_shape[1] = av->a_un.a_val; \ +- break; \ +- case AT_L2_CACHESHAPE: \ +- __libc_alpha_cache_shape[2] = av->a_un.a_val; \ +- break; \ +- case AT_L3_CACHESHAPE: \ +- __libc_alpha_cache_shape[3] = av->a_un.a_val; \ +- break; ++#define DL_PLATFORM_AUXV \ ++ __libc_alpha_cache_shape[0] = auxv_values[AT_L1I_CACHESHAPE]; \ ++ __libc_alpha_cache_shape[1] = auxv_values[AT_L1D_CACHESHAPE]; \ ++ __libc_alpha_cache_shape[2] = auxv_values[AT_L2_CACHESHAPE]; \ ++ __libc_alpha_cache_shape[3] = auxv_values[AT_L3_CACHESHAPE]; +diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h +new file mode 100644 +index 0000000000000000..b3d82f69946d6d2c +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h +@@ -0,0 +1,61 @@ ++/* Parse the Linux auxiliary vector. ++ Copyright (C) 1995-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++typedef ElfW(Addr) dl_parse_auxv_t[AT_MINSIGSTKSZ + 1]; ++ ++/* Copy the auxiliary vector into AUX_VALUES and set up GLRO ++ variables. */ ++static inline ++void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values) ++{ ++ auxv_values[AT_ENTRY] = (ElfW(Addr)) ENTRY_POINT; ++ auxv_values[AT_PAGESZ] = EXEC_PAGESIZE; ++ auxv_values[AT_FPUCW] = _FPU_DEFAULT; ++ ++ /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ ++ _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), ++ "CONSTANT_MINSIGSTKSZ is constant"); ++ auxv_values[AT_MINSIGSTKSZ] = CONSTANT_MINSIGSTKSZ; ++ ++ for (; av->a_type != AT_NULL; av++) ++ if (av->a_type <= AT_MINSIGSTKSZ) ++ auxv_values[av->a_type] = av->a_un.a_val; ++ ++ GLRO(dl_pagesize) = auxv_values[AT_PAGESZ]; ++ __libc_enable_secure = auxv_values[AT_SECURE]; ++ GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM]; ++ GLRO(dl_hwcap) = auxv_values[AT_HWCAP]; ++ GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2]; ++ GLRO(dl_clktck) = auxv_values[AT_CLKTCK]; ++ GLRO(dl_fpu_control) = auxv_values[AT_FPUCW]; ++ _dl_random = (void *) auxv_values[AT_RANDOM]; ++ GLRO(dl_minsigstacksize) = auxv_values[AT_MINSIGSTKSZ]; ++ GLRO(dl_sysinfo_dso) = (void *) auxv_values[AT_SYSINFO_EHDR]; ++#ifdef NEED_DL_SYSINFO ++ if (GLRO(dl_sysinfo_dso) != NULL) ++ GLRO(dl_sysinfo) = auxv_values[AT_SYSINFO]; ++#endif ++ ++ DL_PLATFORM_AUXV ++} +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index 1829dab4f38b560c..80aa9f6f4acb7e3c 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -21,13 +21,12 @@ + #include + #include + #include ++#include + #include + #include + #include +-#include + #include + #include +-#include + #include + #include + #include +@@ -63,24 +62,24 @@ void *_dl_random attribute_relro = NULL; + # define DL_STACK_END(cookie) ((void *) (cookie)) + #endif + +-ElfW(Addr) +-_dl_sysdep_start (void **start_argptr, +- void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, +- ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) ++/* Arguments passed to dl_main. */ ++struct dl_main_arguments + { +- const ElfW(Phdr) *phdr = NULL; +- ElfW(Word) phnum = 0; ++ const ElfW(Phdr) *phdr; ++ ElfW(Word) phnum; + ElfW(Addr) user_entry; +- ElfW(auxv_t) *av; +-#ifdef NEED_DL_SYSINFO +- uintptr_t new_sysinfo = 0; +-#endif ++}; + +- __libc_stack_end = DL_STACK_END (start_argptr); ++/* Separate function, so that dl_main can be called without the large ++ array on the stack. */ ++static void ++_dl_sysdep_parse_arguments (void **start_argptr, ++ struct dl_main_arguments *args) ++{ + _dl_argc = (intptr_t) *start_argptr; + _dl_argv = (char **) (start_argptr + 1); /* Necessary aliasing violation. */ + _environ = _dl_argv + _dl_argc + 1; +- for (char **tmp = _environ + 1; ; ++tmp) ++ for (char **tmp = _environ; ; ++tmp) + if (*tmp == NULL) + { + /* Another necessary aliasing violation. */ +@@ -88,74 +87,25 @@ _dl_sysdep_start (void **start_argptr, + break; + } + +- user_entry = (ElfW(Addr)) ENTRY_POINT; +- GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */ ++ dl_parse_auxv_t auxv_values = { 0, }; ++ _dl_parse_auxv (GLRO(dl_auxv), auxv_values); + +- /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ +- _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), +- "CONSTANT_MINSIGSTKSZ is constant"); +- GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ; ++ args->phdr = (const ElfW(Phdr) *) auxv_values[AT_PHDR]; ++ args->phnum = auxv_values[AT_PHNUM]; ++ args->user_entry = auxv_values[AT_ENTRY]; ++} + +- for (av = GLRO(dl_auxv); av->a_type != AT_NULL; av++) +- switch (av->a_type) +- { +- case AT_PHDR: +- phdr = (void *) av->a_un.a_val; +- break; +- case AT_PHNUM: +- phnum = av->a_un.a_val; +- break; +- case AT_PAGESZ: +- GLRO(dl_pagesize) = av->a_un.a_val; +- break; +- case AT_ENTRY: +- user_entry = av->a_un.a_val; +- break; +- case AT_SECURE: +- __libc_enable_secure = av->a_un.a_val; +- break; +- case AT_PLATFORM: +- GLRO(dl_platform) = (void *) av->a_un.a_val; +- break; +- case AT_HWCAP: +- GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_HWCAP2: +- GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_CLKTCK: +- GLRO(dl_clktck) = av->a_un.a_val; +- break; +- case AT_FPUCW: +- GLRO(dl_fpu_control) = av->a_un.a_val; +- break; +-#ifdef NEED_DL_SYSINFO +- case AT_SYSINFO: +- new_sysinfo = av->a_un.a_val; +- break; +-#endif +- case AT_SYSINFO_EHDR: +- GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val; +- break; +- case AT_RANDOM: +- _dl_random = (void *) av->a_un.a_val; +- break; +- case AT_MINSIGSTKSZ: +- GLRO(dl_minsigstacksize) = av->a_un.a_val; +- break; +- DL_PLATFORM_AUXV +- } ++ElfW(Addr) ++_dl_sysdep_start (void **start_argptr, ++ void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, ++ ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) ++{ ++ __libc_stack_end = DL_STACK_END (start_argptr); + +- dl_hwcap_check (); ++ struct dl_main_arguments dl_main_args; ++ _dl_sysdep_parse_arguments (start_argptr, &dl_main_args); + +-#ifdef NEED_DL_SYSINFO +- if (new_sysinfo != 0) +- { +- /* Only set the sysinfo value if we also have the vsyscall DSO. */ +- if (GLRO(dl_sysinfo_dso) != 0) +- GLRO(dl_sysinfo) = new_sysinfo; +- } +-#endif ++ dl_hwcap_check (); + + __tunables_init (_environ); + +@@ -187,8 +137,9 @@ _dl_sysdep_start (void **start_argptr, + if (__builtin_expect (__libc_enable_secure, 0)) + __libc_check_standard_fds (); + +- (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv)); +- return user_entry; ++ (*dl_main) (dl_main_args.phdr, dl_main_args.phnum, ++ &dl_main_args.user_entry, GLRO(dl_auxv)); ++ return dl_main_args.user_entry; + } + + void +diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h +index 36ba0f3e9e45f3e2..7f35fb531ba22098 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h ++++ b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h +@@ -16,15 +16,5 @@ + License along with the GNU C Library; if not, see + . */ + +-#include +- +-#if IS_IN (libc) && !defined SHARED +-int GLRO(dl_cache_line_size); +-#endif +- +-/* Scan the Aux Vector for the "Data Cache Block Size" entry and assign it +- to dl_cache_line_size. */ +-#define DL_PLATFORM_AUXV \ +- case AT_DCACHEBSIZE: \ +- GLRO(dl_cache_line_size) = av->a_un.a_val; \ +- break; ++#define DL_PLATFORM_AUXV \ ++ GLRO(dl_cache_line_size) = auxv_values[AT_DCACHEBSIZE]; +diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-support.c b/sysdeps/unix/sysv/linux/powerpc/dl-support.c +new file mode 100644 +index 0000000000000000..abe68a704946b90f +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/powerpc/dl-support.c +@@ -0,0 +1,4 @@ ++#include ++ ++/* Populated from the auxiliary vector. */ ++int _dl_cache_line_size; diff --git a/glibc-upstream-2.34-242.patch b/glibc-upstream-2.34-242.patch new file mode 100644 index 0000000..a120d5c --- /dev/null +++ b/glibc-upstream-2.34-242.patch @@ -0,0 +1,399 @@ +commit 1cc4ddfeebdb68e0b6de7e4878eef94d3438706f +Author: Florian Weimer +Date: Fri Feb 11 16:01:19 2022 +0100 + + Revert "Linux: Consolidate auxiliary vector parsing" + + This reverts commit 8c8510ab2790039e58995ef3a22309582413d3ff. The + revert is not perfect because the commit included a bug fix for + _dl_sysdep_start with an empty argv, introduced in commit + 2d47fa68628e831a692cba8fc9050cef435afc5e ("Linux: Remove + DL_FIND_ARG_COMPONENTS"), and this bug fix is kept. + + The revert is necessary because the reverted commit introduced an + early memset call on aarch64, which leads to crash due to lack of TCB + initialization. + + (cherry picked from commit d96d2995c1121d3310102afda2deb1f35761b5e6) + +diff --git a/elf/dl-support.c b/elf/dl-support.c +index 40ef07521336857d..f29dc965f4d10648 100644 +--- a/elf/dl-support.c ++++ b/elf/dl-support.c +@@ -241,21 +241,93 @@ __rtld_lock_define_initialized_recursive (, _dl_load_tls_lock) + + + #ifdef HAVE_AUX_VECTOR +-#include +- + int _dl_clktck; + + void + _dl_aux_init (ElfW(auxv_t) *av) + { ++ int seen = 0; ++ uid_t uid = 0; ++ gid_t gid = 0; ++ + #ifdef NEED_DL_SYSINFO + /* NB: Avoid RELATIVE relocation in static PIE. */ + GL(dl_sysinfo) = DL_SYSINFO_DEFAULT; + #endif + + _dl_auxv = av; +- dl_parse_auxv_t auxv_values = { 0, }; +- _dl_parse_auxv (av, auxv_values); ++ for (; av->a_type != AT_NULL; ++av) ++ switch (av->a_type) ++ { ++ case AT_PAGESZ: ++ if (av->a_un.a_val != 0) ++ GLRO(dl_pagesize) = av->a_un.a_val; ++ break; ++ case AT_CLKTCK: ++ GLRO(dl_clktck) = av->a_un.a_val; ++ break; ++ case AT_PHDR: ++ GL(dl_phdr) = (const void *) av->a_un.a_val; ++ break; ++ case AT_PHNUM: ++ GL(dl_phnum) = av->a_un.a_val; ++ break; ++ case AT_PLATFORM: ++ GLRO(dl_platform) = (void *) av->a_un.a_val; ++ break; ++ case AT_HWCAP: ++ GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; ++ break; ++ case AT_HWCAP2: ++ GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; ++ break; ++ case AT_FPUCW: ++ GLRO(dl_fpu_control) = av->a_un.a_val; ++ break; ++#ifdef NEED_DL_SYSINFO ++ case AT_SYSINFO: ++ GL(dl_sysinfo) = av->a_un.a_val; ++ break; ++#endif ++#ifdef NEED_DL_SYSINFO_DSO ++ case AT_SYSINFO_EHDR: ++ GL(dl_sysinfo_dso) = (void *) av->a_un.a_val; ++ break; ++#endif ++ case AT_UID: ++ uid ^= av->a_un.a_val; ++ seen |= 1; ++ break; ++ case AT_EUID: ++ uid ^= av->a_un.a_val; ++ seen |= 2; ++ break; ++ case AT_GID: ++ gid ^= av->a_un.a_val; ++ seen |= 4; ++ break; ++ case AT_EGID: ++ gid ^= av->a_un.a_val; ++ seen |= 8; ++ break; ++ case AT_SECURE: ++ seen = -1; ++ __libc_enable_secure = av->a_un.a_val; ++ __libc_enable_secure_decided = 1; ++ break; ++ case AT_RANDOM: ++ _dl_random = (void *) av->a_un.a_val; ++ break; ++ case AT_MINSIGSTKSZ: ++ _dl_minsigstacksize = av->a_un.a_val; ++ break; ++ DL_PLATFORM_AUXV ++ } ++ if (seen == 0xf) ++ { ++ __libc_enable_secure = uid != 0 || gid != 0; ++ __libc_enable_secure_decided = 1; ++ } + } + #endif + +diff --git a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h +index 8c99e776a0af9cef..1aa9dca80d189ebe 100644 +--- a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h ++++ b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h +@@ -20,8 +20,16 @@ + + extern long __libc_alpha_cache_shape[4]; + +-#define DL_PLATFORM_AUXV \ +- __libc_alpha_cache_shape[0] = auxv_values[AT_L1I_CACHESHAPE]; \ +- __libc_alpha_cache_shape[1] = auxv_values[AT_L1D_CACHESHAPE]; \ +- __libc_alpha_cache_shape[2] = auxv_values[AT_L2_CACHESHAPE]; \ +- __libc_alpha_cache_shape[3] = auxv_values[AT_L3_CACHESHAPE]; ++#define DL_PLATFORM_AUXV \ ++ case AT_L1I_CACHESHAPE: \ ++ __libc_alpha_cache_shape[0] = av->a_un.a_val; \ ++ break; \ ++ case AT_L1D_CACHESHAPE: \ ++ __libc_alpha_cache_shape[1] = av->a_un.a_val; \ ++ break; \ ++ case AT_L2_CACHESHAPE: \ ++ __libc_alpha_cache_shape[2] = av->a_un.a_val; \ ++ break; \ ++ case AT_L3_CACHESHAPE: \ ++ __libc_alpha_cache_shape[3] = av->a_un.a_val; \ ++ break; +diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h +deleted file mode 100644 +index b3d82f69946d6d2c..0000000000000000 +--- a/sysdeps/unix/sysv/linux/dl-parse_auxv.h ++++ /dev/null +@@ -1,61 +0,0 @@ +-/* Parse the Linux auxiliary vector. +- Copyright (C) 1995-2022 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +-#include +-#include +-#include +-#include +- +-typedef ElfW(Addr) dl_parse_auxv_t[AT_MINSIGSTKSZ + 1]; +- +-/* Copy the auxiliary vector into AUX_VALUES and set up GLRO +- variables. */ +-static inline +-void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values) +-{ +- auxv_values[AT_ENTRY] = (ElfW(Addr)) ENTRY_POINT; +- auxv_values[AT_PAGESZ] = EXEC_PAGESIZE; +- auxv_values[AT_FPUCW] = _FPU_DEFAULT; +- +- /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ +- _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), +- "CONSTANT_MINSIGSTKSZ is constant"); +- auxv_values[AT_MINSIGSTKSZ] = CONSTANT_MINSIGSTKSZ; +- +- for (; av->a_type != AT_NULL; av++) +- if (av->a_type <= AT_MINSIGSTKSZ) +- auxv_values[av->a_type] = av->a_un.a_val; +- +- GLRO(dl_pagesize) = auxv_values[AT_PAGESZ]; +- __libc_enable_secure = auxv_values[AT_SECURE]; +- GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM]; +- GLRO(dl_hwcap) = auxv_values[AT_HWCAP]; +- GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2]; +- GLRO(dl_clktck) = auxv_values[AT_CLKTCK]; +- GLRO(dl_fpu_control) = auxv_values[AT_FPUCW]; +- _dl_random = (void *) auxv_values[AT_RANDOM]; +- GLRO(dl_minsigstacksize) = auxv_values[AT_MINSIGSTKSZ]; +- GLRO(dl_sysinfo_dso) = (void *) auxv_values[AT_SYSINFO_EHDR]; +-#ifdef NEED_DL_SYSINFO +- if (GLRO(dl_sysinfo_dso) != NULL) +- GLRO(dl_sysinfo) = auxv_values[AT_SYSINFO]; +-#endif +- +- DL_PLATFORM_AUXV +-} +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index 80aa9f6f4acb7e3c..facaaba3b9d091b3 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -21,12 +21,13 @@ + #include + #include + #include +-#include + #include + #include + #include ++#include + #include + #include ++#include + #include + #include + #include +@@ -62,20 +63,20 @@ void *_dl_random attribute_relro = NULL; + # define DL_STACK_END(cookie) ((void *) (cookie)) + #endif + +-/* Arguments passed to dl_main. */ +-struct dl_main_arguments ++ElfW(Addr) ++_dl_sysdep_start (void **start_argptr, ++ void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, ++ ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) + { +- const ElfW(Phdr) *phdr; +- ElfW(Word) phnum; ++ const ElfW(Phdr) *phdr = NULL; ++ ElfW(Word) phnum = 0; + ElfW(Addr) user_entry; +-}; ++ ElfW(auxv_t) *av; ++#ifdef NEED_DL_SYSINFO ++ uintptr_t new_sysinfo = 0; ++#endif + +-/* Separate function, so that dl_main can be called without the large +- array on the stack. */ +-static void +-_dl_sysdep_parse_arguments (void **start_argptr, +- struct dl_main_arguments *args) +-{ ++ __libc_stack_end = DL_STACK_END (start_argptr); + _dl_argc = (intptr_t) *start_argptr; + _dl_argv = (char **) (start_argptr + 1); /* Necessary aliasing violation. */ + _environ = _dl_argv + _dl_argc + 1; +@@ -87,26 +88,75 @@ _dl_sysdep_parse_arguments (void **start_argptr, + break; + } + +- dl_parse_auxv_t auxv_values = { 0, }; +- _dl_parse_auxv (GLRO(dl_auxv), auxv_values); ++ user_entry = (ElfW(Addr)) ENTRY_POINT; ++ GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */ + +- args->phdr = (const ElfW(Phdr) *) auxv_values[AT_PHDR]; +- args->phnum = auxv_values[AT_PHNUM]; +- args->user_entry = auxv_values[AT_ENTRY]; +-} ++ /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ ++ _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), ++ "CONSTANT_MINSIGSTKSZ is constant"); ++ GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ; + +-ElfW(Addr) +-_dl_sysdep_start (void **start_argptr, +- void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, +- ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) +-{ +- __libc_stack_end = DL_STACK_END (start_argptr); +- +- struct dl_main_arguments dl_main_args; +- _dl_sysdep_parse_arguments (start_argptr, &dl_main_args); ++ for (av = GLRO(dl_auxv); av->a_type != AT_NULL; av++) ++ switch (av->a_type) ++ { ++ case AT_PHDR: ++ phdr = (void *) av->a_un.a_val; ++ break; ++ case AT_PHNUM: ++ phnum = av->a_un.a_val; ++ break; ++ case AT_PAGESZ: ++ GLRO(dl_pagesize) = av->a_un.a_val; ++ break; ++ case AT_ENTRY: ++ user_entry = av->a_un.a_val; ++ break; ++ case AT_SECURE: ++ __libc_enable_secure = av->a_un.a_val; ++ break; ++ case AT_PLATFORM: ++ GLRO(dl_platform) = (void *) av->a_un.a_val; ++ break; ++ case AT_HWCAP: ++ GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; ++ break; ++ case AT_HWCAP2: ++ GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; ++ break; ++ case AT_CLKTCK: ++ GLRO(dl_clktck) = av->a_un.a_val; ++ break; ++ case AT_FPUCW: ++ GLRO(dl_fpu_control) = av->a_un.a_val; ++ break; ++#ifdef NEED_DL_SYSINFO ++ case AT_SYSINFO: ++ new_sysinfo = av->a_un.a_val; ++ break; ++#endif ++ case AT_SYSINFO_EHDR: ++ GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val; ++ break; ++ case AT_RANDOM: ++ _dl_random = (void *) av->a_un.a_val; ++ break; ++ case AT_MINSIGSTKSZ: ++ GLRO(dl_minsigstacksize) = av->a_un.a_val; ++ break; ++ DL_PLATFORM_AUXV ++ } + + dl_hwcap_check (); + ++#ifdef NEED_DL_SYSINFO ++ if (new_sysinfo != 0) ++ { ++ /* Only set the sysinfo value if we also have the vsyscall DSO. */ ++ if (GLRO(dl_sysinfo_dso) != 0) ++ GLRO(dl_sysinfo) = new_sysinfo; ++ } ++#endif ++ + __tunables_init (_environ); + + /* Initialize DSO sorting algorithm after tunables. */ +@@ -137,9 +187,8 @@ _dl_sysdep_start (void **start_argptr, + if (__builtin_expect (__libc_enable_secure, 0)) + __libc_check_standard_fds (); + +- (*dl_main) (dl_main_args.phdr, dl_main_args.phnum, +- &dl_main_args.user_entry, GLRO(dl_auxv)); +- return dl_main_args.user_entry; ++ (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv)); ++ return user_entry; + } + + void +diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h +index 7f35fb531ba22098..36ba0f3e9e45f3e2 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h ++++ b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h +@@ -16,5 +16,15 @@ + License along with the GNU C Library; if not, see + . */ + +-#define DL_PLATFORM_AUXV \ +- GLRO(dl_cache_line_size) = auxv_values[AT_DCACHEBSIZE]; ++#include ++ ++#if IS_IN (libc) && !defined SHARED ++int GLRO(dl_cache_line_size); ++#endif ++ ++/* Scan the Aux Vector for the "Data Cache Block Size" entry and assign it ++ to dl_cache_line_size. */ ++#define DL_PLATFORM_AUXV \ ++ case AT_DCACHEBSIZE: \ ++ GLRO(dl_cache_line_size) = av->a_un.a_val; \ ++ break; +diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-support.c b/sysdeps/unix/sysv/linux/powerpc/dl-support.c +deleted file mode 100644 +index abe68a704946b90f..0000000000000000 +--- a/sysdeps/unix/sysv/linux/powerpc/dl-support.c ++++ /dev/null +@@ -1,4 +0,0 @@ +-#include +- +-/* Populated from the auxiliary vector. */ +-int _dl_cache_line_size; diff --git a/glibc-upstream-2.34-243.patch b/glibc-upstream-2.34-243.patch new file mode 100644 index 0000000..a9ae285 --- /dev/null +++ b/glibc-upstream-2.34-243.patch @@ -0,0 +1,36 @@ +commit 28bdb03b1b2bdb2d2dc62a9beeaa7d9bd2b10679 +Author: Florian Weimer +Date: Fri Feb 11 19:03:04 2022 +0100 + + Linux: Include in dl-sysdep.c only for SHARED + + Otherwise, on POWER ends up being included twice, + once in dl-sysdep.c, once in dl-support.c. That leads to a linker + failure due to multiple definitions of _dl_cache_line_size. + + Fixes commit d96d2995c1121d3310102afda2deb1f35761b5e6 + ("Revert "Linux: Consolidate auxiliary vector parsing"). + + (cherry picked from commit 098c795e85fbd05c5ef59c2d0ce59529331bea27) + +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index facaaba3b9d091b3..3487976b06ad7f58 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -18,7 +18,6 @@ + + #include <_itoa.h> + #include +-#include + #include + #include + #include +@@ -46,6 +45,8 @@ + #include + + #ifdef SHARED ++# include ++ + extern char **_environ attribute_hidden; + extern char _end[] attribute_hidden; + diff --git a/glibc-upstream-2.34-244.patch b/glibc-upstream-2.34-244.patch new file mode 100644 index 0000000..0a4325f --- /dev/null +++ b/glibc-upstream-2.34-244.patch @@ -0,0 +1,439 @@ +commit ff900fad89df7fa12750c018993a12cc02474646 +Author: Florian Weimer +Date: Mon Feb 28 11:50:41 2022 +0100 + + Linux: Consolidate auxiliary vector parsing (redo) + + And optimize it slightly. + + This is commit 8c8510ab2790039e58995ef3a22309582413d3ff revised. + + In _dl_aux_init in elf/dl-support.c, use an explicit loop + and -fno-tree-loop-distribute-patterns to avoid memset. + + Reviewed-by: Szabolcs Nagy + (cherry picked from commit 73fc4e28b9464f0e13edc719a5372839970e7ddb) + +diff --git a/elf/Makefile b/elf/Makefile +index c89a6a58690646ee..6423ebbdd7708a14 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -148,6 +148,11 @@ ifeq (yes,$(have-loop-to-function)) + CFLAGS-rtld.c += -fno-tree-loop-distribute-patterns + endif + ++ifeq (yes,$(have-loop-to-function)) ++# Likewise, during static library startup, memset is not yet available. ++CFLAGS-dl-support.c = -fno-tree-loop-distribute-patterns ++endif ++ + # Compile rtld itself without stack protection. + # Also compile all routines in the static library that are elided from + # the shared libc because they are in libc.a in the same way. +diff --git a/elf/dl-support.c b/elf/dl-support.c +index f29dc965f4d10648..a2e45e7b14e3a6b9 100644 +--- a/elf/dl-support.c ++++ b/elf/dl-support.c +@@ -43,6 +43,7 @@ + #include + #include + #include ++#include + + extern char *__progname; + char **_dl_argv = &__progname; /* This is checked for some error messages. */ +@@ -241,93 +242,25 @@ __rtld_lock_define_initialized_recursive (, _dl_load_tls_lock) + + + #ifdef HAVE_AUX_VECTOR ++#include ++ + int _dl_clktck; + + void + _dl_aux_init (ElfW(auxv_t) *av) + { +- int seen = 0; +- uid_t uid = 0; +- gid_t gid = 0; +- + #ifdef NEED_DL_SYSINFO + /* NB: Avoid RELATIVE relocation in static PIE. */ + GL(dl_sysinfo) = DL_SYSINFO_DEFAULT; + #endif + + _dl_auxv = av; +- for (; av->a_type != AT_NULL; ++av) +- switch (av->a_type) +- { +- case AT_PAGESZ: +- if (av->a_un.a_val != 0) +- GLRO(dl_pagesize) = av->a_un.a_val; +- break; +- case AT_CLKTCK: +- GLRO(dl_clktck) = av->a_un.a_val; +- break; +- case AT_PHDR: +- GL(dl_phdr) = (const void *) av->a_un.a_val; +- break; +- case AT_PHNUM: +- GL(dl_phnum) = av->a_un.a_val; +- break; +- case AT_PLATFORM: +- GLRO(dl_platform) = (void *) av->a_un.a_val; +- break; +- case AT_HWCAP: +- GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_HWCAP2: +- GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_FPUCW: +- GLRO(dl_fpu_control) = av->a_un.a_val; +- break; +-#ifdef NEED_DL_SYSINFO +- case AT_SYSINFO: +- GL(dl_sysinfo) = av->a_un.a_val; +- break; +-#endif +-#ifdef NEED_DL_SYSINFO_DSO +- case AT_SYSINFO_EHDR: +- GL(dl_sysinfo_dso) = (void *) av->a_un.a_val; +- break; +-#endif +- case AT_UID: +- uid ^= av->a_un.a_val; +- seen |= 1; +- break; +- case AT_EUID: +- uid ^= av->a_un.a_val; +- seen |= 2; +- break; +- case AT_GID: +- gid ^= av->a_un.a_val; +- seen |= 4; +- break; +- case AT_EGID: +- gid ^= av->a_un.a_val; +- seen |= 8; +- break; +- case AT_SECURE: +- seen = -1; +- __libc_enable_secure = av->a_un.a_val; +- __libc_enable_secure_decided = 1; +- break; +- case AT_RANDOM: +- _dl_random = (void *) av->a_un.a_val; +- break; +- case AT_MINSIGSTKSZ: +- _dl_minsigstacksize = av->a_un.a_val; +- break; +- DL_PLATFORM_AUXV +- } +- if (seen == 0xf) +- { +- __libc_enable_secure = uid != 0 || gid != 0; +- __libc_enable_secure_decided = 1; +- } ++ dl_parse_auxv_t auxv_values; ++ /* Use an explicit initialization loop here because memset may not ++ be available yet. */ ++ for (int i = 0; i < array_length (auxv_values); ++i) ++ auxv_values[i] = 0; ++ _dl_parse_auxv (av, auxv_values); + } + #endif + +diff --git a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h +index 1aa9dca80d189ebe..8c99e776a0af9cef 100644 +--- a/sysdeps/unix/sysv/linux/alpha/dl-auxv.h ++++ b/sysdeps/unix/sysv/linux/alpha/dl-auxv.h +@@ -20,16 +20,8 @@ + + extern long __libc_alpha_cache_shape[4]; + +-#define DL_PLATFORM_AUXV \ +- case AT_L1I_CACHESHAPE: \ +- __libc_alpha_cache_shape[0] = av->a_un.a_val; \ +- break; \ +- case AT_L1D_CACHESHAPE: \ +- __libc_alpha_cache_shape[1] = av->a_un.a_val; \ +- break; \ +- case AT_L2_CACHESHAPE: \ +- __libc_alpha_cache_shape[2] = av->a_un.a_val; \ +- break; \ +- case AT_L3_CACHESHAPE: \ +- __libc_alpha_cache_shape[3] = av->a_un.a_val; \ +- break; ++#define DL_PLATFORM_AUXV \ ++ __libc_alpha_cache_shape[0] = auxv_values[AT_L1I_CACHESHAPE]; \ ++ __libc_alpha_cache_shape[1] = auxv_values[AT_L1D_CACHESHAPE]; \ ++ __libc_alpha_cache_shape[2] = auxv_values[AT_L2_CACHESHAPE]; \ ++ __libc_alpha_cache_shape[3] = auxv_values[AT_L3_CACHESHAPE]; +diff --git a/sysdeps/unix/sysv/linux/dl-parse_auxv.h b/sysdeps/unix/sysv/linux/dl-parse_auxv.h +new file mode 100644 +index 0000000000000000..bf9374371eb217fc +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/dl-parse_auxv.h +@@ -0,0 +1,61 @@ ++/* Parse the Linux auxiliary vector. ++ Copyright (C) 1995-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++typedef ElfW(Addr) dl_parse_auxv_t[AT_MINSIGSTKSZ + 1]; ++ ++/* Copy the auxiliary vector into AUXV_VALUES and set up GLRO ++ variables. */ ++static inline ++void _dl_parse_auxv (ElfW(auxv_t) *av, dl_parse_auxv_t auxv_values) ++{ ++ auxv_values[AT_ENTRY] = (ElfW(Addr)) ENTRY_POINT; ++ auxv_values[AT_PAGESZ] = EXEC_PAGESIZE; ++ auxv_values[AT_FPUCW] = _FPU_DEFAULT; ++ ++ /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ ++ _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), ++ "CONSTANT_MINSIGSTKSZ is constant"); ++ auxv_values[AT_MINSIGSTKSZ] = CONSTANT_MINSIGSTKSZ; ++ ++ for (; av->a_type != AT_NULL; av++) ++ if (av->a_type <= AT_MINSIGSTKSZ) ++ auxv_values[av->a_type] = av->a_un.a_val; ++ ++ GLRO(dl_pagesize) = auxv_values[AT_PAGESZ]; ++ __libc_enable_secure = auxv_values[AT_SECURE]; ++ GLRO(dl_platform) = (void *) auxv_values[AT_PLATFORM]; ++ GLRO(dl_hwcap) = auxv_values[AT_HWCAP]; ++ GLRO(dl_hwcap2) = auxv_values[AT_HWCAP2]; ++ GLRO(dl_clktck) = auxv_values[AT_CLKTCK]; ++ GLRO(dl_fpu_control) = auxv_values[AT_FPUCW]; ++ _dl_random = (void *) auxv_values[AT_RANDOM]; ++ GLRO(dl_minsigstacksize) = auxv_values[AT_MINSIGSTKSZ]; ++ GLRO(dl_sysinfo_dso) = (void *) auxv_values[AT_SYSINFO_EHDR]; ++#ifdef NEED_DL_SYSINFO ++ if (GLRO(dl_sysinfo_dso) != NULL) ++ GLRO(dl_sysinfo) = auxv_values[AT_SYSINFO]; ++#endif ++ ++ DL_PLATFORM_AUXV ++} +diff --git a/sysdeps/unix/sysv/linux/dl-sysdep.c b/sysdeps/unix/sysv/linux/dl-sysdep.c +index 3487976b06ad7f58..56db828fc6985de6 100644 +--- a/sysdeps/unix/sysv/linux/dl-sysdep.c ++++ b/sysdeps/unix/sysv/linux/dl-sysdep.c +@@ -18,15 +18,14 @@ + + #include <_itoa.h> + #include +-#include ++#include + #include ++#include + #include + #include + #include +-#include + #include + #include +-#include + #include + #include + #include +@@ -43,10 +42,9 @@ + #include + + #include ++#include + + #ifdef SHARED +-# include +- + extern char **_environ attribute_hidden; + extern char _end[] attribute_hidden; + +@@ -64,20 +62,20 @@ void *_dl_random attribute_relro = NULL; + # define DL_STACK_END(cookie) ((void *) (cookie)) + #endif + +-ElfW(Addr) +-_dl_sysdep_start (void **start_argptr, +- void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, +- ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) ++/* Arguments passed to dl_main. */ ++struct dl_main_arguments + { +- const ElfW(Phdr) *phdr = NULL; +- ElfW(Word) phnum = 0; ++ const ElfW(Phdr) *phdr; ++ ElfW(Word) phnum; + ElfW(Addr) user_entry; +- ElfW(auxv_t) *av; +-#ifdef NEED_DL_SYSINFO +- uintptr_t new_sysinfo = 0; +-#endif ++}; + +- __libc_stack_end = DL_STACK_END (start_argptr); ++/* Separate function, so that dl_main can be called without the large ++ array on the stack. */ ++static void ++_dl_sysdep_parse_arguments (void **start_argptr, ++ struct dl_main_arguments *args) ++{ + _dl_argc = (intptr_t) *start_argptr; + _dl_argv = (char **) (start_argptr + 1); /* Necessary aliasing violation. */ + _environ = _dl_argv + _dl_argc + 1; +@@ -89,74 +87,25 @@ _dl_sysdep_start (void **start_argptr, + break; + } + +- user_entry = (ElfW(Addr)) ENTRY_POINT; +- GLRO(dl_platform) = NULL; /* Default to nothing known about the platform. */ ++ dl_parse_auxv_t auxv_values = { 0, }; ++ _dl_parse_auxv (GLRO(dl_auxv), auxv_values); + +- /* NB: Default to a constant CONSTANT_MINSIGSTKSZ. */ +- _Static_assert (__builtin_constant_p (CONSTANT_MINSIGSTKSZ), +- "CONSTANT_MINSIGSTKSZ is constant"); +- GLRO(dl_minsigstacksize) = CONSTANT_MINSIGSTKSZ; ++ args->phdr = (const ElfW(Phdr) *) auxv_values[AT_PHDR]; ++ args->phnum = auxv_values[AT_PHNUM]; ++ args->user_entry = auxv_values[AT_ENTRY]; ++} + +- for (av = GLRO(dl_auxv); av->a_type != AT_NULL; av++) +- switch (av->a_type) +- { +- case AT_PHDR: +- phdr = (void *) av->a_un.a_val; +- break; +- case AT_PHNUM: +- phnum = av->a_un.a_val; +- break; +- case AT_PAGESZ: +- GLRO(dl_pagesize) = av->a_un.a_val; +- break; +- case AT_ENTRY: +- user_entry = av->a_un.a_val; +- break; +- case AT_SECURE: +- __libc_enable_secure = av->a_un.a_val; +- break; +- case AT_PLATFORM: +- GLRO(dl_platform) = (void *) av->a_un.a_val; +- break; +- case AT_HWCAP: +- GLRO(dl_hwcap) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_HWCAP2: +- GLRO(dl_hwcap2) = (unsigned long int) av->a_un.a_val; +- break; +- case AT_CLKTCK: +- GLRO(dl_clktck) = av->a_un.a_val; +- break; +- case AT_FPUCW: +- GLRO(dl_fpu_control) = av->a_un.a_val; +- break; +-#ifdef NEED_DL_SYSINFO +- case AT_SYSINFO: +- new_sysinfo = av->a_un.a_val; +- break; +-#endif +- case AT_SYSINFO_EHDR: +- GLRO(dl_sysinfo_dso) = (void *) av->a_un.a_val; +- break; +- case AT_RANDOM: +- _dl_random = (void *) av->a_un.a_val; +- break; +- case AT_MINSIGSTKSZ: +- GLRO(dl_minsigstacksize) = av->a_un.a_val; +- break; +- DL_PLATFORM_AUXV +- } ++ElfW(Addr) ++_dl_sysdep_start (void **start_argptr, ++ void (*dl_main) (const ElfW(Phdr) *phdr, ElfW(Word) phnum, ++ ElfW(Addr) *user_entry, ElfW(auxv_t) *auxv)) ++{ ++ __libc_stack_end = DL_STACK_END (start_argptr); + +- dl_hwcap_check (); ++ struct dl_main_arguments dl_main_args; ++ _dl_sysdep_parse_arguments (start_argptr, &dl_main_args); + +-#ifdef NEED_DL_SYSINFO +- if (new_sysinfo != 0) +- { +- /* Only set the sysinfo value if we also have the vsyscall DSO. */ +- if (GLRO(dl_sysinfo_dso) != 0) +- GLRO(dl_sysinfo) = new_sysinfo; +- } +-#endif ++ dl_hwcap_check (); + + __tunables_init (_environ); + +@@ -188,8 +137,9 @@ _dl_sysdep_start (void **start_argptr, + if (__builtin_expect (__libc_enable_secure, 0)) + __libc_check_standard_fds (); + +- (*dl_main) (phdr, phnum, &user_entry, GLRO(dl_auxv)); +- return user_entry; ++ (*dl_main) (dl_main_args.phdr, dl_main_args.phnum, ++ &dl_main_args.user_entry, GLRO(dl_auxv)); ++ return dl_main_args.user_entry; + } + + void +diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h +index 36ba0f3e9e45f3e2..7f35fb531ba22098 100644 +--- a/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h ++++ b/sysdeps/unix/sysv/linux/powerpc/dl-auxv.h +@@ -16,15 +16,5 @@ + License along with the GNU C Library; if not, see + . */ + +-#include +- +-#if IS_IN (libc) && !defined SHARED +-int GLRO(dl_cache_line_size); +-#endif +- +-/* Scan the Aux Vector for the "Data Cache Block Size" entry and assign it +- to dl_cache_line_size. */ +-#define DL_PLATFORM_AUXV \ +- case AT_DCACHEBSIZE: \ +- GLRO(dl_cache_line_size) = av->a_un.a_val; \ +- break; ++#define DL_PLATFORM_AUXV \ ++ GLRO(dl_cache_line_size) = auxv_values[AT_DCACHEBSIZE]; +diff --git a/sysdeps/unix/sysv/linux/powerpc/dl-support.c b/sysdeps/unix/sysv/linux/powerpc/dl-support.c +new file mode 100644 +index 0000000000000000..abe68a704946b90f +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/powerpc/dl-support.c +@@ -0,0 +1,4 @@ ++#include ++ ++/* Populated from the auxiliary vector. */ ++int _dl_cache_line_size; diff --git a/glibc-upstream-2.34-245.patch b/glibc-upstream-2.34-245.patch new file mode 100644 index 0000000..5ba00fd --- /dev/null +++ b/glibc-upstream-2.34-245.patch @@ -0,0 +1,197 @@ +commit be9240c84c67de44959905a829141576965a0588 +Author: Fangrui Song +Date: Tue Apr 19 15:52:27 2022 -0700 + + elf: Remove __libc_init_secure + + After 73fc4e28b9464f0e13edc719a5372839970e7ddb, + __libc_enable_secure_decided is always 0 and a statically linked + executable may overwrite __libc_enable_secure without considering + AT_SECURE. + + The __libc_enable_secure has been correctly initialized in _dl_aux_init, + so just remove __libc_enable_secure_decided and __libc_init_secure. + This allows us to remove some startup_get*id functions from + 22b79ed7f413cd980a7af0cf258da5bf82b6d5e5. + + Reviewed-by: Florian Weimer + (cherry picked from commit 3e9acce8c50883b6cd8a3fb653363d9fa21e1608) + +diff --git a/csu/libc-start.c b/csu/libc-start.c +index d01e57ea59ceb880..a2fc2f6f9665a48f 100644 +--- a/csu/libc-start.c ++++ b/csu/libc-start.c +@@ -285,9 +285,6 @@ LIBC_START_MAIN (int (*main) (int, char **, char ** MAIN_AUXVEC_DECL), + } + } + +- /* Initialize very early so that tunables can use it. */ +- __libc_init_secure (); +- + __tunables_init (__environ); + + ARCH_INIT_CPU_FEATURES (); +diff --git a/elf/enbl-secure.c b/elf/enbl-secure.c +index 9e47526bd3e444e1..1208610bd0670c74 100644 +--- a/elf/enbl-secure.c ++++ b/elf/enbl-secure.c +@@ -26,15 +26,5 @@ + #include + #include + +-/* If nonzero __libc_enable_secure is already set. */ +-int __libc_enable_secure_decided; + /* Safest assumption, if somehow the initializer isn't run. */ + int __libc_enable_secure = 1; +- +-void +-__libc_init_secure (void) +-{ +- if (__libc_enable_secure_decided == 0) +- __libc_enable_secure = (startup_geteuid () != startup_getuid () +- || startup_getegid () != startup_getgid ()); +-} +diff --git a/include/libc-internal.h b/include/libc-internal.h +index 749dfb919ce4a62d..44fcb6bdf8751c1c 100644 +--- a/include/libc-internal.h ++++ b/include/libc-internal.h +@@ -21,9 +21,6 @@ + + #include + +-/* Initialize the `__libc_enable_secure' flag. */ +-extern void __libc_init_secure (void); +- + /* Discover the tick frequency of the machine if something goes wrong, + we return 0, an impossible hertz. */ + extern int __profile_frequency (void); +diff --git a/include/unistd.h b/include/unistd.h +index 7849562c4272e2c9..5824485629793ccb 100644 +--- a/include/unistd.h ++++ b/include/unistd.h +@@ -180,7 +180,6 @@ libc_hidden_proto (__sbrk) + and some functions contained in the C library ignore various + environment variables that normally affect them. */ + extern int __libc_enable_secure attribute_relro; +-extern int __libc_enable_secure_decided; + rtld_hidden_proto (__libc_enable_secure) + + +diff --git a/sysdeps/generic/startup.h b/sysdeps/generic/startup.h +index 04f20cde474cea89..c3be5430bd8bbaa6 100644 +--- a/sysdeps/generic/startup.h ++++ b/sysdeps/generic/startup.h +@@ -23,27 +23,3 @@ + + /* Use macro instead of inline function to avoid including . */ + #define _startup_fatal(message) __libc_fatal ((message)) +- +-static inline uid_t +-startup_getuid (void) +-{ +- return __getuid (); +-} +- +-static inline uid_t +-startup_geteuid (void) +-{ +- return __geteuid (); +-} +- +-static inline gid_t +-startup_getgid (void) +-{ +- return __getgid (); +-} +- +-static inline gid_t +-startup_getegid (void) +-{ +- return __getegid (); +-} +diff --git a/sysdeps/mach/hurd/enbl-secure.c b/sysdeps/mach/hurd/enbl-secure.c +deleted file mode 100644 +index 3e9a6b888d56754b..0000000000000000 +--- a/sysdeps/mach/hurd/enbl-secure.c ++++ /dev/null +@@ -1,30 +0,0 @@ +-/* Define and initialize the `__libc_enable_secure' flag. Hurd version. +- Copyright (C) 1998-2021 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-/* There is no need for this file in the Hurd; it is just a placeholder +- to prevent inclusion of the sysdeps/generic version. +- In the shared library, the `__libc_enable_secure' variable is defined +- by the dynamic linker in dl-sysdep.c and set there. +- In the static library, it is defined in init-first.c and set there. */ +- +-#include +- +-void +-__libc_init_secure (void) +-{ +-} +diff --git a/sysdeps/mach/hurd/i386/init-first.c b/sysdeps/mach/hurd/i386/init-first.c +index a430aae085527163..4dc9017ec8754a1a 100644 +--- a/sysdeps/mach/hurd/i386/init-first.c ++++ b/sysdeps/mach/hurd/i386/init-first.c +@@ -38,10 +38,6 @@ extern void __init_misc (int, char **, char **); + unsigned long int __hurd_threadvar_stack_offset; + unsigned long int __hurd_threadvar_stack_mask; + +-#ifndef SHARED +-int __libc_enable_secure; +-#endif +- + extern int __libc_argc attribute_hidden; + extern char **__libc_argv attribute_hidden; + extern char **_dl_argv; +diff --git a/sysdeps/unix/sysv/linux/i386/startup.h b/sysdeps/unix/sysv/linux/i386/startup.h +index dee7a4f1d3d420be..192c765361c17ed1 100644 +--- a/sysdeps/unix/sysv/linux/i386/startup.h ++++ b/sysdeps/unix/sysv/linux/i386/startup.h +@@ -32,30 +32,6 @@ _startup_fatal (const char *message __attribute__ ((unused))) + ABORT_INSTRUCTION; + __builtin_unreachable (); + } +- +-static inline uid_t +-startup_getuid (void) +-{ +- return (uid_t) INTERNAL_SYSCALL_CALL (getuid32); +-} +- +-static inline uid_t +-startup_geteuid (void) +-{ +- return (uid_t) INTERNAL_SYSCALL_CALL (geteuid32); +-} +- +-static inline gid_t +-startup_getgid (void) +-{ +- return (gid_t) INTERNAL_SYSCALL_CALL (getgid32); +-} +- +-static inline gid_t +-startup_getegid (void) +-{ +- return (gid_t) INTERNAL_SYSCALL_CALL (getegid32); +-} + #else + # include_next + #endif diff --git a/glibc-upstream-2.34-246.patch b/glibc-upstream-2.34-246.patch new file mode 100644 index 0000000..76c7b68 --- /dev/null +++ b/glibc-upstream-2.34-246.patch @@ -0,0 +1,31 @@ +commit 1e7b011f87c653ad109b34e675f64e7a5cc3805a +Author: Florian Weimer +Date: Wed May 4 15:37:21 2022 +0200 + + i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S + + After commit a78e6a10d0b50d0ca80309775980fc99944b1727 + ("i386: Remove broken CAN_USE_REGISTER_ASM_EBP (bug 28771)"), + it is never defined. + + Reviewed-by: H.J. Lu + (cherry picked from commit 6e5c7a1e262961adb52443ab91bd2c9b72316402) + +diff --git a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S +index c95f297d6f0217ef..404435f0123b23b3 100644 +--- a/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S ++++ b/sysdeps/unix/sysv/linux/i386/libc-do-syscall.S +@@ -18,8 +18,6 @@ + + #include + +-#ifndef OPTIMIZE_FOR_GCC_5 +- + /* %eax, %ecx, %edx and %esi contain the values expected by the kernel. + %edi points to a structure with the values of %ebx, %edi and %ebp. */ + +@@ -50,4 +48,3 @@ ENTRY (__libc_do_syscall) + cfi_restore (ebx) + ret + END (__libc_do_syscall) +-#endif diff --git a/glibc-upstream-2.34-247.patch b/glibc-upstream-2.34-247.patch new file mode 100644 index 0000000..c6b2961 --- /dev/null +++ b/glibc-upstream-2.34-247.patch @@ -0,0 +1,94 @@ +commit 1a5b9d1a231ae788aac3520dab07dc856e404c69 +Author: Florian Weimer +Date: Wed May 4 15:37:21 2022 +0200 + + i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls + + Introduce an int-80h-based version of __libc_do_syscall and use + it if I386_USE_SYSENTER is defined as 0. + + Reviewed-by: H.J. Lu + (cherry picked from commit 60f0f2130d30cfd008ca39743027f1e200592dff) + +diff --git a/sysdeps/unix/sysv/linux/i386/Makefile b/sysdeps/unix/sysv/linux/i386/Makefile +index abd0009d58f06303..e379a2e767d96322 100644 +--- a/sysdeps/unix/sysv/linux/i386/Makefile ++++ b/sysdeps/unix/sysv/linux/i386/Makefile +@@ -14,7 +14,7 @@ install-bin += lddlibc4 + endif + + ifeq ($(subdir),io) +-sysdep_routines += libc-do-syscall ++sysdep_routines += libc-do-syscall libc-do-syscall-int80 + endif + + ifeq ($(subdir),stdlib) +diff --git a/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S b/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S +new file mode 100644 +index 0000000000000000..2c472f255734b357 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/i386/libc-do-syscall-int80.S +@@ -0,0 +1,25 @@ ++/* Out-of-line syscall stub for six-argument syscalls from C. For static PIE. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef SHARED ++# define I386_USE_SYSENTER 0 ++# include ++ ++# define __libc_do_syscall __libc_do_syscall_int80 ++# include "libc-do-syscall.S" ++#endif +diff --git a/sysdeps/unix/sysv/linux/i386/sysdep.h b/sysdeps/unix/sysv/linux/i386/sysdep.h +index 39d6a3c13427abb5..4c6358c7fe43fe0b 100644 +--- a/sysdeps/unix/sysv/linux/i386/sysdep.h ++++ b/sysdeps/unix/sysv/linux/i386/sysdep.h +@@ -43,6 +43,15 @@ + # endif + #endif + ++#if !I386_USE_SYSENTER && IS_IN (libc) && !defined SHARED ++/* Inside static libc, we have two versions. For compilation units ++ with !I386_USE_SYSENTER, the vDSO entry mechanism cannot be ++ used. */ ++# define I386_DO_SYSCALL_STRING "__libc_do_syscall_int80" ++#else ++# define I386_DO_SYSCALL_STRING "__libc_do_syscall" ++#endif ++ + #ifdef __ASSEMBLER__ + + /* Linux uses a negative return value to indicate syscall errors, +@@ -302,7 +311,7 @@ struct libc_do_syscall_args + }; \ + asm volatile ( \ + "movl %1, %%eax\n\t" \ +- "call __libc_do_syscall" \ ++ "call " I386_DO_SYSCALL_STRING \ + : "=a" (resultvar) \ + : "i" (__NR_##name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \ + : "memory", "cc") +@@ -316,7 +325,7 @@ struct libc_do_syscall_args + }; \ + asm volatile ( \ + "movl %1, %%eax\n\t" \ +- "call __libc_do_syscall" \ ++ "call " I386_DO_SYSCALL_STRING \ + : "=a" (resultvar) \ + : "a" (name), "c" (arg2), "d" (arg3), "S" (arg4), "D" (&_xv) \ + : "memory", "cc") diff --git a/glibc-upstream-2.34-248.patch b/glibc-upstream-2.34-248.patch new file mode 100644 index 0000000..dda3e73 --- /dev/null +++ b/glibc-upstream-2.34-248.patch @@ -0,0 +1,93 @@ +commit b38c9cdb58061d357cdf9bca4f6967d487becb82 +Author: Florian Weimer +Date: Wed May 4 15:37:21 2022 +0200 + + Linux: Define MMAP_CALL_INTERNAL + + Unlike MMAP_CALL, this avoids a TCB dependency for an errno update + on failure. + + cannot be included as is on several architectures + due to the definition of page_unit, so introduce a separate header + file for the definition of MMAP_CALL and MMAP_CALL_INTERNAL, + . + + Reviewed-by: Stefan Liebler + (cherry picked from commit c1b68685d438373efe64e5f076f4215723004dfb) + +diff --git a/sysdeps/unix/sysv/linux/mmap_call.h b/sysdeps/unix/sysv/linux/mmap_call.h +new file mode 100644 +index 0000000000000000..3547c99e149e5064 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/mmap_call.h +@@ -0,0 +1,22 @@ ++/* Generic definition of MMAP_CALL and MMAP_CALL_INTERNAL. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ ++ INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset) ++#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ ++ INTERNAL_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset) +diff --git a/sysdeps/unix/sysv/linux/mmap_internal.h b/sysdeps/unix/sysv/linux/mmap_internal.h +index 5ca6976191137f95..989eb0c7c6b57dc1 100644 +--- a/sysdeps/unix/sysv/linux/mmap_internal.h ++++ b/sysdeps/unix/sysv/linux/mmap_internal.h +@@ -40,10 +40,6 @@ static uint64_t page_unit; + /* Do not accept offset not multiple of page size. */ + #define MMAP_OFF_LOW_MASK (MMAP2_PAGE_UNIT - 1) + +-/* An architecture may override this. */ +-#ifndef MMAP_CALL +-# define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ +- INLINE_SYSCALL_CALL (__nr, __addr, __len, __prot, __flags, __fd, __offset) +-#endif ++#include + + #endif /* MMAP_INTERNAL_LINUX_H */ +diff --git a/sysdeps/unix/sysv/linux/s390/mmap_internal.h b/sysdeps/unix/sysv/linux/s390/mmap_call.h +similarity index 78% +rename from sysdeps/unix/sysv/linux/s390/mmap_internal.h +rename to sysdeps/unix/sysv/linux/s390/mmap_call.h +index 46f1c3769d6b586a..bdd30cc83764c2c1 100644 +--- a/sysdeps/unix/sysv/linux/s390/mmap_internal.h ++++ b/sysdeps/unix/sysv/linux/s390/mmap_call.h +@@ -16,9 +16,6 @@ + License along with the GNU C Library; if not, see + . */ + +-#ifndef MMAP_S390_INTERNAL_H +-# define MMAP_S390_INTERNAL_H +- + #define MMAP_CALL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ + ({ \ + long int __args[6] = { (long int) (__addr), (long int) (__len), \ +@@ -26,7 +23,10 @@ + (long int) (__fd), (long int) (__offset) }; \ + INLINE_SYSCALL_CALL (__nr, __args); \ + }) +- +-#include_next +- +-#endif ++#define MMAP_CALL_INTERNAL(__nr, __addr, __len, __prot, __flags, __fd, __offset) \ ++ ({ \ ++ long int __args[6] = { (long int) (__addr), (long int) (__len), \ ++ (long int) (__prot), (long int) (__flags), \ ++ (long int) (__fd), (long int) (__offset) }; \ ++ INTERNAL_SYSCALL_CALL (__nr, __args); \ ++ }) diff --git a/glibc-upstream-2.34-249.patch b/glibc-upstream-2.34-249.patch new file mode 100644 index 0000000..7b48d3f --- /dev/null +++ b/glibc-upstream-2.34-249.patch @@ -0,0 +1,88 @@ +commit b2387bea84560d286613257139aba6787f414594 +Author: Florian Weimer +Date: Mon May 9 18:15:16 2022 +0200 + + ia64: Always define IA64_USE_NEW_STUB as a flag macro + + And keep the previous definition if it exists. This allows + disabling IA64_USE_NEW_STUB while keeping USE_DL_SYSINFO defined. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 18bd9c3d3b1b6a9182698c85354578d1d58e9d64) + +diff --git a/sysdeps/unix/sysv/linux/ia64/brk.c b/sysdeps/unix/sysv/linux/ia64/brk.c +index cf2c5bd667fb4432..61d8fa260eb59d1e 100644 +--- a/sysdeps/unix/sysv/linux/ia64/brk.c ++++ b/sysdeps/unix/sysv/linux/ia64/brk.c +@@ -16,7 +16,6 @@ + License along with the GNU C Library; if not, see + . */ + +-#include +-/* brk is used by statup before TCB is properly set. */ +-#undef USE_DL_SYSINFO ++/* brk is used by startup before TCB is properly set up. */ ++#define IA64_USE_NEW_STUB 0 + #include +diff --git a/sysdeps/unix/sysv/linux/ia64/sysdep.h b/sysdeps/unix/sysv/linux/ia64/sysdep.h +index 7198c192a03b7676..f1c81a66833941cc 100644 +--- a/sysdeps/unix/sysv/linux/ia64/sysdep.h ++++ b/sysdeps/unix/sysv/linux/ia64/sysdep.h +@@ -46,12 +46,15 @@ + #undef SYS_ify + #define SYS_ify(syscall_name) __NR_##syscall_name + +-#if defined USE_DL_SYSINFO \ +- && (IS_IN (libc) \ +- || IS_IN (libpthread) || IS_IN (librt)) +-# define IA64_USE_NEW_STUB +-#else +-# undef IA64_USE_NEW_STUB ++#ifndef IA64_USE_NEW_STUB ++# if defined USE_DL_SYSINFO && IS_IN (libc) ++# define IA64_USE_NEW_STUB 1 ++# else ++# define IA64_USE_NEW_STUB 0 ++# endif ++#endif ++#if IA64_USE_NEW_STUB && !USE_DL_SYSINFO ++# error IA64_USE_NEW_STUB needs USE_DL_SYSINFO + #endif + + #ifdef __ASSEMBLER__ +@@ -103,7 +106,7 @@ + mov r15=num; \ + break __IA64_BREAK_SYSCALL + +-#ifdef IA64_USE_NEW_STUB ++#if IA64_USE_NEW_STUB + # ifdef SHARED + # define DO_CALL(num) \ + .prologue; \ +@@ -187,7 +190,7 @@ + (non-negative) errno on error or the return value on success. + */ + +-#ifdef IA64_USE_NEW_STUB ++#if IA64_USE_NEW_STUB + + # define INTERNAL_SYSCALL_NCS(name, nr, args...) \ + ({ \ +@@ -279,7 +282,7 @@ + #define ASM_OUTARGS_5 ASM_OUTARGS_4, "=r" (_out4) + #define ASM_OUTARGS_6 ASM_OUTARGS_5, "=r" (_out5) + +-#ifdef IA64_USE_NEW_STUB ++#if IA64_USE_NEW_STUB + #define ASM_ARGS_0 + #define ASM_ARGS_1 ASM_ARGS_0, "4" (_out0) + #define ASM_ARGS_2 ASM_ARGS_1, "5" (_out1) +@@ -315,7 +318,7 @@ + /* Branch registers. */ \ + "b6" + +-#ifdef IA64_USE_NEW_STUB ++#if IA64_USE_NEW_STUB + # define ASM_CLOBBERS_6 ASM_CLOBBERS_6_COMMON + #else + # define ASM_CLOBBERS_6 ASM_CLOBBERS_6_COMMON , "b7" diff --git a/glibc-upstream-2.34-250.patch b/glibc-upstream-2.34-250.patch new file mode 100644 index 0000000..f552acc --- /dev/null +++ b/glibc-upstream-2.34-250.patch @@ -0,0 +1,121 @@ +commit e7ca2a475cf2e7ffc987b8d08e1a40337840b500 +Author: Florian Weimer +Date: Mon May 9 18:15:16 2022 +0200 + + Linux: Implement a useful version of _startup_fatal + + On i386 and ia64, the TCB is not available at this point. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit a2a6bce7d7e52c1c34369a7da62c501cc350bc31) + +diff --git a/sysdeps/unix/sysv/linux/i386/startup.h b/sysdeps/unix/sysv/linux/i386/startup.h +index 192c765361c17ed1..213805d7d2d459be 100644 +--- a/sysdeps/unix/sysv/linux/i386/startup.h ++++ b/sysdeps/unix/sysv/linux/i386/startup.h +@@ -1,5 +1,5 @@ + /* Linux/i386 definitions of functions used by static libc main startup. +- Copyright (C) 2017-2021 Free Software Foundation, Inc. ++ Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -16,22 +16,7 @@ + License along with the GNU C Library; if not, see + . */ + +-#if BUILD_PIE_DEFAULT +-/* Can't use "call *%gs:SYSINFO_OFFSET" during statup in static PIE. */ +-# define I386_USE_SYSENTER 0 ++/* Can't use "call *%gs:SYSINFO_OFFSET" during startup. */ ++#define I386_USE_SYSENTER 0 + +-# include +-# include +- +-__attribute__ ((__noreturn__)) +-static inline void +-_startup_fatal (const char *message __attribute__ ((unused))) +-{ +- /* This is only called very early during startup in static PIE. +- FIXME: How can it be improved? */ +- ABORT_INSTRUCTION; +- __builtin_unreachable (); +-} +-#else +-# include_next +-#endif ++#include_next +diff --git a/sysdeps/unix/sysv/linux/ia64/startup.h b/sysdeps/unix/sysv/linux/ia64/startup.h +new file mode 100644 +index 0000000000000000..77f29f15a2103ed5 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/ia64/startup.h +@@ -0,0 +1,22 @@ ++/* Linux/ia64 definitions of functions used by static libc main startup. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* This code is used before the TCB is set up. */ ++#define IA64_USE_NEW_STUB 0 ++ ++#include_next +diff --git a/sysdeps/unix/sysv/linux/startup.h b/sysdeps/unix/sysv/linux/startup.h +new file mode 100644 +index 0000000000000000..39859b404a84798b +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/startup.h +@@ -0,0 +1,39 @@ ++/* Linux definitions of functions used by static libc main startup. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifdef SHARED ++# include_next ++#else ++# include ++ ++/* Avoid a run-time invocation of strlen. */ ++#define _startup_fatal(message) \ ++ do \ ++ { \ ++ size_t __message_length = __builtin_strlen (message); \ ++ if (! __builtin_constant_p (__message_length)) \ ++ { \ ++ extern void _startup_fatal_not_constant (void); \ ++ _startup_fatal_not_constant (); \ ++ } \ ++ INTERNAL_SYSCALL_CALL (write, STDERR_FILENO, (message), \ ++ __message_length); \ ++ INTERNAL_SYSCALL_CALL (exit_group, 127); \ ++ } \ ++ while (0) ++#endif /* !SHARED */ diff --git a/glibc-upstream-2.34-251.patch b/glibc-upstream-2.34-251.patch new file mode 100644 index 0000000..9f5a590 --- /dev/null +++ b/glibc-upstream-2.34-251.patch @@ -0,0 +1,150 @@ +commit 43d77ef9b87533221890423e491eed1b8ca81f0c +Author: Florian Weimer +Date: Mon May 16 18:41:43 2022 +0200 + + Linux: Introduce __brk_call for invoking the brk system call + + Alpha and sparc can now use the generic implementation. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit b57ab258c1140bc45464b4b9908713e3e0ee35aa) + +diff --git a/sysdeps/unix/sysv/linux/alpha/brk_call.h b/sysdeps/unix/sysv/linux/alpha/brk_call.h +new file mode 100644 +index 0000000000000000..b8088cf13f938c88 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/alpha/brk_call.h +@@ -0,0 +1,28 @@ ++/* Invoke the brk system call. Alpha version. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++static inline void * ++__brk_call (void *addr) ++{ ++ unsigned long int result = INTERNAL_SYSCALL_CALL (brk, addr); ++ if (result == -ENOMEM) ++ /* Mimic the default error reporting behavior. */ ++ return addr; ++ else ++ return (void *) result; ++} +diff --git a/sysdeps/unix/sysv/linux/brk.c b/sysdeps/unix/sysv/linux/brk.c +index 2d70d824fc72d32d..20b11c15caae148d 100644 +--- a/sysdeps/unix/sysv/linux/brk.c ++++ b/sysdeps/unix/sysv/linux/brk.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + + /* This must be initialized data because commons can't have aliases. */ + void *__curbrk = 0; +@@ -33,7 +34,7 @@ weak_alias (__curbrk, ___brk_addr) + int + __brk (void *addr) + { +- __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr); ++ __curbrk = __brk_call (addr); + if (__curbrk < addr) + { + __set_errno (ENOMEM); +diff --git a/sysdeps/unix/sysv/linux/brk_call.h b/sysdeps/unix/sysv/linux/brk_call.h +new file mode 100644 +index 0000000000000000..72370c25d785a9ab +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/brk_call.h +@@ -0,0 +1,25 @@ ++/* Invoke the brk system call. Generic Linux version. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library. If not, see ++ . */ ++ ++static inline void * ++__brk_call (void *addr) ++{ ++ /* The default implementation reports errors through an unchanged ++ break. */ ++ return (void *) INTERNAL_SYSCALL_CALL (brk, addr); ++} +diff --git a/sysdeps/unix/sysv/linux/alpha/brk.c b/sysdeps/unix/sysv/linux/sparc/brk_call.h +similarity index 61% +rename from sysdeps/unix/sysv/linux/alpha/brk.c +rename to sysdeps/unix/sysv/linux/sparc/brk_call.h +index 074c47e054bfeb11..59ce5216601143fb 100644 +--- a/sysdeps/unix/sysv/linux/alpha/brk.c ++++ b/sysdeps/unix/sysv/linux/sparc/brk_call.h +@@ -1,5 +1,5 @@ +-/* Change data segment size. Linux/Alpha. +- Copyright (C) 2020-2021 Free Software Foundation, Inc. ++/* Invoke the brk system call. Sparc version. ++ Copyright (C) 2022 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -16,23 +16,20 @@ + License along with the GNU C Library. If not, see + . */ + +-#include +-#include +-#include ++#ifdef __arch64__ ++# define SYSCALL_NUM "0x6d" ++#else ++# define SYSCALL_NUM "0x10" ++#endif + +-void *__curbrk = 0; +- +-int +-__brk (void *addr) ++static inline void * ++__brk_call (void *addr) + { +- /* Alpha brk returns -ENOMEM in case of failure. */ +- __curbrk = (void *) INTERNAL_SYSCALL_CALL (brk, addr); +- if ((unsigned long) __curbrk == -ENOMEM) +- { +- __set_errno (ENOMEM); +- return -1; +- } +- +- return 0; ++ register long int g1 asm ("g1") = __NR_brk; ++ register long int o0 asm ("o0") = (long int) addr; ++ asm volatile ("ta " SYSCALL_NUM ++ : "=r"(o0) ++ : "r"(g1), "0"(o0) ++ : "cc"); ++ return (void *) o0; + } +-weak_alias (__brk, brk) diff --git a/glibc-upstream-2.34-252.patch b/glibc-upstream-2.34-252.patch new file mode 100644 index 0000000..b607fcc --- /dev/null +++ b/glibc-upstream-2.34-252.patch @@ -0,0 +1,510 @@ +commit ede8d94d154157d269b18f3601440ac576c1f96a +Author: Florian Weimer +Date: Mon May 16 18:41:43 2022 +0200 + + csu: Implement and use _dl_early_allocate during static startup + + This implements mmap fallback for a brk failure during TLS + allocation. + + scripts/tls-elf-edit.py is updated to support the new patching method. + The script no longer requires that in the input object is of ET_DYN + type. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit f787e138aa0bf677bf74fa2a08595c446292f3d7) + +Conflicts: + elf/Makefile + (missing ld.so static execve backport upstream) + sysdeps/generic/ldsodefs.h + (missing ld.so dependency sorting optimization upstream) + +diff --git a/csu/libc-tls.c b/csu/libc-tls.c +index d83e69f6257ae981..738f59f46b62c31c 100644 +--- a/csu/libc-tls.c ++++ b/csu/libc-tls.c +@@ -145,11 +145,16 @@ __libc_setup_tls (void) + _dl_allocate_tls_storage (in elf/dl-tls.c) does using __libc_memalign + and dl_tls_static_align. */ + tcb_offset = roundup (memsz + GLRO(dl_tls_static_surplus), max_align); +- tlsblock = __sbrk (tcb_offset + TLS_INIT_TCB_SIZE + max_align); ++ tlsblock = _dl_early_allocate (tcb_offset + TLS_INIT_TCB_SIZE + max_align); ++ if (tlsblock == NULL) ++ _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n"); + #elif TLS_DTV_AT_TP + tcb_offset = roundup (TLS_INIT_TCB_SIZE, align ?: 1); +- tlsblock = __sbrk (tcb_offset + memsz + max_align +- + TLS_PRE_TCB_SIZE + GLRO(dl_tls_static_surplus)); ++ tlsblock = _dl_early_allocate (tcb_offset + memsz + max_align ++ + TLS_PRE_TCB_SIZE ++ + GLRO(dl_tls_static_surplus)); ++ if (tlsblock == NULL) ++ _startup_fatal ("Fatal glibc error: Cannot allocate TLS block\n"); + tlsblock += TLS_PRE_TCB_SIZE; + #else + /* In case a model with a different layout for the TCB and DTV +diff --git a/elf/Makefile b/elf/Makefile +index 6423ebbdd7708a14..ea1512549be3f628 100644 +--- a/elf/Makefile ++++ b/elf/Makefile +@@ -33,6 +33,7 @@ routines = \ + $(all-dl-routines) \ + dl-addr \ + dl-addr-obj \ ++ dl-early_allocate \ + dl-error \ + dl-iteratephdr \ + dl-libc \ +@@ -104,6 +105,7 @@ all-dl-routines = $(dl-routines) $(sysdep-dl-routines) + # But they are absent from the shared libc, because that code is in ld.so. + elide-routines.os = \ + $(all-dl-routines) \ ++ dl-early_allocate \ + dl-exception \ + dl-origin \ + dl-reloc-static-pie \ +@@ -264,6 +266,7 @@ tests-static-normal := \ + tst-linkall-static \ + tst-single_threaded-pthread-static \ + tst-single_threaded-static \ ++ tst-tls-allocation-failure-static \ + tst-tlsalign-extern-static \ + tst-tlsalign-static \ + # tests-static-normal +@@ -1101,6 +1104,10 @@ $(objpfx)tst-glibcelf.out: tst-glibcelf.py elf.h $(..)/scripts/glibcelf.py \ + --cc="$(CC) $(patsubst -DMODULE_NAME=%,-DMODULE_NAME=testsuite,$(CPPFLAGS))" \ + < /dev/null > $@ 2>&1; $(evaluate-test) + ++ifeq ($(run-built-tests),yes) ++tests-special += $(objpfx)tst-tls-allocation-failure-static-patched.out ++endif ++ + # The test requires shared _and_ PIE because the executable + # unit test driver must be able to link with the shared object + # that is going to eventually go into an installed DSO. +@@ -2637,3 +2644,15 @@ $(objpfx)tst-ro-dynamic-mod.so: $(objpfx)tst-ro-dynamic-mod.os \ + $(objpfx)tst-ro-dynamic-mod.os + + $(objpfx)tst-rtld-run-static.out: $(objpfx)/ldconfig ++ ++$(objpfx)tst-tls-allocation-failure-static-patched: \ ++ $(objpfx)tst-tls-allocation-failure-static $(..)scripts/tst-elf-edit.py ++ cp $< $@ ++ $(PYTHON) $(..)scripts/tst-elf-edit.py --maximize-tls-size $@ ++ ++$(objpfx)tst-tls-allocation-failure-static-patched.out: \ ++ $(objpfx)tst-tls-allocation-failure-static-patched ++ $< > $@ 2>&1; echo "status: $$?" >> $@ ++ grep -q '^Fatal glibc error: Cannot allocate TLS block$$' $@ \ ++ && grep -q '^status: 127$$' $@; \ ++ $(evaluate-test) +diff --git a/elf/dl-early_allocate.c b/elf/dl-early_allocate.c +new file mode 100644 +index 0000000000000000..61677aaa0364c209 +--- /dev/null ++++ b/elf/dl-early_allocate.c +@@ -0,0 +1,30 @@ ++/* Early memory allocation for the dynamic loader. Generic version. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++ ++void * ++_dl_early_allocate (size_t size) ++{ ++ void *result = __sbrk (size); ++ if (result == (void *) -1) ++ result = NULL; ++ return result; ++} +diff --git a/elf/tst-tls-allocation-failure-static.c b/elf/tst-tls-allocation-failure-static.c +new file mode 100644 +index 0000000000000000..8de831b2469ba390 +--- /dev/null ++++ b/elf/tst-tls-allocation-failure-static.c +@@ -0,0 +1,31 @@ ++/* Base for test program with impossiblyh large PT_TLS segment. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* The test actual binary is patched using scripts/tst-elf-edit.py ++ --maximize-tls-size, and this introduces the expected test ++ allocation failure due to an excessive PT_LS p_memsz value. ++ ++ Patching the binary is required because on some 64-bit targets, TLS ++ relocations can only cover a 32-bit range, and glibc-internal TLS ++ variables such as errno end up outside that range. */ ++ ++int ++main (void) ++{ ++ return 0; ++} +diff --git a/scripts/tst-elf-edit.py b/scripts/tst-elf-edit.py +new file mode 100644 +index 0000000000000000..0e19ce1e7392f3ca +--- /dev/null ++++ b/scripts/tst-elf-edit.py +@@ -0,0 +1,226 @@ ++#!/usr/bin/python3 ++# ELF editor for load align tests. ++# Copyright (C) 2022 Free Software Foundation, Inc. ++# Copyright The GNU Toolchain Authors. ++# This file is part of the GNU C Library. ++# ++# The GNU C Library is free software; you can redistribute it and/or ++# modify it under the terms of the GNU Lesser General Public ++# License as published by the Free Software Foundation; either ++# version 2.1 of the License, or (at your option) any later version. ++# ++# The GNU C Library is distributed in the hope that it will be useful, ++# but WITHOUT ANY WARRANTY; without even the implied warranty of ++# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++# Lesser General Public License for more details. ++# ++# You should have received a copy of the GNU Lesser General Public ++# License along with the GNU C Library; if not, see ++# . ++ ++import argparse ++import os ++import sys ++import struct ++ ++EI_NIDENT=16 ++ ++EI_MAG0=0 ++ELFMAG0=b'\x7f' ++EI_MAG1=1 ++ELFMAG1=b'E' ++EI_MAG2=2 ++ELFMAG2=b'L' ++EI_MAG3=3 ++ELFMAG3=b'F' ++ ++EI_CLASS=4 ++ELFCLASSNONE=b'0' ++ELFCLASS32=b'\x01' ++ELFCLASS64=b'\x02' ++ ++EI_DATA=5 ++ELFDATA2LSB=b'\x01' ++ELFDATA2MSB=b'\x02' ++ ++ET_EXEC=2 ++ET_DYN=3 ++ ++PT_LOAD=1 ++PT_TLS=7 ++ ++def elf_types_fmts(e_ident): ++ endian = '<' if e_ident[EI_DATA] == ELFDATA2LSB else '>' ++ addr = 'I' if e_ident[EI_CLASS] == ELFCLASS32 else 'Q' ++ off = 'I' if e_ident[EI_CLASS] == ELFCLASS32 else 'Q' ++ return (endian, addr, off) ++ ++class Elf_Ehdr: ++ def __init__(self, e_ident): ++ endian, addr, off = elf_types_fmts(e_ident) ++ self.fmt = '{0}HHI{1}{2}{2}IHHHHHH'.format(endian, addr, off) ++ self.len = struct.calcsize(self.fmt) ++ ++ def read(self, f): ++ buf = f.read(self.len) ++ if not buf: ++ error('{}: header too small'.format(f.name)) ++ data = struct.unpack(self.fmt, buf) ++ self.e_type = data[0] ++ self.e_machine = data[1] ++ self.e_version = data[2] ++ self.e_entry = data[3] ++ self.e_phoff = data[4] ++ self.e_shoff = data[5] ++ self.e_flags = data[6] ++ self.e_ehsize = data[7] ++ self.e_phentsize= data[8] ++ self.e_phnum = data[9] ++ self.e_shstrndx = data[10] ++ ++ ++class Elf_Phdr: ++ def __init__(self, e_ident): ++ endian, addr, off = elf_types_fmts(e_ident) ++ self.ei_class = e_ident[EI_CLASS] ++ if self.ei_class == ELFCLASS32: ++ self.fmt = '{0}I{2}{1}{1}IIII'.format(endian, addr, off) ++ else: ++ self.fmt = '{0}II{2}{1}{1}QQQ'.format(endian, addr, off) ++ self.len = struct.calcsize(self.fmt) ++ ++ def read(self, f): ++ buf = f.read(self.len) ++ if len(buf) < self.len: ++ error('{}: program header too small'.format(f.name)) ++ data = struct.unpack(self.fmt, buf) ++ if self.ei_class == ELFCLASS32: ++ self.p_type = data[0] ++ self.p_offset = data[1] ++ self.p_vaddr = data[2] ++ self.p_paddr = data[3] ++ self.p_filesz = data[4] ++ self.p_memsz = data[5] ++ self.p_flags = data[6] ++ self.p_align = data[7] ++ else: ++ self.p_type = data[0] ++ self.p_flags = data[1] ++ self.p_offset = data[2] ++ self.p_vaddr = data[3] ++ self.p_paddr = data[4] ++ self.p_filesz = data[5] ++ self.p_memsz = data[6] ++ self.p_align = data[7] ++ ++ def write(self, f): ++ if self.ei_class == ELFCLASS32: ++ data = struct.pack(self.fmt, ++ self.p_type, ++ self.p_offset, ++ self.p_vaddr, ++ self.p_paddr, ++ self.p_filesz, ++ self.p_memsz, ++ self.p_flags, ++ self.p_align) ++ else: ++ data = struct.pack(self.fmt, ++ self.p_type, ++ self.p_flags, ++ self.p_offset, ++ self.p_vaddr, ++ self.p_paddr, ++ self.p_filesz, ++ self.p_memsz, ++ self.p_align) ++ f.write(data) ++ ++ ++def error(msg): ++ print(msg, file=sys.stderr) ++ sys.exit(1) ++ ++ ++def elf_edit_align(phdr, align): ++ if align == 'half': ++ phdr.p_align = phdr.p_align >> 1 ++ else: ++ phdr.p_align = int(align) ++ ++def elf_edit_maximize_tls_size(phdr, elfclass): ++ if elfclass == ELFCLASS32: ++ # It is possible that the kernel can allocate half of the ++ # address space, so use something larger. ++ phdr.p_memsz = 0xfff00000 ++ else: ++ phdr.p_memsz = 1 << 63 ++ ++def elf_edit(f, opts): ++ ei_nident_fmt = 'c' * EI_NIDENT ++ ei_nident_len = struct.calcsize(ei_nident_fmt) ++ ++ data = f.read(ei_nident_len) ++ if len(data) < ei_nident_len: ++ error('{}: e_nident too small'.format(f.name)) ++ e_ident = struct.unpack(ei_nident_fmt, data) ++ ++ if e_ident[EI_MAG0] != ELFMAG0 \ ++ or e_ident[EI_MAG1] != ELFMAG1 \ ++ or e_ident[EI_MAG2] != ELFMAG2 \ ++ or e_ident[EI_MAG3] != ELFMAG3: ++ error('{}: bad ELF header'.format(f.name)) ++ ++ if e_ident[EI_CLASS] != ELFCLASS32 \ ++ and e_ident[EI_CLASS] != ELFCLASS64: ++ error('{}: unsupported ELF class: {}'.format(f.name, e_ident[EI_CLASS])) ++ ++ if e_ident[EI_DATA] != ELFDATA2LSB \ ++ and e_ident[EI_DATA] != ELFDATA2MSB: \ ++ error('{}: unsupported ELF data: {}'.format(f.name, e_ident[EI_DATA])) ++ ++ ehdr = Elf_Ehdr(e_ident) ++ ehdr.read(f) ++ if ehdr.e_type not in (ET_EXEC, ET_DYN): ++ error('{}: not an executable or shared library'.format(f.name)) ++ ++ phdr = Elf_Phdr(e_ident) ++ maximize_tls_size_done = False ++ for i in range(0, ehdr.e_phnum): ++ f.seek(ehdr.e_phoff + i * phdr.len) ++ phdr.read(f) ++ if phdr.p_type == PT_LOAD and opts.align is not None: ++ elf_edit_align(phdr, opts.align) ++ f.seek(ehdr.e_phoff + i * phdr.len) ++ phdr.write(f) ++ break ++ if phdr.p_type == PT_TLS and opts.maximize_tls_size: ++ elf_edit_maximize_tls_size(phdr, e_ident[EI_CLASS]) ++ f.seek(ehdr.e_phoff + i * phdr.len) ++ phdr.write(f) ++ maximize_tls_size_done = True ++ break ++ ++ if opts.maximize_tls_size and not maximize_tls_size_done: ++ error('{}: TLS maximum size was not updated'.format(f.name)) ++ ++def get_parser(): ++ parser = argparse.ArgumentParser(description=__doc__) ++ parser.add_argument('-a', dest='align', ++ help='How to set the LOAD alignment') ++ parser.add_argument('--maximize-tls-size', action='store_true', ++ help='Set maximum PT_TLS size') ++ parser.add_argument('output', ++ help='ELF file to edit') ++ return parser ++ ++ ++def main(argv): ++ parser = get_parser() ++ opts = parser.parse_args(argv) ++ with open(opts.output, 'r+b') as fout: ++ elf_edit(fout, opts) ++ ++ ++if __name__ == '__main__': ++ main(sys.argv[1:]) +diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h +index a38de94bf7ea8e93..87ad2f3f4d89eb7d 100644 +--- a/sysdeps/generic/ldsodefs.h ++++ b/sysdeps/generic/ldsodefs.h +@@ -1238,6 +1238,11 @@ extern struct link_map * _dl_get_dl_main_map (void) + /* Initialize the DSO sort algorithm to use. */ + extern void _dl_sort_maps_init (void) attribute_hidden; + ++/* Perform early memory allocation, avoding a TCB dependency. ++ Terminate the process if allocation fails. May attempt to use ++ brk. */ ++void *_dl_early_allocate (size_t size) attribute_hidden; ++ + /* Initialization of libpthread for statically linked applications. + If libpthread is not linked in, this is an empty function. */ + void __pthread_initialize_minimal (void) weak_function; +diff --git a/sysdeps/unix/sysv/linux/dl-early_allocate.c b/sysdeps/unix/sysv/linux/dl-early_allocate.c +new file mode 100644 +index 0000000000000000..52c538e85afa8522 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/dl-early_allocate.c +@@ -0,0 +1,82 @@ ++/* Early memory allocation for the dynamic loader. Generic version. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Mark symbols hidden in static PIE for early self relocation to work. */ ++#if BUILD_PIE_DEFAULT ++# pragma GCC visibility push(hidden) ++#endif ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++ ++/* Defined in brk.c. */ ++extern void *__curbrk; ++ ++void * ++_dl_early_allocate (size_t size) ++{ ++ void *result; ++ ++ if (__curbrk != NULL) ++ /* If the break has been initialized, brk must have run before, ++ so just call it once more. */ ++ { ++ result = __sbrk (size); ++ if (result == (void *) -1) ++ result = NULL; ++ } ++ else ++ { ++ /* If brk has not been invoked, there is no need to update ++ __curbrk. The first call to brk will take care of that. */ ++ void *previous = __brk_call (0); ++ result = __brk_call (previous + size); ++ if (result == previous) ++ result = NULL; ++ else ++ result = previous; ++ } ++ ++ /* If brk fails, fall back to mmap. This can happen due to ++ unfortunate ASLR layout decisions and kernel bugs, particularly ++ for static PIE. */ ++ if (result == NULL) ++ { ++ long int ret; ++ int prot = PROT_READ | PROT_WRITE; ++ int flags = MAP_PRIVATE | MAP_ANONYMOUS; ++#ifdef __NR_mmap2 ++ ret = MMAP_CALL_INTERNAL (mmap2, 0, size, prot, flags, -1, 0); ++#else ++ ret = MMAP_CALL_INTERNAL (mmap, 0, size, prot, flags, -1, 0); ++#endif ++ if (INTERNAL_SYSCALL_ERROR_P (ret)) ++ result = NULL; ++ else ++ result = (void *) ret; ++ } ++ ++ return result; ++} diff --git a/glibc-upstream-2.34-253.patch b/glibc-upstream-2.34-253.patch new file mode 100644 index 0000000..2be9efc --- /dev/null +++ b/glibc-upstream-2.34-253.patch @@ -0,0 +1,350 @@ +commit 89b638f48ac5c9af5b1fe9caa6287d70127b66a5 +Author: Stefan Liebler +Date: Tue May 17 16:12:18 2022 +0200 + + S390: Enable static PIE + + This commit enables static PIE on 64bit. On 31bit, static PIE is + not supported. + + A new configure check in sysdeps/s390/s390-64/configure.ac also performs + a minimal test for requirements in ld: + Ensure you also have those patches for: + - binutils (ld) + - "[PR ld/22263] s390: Avoid dynamic TLS relocs in PIE" + https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=26b1426577b5dcb32d149c64cca3e603b81948a9 + (Tested by configure check above) + Otherwise there will be a R_390_TLS_TPOFF relocation, which fails to + be processed in _dl_relocate_static_pie() as static TLS map is not setup. + - "s390: Add DT_JMPREL pointing to .rela.[i]plt with static-pie" + https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d942d8db12adf4c9e5c7d9ed6496a779ece7149e + (We can't test it in configure as we are not able to link a static PIE + executable if the system glibc lacks static PIE support) + Otherwise there won't be DT_JMPREL, DT_PLTRELA, DT_PLTRELASZ entries + and the IFUNC symbols are not processed, which leads to crashes. + + - kernel (the mentioned links to the commits belong to 5.19 merge window): + - "s390/mmap: increase stack/mmap gap to 128MB" + https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=f2f47d0ef72c30622e62471903ea19446ea79ee2 + - "s390/vdso: move vdso mapping to its own function" + https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=57761da4dc5cd60bed2c81ba0edb7495c3c740b8 + - "s390/vdso: map vdso above stack" + https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=9e37a2e8546f9e48ea76c839116fa5174d14e033 + - "s390/vdso: add vdso randomization" + https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=41cd81abafdc4e58a93fcb677712a76885e3ca25 + (We can't test the kernel of the target system) + Otherwise if /proc/sys/kernel/randomize_va_space is turned off (0), + static PIE executables like ldconfig will crash. While startup sbrk is + used to enlarge the HEAP. Unfortunately the underlying brk syscall fails + as there is not enough space after the HEAP. Then the address of the TLS + image is invalid and the following memcpy in __libc_setup_tls() leads + to a segfault. + If /proc/sys/kernel/randomize_va_space is activated (default: 2), there + is enough space after HEAP. + + - glibc + - "Linux: Define MMAP_CALL_INTERNAL" + https://sourceware.org/git/?p=glibc.git;a=commit;h=c1b68685d438373efe64e5f076f4215723004dfb + - "i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S" + https://sourceware.org/git/?p=glibc.git;a=commit;h=6e5c7a1e262961adb52443ab91bd2c9b72316402 + - "i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls" + https://sourceware.org/git/?p=glibc.git;a=commit;h=60f0f2130d30cfd008ca39743027f1e200592dff + - "ia64: Always define IA64_USE_NEW_STUB as a flag macro" + https://sourceware.org/git/?p=glibc.git;a=commit;h=18bd9c3d3b1b6a9182698c85354578d1d58e9d64 + - "Linux: Implement a useful version of _startup_fatal" + https://sourceware.org/git/?p=glibc.git;a=commit;h=a2a6bce7d7e52c1c34369a7da62c501cc350bc31 + - "Linux: Introduce __brk_call for invoking the brk system call" + https://sourceware.org/git/?p=glibc.git;a=commit;h=b57ab258c1140bc45464b4b9908713e3e0ee35aa + - "csu: Implement and use _dl_early_allocate during static startup" + https://sourceware.org/git/?p=glibc.git;a=commit;h=f787e138aa0bf677bf74fa2a08595c446292f3d7 + The mentioned patch series by Florian Weimer avoids the mentioned failing + sbrk syscall by falling back to mmap. + + This commit also adjusts startup code in start.S to be ready for static PIE. + We have to add a wrapper function for main as we are not allowed to use + GOT relocations before __libc_start_main is called. + (Compare also to: + - commit 14d886edbd3d80b771e1c42fbd9217f9074de9c6 + "aarch64: fix start code for static pie" + - commit 3d1d79283e6de4f7c434cb67fb53a4fd28359669 + "aarch64: fix static pie enabled libc when main is in a shared library" + ) + + (cherry picked from commit 728894dba4a19578bd803906de184a8dd51ed13c) + +diff --git a/sysdeps/s390/s390-64/configure b/sysdeps/s390/s390-64/configure +new file mode 100644 +index 0000000000000000..101c570d2e62da25 +--- /dev/null ++++ b/sysdeps/s390/s390-64/configure +@@ -0,0 +1,122 @@ ++# This file is generated from configure.ac by Autoconf. DO NOT EDIT! ++ # Local configure fragment for sysdeps/s390/s390-64. ++ ++# Minimal checking for static PIE support in ld. ++# Compare to ld testcase/bugzilla: ++# /ld/testsuite/ld-elf/pr22263-1.rd ++{ $as_echo "$as_me:${as_lineno-$LINENO}: checking for s390-specific static PIE requirements" >&5 ++$as_echo_n "checking for s390-specific static PIE requirements... " >&6; } ++if { as_var=\ ++libc_cv_s390x_staticpie_req; eval \${$as_var+:} false; }; then : ++ $as_echo_n "(cached) " >&6 ++else ++ cat > conftest1.c < conftest2.c <&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 ++ test $ac_status = 0; }; } \ ++ && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -fPIE -c conftest2.c -o conftest2.o' ++ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 ++ test $ac_status = 0; }; } \ ++ && { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS -pie -o conftest conftest1.o conftest2.o' ++ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 ++ test $ac_status = 0; }; } \ ++ && { ac_try='! readelf -Wr conftest | grep R_390_TLS_TPOFF' ++ { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5 ++ (eval $ac_try) 2>&5 ++ ac_status=$? ++ $as_echo "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5 ++ test $ac_status = 0; }; } ++ then ++ libc_cv_s390x_staticpie_req=yes ++ fi ++ rm -rf conftest.* ++fi ++eval ac_res=\$\ ++libc_cv_s390x_staticpie_req ++ { $as_echo "$as_me:${as_lineno-$LINENO}: result: $ac_res" >&5 ++$as_echo "$ac_res" >&6; } ++if test $libc_cv_s390x_staticpie_req = yes; then ++ # Static PIE is supported only on 64bit. ++ # Ensure you also have those patches for: ++ # - binutils (ld) ++ # - "[PR ld/22263] s390: Avoid dynamic TLS relocs in PIE" ++ # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=26b1426577b5dcb32d149c64cca3e603b81948a9 ++ # (Tested by configure check above) ++ # Otherwise there will be a R_390_TLS_TPOFF relocation, which fails to ++ # be processed in _dl_relocate_static_pie() as static TLS map is not setup. ++ # - "s390: Add DT_JMPREL pointing to .rela.[i]plt with static-pie" ++ # https://sourceware.org/git/?p=binutils-gdb.git;a=commit;h=d942d8db12adf4c9e5c7d9ed6496a779ece7149e ++ # (We can't test it in configure as we are not able to link a static PIE ++ # executable if the system glibc lacks static PIE support) ++ # Otherwise there won't be DT_JMPREL, DT_PLTRELA, DT_PLTRELASZ entries ++ # and the IFUNC symbols are not processed, which leads to crashes. ++ # ++ # - kernel (the mentioned links to the commits belong to 5.19 merge window): ++ # - "s390/mmap: increase stack/mmap gap to 128MB" ++ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=f2f47d0ef72c30622e62471903ea19446ea79ee2 ++ # - "s390/vdso: move vdso mapping to its own function" ++ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=57761da4dc5cd60bed2c81ba0edb7495c3c740b8 ++ # - "s390/vdso: map vdso above stack" ++ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=9e37a2e8546f9e48ea76c839116fa5174d14e033 ++ # - "s390/vdso: add vdso randomization" ++ # https://git.kernel.org/pub/scm/linux/kernel/git/s390/linux.git/commit/?h=features&id=41cd81abafdc4e58a93fcb677712a76885e3ca25 ++ # (We can't test the kernel of the target system) ++ # Otherwise if /proc/sys/kernel/randomize_va_space is turned off (0), ++ # static PIE executables like ldconfig will crash. While startup sbrk is ++ # used to enlarge the HEAP. Unfortunately the underlying brk syscall fails ++ # as there is not enough space after the HEAP. Then the address of the TLS ++ # image is invalid and the following memcpy in __libc_setup_tls() leads ++ # to a segfault. ++ # If /proc/sys/kernel/randomize_va_space is activated (default: 2), there ++ # is enough space after HEAP. ++ # ++ # - glibc ++ # - "Linux: Define MMAP_CALL_INTERNAL" ++ # https://sourceware.org/git/?p=glibc.git;a=commit;h=c1b68685d438373efe64e5f076f4215723004dfb ++ # - "i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S" ++ # https://sourceware.org/git/?p=glibc.git;a=commit;h=6e5c7a1e262961adb52443ab91bd2c9b72316402 ++ # - "i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls" ++ # https://sourceware.org/git/?p=glibc.git;a=commit;h=60f0f2130d30cfd008ca39743027f1e200592dff ++ # - "ia64: Always define IA64_USE_NEW_STUB as a flag macro" ++ # https://sourceware.org/git/?p=glibc.git;a=commit;h=18bd9c3d3b1b6a9182698c85354578d1d58e9d64 ++ # - "Linux: Implement a useful version of _startup_fatal" ++ # https://sourceware.org/git/?p=glibc.git;a=commit;h=a2a6bce7d7e52c1c34369a7da62c501cc350bc31 ++ # - "Linux: Introduce __brk_call for invoking the brk system call" ++ # https://sourceware.org/git/?p=glibc.git;a=commit;h=b57ab258c1140bc45464b4b9908713e3e0ee35aa ++ # - "csu: Implement and use _dl_early_allocate during static startup" ++ # https://sourceware.org/git/?p=glibc.git;a=commit;h=f787e138aa0bf677bf74fa2a08595c446292f3d7 ++ # The mentioned patch series by Florian Weimer avoids the mentioned failing ++ # sbrk syscall by falling back to mmap. ++ $as_echo "#define SUPPORT_STATIC_PIE 1" >>confdefs.h ++ ++fi +diff --git a/sysdeps/s390/s390-64/configure.ac b/sysdeps/s390/s390-64/configure.ac +new file mode 100644 +index 0000000000000000..2583a4a3350ac11f +--- /dev/null ++++ b/sysdeps/s390/s390-64/configure.ac +@@ -0,0 +1,92 @@ ++GLIBC_PROVIDES dnl See aclocal.m4 in the top level source directory. ++# Local configure fragment for sysdeps/s390/s390-64. ++ ++# Minimal checking for static PIE support in ld. ++# Compare to ld testcase/bugzilla: ++# /ld/testsuite/ld-elf/pr22263-1.rd ++AC_CACHE_CHECK([for s390-specific static PIE requirements], \ ++[libc_cv_s390x_staticpie_req], [dnl ++ cat > conftest1.c < conftest2.c < Scrt1.o */ + larl %r2,main@GOTENT # load pointer to main + lg %r2,0(%r2) ++# else ++ /* Used for dynamic linked position dependent executable. ++ => crt1.o (glibc configured without --disable-default-pie: ++ PIC is defined) ++ Or for static linked position independent executable. ++ => rcrt1.o (only available if glibc configured without ++ --disable-default-pie: PIC is defined) */ ++ larl %r2,__wrap_main ++# endif + brasl %r14,__libc_start_main@plt + #else ++ /* Used for dynamic/static linked position dependent executable. ++ => crt1.o (glibc configured with --disable-default-pie: ++ PIC and SHARED are not defined) */ + larl %r2,main # load pointer to main + brasl %r14,__libc_start_main + #endif +@@ -98,6 +113,19 @@ _start: + + cfi_endproc + ++#if defined PIC && !defined SHARED ++ /* When main is not defined in the executable but in a shared library ++ then a wrapper is needed in crt1.o of the static-pie enabled libc, ++ because crt1.o and rcrt1.o share code and the later must avoid the ++ use of GOT relocations before __libc_start_main is called. */ ++__wrap_main: ++ cfi_startproc ++ larl %r1,main@GOTENT # load pointer to main ++ lg %r1,0(%r1) ++ br %r1 ++ cfi_endproc ++#endif ++ + /* Define a symbol for the first piece of initialized data. */ + .data + .globl __data_start diff --git a/glibc-upstream-2.34-254.patch b/glibc-upstream-2.34-254.patch new file mode 100644 index 0000000..be69b2f --- /dev/null +++ b/glibc-upstream-2.34-254.patch @@ -0,0 +1,301 @@ +commit c73c79af7d6f1124fbfa5d935b4f620217d6a2ec +Author: Szabolcs Nagy +Date: Fri Jun 15 16:14:58 2018 +0100 + + rtld: Use generic argv adjustment in ld.so [BZ #23293] + + When an executable is invoked as + + ./ld.so [ld.so-args] ./exe [exe-args] + + then the argv is adujusted in ld.so before calling the entry point of + the executable so ld.so args are not visible to it. On most targets + this requires moving argv, env and auxv on the stack to ensure correct + stack alignment at the entry point. This had several issues: + + - The code for this adjustment on the stack is written in asm as part + of the target specific ld.so _start code which is hard to maintain. + + - The adjustment is done after _dl_start returns, where it's too late + to update GLRO(dl_auxv), as it is already readonly, so it points to + memory that was clobbered by the adjustment. This is bug 23293. + + - _environ is also wrong in ld.so after the adjustment, but it is + likely not used after _dl_start returns so this is not user visible. + + - _dl_argv was updated, but for this it was moved out of relro, which + changes security properties across targets unnecessarily. + + This patch introduces a generic _dl_start_args_adjust function that + handles the argument adjustments after ld.so processed its own args + and before relro protection is applied. + + The same algorithm is used on all targets, _dl_skip_args is now 0, so + existing target specific adjustment code is no longer used. The bug + affects aarch64, alpha, arc, arm, csky, ia64, nios2, s390-32 and sparc, + other targets don't need the change in principle, only for consistency. + + The GNU Hurd start code relied on _dl_skip_args after dl_main returned, + now it checks directly if args were adjusted and fixes the Hurd startup + data accordingly. + + Follow up patches can remove _dl_skip_args and DL_ARGV_NOT_RELRO. + + Tested on aarch64-linux-gnu and cross tested on i686-gnu. + + Reviewed-by: Adhemerval Zanella + (cherry picked from commit ad43cac44a6860eaefcadadfb2acb349921e96bf) + +Conflicts: + elf/rtld.c + (Downstream-only backport of glibc-rh2023422-1.patch) + +diff --git a/elf/rtld.c b/elf/rtld.c +index 434fbeddd5cce74d..9de53ccaed420a57 100644 +--- a/elf/rtld.c ++++ b/elf/rtld.c +@@ -1121,6 +1121,62 @@ rtld_chain_load (struct link_map *main_map, char *argv0) + rtld_soname, pathname, errcode); + } + ++/* Adjusts the contents of the stack and related globals for the user ++ entry point. The ld.so processed skip_args arguments and bumped ++ _dl_argv and _dl_argc accordingly. Those arguments are removed from ++ argv here. */ ++static void ++_dl_start_args_adjust (int skip_args) ++{ ++ void **sp = (void **) (_dl_argv - skip_args - 1); ++ void **p = sp + skip_args; ++ ++ if (skip_args == 0) ++ return; ++ ++ /* Sanity check. */ ++ intptr_t argc = (intptr_t) sp[0] - skip_args; ++ assert (argc == _dl_argc); ++ ++ /* Adjust argc on stack. */ ++ sp[0] = (void *) (intptr_t) _dl_argc; ++ ++ /* Update globals in rtld. */ ++ _dl_argv -= skip_args; ++ _environ -= skip_args; ++ ++ /* Shuffle argv down. */ ++ do ++ *++sp = *++p; ++ while (*p != NULL); ++ ++ assert (_environ == (char **) (sp + 1)); ++ ++ /* Shuffle envp down. */ ++ do ++ *++sp = *++p; ++ while (*p != NULL); ++ ++#ifdef HAVE_AUX_VECTOR ++ void **auxv = (void **) GLRO(dl_auxv) - skip_args; ++ GLRO(dl_auxv) = (ElfW(auxv_t) *) auxv; /* Aliasing violation. */ ++ assert (auxv == sp + 1); ++ ++ /* Shuffle auxv down. */ ++ ElfW(auxv_t) ax; ++ char *oldp = (char *) (p + 1); ++ char *newp = (char *) (sp + 1); ++ do ++ { ++ memcpy (&ax, oldp, sizeof (ax)); ++ memcpy (newp, &ax, sizeof (ax)); ++ oldp += sizeof (ax); ++ newp += sizeof (ax); ++ } ++ while (ax.a_type != AT_NULL); ++#endif ++} ++ + static void + dl_main (const ElfW(Phdr) *phdr, + ElfW(Word) phnum, +@@ -1177,6 +1233,7 @@ dl_main (const ElfW(Phdr) *phdr, + rtld_is_main = true; + + char *argv0 = NULL; ++ char **orig_argv = _dl_argv; + + /* Note the place where the dynamic linker actually came from. */ + GL(dl_rtld_map).l_name = rtld_progname; +@@ -1191,7 +1248,6 @@ dl_main (const ElfW(Phdr) *phdr, + GLRO(dl_lazy) = -1; + } + +- ++_dl_skip_args; + --_dl_argc; + ++_dl_argv; + } +@@ -1200,14 +1256,12 @@ dl_main (const ElfW(Phdr) *phdr, + if (state.mode != rtld_mode_help) + state.mode = rtld_mode_verify; + +- ++_dl_skip_args; + --_dl_argc; + ++_dl_argv; + } + else if (! strcmp (_dl_argv[1], "--inhibit-cache")) + { + GLRO(dl_inhibit_cache) = 1; +- ++_dl_skip_args; + --_dl_argc; + ++_dl_argv; + } +@@ -1217,7 +1271,6 @@ dl_main (const ElfW(Phdr) *phdr, + state.library_path = _dl_argv[2]; + state.library_path_source = "--library-path"; + +- _dl_skip_args += 2; + _dl_argc -= 2; + _dl_argv += 2; + } +@@ -1226,7 +1279,6 @@ dl_main (const ElfW(Phdr) *phdr, + { + GLRO(dl_inhibit_rpath) = _dl_argv[2]; + +- _dl_skip_args += 2; + _dl_argc -= 2; + _dl_argv += 2; + } +@@ -1234,14 +1286,12 @@ dl_main (const ElfW(Phdr) *phdr, + { + audit_list_add_string (&state.audit_list, _dl_argv[2]); + +- _dl_skip_args += 2; + _dl_argc -= 2; + _dl_argv += 2; + } + else if (! strcmp (_dl_argv[1], "--preload") && _dl_argc > 2) + { + state.preloadarg = _dl_argv[2]; +- _dl_skip_args += 2; + _dl_argc -= 2; + _dl_argv += 2; + } +@@ -1249,7 +1299,6 @@ dl_main (const ElfW(Phdr) *phdr, + { + argv0 = _dl_argv[2]; + +- _dl_skip_args += 2; + _dl_argc -= 2; + _dl_argv += 2; + } +@@ -1257,7 +1306,6 @@ dl_main (const ElfW(Phdr) *phdr, + && _dl_argc > 2) + { + state.glibc_hwcaps_prepend = _dl_argv[2]; +- _dl_skip_args += 2; + _dl_argc -= 2; + _dl_argv += 2; + } +@@ -1265,7 +1313,6 @@ dl_main (const ElfW(Phdr) *phdr, + && _dl_argc > 2) + { + state.glibc_hwcaps_mask = _dl_argv[2]; +- _dl_skip_args += 2; + _dl_argc -= 2; + _dl_argv += 2; + } +@@ -1274,7 +1321,6 @@ dl_main (const ElfW(Phdr) *phdr, + { + state.mode = rtld_mode_list_tunables; + +- ++_dl_skip_args; + --_dl_argc; + ++_dl_argv; + } +@@ -1283,7 +1329,6 @@ dl_main (const ElfW(Phdr) *phdr, + { + state.mode = rtld_mode_list_diagnostics; + +- ++_dl_skip_args; + --_dl_argc; + ++_dl_argv; + } +@@ -1329,7 +1374,6 @@ dl_main (const ElfW(Phdr) *phdr, + _dl_usage (ld_so_name, NULL); + } + +- ++_dl_skip_args; + --_dl_argc; + ++_dl_argv; + +@@ -1428,6 +1472,9 @@ dl_main (const ElfW(Phdr) *phdr, + /* Set the argv[0] string now that we've processed the executable. */ + if (argv0 != NULL) + _dl_argv[0] = argv0; ++ ++ /* Adjust arguments for the application entry point. */ ++ _dl_start_args_adjust (_dl_argv - orig_argv); + } + else + { +diff --git a/sysdeps/mach/hurd/dl-sysdep.c b/sysdeps/mach/hurd/dl-sysdep.c +index 4b2072e5d5e3bfd2..5c0f8e46bfbd4753 100644 +--- a/sysdeps/mach/hurd/dl-sysdep.c ++++ b/sysdeps/mach/hurd/dl-sysdep.c +@@ -106,6 +106,7 @@ _dl_sysdep_start (void **start_argptr, + { + void go (intptr_t *argdata) + { ++ char *orig_argv0; + char **p; + + /* Cache the information in various global variables. */ +@@ -114,6 +115,8 @@ _dl_sysdep_start (void **start_argptr, + _environ = &_dl_argv[_dl_argc + 1]; + for (p = _environ; *p++;); /* Skip environ pointers and terminator. */ + ++ orig_argv0 = _dl_argv[0]; ++ + if ((void *) p == _dl_argv[0]) + { + static struct hurd_startup_data nodata; +@@ -204,30 +207,23 @@ unfmh(); /* XXX */ + + /* The call above might screw a few things up. + +- First of all, if _dl_skip_args is nonzero, we are ignoring +- the first few arguments. However, if we have no Hurd startup +- data, it is the magical convention that ARGV[0] == P. The ++ P is the location after the terminating NULL of the list of ++ environment variables. It has to point to the Hurd startup ++ data or if that's missing then P == ARGV[0] must hold. The + startup code in init-first.c will get confused if this is not + the case, so we must rearrange things to make it so. We'll +- overwrite the origional ARGV[0] at P with ARGV[_dl_skip_args]. ++ recompute P and move the Hurd data or the new ARGV[0] there. + +- Secondly, if we need to be secure, it removes some dangerous +- environment variables. If we have no Hurd startup date this +- changes P (since that's the location after the terminating +- NULL in the list of environment variables). We do the same +- thing as in the first case but make sure we recalculate P. +- If we do have Hurd startup data, we have to move the data +- such that it starts just after the terminating NULL in the +- environment list. ++ Note: directly invoked ld.so can move arguments and env vars. + + We use memmove, since the locations might overlap. */ +- if (__libc_enable_secure || _dl_skip_args) +- { +- char **newp; + +- for (newp = _environ; *newp++;); ++ char **newp; ++ for (newp = _environ; *newp++;); + +- if (_dl_argv[-_dl_skip_args] == (char *) p) ++ if (newp != p || _dl_argv[0] != orig_argv0) ++ { ++ if (orig_argv0 == (char *) p) + { + if ((char *) newp != _dl_argv[0]) + { diff --git a/glibc-upstream-2.34-255.patch b/glibc-upstream-2.34-255.patch new file mode 100644 index 0000000..aa679f3 --- /dev/null +++ b/glibc-upstream-2.34-255.patch @@ -0,0 +1,105 @@ +commit b2585cae2854d7d2868fb2e51e2796042c5e0679 +Author: Szabolcs Nagy +Date: Tue May 3 13:18:04 2022 +0100 + + linux: Add a getauxval test [BZ #23293] + + This is for bug 23293 and it relies on the glibc test system running + tests via explicit ld.so invokation by default. + + Reviewed-by: Florian Weimer + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 9faf5262c77487c96da8a3e961b88c0b1879e186) + +diff --git a/sysdeps/unix/sysv/linux/Makefile b/sysdeps/unix/sysv/linux/Makefile +index 0657f4003e7116c6..5c772f69d1b1f1f1 100644 +--- a/sysdeps/unix/sysv/linux/Makefile ++++ b/sysdeps/unix/sysv/linux/Makefile +@@ -123,6 +123,7 @@ tests += tst-clone tst-clone2 tst-clone3 tst-fanotify tst-personality \ + tst-close_range \ + tst-prctl \ + tst-scm_rights \ ++ tst-getauxval \ + # tests + + # Test for the symbol version of fcntl that was replaced in glibc 2.28. +diff --git a/sysdeps/unix/sysv/linux/tst-getauxval.c b/sysdeps/unix/sysv/linux/tst-getauxval.c +new file mode 100644 +index 0000000000000000..c4b619574369f4c5 +--- /dev/null ++++ b/sysdeps/unix/sysv/linux/tst-getauxval.c +@@ -0,0 +1,74 @@ ++/* Basic test for getauxval. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++ ++static int missing; ++static int mismatch; ++ ++static void ++check_nonzero (unsigned long t, const char *s) ++{ ++ unsigned long v = getauxval (t); ++ printf ("%s: %lu (0x%lx)\n", s, v, v); ++ if (v == 0) ++ missing++; ++} ++ ++static void ++check_eq (unsigned long t, const char *s, unsigned long want) ++{ ++ unsigned long v = getauxval (t); ++ printf ("%s: %lu want: %lu\n", s, v, want); ++ if (v != want) ++ mismatch++; ++} ++ ++#define NZ(x) check_nonzero (x, #x) ++#define EQ(x, want) check_eq (x, #x, want) ++ ++static int ++do_test (void) ++{ ++ /* These auxv entries should be non-zero on Linux. */ ++ NZ (AT_PHDR); ++ NZ (AT_PHENT); ++ NZ (AT_PHNUM); ++ NZ (AT_PAGESZ); ++ NZ (AT_ENTRY); ++ NZ (AT_CLKTCK); ++ NZ (AT_RANDOM); ++ NZ (AT_EXECFN); ++ if (missing) ++ FAIL_EXIT1 ("Found %d missing auxv entries.\n", missing); ++ ++ /* Check against syscalls. */ ++ EQ (AT_UID, getuid ()); ++ EQ (AT_EUID, geteuid ()); ++ EQ (AT_GID, getgid ()); ++ EQ (AT_EGID, getegid ()); ++ if (mismatch) ++ FAIL_EXIT1 ("Found %d mismatching auxv entries.\n", mismatch); ++ ++ return 0; ++} ++ ++#include diff --git a/glibc-upstream-2.34-256.patch b/glibc-upstream-2.34-256.patch new file mode 100644 index 0000000..d92a5d0 --- /dev/null +++ b/glibc-upstream-2.34-256.patch @@ -0,0 +1,39 @@ +commit 14770f3e0462721b317f138197e1fbf4db542c94 +Author: Sergei Trofimovich +Date: Mon May 23 13:56:43 2022 +0530 + + string.h: fix __fortified_attr_access macro call [BZ #29162] + + commit e938c0274 "Don't add access size hints to fortifiable functions" + converted a few '__attr_access ((...))' into '__fortified_attr_access (...)' + calls. + + But one of conversions had double parentheses of '__fortified_attr_access (...)'. + + Noticed as a gnat6 build failure: + + /<>-glibc-2.34-210-dev/include/bits/string_fortified.h:110:50: error: macro "__fortified_attr_access" requires 3 arguments, but only 1 given + + The change fixes parentheses. + + This is seen when using compilers that do not support + __builtin___stpncpy_chk, e.g. gcc older than 4.7, clang older than 2.6 + or some compiler not derived from gcc or clang. + + Signed-off-by: Sergei Trofimovich + Reviewed-by: Siddhesh Poyarekar + (cherry picked from commit 5a5f94af0542f9a35aaa7992c18eb4e2403a29b9) + +diff --git a/string/bits/string_fortified.h b/string/bits/string_fortified.h +index 218006c9ba882d9c..4e66e0bd1ebb572a 100644 +--- a/string/bits/string_fortified.h ++++ b/string/bits/string_fortified.h +@@ -107,7 +107,7 @@ __NTH (stpncpy (char *__dest, const char *__src, size_t __n)) + # else + extern char *__stpncpy_chk (char *__dest, const char *__src, size_t __n, + size_t __destlen) __THROW +- __fortified_attr_access ((__write_only__, 1, 3)) ++ __fortified_attr_access (__write_only__, 1, 3) + __attr_access ((__read_only__, 2)); + extern char *__REDIRECT_NTH (__stpncpy_alias, (char *__dest, const char *__src, + size_t __n), stpncpy); diff --git a/glibc-upstream-2.34-257.patch b/glibc-upstream-2.34-257.patch new file mode 100644 index 0000000..c9e1cd5 --- /dev/null +++ b/glibc-upstream-2.34-257.patch @@ -0,0 +1,51 @@ +commit 83ae8287c1c3009459ff29241b647ff61363b22c +Author: Noah Goldstein +Date: Tue Feb 15 08:18:15 2022 -0600 + + x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #29127] + + Re-cherry-pick commit c627209832 for strcmp-avx2.S change which was + omitted in intial cherry pick because at the time this bug was not + present on release branch. + + Fixes BZ #29127. + + In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would + call strcmp-avx2 and wcscmp-avx2 respectively. This would have + not checks around vzeroupper and would trigger spurious + aborts. This commit fixes that. + + test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on + AVX2 machines with and without RTM. + + Co-authored-by: H.J. Lu + (cherry picked from commit c6272098323153db373f2986c67786ea8c85f1cf) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index aa91f6e48a0e1ce5..a9806daadbbfd18b 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -345,10 +345,10 @@ L(one_or_less): + movq %LOCALE_REG, %rdx + # endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_avx2 ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -357,10 +357,6 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- +- jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + TOLOWER_gpr (%rax, %eax) diff --git a/glibc-upstream-2.34-258.patch b/glibc-upstream-2.34-258.patch new file mode 100644 index 0000000..1f04c21 --- /dev/null +++ b/glibc-upstream-2.34-258.patch @@ -0,0 +1,737 @@ +commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31 +Author: Arjun Shankar +Date: Tue May 24 17:57:36 2022 +0200 + + Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose + + In multi-threaded programs, registering via pthread_atfork, + de-registering implicitly via dlclose, or running pthread_atfork + handlers during fork was protected by an internal lock. This meant + that a pthread_atfork handler attempting to register another handler or + dlclose a dynamically loaded library would lead to a deadlock. + + This commit fixes the deadlock in the following way: + + During the execution of handlers at fork time, the atfork lock is + released prior to the execution of each handler and taken again upon its + return. Any handler registrations or de-registrations that occurred + during the execution of the handler are accounted for before proceeding + with further handler execution. + + If a handler that hasn't been executed yet gets de-registered by another + handler during fork, it will not be executed. If a handler gets + registered by another handler during fork, it will not be executed + during that particular fork. + + The possibility that handlers may now be registered or deregistered + during handler execution means that identifying the next handler to be + run after a given handler may register/de-register others requires some + bookkeeping. The fork_handler struct has an additional field, 'id', + which is assigned sequentially during registration. Thus, handlers are + executed in ascending order of 'id' during 'prepare', and descending + order of 'id' during parent/child handler execution after the fork. + + Two tests are included: + + * tst-atfork3: Adhemerval Zanella + This test exercises calling dlclose from prepare, parent, and child + handlers. + + * tst-atfork4: This test exercises calling pthread_atfork and dlclose + from the prepare handler. + + [BZ #24595, BZ #27054] + + Co-authored-by: Adhemerval Zanella + Reviewed-by: Adhemerval Zanella + (cherry picked from commit 52a103e237329b9f88a28513fe7506ffc3bd8ced) + +diff --git a/include/register-atfork.h b/include/register-atfork.h +index fadde14700947ac6..6d7bfd87688d6530 100644 +--- a/include/register-atfork.h ++++ b/include/register-atfork.h +@@ -26,6 +26,7 @@ struct fork_handler + void (*parent_handler) (void); + void (*child_handler) (void); + void *dso_handle; ++ uint64_t id; + }; + + /* Function to call to unregister fork handlers. */ +@@ -39,19 +40,18 @@ enum __run_fork_handler_type + atfork_run_parent + }; + +-/* Run the atfork handlers and lock/unlock the internal lock depending +- of the WHO argument: +- +- - atfork_run_prepare: run all the PREPARE_HANDLER in reverse order of +- insertion and locks the internal lock. +- - atfork_run_child: run all the CHILD_HANDLER and unlocks the internal +- lock. +- - atfork_run_parent: run all the PARENT_HANDLER and unlocks the internal +- lock. +- +- Perform locking only if DO_LOCKING. */ +-extern void __run_fork_handlers (enum __run_fork_handler_type who, +- _Bool do_locking) attribute_hidden; ++/* Run the atfork prepare handlers in the reverse order of registration and ++ return the ID of the last registered handler. If DO_LOCKING is true, the ++ internal lock is held locked upon return. */ ++extern uint64_t __run_prefork_handlers (_Bool do_locking) attribute_hidden; ++ ++/* Given a handler type (parent or child), run all the atfork handlers in ++ the order of registration up to and including the handler with id equal ++ to LASTRUN. If DO_LOCKING is true, the internal lock is unlocked prior ++ to return. */ ++extern void __run_postfork_handlers (enum __run_fork_handler_type who, ++ _Bool do_locking, ++ uint64_t lastrun) attribute_hidden; + + /* C library side function to register new fork handlers. */ + extern int __register_atfork (void (*__prepare) (void), +diff --git a/posix/fork.c b/posix/fork.c +index 021691b9b7441f15..890b806eb48cb75a 100644 +--- a/posix/fork.c ++++ b/posix/fork.c +@@ -46,8 +46,9 @@ __libc_fork (void) + best effort to make is async-signal-safe at least for single-thread + case. */ + bool multiple_threads = __libc_single_threaded == 0; ++ uint64_t lastrun; + +- __run_fork_handlers (atfork_run_prepare, multiple_threads); ++ lastrun = __run_prefork_handlers (multiple_threads); + + struct nss_database_data nss_database_data; + +@@ -105,7 +106,7 @@ __libc_fork (void) + reclaim_stacks (); + + /* Run the handlers registered for the child. */ +- __run_fork_handlers (atfork_run_child, multiple_threads); ++ __run_postfork_handlers (atfork_run_child, multiple_threads, lastrun); + } + else + { +@@ -123,7 +124,7 @@ __libc_fork (void) + } + + /* Run the handlers registered for the parent. */ +- __run_fork_handlers (atfork_run_parent, multiple_threads); ++ __run_postfork_handlers (atfork_run_parent, multiple_threads, lastrun); + + if (pid < 0) + __set_errno (save_errno); +diff --git a/posix/register-atfork.c b/posix/register-atfork.c +index 6fd9e4c56aafd7cc..6370437aa68e039e 100644 +--- a/posix/register-atfork.c ++++ b/posix/register-atfork.c +@@ -19,6 +19,8 @@ + #include + #include + #include ++#include ++#include + + #define DYNARRAY_ELEMENT struct fork_handler + #define DYNARRAY_STRUCT fork_handler_list +@@ -27,7 +29,7 @@ + #include + + static struct fork_handler_list fork_handlers; +-static bool fork_handler_init = false; ++static uint64_t fork_handler_counter; + + static int atfork_lock = LLL_LOCK_INITIALIZER; + +@@ -37,11 +39,8 @@ __register_atfork (void (*prepare) (void), void (*parent) (void), + { + lll_lock (atfork_lock, LLL_PRIVATE); + +- if (!fork_handler_init) +- { +- fork_handler_list_init (&fork_handlers); +- fork_handler_init = true; +- } ++ if (fork_handler_counter == 0) ++ fork_handler_list_init (&fork_handlers); + + struct fork_handler *newp = fork_handler_list_emplace (&fork_handlers); + if (newp != NULL) +@@ -50,6 +49,13 @@ __register_atfork (void (*prepare) (void), void (*parent) (void), + newp->parent_handler = parent; + newp->child_handler = child; + newp->dso_handle = dso_handle; ++ ++ /* IDs assigned to handlers start at 1 and increment with handler ++ registration. Un-registering a handlers discards the corresponding ++ ID. It is not reused in future registrations. */ ++ if (INT_ADD_OVERFLOW (fork_handler_counter, 1)) ++ __libc_fatal ("fork handler counter overflow"); ++ newp->id = ++fork_handler_counter; + } + + /* Release the lock. */ +@@ -104,37 +110,111 @@ __unregister_atfork (void *dso_handle) + lll_unlock (atfork_lock, LLL_PRIVATE); + } + +-void +-__run_fork_handlers (enum __run_fork_handler_type who, _Bool do_locking) ++uint64_t ++__run_prefork_handlers (_Bool do_locking) + { +- struct fork_handler *runp; ++ uint64_t lastrun; + +- if (who == atfork_run_prepare) ++ if (do_locking) ++ lll_lock (atfork_lock, LLL_PRIVATE); ++ ++ /* We run prepare handlers from last to first. After fork, only ++ handlers up to the last handler found here (pre-fork) will be run. ++ Handlers registered during __run_prefork_handlers or ++ __run_postfork_handlers will be positioned after this last handler, and ++ since their prepare handlers won't be run now, their parent/child ++ handlers should also be ignored. */ ++ lastrun = fork_handler_counter; ++ ++ size_t sl = fork_handler_list_size (&fork_handlers); ++ for (size_t i = sl; i > 0;) + { +- if (do_locking) +- lll_lock (atfork_lock, LLL_PRIVATE); +- size_t sl = fork_handler_list_size (&fork_handlers); +- for (size_t i = sl; i > 0; i--) +- { +- runp = fork_handler_list_at (&fork_handlers, i - 1); +- if (runp->prepare_handler != NULL) +- runp->prepare_handler (); +- } ++ struct fork_handler *runp ++ = fork_handler_list_at (&fork_handlers, i - 1); ++ ++ uint64_t id = runp->id; ++ ++ if (runp->prepare_handler != NULL) ++ { ++ if (do_locking) ++ lll_unlock (atfork_lock, LLL_PRIVATE); ++ ++ runp->prepare_handler (); ++ ++ if (do_locking) ++ lll_lock (atfork_lock, LLL_PRIVATE); ++ } ++ ++ /* We unlocked, ran the handler, and locked again. In the ++ meanwhile, one or more deregistrations could have occurred leading ++ to the current (just run) handler being moved up the list or even ++ removed from the list itself. Since handler IDs are guaranteed to ++ to be in increasing order, the next handler has to have: */ ++ ++ /* A. An earlier position than the current one has. */ ++ i--; ++ ++ /* B. A lower ID than the current one does. The code below skips ++ any newly added handlers with higher IDs. */ ++ while (i > 0 ++ && fork_handler_list_at (&fork_handlers, i - 1)->id >= id) ++ i--; + } +- else ++ ++ return lastrun; ++} ++ ++void ++__run_postfork_handlers (enum __run_fork_handler_type who, _Bool do_locking, ++ uint64_t lastrun) ++{ ++ size_t sl = fork_handler_list_size (&fork_handlers); ++ for (size_t i = 0; i < sl;) + { +- size_t sl = fork_handler_list_size (&fork_handlers); +- for (size_t i = 0; i < sl; i++) +- { +- runp = fork_handler_list_at (&fork_handlers, i); +- if (who == atfork_run_child && runp->child_handler) +- runp->child_handler (); +- else if (who == atfork_run_parent && runp->parent_handler) +- runp->parent_handler (); +- } ++ struct fork_handler *runp = fork_handler_list_at (&fork_handlers, i); ++ uint64_t id = runp->id; ++ ++ /* prepare handlers were not run for handlers with ID > LASTRUN. ++ Thus, parent/child handlers will also not be run. */ ++ if (id > lastrun) ++ break; ++ + if (do_locking) +- lll_unlock (atfork_lock, LLL_PRIVATE); ++ lll_unlock (atfork_lock, LLL_PRIVATE); ++ ++ if (who == atfork_run_child && runp->child_handler) ++ runp->child_handler (); ++ else if (who == atfork_run_parent && runp->parent_handler) ++ runp->parent_handler (); ++ ++ if (do_locking) ++ lll_lock (atfork_lock, LLL_PRIVATE); ++ ++ /* We unlocked, ran the handler, and locked again. In the meanwhile, ++ one or more [de]registrations could have occurred. Due to this, ++ the list size must be updated. */ ++ sl = fork_handler_list_size (&fork_handlers); ++ ++ /* The just-run handler could also have moved up the list. */ ++ ++ if (sl > i && fork_handler_list_at (&fork_handlers, i)->id == id) ++ /* The position of the recently run handler hasn't changed. The ++ next handler to be run is an easy increment away. */ ++ i++; ++ else ++ { ++ /* The next handler to be run is the first handler in the list ++ to have an ID higher than the current one. */ ++ for (i = 0; i < sl; i++) ++ { ++ if (fork_handler_list_at (&fork_handlers, i)->id > id) ++ break; ++ } ++ } + } ++ ++ if (do_locking) ++ lll_unlock (atfork_lock, LLL_PRIVATE); + } + + +diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile +index 00419c4d199df912..5147588c130c9415 100644 +--- a/sysdeps/pthread/Makefile ++++ b/sysdeps/pthread/Makefile +@@ -154,16 +154,36 @@ tests += tst-cancelx2 tst-cancelx3 tst-cancelx6 tst-cancelx8 tst-cancelx9 \ + tst-cleanupx0 tst-cleanupx1 tst-cleanupx2 tst-cleanupx3 + + ifeq ($(build-shared),yes) +-tests += tst-atfork2 tst-pt-tls4 tst-_res1 tst-fini1 tst-create1 ++tests += \ ++ tst-atfork2 \ ++ tst-pt-tls4 \ ++ tst-_res1 \ ++ tst-fini1 \ ++ tst-create1 \ ++ tst-atfork3 \ ++ tst-atfork4 \ ++# tests ++ + tests-nolibpthread += tst-fini1 + endif + +-modules-names += tst-atfork2mod tst-tls4moda tst-tls4modb \ +- tst-_res1mod1 tst-_res1mod2 tst-fini1mod \ +- tst-create1mod ++modules-names += \ ++ tst-atfork2mod \ ++ tst-tls4moda \ ++ tst-tls4modb \ ++ tst-_res1mod1 \ ++ tst-_res1mod2 \ ++ tst-fini1mod \ ++ tst-create1mod \ ++ tst-atfork3mod \ ++ tst-atfork4mod \ ++# module-names ++ + test-modules = $(addprefix $(objpfx),$(addsuffix .so,$(modules-names))) + + tst-atfork2mod.so-no-z-defs = yes ++tst-atfork3mod.so-no-z-defs = yes ++tst-atfork4mod.so-no-z-defs = yes + tst-create1mod.so-no-z-defs = yes + + ifeq ($(build-shared),yes) +@@ -226,8 +246,18 @@ tst-atfork2-ENV = MALLOC_TRACE=$(objpfx)tst-atfork2.mtrace \ + LD_PRELOAD=$(common-objpfx)/malloc/libc_malloc_debug.so + $(objpfx)tst-atfork2mod.so: $(shared-thread-library) + ++$(objpfx)tst-atfork3: $(shared-thread-library) ++LDFLAGS-tst-atfork3 = -rdynamic ++$(objpfx)tst-atfork3mod.so: $(shared-thread-library) ++ ++$(objpfx)tst-atfork4: $(shared-thread-library) ++LDFLAGS-tst-atfork4 = -rdynamic ++$(objpfx)tst-atfork4mod.so: $(shared-thread-library) ++ + ifeq ($(build-shared),yes) + $(objpfx)tst-atfork2.out: $(objpfx)tst-atfork2mod.so ++$(objpfx)tst-atfork3.out: $(objpfx)tst-atfork3mod.so ++$(objpfx)tst-atfork4.out: $(objpfx)tst-atfork4mod.so + endif + + ifeq ($(build-shared),yes) +diff --git a/sysdeps/pthread/tst-atfork3.c b/sysdeps/pthread/tst-atfork3.c +new file mode 100644 +index 0000000000000000..bb2250e432ab79ad +--- /dev/null ++++ b/sysdeps/pthread/tst-atfork3.c +@@ -0,0 +1,118 @@ ++/* Check if pthread_atfork handler can call dlclose (BZ#24595). ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++ ++/* Check if pthread_atfork handlers do not deadlock when calling a function ++ that might alter the internal fork handle list, such as dlclose. ++ ++ The test registers a callback set with pthread_atfork(), dlopen() a shared ++ library (nptl/tst-atfork3mod.c), calls an exported symbol from the library ++ (which in turn also registers atfork handlers), and calls fork to trigger ++ the callbacks. */ ++ ++static void *handler; ++static bool run_dlclose_prepare; ++static bool run_dlclose_parent; ++static bool run_dlclose_child; ++ ++static void ++prepare (void) ++{ ++ if (run_dlclose_prepare) ++ xdlclose (handler); ++} ++ ++static void ++parent (void) ++{ ++ if (run_dlclose_parent) ++ xdlclose (handler); ++} ++ ++static void ++child (void) ++{ ++ if (run_dlclose_child) ++ xdlclose (handler); ++} ++ ++static void ++proc_func (void *closure) ++{ ++} ++ ++static void ++do_test_generic (bool dlclose_prepare, bool dlclose_parent, bool dlclose_child) ++{ ++ run_dlclose_prepare = dlclose_prepare; ++ run_dlclose_parent = dlclose_parent; ++ run_dlclose_child = dlclose_child; ++ ++ handler = xdlopen ("tst-atfork3mod.so", RTLD_NOW); ++ ++ int (*atfork3mod_func)(void); ++ atfork3mod_func = xdlsym (handler, "atfork3mod_func"); ++ ++ atfork3mod_func (); ++ ++ struct support_capture_subprocess proc ++ = support_capture_subprocess (proc_func, NULL); ++ support_capture_subprocess_check (&proc, "tst-atfork3", 0, sc_allow_none); ++ ++ handler = atfork3mod_func = NULL; ++ ++ support_capture_subprocess_free (&proc); ++} ++ ++static void * ++thread_func (void *closure) ++{ ++ return NULL; ++} ++ ++static int ++do_test (void) ++{ ++ { ++ /* Make the process acts as multithread. */ ++ pthread_attr_t attr; ++ xpthread_attr_init (&attr); ++ xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED); ++ xpthread_create (&attr, thread_func, NULL); ++ } ++ ++ TEST_COMPARE (pthread_atfork (prepare, parent, child), 0); ++ ++ do_test_generic (true /* prepare */, false /* parent */, false /* child */); ++ do_test_generic (false /* prepare */, true /* parent */, false /* child */); ++ do_test_generic (false /* prepare */, false /* parent */, true /* child */); ++ ++ return 0; ++} ++ ++#include +diff --git a/sysdeps/pthread/tst-atfork3mod.c b/sysdeps/pthread/tst-atfork3mod.c +new file mode 100644 +index 0000000000000000..6d0658cb9efdecbc +--- /dev/null ++++ b/sysdeps/pthread/tst-atfork3mod.c +@@ -0,0 +1,44 @@ ++/* Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++ ++#include ++ ++static void ++mod_prepare (void) ++{ ++} ++ ++static void ++mod_parent (void) ++{ ++} ++ ++static void ++mod_child (void) ++{ ++} ++ ++int atfork3mod_func (void) ++{ ++ TEST_COMPARE (pthread_atfork (mod_prepare, mod_parent, mod_child), 0); ++ ++ return 0; ++} +diff --git a/sysdeps/pthread/tst-atfork4.c b/sysdeps/pthread/tst-atfork4.c +new file mode 100644 +index 0000000000000000..52dc87e73b846ab9 +--- /dev/null ++++ b/sysdeps/pthread/tst-atfork4.c +@@ -0,0 +1,128 @@ ++/* pthread_atfork supports handlers that call pthread_atfork or dlclose. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static void * ++thread_func (void *x) ++{ ++ return NULL; ++} ++ ++static unsigned int second_atfork_handler_runcount = 0; ++ ++static void ++second_atfork_handler (void) ++{ ++ second_atfork_handler_runcount++; ++} ++ ++static void *h = NULL; ++ ++static unsigned int atfork_handler_runcount = 0; ++ ++static void ++prepare (void) ++{ ++ /* These atfork handlers are registered while atfork handlers are being ++ executed and thus will not be executed during the corresponding ++ fork. */ ++ TEST_VERIFY_EXIT (pthread_atfork (second_atfork_handler, ++ second_atfork_handler, ++ second_atfork_handler) == 0); ++ ++ /* This will de-register the atfork handlers registered by the dlopen'd ++ library and so they will not be executed. */ ++ if (h != NULL) ++ { ++ xdlclose (h); ++ h = NULL; ++ } ++ ++ atfork_handler_runcount++; ++} ++ ++static void ++after (void) ++{ ++ atfork_handler_runcount++; ++} ++ ++static int ++do_test (void) ++{ ++ /* Make sure __libc_single_threaded is 0. */ ++ pthread_attr_t attr; ++ xpthread_attr_init (&attr); ++ xpthread_attr_setdetachstate (&attr, PTHREAD_CREATE_DETACHED); ++ xpthread_create (&attr, thread_func, NULL); ++ ++ void (*reg_atfork_handlers) (void); ++ ++ h = xdlopen ("tst-atfork4mod.so", RTLD_LAZY); ++ ++ reg_atfork_handlers = xdlsym (h, "reg_atfork_handlers"); ++ ++ reg_atfork_handlers (); ++ ++ /* We register our atfork handlers *after* loading the module so that our ++ prepare handler is called first at fork, where we then dlclose the ++ module before its prepare handler has a chance to be called. */ ++ TEST_VERIFY_EXIT (pthread_atfork (prepare, after, after) == 0); ++ ++ pid_t pid = xfork (); ++ ++ /* Both the parent and the child processes should observe this. */ ++ TEST_VERIFY_EXIT (atfork_handler_runcount == 2); ++ TEST_VERIFY_EXIT (second_atfork_handler_runcount == 0); ++ ++ if (pid > 0) ++ { ++ int childstat; ++ ++ xwaitpid (-1, &childstat, 0); ++ TEST_VERIFY_EXIT (WIFEXITED (childstat) ++ && WEXITSTATUS (childstat) == 0); ++ ++ /* This time, the second set of atfork handlers should also be called ++ since the handlers are already in place before fork is called. */ ++ ++ pid = xfork (); ++ ++ TEST_VERIFY_EXIT (atfork_handler_runcount == 4); ++ TEST_VERIFY_EXIT (second_atfork_handler_runcount == 2); ++ ++ if (pid > 0) ++ { ++ xwaitpid (-1, &childstat, 0); ++ TEST_VERIFY_EXIT (WIFEXITED (childstat) ++ && WEXITSTATUS (childstat) == 0); ++ } ++ } ++ ++ return 0; ++} ++ ++#include +diff --git a/sysdeps/pthread/tst-atfork4mod.c b/sysdeps/pthread/tst-atfork4mod.c +new file mode 100644 +index 0000000000000000..e111efeb185916e0 +--- /dev/null ++++ b/sysdeps/pthread/tst-atfork4mod.c +@@ -0,0 +1,48 @@ ++/* pthread_atfork supports handlers that call pthread_atfork or dlclose. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++ ++/* This dynamically loaded library simply registers its atfork handlers when ++ asked to. The atfork handlers should never be executed because the ++ library is unloaded before fork is called by the test program. */ ++ ++static void ++prepare (void) ++{ ++ abort (); ++} ++ ++static void ++parent (void) ++{ ++ abort (); ++} ++ ++static void ++child (void) ++{ ++ abort (); ++} ++ ++void ++reg_atfork_handlers (void) ++{ ++ pthread_atfork (prepare, parent, child); ++} diff --git a/glibc.spec b/glibc.spec index 61f2ecc..7911d80 100644 --- a/glibc.spec +++ b/glibc.spec @@ -148,7 +148,7 @@ end \ Summary: The GNU libc libraries Name: glibc Version: %{glibcversion} -Release: 32%{?dist} +Release: 35%{?dist} # In general, GPLv2+ is used by programs, LGPLv2+ is used for # libraries. @@ -461,6 +461,74 @@ Patch253: glibc-upstream-2.34-187.patch Patch254: glibc-upstream-2.34-188.patch Patch255: glibc-upstream-2.34-189.patch Patch256: glibc-upstream-2.34-190.patch +Patch257: glibc-upstream-2.34-191.patch +Patch258: glibc-upstream-2.34-192.patch +Patch259: glibc-upstream-2.34-193.patch +Patch260: glibc-upstream-2.34-194.patch +Patch261: glibc-upstream-2.34-195.patch +Patch262: glibc-upstream-2.34-196.patch +Patch263: glibc-upstream-2.34-197.patch +Patch264: glibc-upstream-2.34-198.patch +Patch265: glibc-upstream-2.34-199.patch +Patch266: glibc-upstream-2.34-200.patch +Patch267: glibc-upstream-2.34-201.patch +Patch268: glibc-upstream-2.34-202.patch +Patch269: glibc-upstream-2.34-203.patch +Patch270: glibc-upstream-2.34-204.patch +Patch271: glibc-upstream-2.34-205.patch +Patch272: glibc-upstream-2.34-206.patch +Patch273: glibc-upstream-2.34-207.patch +Patch274: glibc-upstream-2.34-208.patch +Patch275: glibc-upstream-2.34-209.patch +Patch276: glibc-upstream-2.34-210.patch +Patch277: glibc-upstream-2.34-211.patch +Patch278: glibc-upstream-2.34-212.patch +Patch279: glibc-upstream-2.34-213.patch +Patch280: glibc-upstream-2.34-214.patch +Patch281: glibc-upstream-2.34-215.patch +Patch282: glibc-upstream-2.34-216.patch +Patch283: glibc-upstream-2.34-217.patch +Patch284: glibc-upstream-2.34-218.patch +Patch285: glibc-upstream-2.34-219.patch +Patch286: glibc-upstream-2.34-220.patch +Patch287: glibc-upstream-2.34-221.patch +Patch288: glibc-upstream-2.34-222.patch +Patch289: glibc-upstream-2.34-223.patch +Patch290: glibc-upstream-2.34-224.patch +Patch291: glibc-upstream-2.34-225.patch +Patch292: glibc-upstream-2.34-226.patch +Patch293: glibc-upstream-2.34-227.patch +Patch294: glibc-upstream-2.34-228.patch +Patch295: glibc-upstream-2.34-229.patch +Patch296: glibc-upstream-2.34-230.patch +Patch297: glibc-upstream-2.34-231.patch +Patch298: glibc-upstream-2.34-232.patch +Patch299: glibc-upstream-2.34-233.patch +Patch300: glibc-upstream-2.34-234.patch +Patch301: glibc-upstream-2.34-235.patch +Patch302: glibc-upstream-2.34-236.patch +Patch303: glibc-upstream-2.34-237.patch +Patch304: glibc-upstream-2.34-238.patch +Patch305: glibc-upstream-2.34-239.patch +Patch306: glibc-upstream-2.34-240.patch +Patch307: glibc-upstream-2.34-241.patch +Patch308: glibc-upstream-2.34-242.patch +Patch309: glibc-upstream-2.34-243.patch +Patch310: glibc-upstream-2.34-244.patch +Patch311: glibc-upstream-2.34-245.patch +Patch312: glibc-upstream-2.34-246.patch +Patch313: glibc-upstream-2.34-247.patch +Patch314: glibc-upstream-2.34-248.patch +Patch315: glibc-upstream-2.34-249.patch +Patch316: glibc-upstream-2.34-250.patch +Patch317: glibc-upstream-2.34-251.patch +Patch318: glibc-upstream-2.34-252.patch +Patch319: glibc-upstream-2.34-253.patch +Patch320: glibc-upstream-2.34-254.patch +Patch321: glibc-upstream-2.34-255.patch +Patch322: glibc-upstream-2.34-256.patch +Patch323: glibc-upstream-2.34-257.patch +Patch324: glibc-upstream-2.34-258.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2517,6 +2585,86 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Tue May 31 2022 Arjun Shankar - 2.34-35 +- Sync with upstream branch release/2.34/master, + commit ff450cdbdee0b8cb6b9d653d6d2fa892de29be31: +- Fix deadlock when pthread_atfork handler calls pthread_atfork or dlclose +- x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ #29127] +- string.h: fix __fortified_attr_access macro call [BZ #29162] +- linux: Add a getauxval test [BZ #23293] +- rtld: Use generic argv adjustment in ld.so [BZ #23293] +- S390: Enable static PIE + +* Thu May 19 2022 Florian Weimer - 2.34-34 +- Sync with upstream branch release/2.34/master, + commit ede8d94d154157d269b18f3601440ac576c1f96a: +- csu: Implement and use _dl_early_allocate during static startup +- Linux: Introduce __brk_call for invoking the brk system call +- Linux: Implement a useful version of _startup_fatal +- ia64: Always define IA64_USE_NEW_STUB as a flag macro +- Linux: Define MMAP_CALL_INTERNAL +- i386: Honor I386_USE_SYSENTER for 6-argument Linux system calls +- i386: Remove OPTIMIZE_FOR_GCC_5 from Linux libc-do-syscall.S +- elf: Remove __libc_init_secure +- Linux: Consolidate auxiliary vector parsing (redo) +- Linux: Include in dl-sysdep.c only for SHARED +- Revert "Linux: Consolidate auxiliary vector parsing" +- Linux: Consolidate auxiliary vector parsing +- Linux: Assume that NEED_DL_SYSINFO_DSO is always defined +- Linux: Remove DL_FIND_ARG_COMPONENTS +- Linux: Remove HAVE_AUX_SECURE, HAVE_AUX_XID, HAVE_AUX_PAGESIZE +- elf: Merge dl-sysdep.c into the Linux version +- elf: Remove unused NEED_DL_BASE_ADDR and _dl_base_addr +- x86: Optimize {str|wcs}rchr-evex +- x86: Optimize {str|wcs}rchr-avx2 +- x86: Optimize {str|wcs}rchr-sse2 +- x86: Cleanup page cross code in memcmp-avx2-movbe.S +- x86: Remove memcmp-sse4.S +- x86: Small improvements for wcslen +- x86: Remove AVX str{n}casecmp +- x86: Add EVEX optimized str{n}casecmp +- x86: Add AVX2 optimized str{n}casecmp +- x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S +- x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S +- x86: Remove strspn-sse2.S and use the generic implementation +- x86: Remove strpbrk-sse2.S and use the generic implementation +- x86: Remove strcspn-sse2.S and use the generic implementation +- x86: Optimize strspn in strspn-c.c +- x86: Optimize strcspn and strpbrk in strcspn-c.c +- x86: Code cleanup in strchr-evex and comment justifying branch +- x86: Code cleanup in strchr-avx2 and comment justifying branch +- x86_64: Remove bcopy optimizations +- x86-64: Remove bzero weak alias in SS2 memset +- x86_64/multiarch: Sort sysdep_routines and put one entry per line +- x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) +- fortify: Ensure that __glibc_fortify condition is a constant [BZ #29141] + +* Thu May 12 2022 Florian Weimer - 2.34-33 +- Sync with upstream branch release/2.34/master, + commit 91c2e6c3db44297bf4cb3a2e3c40236c5b6a0b23: +- dlfcn: Implement the RTLD_DI_PHDR request type for dlinfo +- manual: Document the dlinfo function +- x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ #28896] +- x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] +- x86: Set .text section in memset-vec-unaligned-erms +- x86-64: Optimize bzero +- x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 Only) +- x86: Improve vec generation in memset-vec-unaligned-erms.S +- x86-64: Fix strcmp-evex.S +- x86-64: Fix strcmp-avx2.S +- x86: Optimize strcmp-evex.S +- x86: Optimize strcmp-avx2.S +- manual: Clarify that abbreviations of long options are allowed +- Add HWCAP2_AFP, HWCAP2_RPRES from Linux 5.17 to AArch64 bits/hwcap.h +- aarch64: Add HWCAP2_ECV from Linux 5.16 +- Add SOL_MPTCP, SOL_MCTP from Linux 5.16 to bits/socket.h +- Update kernel version to 5.17 in tst-mman-consts.py +- Update kernel version to 5.16 in tst-mman-consts.py +- Update syscall lists for Linux 5.17 +- Add ARPHRD_CAN, ARPHRD_MCTP to net/if_arp.h +- Update kernel version to 5.15 in tst-mman-consts.py +- Add PF_MCTP, AF_MCTP from Linux 5.15 to bits/socket.h + * Thu Apr 28 2022 Carlos O'Donell - 2.34-32 - Sync with upstream branch release/2.34/master, commit c66c92181ddbd82306537a608e8c0282587131de: