From 02cfe04e361ac9ecd2663c077179617908a2a2a2 Mon Sep 17 00:00:00 2001 From: DJ Delorie Date: Thu, 14 Dec 2023 17:33:45 -0500 Subject: [PATCH] Import Intel hyperscale improvements (RHEL-15696) Resolves: RHEL-15696 Includes two additional (well, 1.5) upstream patches to resolve roundeven redirects. --- glibc-RHEL-15696-1.patch | 259 +++ glibc-RHEL-15696-10.patch | 41 + glibc-RHEL-15696-100.patch | 257 +++ glibc-RHEL-15696-101.patch | 964 ++++++++++ glibc-RHEL-15696-102.patch | 263 +++ glibc-RHEL-15696-103.patch | 876 +++++++++ glibc-RHEL-15696-104.patch | 501 ++++++ glibc-RHEL-15696-105.patch | 558 ++++++ glibc-RHEL-15696-106.patch | 73 + glibc-RHEL-15696-107.patch | 226 +++ glibc-RHEL-15696-108.patch | 55 + glibc-RHEL-15696-109.patch | 60 + glibc-RHEL-15696-11.patch | 74 + glibc-RHEL-15696-110.patch | 26 + glibc-RHEL-15696-12.patch | 3410 ++++++++++++++++++++++++++++++++++++ glibc-RHEL-15696-13.patch | 1488 ++++++++++++++++ glibc-RHEL-15696-14.patch | 242 +++ glibc-RHEL-15696-15.patch | 254 +++ glibc-RHEL-15696-16.patch | 561 ++++++ glibc-RHEL-15696-17.patch | 2568 +++++++++++++++++++++++++++ glibc-RHEL-15696-18.patch | 735 ++++++++ glibc-RHEL-15696-19.patch | 148 ++ glibc-RHEL-15696-2.patch | 230 +++ glibc-RHEL-15696-20.patch | 164 ++ glibc-RHEL-15696-21.patch | 71 + glibc-RHEL-15696-22.patch | 51 + glibc-RHEL-15696-23.patch | 584 ++++++ glibc-RHEL-15696-24.patch | 388 ++++ glibc-RHEL-15696-25.patch | 767 ++++++++ glibc-RHEL-15696-26.patch | 701 ++++++++ glibc-RHEL-15696-27.patch | 30 + glibc-RHEL-15696-28.patch | 566 ++++++ glibc-RHEL-15696-29.patch | 181 ++ glibc-RHEL-15696-3.patch | 396 +++++ glibc-RHEL-15696-30.patch | 497 ++++++ glibc-RHEL-15696-31.patch | 745 ++++++++ glibc-RHEL-15696-32.patch | 158 ++ glibc-RHEL-15696-33.patch | 51 + glibc-RHEL-15696-34.patch | 135 ++ glibc-RHEL-15696-35.patch | 51 + glibc-RHEL-15696-36.patch | 44 + glibc-RHEL-15696-37.patch | 359 ++++ glibc-RHEL-15696-38.patch | 67 + glibc-RHEL-15696-39.patch | 449 +++++ glibc-RHEL-15696-4.patch | 151 ++ glibc-RHEL-15696-40.patch | 92 + glibc-RHEL-15696-41.patch | 265 +++ glibc-RHEL-15696-42.patch | 396 +++++ glibc-RHEL-15696-43.patch | 532 ++++++ glibc-RHEL-15696-44.patch | 536 ++++++ glibc-RHEL-15696-45.patch | 873 +++++++++ glibc-RHEL-15696-46.patch | 851 +++++++++ glibc-RHEL-15696-47.patch | 104 ++ glibc-RHEL-15696-48.patch | 84 + glibc-RHEL-15696-49.patch | 55 + glibc-RHEL-15696-5.patch | 290 +++ glibc-RHEL-15696-50.patch | 43 + glibc-RHEL-15696-51.patch | 118 ++ glibc-RHEL-15696-52.patch | 242 +++ glibc-RHEL-15696-53.patch | 41 + glibc-RHEL-15696-54.patch | 268 +++ glibc-RHEL-15696-55.patch | 48 + glibc-RHEL-15696-56.patch | 658 +++++++ glibc-RHEL-15696-57.patch | 510 ++++++ glibc-RHEL-15696-58.patch | 45 + glibc-RHEL-15696-59.patch | 695 ++++++++ glibc-RHEL-15696-6.patch | 300 ++++ glibc-RHEL-15696-60.patch | 54 + glibc-RHEL-15696-61.patch | 56 + glibc-RHEL-15696-62.patch | 136 ++ glibc-RHEL-15696-63.patch | 2428 +++++++++++++++++++++++++ glibc-RHEL-15696-64.patch | 39 + glibc-RHEL-15696-65.patch | 39 + glibc-RHEL-15696-66.patch | 51 + glibc-RHEL-15696-67.patch | 71 + glibc-RHEL-15696-68.patch | 60 + glibc-RHEL-15696-69.patch | 35 + glibc-RHEL-15696-7.patch | 153 ++ glibc-RHEL-15696-70.patch | 389 ++++ glibc-RHEL-15696-71.patch | 43 + glibc-RHEL-15696-72.patch | 146 ++ glibc-RHEL-15696-73.patch | 37 + glibc-RHEL-15696-74.patch | 1798 +++++++++++++++++++ glibc-RHEL-15696-75.patch | 1992 +++++++++++++++++++++ glibc-RHEL-15696-76.patch | 33 + glibc-RHEL-15696-77.patch | 33 + glibc-RHEL-15696-78.patch | 459 +++++ glibc-RHEL-15696-79.patch | 40 + glibc-RHEL-15696-8.patch | 218 +++ glibc-RHEL-15696-80.patch | 753 ++++++++ glibc-RHEL-15696-81.patch | 33 + glibc-RHEL-15696-82.patch | 90 + glibc-RHEL-15696-83.patch | 77 + glibc-RHEL-15696-84.patch | 27 + glibc-RHEL-15696-85.patch | 108 ++ glibc-RHEL-15696-86.patch | 36 + glibc-RHEL-15696-87.patch | 29 + glibc-RHEL-15696-88.patch | 372 ++++ glibc-RHEL-15696-89.patch | 343 ++++ glibc-RHEL-15696-9.patch | 206 +++ glibc-RHEL-15696-90.patch | 147 ++ glibc-RHEL-15696-91.patch | 147 ++ glibc-RHEL-15696-92.patch | 175 ++ glibc-RHEL-15696-93.patch | 55 + glibc-RHEL-15696-94.patch | 168 ++ glibc-RHEL-15696-95.patch | 122 ++ glibc-RHEL-15696-96.patch | 143 ++ glibc-RHEL-15696-97.patch | 759 ++++++++ glibc-RHEL-15696-98.patch | 814 +++++++++ glibc-RHEL-15696-99.patch | 913 ++++++++++ glibc.spec | 115 +- 111 files changed, 41462 insertions(+), 1 deletion(-) create mode 100644 glibc-RHEL-15696-1.patch create mode 100644 glibc-RHEL-15696-10.patch create mode 100644 glibc-RHEL-15696-100.patch create mode 100644 glibc-RHEL-15696-101.patch create mode 100644 glibc-RHEL-15696-102.patch create mode 100644 glibc-RHEL-15696-103.patch create mode 100644 glibc-RHEL-15696-104.patch create mode 100644 glibc-RHEL-15696-105.patch create mode 100644 glibc-RHEL-15696-106.patch create mode 100644 glibc-RHEL-15696-107.patch create mode 100644 glibc-RHEL-15696-108.patch create mode 100644 glibc-RHEL-15696-109.patch create mode 100644 glibc-RHEL-15696-11.patch create mode 100644 glibc-RHEL-15696-110.patch create mode 100644 glibc-RHEL-15696-12.patch create mode 100644 glibc-RHEL-15696-13.patch create mode 100644 glibc-RHEL-15696-14.patch create mode 100644 glibc-RHEL-15696-15.patch create mode 100644 glibc-RHEL-15696-16.patch create mode 100644 glibc-RHEL-15696-17.patch create mode 100644 glibc-RHEL-15696-18.patch create mode 100644 glibc-RHEL-15696-19.patch create mode 100644 glibc-RHEL-15696-2.patch create mode 100644 glibc-RHEL-15696-20.patch create mode 100644 glibc-RHEL-15696-21.patch create mode 100644 glibc-RHEL-15696-22.patch create mode 100644 glibc-RHEL-15696-23.patch create mode 100644 glibc-RHEL-15696-24.patch create mode 100644 glibc-RHEL-15696-25.patch create mode 100644 glibc-RHEL-15696-26.patch create mode 100644 glibc-RHEL-15696-27.patch create mode 100644 glibc-RHEL-15696-28.patch create mode 100644 glibc-RHEL-15696-29.patch create mode 100644 glibc-RHEL-15696-3.patch create mode 100644 glibc-RHEL-15696-30.patch create mode 100644 glibc-RHEL-15696-31.patch create mode 100644 glibc-RHEL-15696-32.patch create mode 100644 glibc-RHEL-15696-33.patch create mode 100644 glibc-RHEL-15696-34.patch create mode 100644 glibc-RHEL-15696-35.patch create mode 100644 glibc-RHEL-15696-36.patch create mode 100644 glibc-RHEL-15696-37.patch create mode 100644 glibc-RHEL-15696-38.patch create mode 100644 glibc-RHEL-15696-39.patch create mode 100644 glibc-RHEL-15696-4.patch create mode 100644 glibc-RHEL-15696-40.patch create mode 100644 glibc-RHEL-15696-41.patch create mode 100644 glibc-RHEL-15696-42.patch create mode 100644 glibc-RHEL-15696-43.patch create mode 100644 glibc-RHEL-15696-44.patch create mode 100644 glibc-RHEL-15696-45.patch create mode 100644 glibc-RHEL-15696-46.patch create mode 100644 glibc-RHEL-15696-47.patch create mode 100644 glibc-RHEL-15696-48.patch create mode 100644 glibc-RHEL-15696-49.patch create mode 100644 glibc-RHEL-15696-5.patch create mode 100644 glibc-RHEL-15696-50.patch create mode 100644 glibc-RHEL-15696-51.patch create mode 100644 glibc-RHEL-15696-52.patch create mode 100644 glibc-RHEL-15696-53.patch create mode 100644 glibc-RHEL-15696-54.patch create mode 100644 glibc-RHEL-15696-55.patch create mode 100644 glibc-RHEL-15696-56.patch create mode 100644 glibc-RHEL-15696-57.patch create mode 100644 glibc-RHEL-15696-58.patch create mode 100644 glibc-RHEL-15696-59.patch create mode 100644 glibc-RHEL-15696-6.patch create mode 100644 glibc-RHEL-15696-60.patch create mode 100644 glibc-RHEL-15696-61.patch create mode 100644 glibc-RHEL-15696-62.patch create mode 100644 glibc-RHEL-15696-63.patch create mode 100644 glibc-RHEL-15696-64.patch create mode 100644 glibc-RHEL-15696-65.patch create mode 100644 glibc-RHEL-15696-66.patch create mode 100644 glibc-RHEL-15696-67.patch create mode 100644 glibc-RHEL-15696-68.patch create mode 100644 glibc-RHEL-15696-69.patch create mode 100644 glibc-RHEL-15696-7.patch create mode 100644 glibc-RHEL-15696-70.patch create mode 100644 glibc-RHEL-15696-71.patch create mode 100644 glibc-RHEL-15696-72.patch create mode 100644 glibc-RHEL-15696-73.patch create mode 100644 glibc-RHEL-15696-74.patch create mode 100644 glibc-RHEL-15696-75.patch create mode 100644 glibc-RHEL-15696-76.patch create mode 100644 glibc-RHEL-15696-77.patch create mode 100644 glibc-RHEL-15696-78.patch create mode 100644 glibc-RHEL-15696-79.patch create mode 100644 glibc-RHEL-15696-8.patch create mode 100644 glibc-RHEL-15696-80.patch create mode 100644 glibc-RHEL-15696-81.patch create mode 100644 glibc-RHEL-15696-82.patch create mode 100644 glibc-RHEL-15696-83.patch create mode 100644 glibc-RHEL-15696-84.patch create mode 100644 glibc-RHEL-15696-85.patch create mode 100644 glibc-RHEL-15696-86.patch create mode 100644 glibc-RHEL-15696-87.patch create mode 100644 glibc-RHEL-15696-88.patch create mode 100644 glibc-RHEL-15696-89.patch create mode 100644 glibc-RHEL-15696-9.patch create mode 100644 glibc-RHEL-15696-90.patch create mode 100644 glibc-RHEL-15696-91.patch create mode 100644 glibc-RHEL-15696-92.patch create mode 100644 glibc-RHEL-15696-93.patch create mode 100644 glibc-RHEL-15696-94.patch create mode 100644 glibc-RHEL-15696-95.patch create mode 100644 glibc-RHEL-15696-96.patch create mode 100644 glibc-RHEL-15696-97.patch create mode 100644 glibc-RHEL-15696-98.patch create mode 100644 glibc-RHEL-15696-99.patch diff --git a/glibc-RHEL-15696-1.patch b/glibc-RHEL-15696-1.patch new file mode 100644 index 0000000..804de54 --- /dev/null +++ b/glibc-RHEL-15696-1.patch @@ -0,0 +1,259 @@ +From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:23:59 -0800 +Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the + upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and + tst-size_t-wmemchr. + * sysdeps/x86_64/x32/test-size_t.h: New file. + * sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise. + * sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise. +--- + sysdeps/x86_64/memchr.S | 10 ++-- + sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++- + sysdeps/x86_64/x32/Makefile | 8 +++ + sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++ + sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++ + 6 files changed, 148 insertions(+), 5 deletions(-) + create mode 100644 sysdeps/x86_64/x32/test-size_t.h + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c + +Conflicts: + ChangeLog + (removed) + NEWS + (removed) + +diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S +index feef5d4f..cb320257 100644 +--- a/sysdeps/x86_64/memchr.S ++++ b/sysdeps/x86_64/memchr.S +@@ -34,12 +34,16 @@ ENTRY(MEMCHR) + mov %edi, %ecx + + #ifdef USE_AS_WMEMCHR +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz L(return_null) +- shl $2, %rdx ++ shl $2, %RDX_LP + #else ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif + punpcklbw %xmm1, %xmm1 +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz L(return_null) + punpcklbw %xmm1, %xmm1 + #endif +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index 5f5e7725..c81da19b 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -40,16 +40,20 @@ + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz L(null) + # endif + movl %edi, %ecx + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + # ifdef USE_AS_WMEMCHR +- shl $2, %rdx ++ shl $2, %RDX_LP + vpbroadcastd %xmm0, %ymm0 + # else ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif + vpbroadcastb %xmm0, %ymm0 + # endif + /* Check if we may cross page boundary with one vector load. */ +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index f2ebc24f..7d528889 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -4,3 +4,11 @@ ifeq ($(subdir),math) + # 64-bit llround. Add -fno-builtin-lround to silence the compiler. + CFLAGS-s_llround.c += -fno-builtin-lround + endif ++ ++ifeq ($(subdir),string) ++tests += tst-size_t-memchr ++endif ++ ++ifeq ($(subdir),wcsmbs) ++tests += tst-size_t-wmemchr ++endif +diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h +new file mode 100644 +index 00000000..78a94086 +--- /dev/null ++++ b/sysdeps/x86_64/x32/test-size_t.h +@@ -0,0 +1,35 @@ ++/* Test string/memory functions with size_t in the lower 32 bits of ++ 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_MAIN ++#include ++ ++/* On x32, parameter_t may be passed in a 64-bit register with the LEN ++ field in the lower 32 bits. When the LEN field of 64-bit register ++ is passed to string/memory function as the size_t parameter, only ++ the lower 32 bits can be used. */ ++typedef struct ++{ ++ union ++ { ++ size_t len; ++ void (*fn) (void); ++ }; ++ void *p; ++} parameter_t; +diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c +new file mode 100644 +index 00000000..29a3daf1 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c +@@ -0,0 +1,72 @@ ++/* Test memchr with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef WIDE ++# define TEST_NAME "memchr" ++#else ++# define TEST_NAME "wmemchr" ++#endif /* WIDE */ ++#include "test-size_t.h" ++ ++#ifndef WIDE ++# define MEMCHR memchr ++# define CHAR char ++# define UCHAR unsigned char ++#else ++# include ++# define MEMCHR wmemchr ++# define CHAR wchar_t ++# define UCHAR wchar_t ++#endif /* WIDE */ ++ ++IMPL (MEMCHR, 1) ++ ++typedef CHAR * (*proto_t) (const CHAR*, int, size_t); ++ ++static CHAR * ++__attribute__ ((noinline, noclone)) ++do_memchr (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, (uintptr_t) b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 }; ++ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ c.fn = impl->fn; ++ CHAR *res = do_memchr (src, c); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %p != NULL", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c +new file mode 100644 +index 00000000..877801d6 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c +@@ -0,0 +1,20 @@ ++/* Test wmemchr with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-memchr.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-10.patch b/glibc-RHEL-15696-10.patch new file mode 100644 index 0000000..10bd49d --- /dev/null +++ b/glibc-RHEL-15696-10.patch @@ -0,0 +1,41 @@ +From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 9 Jan 2022 16:02:21 -0600 +Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] +Content-type: text/plain; charset=UTF-8 + +Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to +__wcscmp_avx2. For x86_64 this covers the entire address range so any +length larger could not possibly be used to bound `s1` or `s2`. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 156c1949..8fb8eedc 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -83,6 +83,16 @@ ENTRY (STRCMP) + je L(char0) + jb L(zero) + # ifdef USE_AS_WCSCMP ++# ifndef __ILP32__ ++ movq %rdx, %rcx ++ /* Check if length could overflow when multiplied by ++ sizeof(wchar_t). Checking top 8 bits will cover all potential ++ overflow cases as well as redirect cases where its impossible to ++ length to bound a valid memory region. In these cases just use ++ 'wcscmp'. */ ++ shrq $56, %rcx ++ jnz __wcscmp_avx2 ++# endif + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-100.patch b/glibc-RHEL-15696-100.patch new file mode 100644 index 0000000..0e779e4 --- /dev/null +++ b/glibc-RHEL-15696-100.patch @@ -0,0 +1,257 @@ +From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 25 Mar 2022 17:13:33 -0500 +Subject: [PATCH] x86: Small improvements for wcslen +Content-type: text/plain; charset=UTF-8 + +Just a few QOL changes. + 1. Prefer `add` > `lea` as it has high execution units it can run + on. + 2. Don't break macro-fusion between `test` and `jcc` + 3. Reduce code size by removing gratuitous padding bytes (-90 + bytes). + +geometric_mean(N=20) of all benchmarks New / Original: 0.959 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++--------------------- + 1 file changed, 41 insertions(+), 45 deletions(-) + +diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S +index 9f5f7232..254bb030 100644 +--- a/sysdeps/x86_64/wcslen.S ++++ b/sysdeps/x86_64/wcslen.S +@@ -41,82 +41,82 @@ ENTRY (__wcslen) + pxor %xmm0, %xmm0 + + lea 32(%rdi), %rax +- lea 16(%rdi), %rcx ++ addq $16, %rdi + and $-16, %rax + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx + pxor %xmm1, %xmm1 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx + pxor %xmm2, %xmm2 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx + pxor %xmm3, %xmm3 ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm0 + pmovmskb %xmm0, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm1 + pmovmskb %xmm1, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm2 + pmovmskb %xmm2, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + pcmpeqd (%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $16, %rax + test %edx, %edx +- lea 16(%rax), %rax + jnz L(exit) + + and $-0x40, %rax +@@ -133,104 +133,100 @@ L(aligned_64_loop): + pminub %xmm0, %xmm2 + pcmpeqd %xmm3, %xmm2 + pmovmskb %xmm2, %edx ++ addq $64, %rax + test %edx, %edx +- lea 64(%rax), %rax + jz L(aligned_64_loop) + + pcmpeqd -64(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $48, %rdi + test %edx, %edx +- lea 48(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm1, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd -32(%rax), %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx + jnz L(exit) + + pcmpeqd %xmm6, %xmm3 + pmovmskb %xmm3, %edx ++ addq $-16, %rdi + test %edx, %edx +- lea -16(%rcx), %rcx +- jnz L(exit) +- +- jmp L(aligned_64_loop) ++ jz L(aligned_64_loop) + + .p2align 4 + L(exit): +- sub %rcx, %rax ++ sub %rdi, %rax + shr $2, %rax + test %dl, %dl + jz L(exit_high) + +- mov %dl, %cl +- and $15, %cl ++ andl $15, %edx + jz L(exit_1) + ret + +- .p2align 4 ++ /* No align here. Naturally aligned % 16 == 1. */ + L(exit_high): +- mov %dh, %ch +- and $15, %ch ++ andl $(15 << 8), %edx + jz L(exit_3) + add $2, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_1): + add $1, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_3): + add $3, %rax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail0): +- xor %rax, %rax ++ xorl %eax, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail1): +- mov $1, %rax ++ movl $1, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail2): +- mov $2, %rax ++ movl $2, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail3): +- mov $3, %rax ++ movl $3, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail4): +- mov $4, %rax ++ movl $4, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail5): +- mov $5, %rax ++ movl $5, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail6): +- mov $6, %rax ++ movl $6, %eax + ret + +- .p2align 4 ++ .p2align 3 + L(exit_tail7): +- mov $7, %rax ++ movl $7, %eax + ret + + END (__wcslen) +-- +GitLab + diff --git a/glibc-RHEL-15696-101.patch b/glibc-RHEL-15696-101.patch new file mode 100644 index 0000000..131ea5b --- /dev/null +++ b/glibc-RHEL-15696-101.patch @@ -0,0 +1,964 @@ +From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 15 Apr 2022 12:28:00 -0500 +Subject: [PATCH] x86: Remove memcmp-sse4.S +Content-type: text/plain; charset=UTF-8 + +Code didn't actually use any sse4 instructions since `ptest` was +removed in: + +commit 2f9062d7171850451e6044ef78d91ff8c017b9c0 +Author: Noah Goldstein +Date: Wed Nov 10 16:18:56 2021 -0600 + + x86: Shrink memcmp-sse4.S code size + +The new memcmp-sse2 implementation is also faster. + +geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905 + +Note there are two regressions preferring SSE2 for Size = 1 and Size = +65. + +Size = 1: +size, align0, align1, ret, New Time/Old Time + 1, 1, 1, 0, 1.2 + 1, 1, 1, 1, 1.197 + 1, 1, 1, -1, 1.2 + +This is intentional. Size == 1 is significantly less hot based on +profiles of GCC11 and Python3 than sizes [4, 8] (which is made +hotter). + +Python3 Size = 1 -> 13.64% +Python3 Size = [4, 8] -> 60.92% + +GCC11 Size = 1 -> 1.29% +GCC11 Size = [4, 8] -> 33.86% + +size, align0, align1, ret, New Time/Old Time + 4, 4, 4, 0, 0.622 + 4, 4, 4, 1, 0.797 + 4, 4, 4, -1, 0.805 + 5, 5, 5, 0, 0.623 + 5, 5, 5, 1, 0.777 + 5, 5, 5, -1, 0.802 + 6, 6, 6, 0, 0.625 + 6, 6, 6, 1, 0.813 + 6, 6, 6, -1, 0.788 + 7, 7, 7, 0, 0.625 + 7, 7, 7, 1, 0.799 + 7, 7, 7, -1, 0.795 + 8, 8, 8, 0, 0.625 + 8, 8, 8, 1, 0.848 + 8, 8, 8, -1, 0.914 + 9, 9, 9, 0, 0.625 + +Size = 65: +size, align0, align1, ret, New Time/Old Time + 65, 0, 0, 0, 1.103 + 65, 0, 0, 1, 1.216 + 65, 0, 0, -1, 1.227 + 65, 65, 0, 0, 1.091 + 65, 0, 65, 1, 1.19 + 65, 65, 65, -1, 1.215 + +This is because A) the checks in range [65, 96] are now unrolled 2x +and B) because smaller values <= 16 are now given a hotter path. By +contrast the SSE4 version has a branch for Size = 80. The unrolled +version has get better performance for returns which need both +comparisons. + +size, align0, align1, ret, New Time/Old Time + 128, 4, 8, 0, 0.858 + 128, 4, 8, 1, 0.879 + 128, 4, 8, -1, 0.888 + +As well, out of microbenchmark environments that are not full +predictable the branch will have a real-cost. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 2 - + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 - + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 - + sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 --------------------- + 4 files changed, 814 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index bca82e38..b503e4b8 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -11,7 +11,6 @@ sysdep_routines += \ + memcmp-avx2-movbe-rtm \ + memcmp-evex-movbe \ + memcmp-sse2 \ +- memcmp-sse4 \ + memcmp-ssse3 \ + memcpy-ssse3 \ + memcpy-ssse3-back \ +@@ -174,7 +173,6 @@ sysdep_routines += \ + wmemcmp-avx2-movbe-rtm \ + wmemcmp-c \ + wmemcmp-evex-movbe \ +- wmemcmp-sse4 \ + wmemcmp-ssse3 \ + # sysdep_routines + endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 14314367..450a2917 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), +- __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), + __memcmp_ssse3) + IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2)) +@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) +- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), +- __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), + __wmemcmp_ssse3) + IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 690dffe8..0bc47a7f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -21,7 +21,6 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; +@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2_movbe); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) +- return OPTIMIZE (sse4_1); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3)) + return OPTIMIZE (ssse3); + +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +deleted file mode 100644 +index 50060006..00000000 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ /dev/null +@@ -1,804 +0,0 @@ +-/* memcmp with SSE4.1, wmemcmp with SSE4.1 +- Copyright (C) 2010-2018 Free Software Foundation, Inc. +- Contributed by Intel Corporation. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#if IS_IN (libc) +- +-# include +- +-# ifndef MEMCMP +-# define MEMCMP __memcmp_sse4_1 +-# endif +- +-#ifdef USE_AS_WMEMCMP +-# define CMPEQ pcmpeqd +-# define CHAR_SIZE 4 +-#else +-# define CMPEQ pcmpeqb +-# define CHAR_SIZE 1 +-#endif +- +- +-/* Warning! +- wmemcmp has to use SIGNED comparison for elements. +- memcmp has to use UNSIGNED comparison for elemnts. +-*/ +- +- .section .text.sse4.1,"ax",@progbits +-ENTRY (MEMCMP) +-# ifdef USE_AS_WMEMCMP +- shl $2, %RDX_LP +-# elif defined __ILP32__ +- /* Clear the upper 32 bits. */ +- mov %edx, %edx +-# endif +- cmp $79, %RDX_LP +- ja L(79bytesormore) +- +- cmp $CHAR_SIZE, %RDX_LP +- jbe L(firstbyte) +- +- /* N in (CHAR_SIZE, 79) bytes. */ +- cmpl $32, %edx +- ja L(more_32_bytes) +- +- cmpl $16, %edx +- jae L(16_to_32_bytes) +- +-# ifndef USE_AS_WMEMCMP +- cmpl $8, %edx +- jae L(8_to_16_bytes) +- +- cmpl $4, %edx +- jb L(2_to_3_bytes) +- +- movl (%rdi), %eax +- movl (%rsi), %ecx +- +- bswap %eax +- bswap %ecx +- +- shlq $32, %rax +- shlq $32, %rcx +- +- movl -4(%rdi, %rdx), %edi +- movl -4(%rsi, %rdx), %esi +- +- bswap %edi +- bswap %esi +- +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(2_to_3_bytes): +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- subl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(8_to_16_bytes): +- movq (%rdi), %rax +- movq (%rsi), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- jne L(8_to_16_bytes_done) +- +- movq -8(%rdi, %rdx), %rax +- movq -8(%rsi, %rdx), %rcx +- +- bswap %rax +- bswap %rcx +- +- subq %rcx, %rax +- +-L(8_to_16_bytes_done): +- cmovne %edx, %eax +- sbbl %ecx, %ecx +- orl %ecx, %eax +- ret +-# else +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl 4(%rdi), %ecx +- cmpl 4(%rsi), %ecx +- jne L(8_to_16_bytes_done) +- movl -4(%rdi, %rdx), %ecx +- cmpl -4(%rsi, %rdx), %ecx +- jne L(8_to_16_bytes_done) +- ret +-# endif +- +- .p2align 4,, 3 +-L(ret_zero): +- xorl %eax, %eax +-L(zero): +- ret +- +- .p2align 4,, 8 +-L(firstbyte): +- jb L(ret_zero) +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- je L(zero) +-L(8_to_16_bytes_done): +- setg %al +- leal -1(%rax, %rax), %eax +-# else +- movzbl (%rdi), %eax +- movzbl (%rsi), %ecx +- sub %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_48): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin_32): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl 32(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl 32(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl 32(%rsi, %rax), %ecx +- movzbl 32(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_begin_16): +- addq $16, %rdi +- addq $16, %rsi +-L(vec_return_begin): +- bsfl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(vec_return_end_16): +- subl $16, %edx +-L(vec_return_end): +- bsfl %eax, %eax +- addl %edx, %eax +-# ifdef USE_AS_WMEMCMP +- movl -16(%rdi, %rax), %ecx +- xorl %edx, %edx +- cmpl -16(%rsi, %rax), %ecx +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl -16(%rsi, %rax), %ecx +- movzbl -16(%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4,, 8 +-L(more_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm0 +- movdqu 16(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- cmpl $64, %edx +- jbe L(32_to_64_bytes) +- movdqu 32(%rdi), %xmm0 +- movdqu 32(%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- .p2align 4,, 6 +-L(32_to_64_bytes): +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(16_to_32_bytes): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- +- .p2align 4 +-L(79bytesormore): +- movdqu (%rdi), %xmm0 +- movdqu (%rsi), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- +- mov %rsi, %rcx +- and $-16, %rsi +- add $16, %rsi +- sub %rsi, %rcx +- +- sub %rcx, %rdi +- add %rcx, %rdx +- test $0xf, %rdi +- jz L(2aligned) +- +- cmp $128, %rdx +- ja L(128bytesormore) +- +- .p2align 4,, 6 +-L(less128bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(last_64_bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormore): +- cmp $256, %rdx +- ja L(unaligned_loop) +-L(less256bytes): +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqu (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqu 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqu 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytes) +- +- cmp $32, %rdx +- ja L(last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(unaligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_unaligned) +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loop): +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loop) +- +- .p2align 4,, 6 +-L(loop_tail): +- addq %rdx, %rdi +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- addq %rdx, %rsi +- movdqu (%rsi), %xmm4 +- movdqu 16(%rsi), %xmm5 +- movdqu 32(%rsi), %xmm6 +- movdqu 48(%rsi), %xmm7 +- +- CMPEQ %xmm4, %xmm0 +- CMPEQ %xmm5, %xmm1 +- CMPEQ %xmm6, %xmm2 +- CMPEQ %xmm7, %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- ret +- +-L(L2_L3_cache_unaligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_unaligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- +- movdqu (%rdi), %xmm0 +- movdqu 16(%rdi), %xmm1 +- movdqu 32(%rdi), %xmm2 +- movdqu 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(L2_L3_unaligned_128bytes_loop) +- jmp L(loop_tail) +- +- +- /* This case is for machines which are sensitive for unaligned +- * instructions. */ +- .p2align 4 +-L(2aligned): +- cmp $128, %rdx +- ja L(128bytesormorein2aligned) +-L(less128bytesin2aligned): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- cmp $96, %rdx +- jb L(32_to_64_bytes) +- +- addq $64, %rdi +- addq $64, %rsi +- subq $64, %rdx +- +- .p2align 4,, 6 +-L(aligned_last_64_bytes): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(128bytesormorein2aligned): +- cmp $256, %rdx +- ja L(aligned_loop) +-L(less256bytesin2alinged): +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $64, %rdi +- addq $64, %rsi +- +- movdqa (%rdi), %xmm1 +- CMPEQ (%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin) +- +- movdqa 16(%rdi), %xmm1 +- CMPEQ 16(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_16) +- +- movdqa 32(%rdi), %xmm1 +- CMPEQ 32(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_32) +- +- movdqa 48(%rdi), %xmm1 +- CMPEQ 48(%rsi), %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_begin_48) +- +- addq $-128, %rdx +- subq $-64, %rsi +- subq $-64, %rdi +- +- cmp $64, %rdx +- ja L(less128bytesin2aligned) +- +- cmp $32, %rdx +- ja L(aligned_last_64_bytes) +- +- movdqu -32(%rdi, %rdx), %xmm0 +- movdqu -32(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end_16) +- +- movdqu -16(%rdi, %rdx), %xmm0 +- movdqu -16(%rsi, %rdx), %xmm1 +- CMPEQ %xmm0, %xmm1 +- pmovmskb %xmm1, %eax +- incw %ax +- jnz L(vec_return_end) +- ret +- +- .p2align 4 +-L(aligned_loop): +-# ifdef DATA_CACHE_SIZE_HALF +- mov $DATA_CACHE_SIZE_HALF, %R8_LP +-# else +- mov __x86_data_cache_size_half(%rip), %R8_LP +-# endif +- movq %r8, %r9 +- addq %r8, %r8 +- addq %r9, %r8 +- cmpq %r8, %rdx +- ja L(L2_L3_cache_aligned) +- +- sub $64, %rdx +- .p2align 4 +-L(64bytesormore_loopin2aligned): +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- ja L(64bytesormore_loopin2aligned) +- jmp L(loop_tail) +- +-L(L2_L3_cache_aligned): +- subq $64, %rdx +- .p2align 4 +-L(L2_L3_aligned_128bytes_loop): +- prefetchnta 0x1c0(%rdi) +- prefetchnta 0x1c0(%rsi) +- movdqa (%rdi), %xmm0 +- movdqa 16(%rdi), %xmm1 +- movdqa 32(%rdi), %xmm2 +- movdqa 48(%rdi), %xmm3 +- +- CMPEQ (%rsi), %xmm0 +- CMPEQ 16(%rsi), %xmm1 +- CMPEQ 32(%rsi), %xmm2 +- CMPEQ 48(%rsi), %xmm3 +- +- pand %xmm0, %xmm1 +- pand %xmm2, %xmm3 +- pand %xmm1, %xmm3 +- +- pmovmskb %xmm3, %eax +- incw %ax +- jnz L(64bytesormore_loop_end) +- +- addq $64, %rsi +- addq $64, %rdi +- subq $64, %rdx +- ja L(L2_L3_aligned_128bytes_loop) +- jmp L(loop_tail) +- +- .p2align 4 +-L(64bytesormore_loop_end): +- pmovmskb %xmm0, %ecx +- incw %cx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm1, %ecx +- notw %cx +- sall $16, %ecx +- jnz L(loop_end_ret) +- +- pmovmskb %xmm2, %ecx +- notw %cx +- shlq $32, %rcx +- jnz L(loop_end_ret) +- +- addq $48, %rdi +- addq $48, %rsi +- movq %rax, %rcx +- +- .p2align 4,, 6 +-L(loop_end_ret): +- bsfq %rcx, %rcx +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rcx), %eax +- xorl %edx, %edx +- cmpl (%rsi, %rcx), %eax +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %ecx +- subl %ecx, %eax +-# endif +- ret +-END (MEMCMP) +-#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-102.patch b/glibc-RHEL-15696-102.patch new file mode 100644 index 0000000..8cb20ad --- /dev/null +++ b/glibc-RHEL-15696-102.patch @@ -0,0 +1,263 @@ +From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 15 Apr 2022 12:28:01 -0500 +Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S +Content-type: text/plain; charset=UTF-8 + +Old code was both inefficient and wasted code size. New code (-62 +bytes) and comparable or better performance in the page cross case. + +geometric_mean(N=20) of page cross cases New / Original: 0.960 + +size, align0, align1, ret, New Time/Old Time + 1, 4095, 0, 0, 1.001 + 1, 4095, 0, 1, 0.999 + 1, 4095, 0, -1, 1.0 + 2, 4094, 0, 0, 1.0 + 2, 4094, 0, 1, 1.0 + 2, 4094, 0, -1, 1.0 + 3, 4093, 0, 0, 1.0 + 3, 4093, 0, 1, 1.0 + 3, 4093, 0, -1, 1.0 + 4, 4092, 0, 0, 0.987 + 4, 4092, 0, 1, 1.0 + 4, 4092, 0, -1, 1.0 + 5, 4091, 0, 0, 0.984 + 5, 4091, 0, 1, 1.002 + 5, 4091, 0, -1, 1.005 + 6, 4090, 0, 0, 0.993 + 6, 4090, 0, 1, 1.001 + 6, 4090, 0, -1, 1.003 + 7, 4089, 0, 0, 0.991 + 7, 4089, 0, 1, 1.0 + 7, 4089, 0, -1, 1.001 + 8, 4088, 0, 0, 0.875 + 8, 4088, 0, 1, 0.881 + 8, 4088, 0, -1, 0.888 + 9, 4087, 0, 0, 0.872 + 9, 4087, 0, 1, 0.879 + 9, 4087, 0, -1, 0.883 + 10, 4086, 0, 0, 0.878 + 10, 4086, 0, 1, 0.886 + 10, 4086, 0, -1, 0.873 + 11, 4085, 0, 0, 0.878 + 11, 4085, 0, 1, 0.881 + 11, 4085, 0, -1, 0.879 + 12, 4084, 0, 0, 0.873 + 12, 4084, 0, 1, 0.889 + 12, 4084, 0, -1, 0.875 + 13, 4083, 0, 0, 0.873 + 13, 4083, 0, 1, 0.863 + 13, 4083, 0, -1, 0.863 + 14, 4082, 0, 0, 0.838 + 14, 4082, 0, 1, 0.869 + 14, 4082, 0, -1, 0.877 + 15, 4081, 0, 0, 0.841 + 15, 4081, 0, 1, 0.869 + 15, 4081, 0, -1, 0.876 + 16, 4080, 0, 0, 0.988 + 16, 4080, 0, 1, 0.99 + 16, 4080, 0, -1, 0.989 + 17, 4079, 0, 0, 0.978 + 17, 4079, 0, 1, 0.981 + 17, 4079, 0, -1, 0.98 + 18, 4078, 0, 0, 0.981 + 18, 4078, 0, 1, 0.98 + 18, 4078, 0, -1, 0.985 + 19, 4077, 0, 0, 0.977 + 19, 4077, 0, 1, 0.979 + 19, 4077, 0, -1, 0.986 + 20, 4076, 0, 0, 0.977 + 20, 4076, 0, 1, 0.986 + 20, 4076, 0, -1, 0.984 + 21, 4075, 0, 0, 0.977 + 21, 4075, 0, 1, 0.983 + 21, 4075, 0, -1, 0.988 + 22, 4074, 0, 0, 0.983 + 22, 4074, 0, 1, 0.994 + 22, 4074, 0, -1, 0.993 + 23, 4073, 0, 0, 0.98 + 23, 4073, 0, 1, 0.992 + 23, 4073, 0, -1, 0.995 + 24, 4072, 0, 0, 0.989 + 24, 4072, 0, 1, 0.989 + 24, 4072, 0, -1, 0.991 + 25, 4071, 0, 0, 0.99 + 25, 4071, 0, 1, 0.999 + 25, 4071, 0, -1, 0.996 + 26, 4070, 0, 0, 0.993 + 26, 4070, 0, 1, 0.995 + 26, 4070, 0, -1, 0.998 + 27, 4069, 0, 0, 0.993 + 27, 4069, 0, 1, 0.999 + 27, 4069, 0, -1, 1.0 + 28, 4068, 0, 0, 0.997 + 28, 4068, 0, 1, 1.0 + 28, 4068, 0, -1, 0.999 + 29, 4067, 0, 0, 0.996 + 29, 4067, 0, 1, 0.999 + 29, 4067, 0, -1, 0.999 + 30, 4066, 0, 0, 0.991 + 30, 4066, 0, 1, 1.001 + 30, 4066, 0, -1, 0.999 + 31, 4065, 0, 0, 0.988 + 31, 4065, 0, 1, 0.998 + 31, 4065, 0, -1, 0.998 +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++-------- + 1 file changed, 61 insertions(+), 37 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 16fc673e..99258cf5 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -429,22 +429,21 @@ L(page_cross_less_vec): + # ifndef USE_AS_WMEMCMP + cmpl $8, %edx + jae L(between_8_15) ++ /* Fall through for [4, 7]. */ + cmpl $4, %edx +- jae L(between_4_7) ++ jb L(between_2_3) + +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) + /* No ymm register was touched. */ + ret + +@@ -457,9 +456,33 @@ L(one_or_less): + /* No ymm register was touched. */ + ret + ++ .p2align 4,, 5 ++L(ret_nonzero): ++ sbbl %eax, %eax ++ orl $1, %eax ++ /* No ymm register was touched. */ ++ ret ++ ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ /* No ymm register was touched. */ ++ ret ++ + .p2align 4 + L(between_8_15): +-# endif ++ movbe (%rdi), %rax ++ movbe (%rsi), %rcx ++ subq %rcx, %rax ++ jnz L(ret_nonzero) ++ movbe -8(%rdi, %rdx), %rax ++ movbe -8(%rsi, %rdx), %rcx ++ subq %rcx, %rax ++ /* Fast path for return zero. */ ++ jnz L(ret_nonzero) ++ /* No ymm register was touched. */ ++ ret ++# else + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +@@ -475,16 +498,13 @@ L(between_8_15): + VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret ++# endif + +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +- +- .p2align 4 ++ .p2align 4,, 10 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 +@@ -501,11 +521,17 @@ L(between_16_31): + VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax + subl $0xffff, %eax ++ /* Fast path for return zero. */ + jnz L(return_vec_0) + /* No ymm register was touched. */ + ret + + # ifdef USE_AS_WMEMCMP ++ .p2align 4,, 2 ++L(zero): ++ xorl %eax, %eax ++ ret ++ + .p2align 4 + L(one_or_less): + jb L(zero) +@@ -520,22 +546,20 @@ L(one_or_less): + # else + + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- jz L(zero_4_7) +- sbbl %eax, %eax +- orl $1, %eax +-L(zero_4_7): ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ bswap %eax ++ bswap %ecx ++ shrl %eax ++ shrl %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ /* Subtraction is okay because the upper bit is zero. */ ++ subl %ecx, %eax + /* No ymm register was touched. */ + ret + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-103.patch b/glibc-RHEL-15696-103.patch new file mode 100644 index 0000000..c080e54 --- /dev/null +++ b/glibc-RHEL-15696-103.patch @@ -0,0 +1,876 @@ +From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:28 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2 +Content-type: text/plain; charset=UTF-8 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.741 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +- + sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +- + sysdeps/x86_64/strrchr.S | 510 +++++++++++++++--------- + sysdeps/x86_64/wcsrchr.S | 266 +----------- + 4 files changed, 338 insertions(+), 443 deletions(-) + +Conflicts: + sysdeps/x86_64/wcsrchr.S + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S +index 0ec76fe9..6bb1284b 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S +@@ -17,7 +17,7 @@ + . */ + + #if IS_IN (libc) +-# define strrchr __strrchr_sse2 ++# define STRRCHR __strrchr_sse2 + + # undef weak_alias + # define weak_alias(strrchr, rindex) +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +index d015e953..f26d53b5 100644 +--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S ++++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S +@@ -17,7 +17,6 @@ + . */ + + #if IS_IN (libc) +-# define wcsrchr __wcsrchr_sse2 ++# define STRRCHR __wcsrchr_sse2 + #endif +- + #include "../wcsrchr.S" +diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S +index aca98e7e..a58cc220 100644 +--- a/sysdeps/x86_64/strrchr.S ++++ b/sysdeps/x86_64/strrchr.S +@@ -19,210 +19,360 @@ + + #include + ++#ifndef STRRCHR ++# define STRRCHR strrchr ++#endif ++ ++#ifdef USE_AS_WCSRCHR ++# define PCMPEQ pcmpeqd ++# define CHAR_SIZE 4 ++# define PMINU pminud ++#else ++# define PCMPEQ pcmpeqb ++# define CHAR_SIZE 1 ++# define PMINU pminub ++#endif ++ ++#define PAGE_SIZE 4096 ++#define VEC_SIZE 16 ++ + .text +-ENTRY (strrchr) +- movd %esi, %xmm1 ++ENTRY(STRRCHR) ++ movd %esi, %xmm0 + movq %rdi, %rax +- andl $4095, %eax +- punpcklbw %xmm1, %xmm1 +- cmpq $4032, %rax +- punpcklwd %xmm1, %xmm1 +- pshufd $0, %xmm1, %xmm1 ++ andl $(PAGE_SIZE - 1), %eax ++#ifndef USE_AS_WCSRCHR ++ punpcklbw %xmm0, %xmm0 ++ punpcklwd %xmm0, %xmm0 ++#endif ++ pshufd $0, %xmm0, %xmm0 ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax + ja L(cross_page) +- movdqu (%rdi), %xmm0 ++ ++L(cross_page_continue): ++ movups (%rdi), %xmm1 + pxor %xmm2, %xmm2 +- movdqa %xmm0, %xmm3 +- pcmpeqb %xmm1, %xmm0 +- pcmpeqb %xmm2, %xmm3 +- pmovmskb %xmm0, %ecx +- pmovmskb %xmm3, %edx +- testq %rdx, %rdx +- je L(next_48_bytes) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rcx, %rax +- je L(exit) +- bsrq %rax, %rax ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax + addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret0): + ret + ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ + .p2align 4 +-L(next_48_bytes): +- movdqu 16(%rdi), %xmm4 +- movdqa %xmm4, %xmm5 +- movdqu 32(%rdi), %xmm3 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm2, %xmm5 +- movdqu 48(%rdi), %xmm0 +- pmovmskb %xmm5, %edx +- movdqa %xmm3, %xmm5 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm2, %xmm5 +- pcmpeqb %xmm0, %xmm2 +- salq $16, %rdx +- pmovmskb %xmm3, %r8d +- pmovmskb %xmm5, %eax +- pmovmskb %xmm2, %esi +- salq $32, %r8 +- salq $32, %rax +- pcmpeqb %xmm1, %xmm0 +- orq %rdx, %rax +- movq %rsi, %rdx +- pmovmskb %xmm4, %esi +- salq $48, %rdx +- salq $16, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi +- pmovmskb %xmm0, %ecx +- salq $48, %rcx +- orq %rcx, %rsi +- orq %rdx, %rax +- je L(loop_header2) +- leaq -1(%rax), %rcx +- xorq %rax, %rcx +- andq %rcx, %rsi +- je L(exit) +- bsrq %rsi, %rsi +- leaq (%rdi,%rsi), %rax ++L(first_vec_x0_test): ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ testl %eax, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %r8, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 +-L(loop_header2): +- testq %rsi, %rsi +- movq %rdi, %rcx +- je L(no_c_found) +-L(loop_header): +- addq $64, %rdi +- pxor %xmm7, %xmm7 +- andq $-64, %rdi +- jmp L(loop_entry) ++L(first_vec_x1): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret + + .p2align 4 +-L(loop64): +- testq %rdx, %rdx +- cmovne %rdx, %rsi +- cmovne %rdi, %rcx +- addq $64, %rdi +-L(loop_entry): +- movdqa 32(%rdi), %xmm3 +- pxor %xmm6, %xmm6 +- movdqa 48(%rdi), %xmm2 +- movdqa %xmm3, %xmm0 +- movdqa 16(%rdi), %xmm4 +- pminub %xmm2, %xmm0 +- movdqa (%rdi), %xmm5 +- pminub %xmm4, %xmm0 +- pminub %xmm5, %xmm0 +- pcmpeqb %xmm7, %xmm0 +- pmovmskb %xmm0, %eax +- movdqa %xmm5, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %r9d +- movdqa %xmm4, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- pmovmskb %xmm0, %edx +- movdqa %xmm3, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm0, %r10d +- movdqa %xmm2, %xmm0 +- pcmpeqb %xmm1, %xmm0 +- salq $32, %r10 +- orq %r10, %rdx +- pmovmskb %xmm0, %r8d +- orq %r9, %rdx +- salq $48, %r8 +- orq %r8, %rdx ++L(first_vec_x1_test): ++ PCMPEQ %xmm0, %xmm2 ++ pmovmskb %xmm2, %eax + testl %eax, %eax +- je L(loop64) +- pcmpeqb %xmm6, %xmm4 +- pcmpeqb %xmm6, %xmm3 +- pcmpeqb %xmm6, %xmm5 +- pmovmskb %xmm4, %eax +- pmovmskb %xmm3, %r10d +- pcmpeqb %xmm6, %xmm2 +- pmovmskb %xmm5, %r9d +- salq $32, %r10 +- salq $16, %rax +- pmovmskb %xmm2, %r8d +- orq %r10, %rax +- orq %r9, %rax +- salq $48, %r8 +- orq %r8, %rax +- leaq -1(%rax), %r8 +- xorq %rax, %r8 +- andq %r8, %rdx +- cmovne %rdi, %rcx +- cmovne %rdx, %rsi +- bsrq %rsi, %rsi +- leaq (%rcx,%rsi), %rax ++ jz L(first_vec_x0_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm3, %eax ++ leal -1(%rcx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_vec_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ andq $-VEC_SIZE, %rdi ++ ++ movaps VEC_SIZE(%rdi), %xmm2 ++ pxor %xmm3, %xmm3 ++ PCMPEQ %xmm2, %xmm3 ++ pmovmskb %xmm3, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) ++ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm3 ++ pxor %xmm4, %xmm4 ++ PCMPEQ %xmm3, %xmm4 ++ pmovmskb %xmm4, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) ++ ++ addq $VEC_SIZE, %rdi ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ andq $-(VEC_SIZE * 2), %rdi ++ .p2align 4 ++L(first_loop): ++ /* Do 2x VEC at a time. */ ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Use `addl` 1) so we can undo it with `subl` and 2) it can ++ macro-fuse with `jz`. */ ++ addl %ecx, %eax ++ jz L(first_loop) ++ ++ /* Check if there is zero match. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ /* Check if there was a match in last iteration. */ ++ subl %ecx, %eax ++ jnz L(new_match) ++ ++L(first_loop_old_match): ++ PCMPEQ %xmm0, %xmm2 ++ PCMPEQ %xmm0, %xmm3 ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ addl %eax, %ecx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ sall $16, %eax ++ orl %ecx, %eax ++ ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4 ++L(new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(first_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + ++ /* Save minimum state for getting most recent match. We can ++ throw out all previous work. */ + .p2align 4 +-L(no_c_found): +- movl $1, %esi +- xorl %ecx, %ecx +- jmp L(loop_header) ++L(second_loop_match): ++ movq %rdi, %rsi ++ movaps %xmm4, %xmm2 ++ movaps %xmm7, %xmm3 + + .p2align 4 +-L(exit): +- xorl %eax, %eax ++L(second_loop): ++ movaps (VEC_SIZE * 2)(%rdi), %xmm4 ++ movaps (VEC_SIZE * 3)(%rdi), %xmm5 ++ /* Since SSE2 no pminud so wcsrchr needs seperate logic for ++ detecting zero. Note if this is found to be a bottleneck it ++ may be worth adding an SSE4.1 wcsrchr implementation. */ ++#ifdef USE_AS_WCSRCHR ++ movaps %xmm5, %xmm6 ++ pxor %xmm8, %xmm8 ++ ++ PCMPEQ %xmm8, %xmm5 ++ PCMPEQ %xmm4, %xmm8 ++ por %xmm5, %xmm8 ++#else ++ movaps %xmm5, %xmm6 ++ PMINU %xmm4, %xmm5 ++#endif ++ ++ movaps %xmm4, %xmm9 ++ PCMPEQ %xmm0, %xmm4 ++ PCMPEQ %xmm0, %xmm6 ++ movaps %xmm6, %xmm7 ++ por %xmm4, %xmm6 ++#ifndef USE_AS_WCSRCHR ++ pxor %xmm8, %xmm8 ++ PCMPEQ %xmm5, %xmm8 ++#endif ++ ++ pmovmskb %xmm8, %ecx ++ pmovmskb %xmm6, %eax ++ ++ addq $(VEC_SIZE * 2), %rdi ++ /* Either null term or new occurence of CHAR. */ ++ addl %ecx, %eax ++ jz L(second_loop) ++ ++ /* No null term so much be new occurence of CHAR. */ ++ testl %ecx, %ecx ++ jz L(second_loop_match) ++ ++ ++ subl %ecx, %eax ++ jnz L(second_loop_new_match) ++ ++L(second_loop_old_match): ++ pmovmskb %xmm2, %ecx ++ pmovmskb %xmm3, %eax ++ sall $16, %eax ++ orl %ecx, %eax ++ bsrl %eax, %eax ++ addq %rsi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif + ret + + .p2align 4 ++L(second_loop_new_match): ++ pxor %xmm6, %xmm6 ++ PCMPEQ %xmm9, %xmm6 ++ pmovmskb %xmm6, %eax ++ sall $16, %ecx ++ orl %eax, %ecx ++ ++ /* We can't reuse either of the old comparisons as since we mask ++ of zeros after first zero (instead of using the full ++ comparison) we can't gurantee no interference between match ++ after end of string and valid match. */ ++ pmovmskb %xmm4, %eax ++ pmovmskb %xmm7, %edx ++ sall $16, %edx ++ orl %edx, %eax ++ ++ leal -1(%ecx), %edx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(second_loop_old_match) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++ ret ++ ++ .p2align 4,, 4 + L(cross_page): +- movq %rdi, %rax +- pxor %xmm0, %xmm0 +- andq $-64, %rax +- movdqu (%rax), %xmm5 +- movdqa %xmm5, %xmm6 +- movdqu 16(%rax), %xmm4 +- pcmpeqb %xmm1, %xmm5 +- pcmpeqb %xmm0, %xmm6 +- movdqu 32(%rax), %xmm3 +- pmovmskb %xmm6, %esi +- movdqa %xmm4, %xmm6 +- movdqu 48(%rax), %xmm2 +- pcmpeqb %xmm1, %xmm4 +- pcmpeqb %xmm0, %xmm6 +- pmovmskb %xmm6, %edx +- movdqa %xmm3, %xmm6 +- pcmpeqb %xmm1, %xmm3 +- pcmpeqb %xmm0, %xmm6 +- pcmpeqb %xmm2, %xmm0 +- salq $16, %rdx +- pmovmskb %xmm3, %r9d +- pmovmskb %xmm6, %r8d +- pmovmskb %xmm0, %ecx +- salq $32, %r9 +- salq $32, %r8 +- pcmpeqb %xmm1, %xmm2 +- orq %r8, %rdx +- salq $48, %rcx +- pmovmskb %xmm5, %r8d +- orq %rsi, %rdx +- pmovmskb %xmm4, %esi +- orq %rcx, %rdx +- pmovmskb %xmm2, %ecx +- salq $16, %rsi +- salq $48, %rcx +- orq %r9, %rsi +- orq %r8, %rsi +- orq %rcx, %rsi ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ movaps (%rsi), %xmm1 ++ pxor %xmm2, %xmm2 ++ PCMPEQ %xmm1, %xmm2 ++ pmovmskb %xmm2, %edx + movl %edi, %ecx +- subl %eax, %ecx +- shrq %cl, %rdx +- shrq %cl, %rsi +- testq %rdx, %rdx +- je L(loop_header2) +- leaq -1(%rdx), %rax +- xorq %rdx, %rax +- andq %rax, %rsi +- je L(exit) +- bsrq %rsi, %rax ++ andl $(VEC_SIZE - 1), %ecx ++ sarl %cl, %edx ++ jz L(cross_page_continue) ++ PCMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ sarl %cl, %eax ++ leal -1(%rdx), %ecx ++ xorl %edx, %ecx ++ andl %ecx, %eax ++ jz L(ret1) ++ bsrl %eax, %eax + addq %rdi, %rax ++#ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++#endif ++L(ret1): + ret +-END (strrchr) ++END(STRRCHR) + +-weak_alias (strrchr, rindex) +-libc_hidden_builtin_def (strrchr) ++#ifndef USE_AS_WCSRCHR ++ weak_alias (STRRCHR, rindex) ++ libc_hidden_builtin_def (STRRCHR) ++#endif +diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S +index 2f388537..ae3cfa7d 100644 +--- a/sysdeps/x86_64/wcsrchr.S ++++ b/sysdeps/x86_64/wcsrchr.S +@@ -17,266 +17,12 @@ + License along with the GNU C Library; if not, see + . */ + +-#include + +- .text +-ENTRY (wcsrchr) ++#define USE_AS_WCSRCHR 1 ++#define NO_PMINU 1 + +- movd %rsi, %xmm1 +- mov %rdi, %rcx +- punpckldq %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- punpckldq %xmm1, %xmm1 +- and $63, %rcx +- cmp $48, %rcx +- ja L(crosscache) ++#ifndef STRRCHR ++# define STRRCHR wcsrchr ++#endif + +- movdqu (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match1) +- +- test %rcx, %rcx +- jnz L(return_null) +- +- and $-16, %rdi +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match1): +- test %rcx, %rcx +- jnz L(prolog_find_zero_1) +- +- mov %rax, %r8 +- mov %rdi, %rsi +- and $-16, %rdi +- jmp L(loop) +- +- .p2align 4 +-L(crosscache): +- and $15, %rcx +- and $-16, %rdi +- pxor %xmm3, %xmm3 +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm3 +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm3, %rdx +- pmovmskb %xmm0, %rax +- shr %cl, %rdx +- shr %cl, %rax +- add $16, %rdi +- +- test %rax, %rax +- jnz L(unaligned_match) +- +- test %rdx, %rdx +- jnz L(return_null) +- +- xor %r8, %r8 +- jmp L(loop) +- +- .p2align 4 +-L(unaligned_match): +- test %rdx, %rdx +- jnz L(prolog_find_zero) +- +- mov %rax, %r8 +- lea (%rdi, %rcx), %rsi +- +-/* Loop start on aligned string. */ +- .p2align 4 +-L(loop): +- movdqa (%rdi), %xmm0 +- pcmpeqd %xmm0, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm0 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm0, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm3 +- pcmpeqd %xmm3, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm3 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm3, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm4 +- pcmpeqd %xmm4, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm4 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm4, %rax +- or %rax, %rcx +- jnz L(matches) +- +- movdqa (%rdi), %xmm5 +- pcmpeqd %xmm5, %xmm2 +- add $16, %rdi +- pcmpeqd %xmm1, %xmm5 +- pmovmskb %xmm2, %rcx +- pmovmskb %xmm5, %rax +- or %rax, %rcx +- jz L(loop) +- +- .p2align 4 +-L(matches): +- test %rax, %rax +- jnz L(match) +-L(return_value): +- test %r8, %r8 +- jz L(return_null) +- mov %r8, %rax +- mov %rsi, %rdi +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match): +- pmovmskb %xmm2, %rcx +- test %rcx, %rcx +- jnz L(find_zero) +- mov %rax, %r8 +- mov %rdi, %rsi +- jmp L(loop) +- +- .p2align 4 +-L(find_zero): +- test $15, %cl +- jnz L(find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(find_zero_in_second_wchar) +- test $15, %ch +- jnz L(find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_value) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_value) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_value) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero): +- add %rcx, %rdi +- mov %rdx, %rcx +-L(prolog_find_zero_1): +- test $15, %cl +- jnz L(prolog_find_zero_in_first_wchar) +- test %cl, %cl +- jnz L(prolog_find_zero_in_second_wchar) +- test $15, %ch +- jnz L(prolog_find_zero_in_third_wchar) +- +- and $1 << 13 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %ah +- jnz L(match_fourth_wchar) +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_first_wchar): +- test $1, %rax +- jz L(return_null) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_second_wchar): +- and $1 << 5 - 1, %rax +- jz L(return_null) +- +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(prolog_find_zero_in_third_wchar): +- and $1 << 9 - 1, %rax +- jz L(return_null) +- +- test %ah, %ah +- jnz L(match_third_wchar) +- test $15 << 4, %al +- jnz L(match_second_wchar) +- lea -16(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_second_wchar): +- lea -12(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_third_wchar): +- lea -8(%rdi), %rax +- ret +- +- .p2align 4 +-L(match_fourth_wchar): +- lea -4(%rdi), %rax +- ret +- +- .p2align 4 +-L(return_null): +- xor %rax, %rax +- ret +- +-END (wcsrchr) ++#include "../strrchr.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-104.patch b/glibc-RHEL-15696-104.patch new file mode 100644 index 0000000..1cb312a --- /dev/null +++ b/glibc-RHEL-15696-104.patch @@ -0,0 +1,501 @@ +From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:29 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2 +Content-type: text/plain; charset=UTF-8 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.832 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++--------- + 1 file changed, 269 insertions(+), 157 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index c949410b..3d26fad4 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -27,9 +27,13 @@ + # ifdef USE_AS_WCSRCHR + # define VPBROADCAST vpbroadcastd + # define VPCMPEQ vpcmpeqd ++# define VPMIN vpminud ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb + # define VPCMPEQ vpcmpeqb ++# define VPMIN vpminub ++# define CHAR_SIZE 1 + # endif + + # ifndef VZEROUPPER +@@ -41,196 +45,304 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRRCHR) +- movd %esi, %xmm4 +- movl %edi, %ecx ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRRCHR) ++ movd %esi, %xmm7 ++ movl %edi, %eax + /* Broadcast CHAR to YMM4. */ +- VPBROADCAST %xmm4, %ymm4 ++ VPBROADCAST %xmm7, %ymm7 + vpxor %xmm0, %xmm0, %xmm0 + +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ /* Shift here instead of `andl` to save code size (saves a fetch ++ block). */ ++ sall $20, %eax ++ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax ++ ja L(cross_page) + ++L(page_cross_continue): + vmovdqu (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- addq $VEC_SIZE, %rdi ++ /* Check end of string match. */ ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jz L(aligned_more) ++ ++ /* Only check match with search CHAR if needed. */ ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Check if match before first zero. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If ++ search CHAR is zero we are correct. Either way `andq ++ -CHAR_SIZE, %rax` gets the correct result. */ ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret0): ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++ /* Returns for first vec x1/x2 have hard coded backward search ++ path for earlier matches. */ ++ .p2align 4,, 10 ++L(first_vec_x1): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ .p2align 4,, 4 ++L(first_vec_x0_test): ++ VPCMPEQ %ymm1, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ testl %eax, %eax ++ jz L(ret1) ++ bsrl %eax, %eax ++ addq %r8, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret1): ++ VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x0_x1_test): ++ VPCMPEQ %ymm2, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ /* Check ymm2 for search CHAR match. If no match then check ymm1 ++ before returning. */ + testl %eax, %eax +- jnz L(first_vec) ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq 1(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + +- testl %ecx, %ecx +- jnz L(return_null) + +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm6 ++ vpmovmskb %ymm6, %eax ++ blsmskl %ecx, %ecx ++ /* If no in-range search CHAR match in ymm3 then need to check ++ ymm1/ymm2 for an earlier match (we delay checking search ++ CHAR matches until needed). */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN ++ + + .p2align 4 +-L(first_vec): +- /* Check if there is a nul CHAR. */ ++L(aligned_more): ++ /* Save original pointer if match was in VEC 0. */ ++ movq %rdi, %r8 ++ ++ /* Align src. */ ++ orq $(VEC_SIZE - 1), %rdi ++ vmovdqu 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx + testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) ++ jnz L(first_vec_x1) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) ++ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3 ++ VPCMPEQ %ymm3, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + ++ /* Save pointer again before realigning. */ ++ movq %rdi, %rsi ++ addq $(VEC_SIZE + 1), %rdi ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %edx +- vpmovmskb %ymm3, %eax +- shrl %cl, %edx +- shrl %cl, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ ++L(first_aligned_loop): ++ /* Do 2x VEC at a time. Any more and the cost of finding the ++ match outweights loop benefit. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm8 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm8, %ymm0, %ymm8 ++ vpor %ymm5, %ymm8, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi ++ /* No zero or search CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) ++ jz L(first_aligned_loop) + +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) ++ /* If no zero CHAR then go to second loop (this allows us to ++ throw away all prior work). */ ++ vpmovmskb %ymm8, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_prep) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ /* Search char could be zero so we need to get the true match. ++ */ ++ vpmovmskb %ymm5, %eax ++ testl %eax, %eax ++ jnz L(first_aligned_loop_return) + +- .p2align 4 +-L(aligned_loop): +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- add $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx ++ .p2align 4,, 4 ++L(first_vec_x1_or_x2): ++ VPCMPEQ %ymm3, %ymm7, %ymm3 ++ VPCMPEQ %ymm2, %ymm7, %ymm2 + vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) +- +- vmovdqa (%rdi), %ymm1 +- VPCMPEQ %ymm1, %ymm0, %ymm2 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm1, %ymm4, %ymm3 +- vpmovmskb %ymm2, %ecx +- vpmovmskb %ymm3, %eax +- orl %eax, %ecx +- jz L(aligned_loop) +- +- .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a nul CHAR in a loop. */ +- testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ vpmovmskb %ymm2, %edx ++ /* Use add for macro-fusion. */ ++ addq %rax, %rdx ++ jz L(first_vec_x0_test) ++ /* NB: We could move this shift to before the branch and save a ++ bit of code size / performance on the fall through. The ++ branch leads to the null case which generally seems hotter ++ than char in first 3x VEC. */ ++ salq $32, %rax ++ addq %rdx, %rax ++ bsrq %rax, %rax ++ leaq 1(%rsi, %rax), %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++ VZEROUPPER_RETURN + ++ .p2align 4,, 8 ++L(first_aligned_loop_return): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(first_vec_x1_or_x2) ++ ++ bsrq %rax, %rax ++ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax + # ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %eax ++ andq $-CHAR_SIZE, %rax + # endif +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ VZEROUPPER_RETURN + ++ /* Search char cannot be zero. */ + .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a nul CHAR. */ +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++L(second_aligned_loop_set_furthest_match): ++ /* Save VEC and pointer from most recent match. */ ++L(second_aligned_loop_prep): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ vmovdqu %ymm6, %ymm2 ++ vmovdqu %ymm10, %ymm3 + + .p2align 4 +-L(find_nul): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++L(second_aligned_loop): ++ /* Search 2x at at time. */ ++ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4 ++ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5 ++ ++ VPCMPEQ %ymm4, %ymm7, %ymm6 ++ VPMIN %ymm4, %ymm5, %ymm1 ++ VPCMPEQ %ymm5, %ymm7, %ymm10 ++ vpor %ymm6, %ymm10, %ymm5 ++ VPCMPEQ %ymm1, %ymm0, %ymm1 ++ vpor %ymm5, %ymm1, %ymm9 ++ ++ vpmovmskb %ymm9, %eax ++ addq $(VEC_SIZE * 2), %rdi + testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER_RETURN +- +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a nul CHAR. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +-# ifdef USE_AS_WCSRCHR +- /* Keep the first bit for each matching CHAR for bsr. */ +- andl $0x11111111, %ecx +- andl $0x11111111, %eax +-# endif +- /* Mask out any matching bits after the nul CHAR. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax ++ jz L(second_aligned_loop) ++ vpmovmskb %ymm1, %ecx ++ testl %ecx, %ecx ++ jz L(second_aligned_loop_set_furthest_match) ++ vpmovmskb %ymm5, %eax + testl %eax, %eax +- /* Return null pointer if the nul CHAR comes first. */ +- jz L(return_null) +- bsrl %eax, %eax +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ jnz L(return_new_match) ++ ++ /* This is the hot patch. We know CHAR is inbounds and that ++ ymm3/ymm2 have latest match. */ ++ .p2align 4,, 4 ++L(return_old_match): ++ vpmovmskb %ymm3, %eax ++ vpmovmskb %ymm2, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax ++ /* Last iteration also potentially has a match. */ ++ .p2align 4,, 8 ++L(return_new_match): ++ VPCMPEQ %ymm4, %ymm0, %ymm4 ++ vpmovmskb %ymm4, %edx ++ salq $32, %rcx ++ orq %rdx, %rcx ++ ++ vpmovmskb %ymm10, %eax ++ vpmovmskb %ymm6, %edx ++ salq $32, %rax ++ orq %rdx, %rax ++ blsmskq %rcx, %rcx ++ andq %rcx, %rax ++ jz L(return_old_match) ++ bsrq %rax, %rax ++ /* Search char cannot be zero so safe to just use lea for ++ wcsrchr. */ ++ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax + VZEROUPPER_RETURN + +-END (STRRCHR) ++ .p2align 4,, 4 ++L(cross_page): ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rsi ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ %ymm1, %ymm0, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %ecx, %ecx ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ VPCMPEQ %ymm1, %ymm7, %ymm1 ++ vpmovmskb %ymm1, %eax ++ ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %edi, %eax, %eax ++ blsmskl %ecx, %ecx ++ /* Check if any search CHAR match in range. */ ++ andl %ecx, %eax ++ jz L(ret2) ++ bsrl %eax, %eax ++ addq %rdi, %rax ++# ifdef USE_AS_WCSRCHR ++ andq $-CHAR_SIZE, %rax ++# endif ++L(ret2): ++ VZEROUPPER_RETURN ++END(STRRCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-105.patch b/glibc-RHEL-15696-105.patch new file mode 100644 index 0000000..e0a157f --- /dev/null +++ b/glibc-RHEL-15696-105.patch @@ -0,0 +1,558 @@ +From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 21 Apr 2022 20:52:30 -0500 +Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex +Content-type: text/plain; charset=UTF-8 + +The new code unrolls the main loop slightly without adding too much +overhead and minimizes the comparisons for the search CHAR. + +Geometric Mean of all benchmarks New / Old: 0.755 +See email for all results. + +Full xcheck passes on x86_64 with and without multiarch enabled. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++--------- + 1 file changed, 290 insertions(+), 181 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S +index f920b5a5..f5b6d755 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S +@@ -24,242 +24,351 @@ + # define STRRCHR __strrchr_evex + # endif + +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSRCHR ++# define SHIFT_REG esi ++ ++# define kunpck kunpckbw ++# define kmov_2x kmovd ++# define maskz_2x ecx ++# define maskm_2x eax ++# define CHAR_SIZE 4 ++# define VPMIN vpminud ++# define VPTESTN vptestnmd + # define VPBROADCAST vpbroadcastd +-# define VPCMP vpcmpd +-# define SHIFT_REG r8d ++# define VPCMP vpcmpd + # else ++# define SHIFT_REG edi ++ ++# define kunpck kunpckdq ++# define kmov_2x kmovq ++# define maskz_2x rcx ++# define maskm_2x rax ++ ++# define CHAR_SIZE 1 ++# define VPMIN vpminub ++# define VPTESTN vptestnmb + # define VPBROADCAST vpbroadcastb +-# define VPCMP vpcmpb +-# define SHIFT_REG ecx ++# define VPCMP vpcmpb + # endif + + # define XMMZERO xmm16 + # define YMMZERO ymm16 + # define YMMMATCH ymm17 +-# define YMM1 ymm18 ++# define YMMSAVE ymm18 ++ ++# define YMM1 ymm19 ++# define YMM2 ymm20 ++# define YMM3 ymm21 ++# define YMM4 ymm22 ++# define YMM5 ymm23 ++# define YMM6 ymm24 ++# define YMM7 ymm25 ++# define YMM8 ymm26 + +-# define VEC_SIZE 32 + +- .section .text.evex,"ax",@progbits +-ENTRY (STRRCHR) +- movl %edi, %ecx ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++ .section .text.evex, "ax", @progbits ++ENTRY(STRRCHR) ++ movl %edi, %eax + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(cross_page_boundary) + ++L(page_cross_continue): + VMOVU (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ /* k0 has a 1 for each zero CHAR in YMM1. */ ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- +- addq $VEC_SIZE, %rdi +- +- testl %eax, %eax +- jnz L(first_vec) +- + testl %ecx, %ecx +- jnz L(return_null) +- +- andq $-VEC_SIZE, %rdi +- xorl %edx, %edx +- jmp L(aligned_loop) +- +- .p2align 4 +-L(first_vec): +- /* Check if there is a null byte. */ +- testl %ecx, %ecx +- jnz L(char_and_nul_in_first_vec) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- movq %rdi, %rsi +- andq $-VEC_SIZE, %rdi +- jmp L(aligned_loop) +- +- .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ jz L(aligned_more) ++ /* fallthrough: zero CHAR in first VEC. */ + ++ /* K1 has a 1 for each search CHAR match in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Build mask up until first zero CHAR (used to mask of ++ potential search CHAR matches past the end of the string). ++ */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret0) ++ /* Get last match (the `andl` removed any out of bounds ++ matches). */ ++ bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif ++L(ret0): ++ ret + +- VMOVA (%rdi), %YMM1 +- +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ ++ /* Returns for first vec x1/x2/x3 have hard coded backward ++ search path for earlier matches. */ ++ .p2align 4,, 6 ++L(first_vec_x1): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ blsmskl %ecx, %ecx ++ /* eax non-zero if search CHAR in range. */ ++ andl %ecx, %eax ++ jnz L(first_vec_x1_return) ++ ++ /* fallthrough: no match in YMM2 then need to check for earlier ++ matches (in YMM1). */ ++ .p2align 4,, 4 ++L(first_vec_x0_test): + VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %edx + kmovd %k1, %eax +- +- shrxl %SHIFT_REG, %edx, %edx +- shrxl %SHIFT_REG, %eax, %eax +- addq $VEC_SIZE, %rdi +- +- /* Check if there is a CHAR. */ + testl %eax, %eax +- jnz L(found_char) +- +- testl %edx, %edx +- jnz L(return_null) +- +- jmp L(aligned_loop) +- +- .p2align 4 +-L(found_char): +- testl %edx, %edx +- jnz L(char_and_nul) +- +- /* Remember the match and keep searching. */ +- movl %eax, %edx +- leaq (%rdi, %rcx), %rsi ++ jz L(ret1) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ leaq (%rsi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rsi, %rax ++# endif ++L(ret1): ++ ret + +- .p2align 4 +-L(aligned_loop): +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ .p2align 4,, 10 ++L(first_vec_x1_or_x2): ++ VPCMP $0, %YMM3, %YMMMATCH, %k3 ++ VPCMP $0, %YMM2, %YMMMATCH, %k2 ++ /* K2 and K3 have 1 for any search CHAR match. Test if any ++ matches between either of them. Otherwise check YMM1. */ ++ kortestd %k2, %k3 ++ jz L(first_vec_x0_test) ++ ++ /* Guranteed that YMM2 and YMM3 are within range so merge the ++ two bitmasks then get last result. */ ++ kunpck %k2, %k3, %k3 ++ kmovq %k3, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 6 ++L(first_vec_x3): ++ VPCMP $0, %YMMMATCH, %YMM4, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ ++ andl %ecx, %eax ++ jz L(first_vec_x1_or_x2) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- add $VEC_SIZE, %rdi ++ .p2align 4,, 6 ++L(first_vec_x0_x1_test): ++ VPCMP $0, %YMMMATCH, %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Check YMM2 for last match first. If no match try YMM1. */ ++ testl %eax, %eax ++ jz L(first_vec_x0_test) ++ .p2align 4,, 4 ++L(first_vec_x1_return): ++ bsrl %eax, %eax ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 +- kmovd %k0, %ecx ++ .p2align 4,, 10 ++L(first_vec_x2): ++ VPCMP $0, %YMMMATCH, %YMM3, %k1 + kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ blsmskl %ecx, %ecx ++ /* Check YMM3 for last match first. If no match try YMM2/YMM1. ++ */ ++ andl %ecx, %eax ++ jz L(first_vec_x0_x1_test) ++ bsrl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ .p2align 4 ++L(aligned_more): ++ /* Need to keep original pointer incase YMM1 has last match. */ ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rdi ++ VMOVU VEC_SIZE(%rdi), %YMM2 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jnz L(char_nor_null) ++ testl %ecx, %ecx ++ jnz L(first_vec_x1) + +- VMOVA (%rdi), %YMM1 +- addq $VEC_SIZE, %rdi ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3 ++ VPTESTN %YMM3, %YMM3, %k0 ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(first_vec_x2) + +- /* Each bit in K0 represents a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4 ++ VPTESTN %YMM4, %YMM4, %k0 + kmovd %k0, %ecx +- kmovd %k1, %eax +- orl %eax, %ecx +- jz L(aligned_loop) ++ movq %rdi, %r8 ++ testl %ecx, %ecx ++ jnz L(first_vec_x3) + ++ andq $-(VEC_SIZE * 2), %rdi + .p2align 4 +-L(char_nor_null): +- /* Find a CHAR or a null byte in a loop. */ ++L(first_aligned_loop): ++ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee ++ they don't store a match. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5 ++ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6 ++ ++ VPCMP $0, %YMM5, %YMMMATCH, %k2 ++ vpxord %YMM6, %YMMMATCH, %YMM7 ++ ++ VPMIN %YMM5, %YMM6, %YMM8 ++ VPMIN %YMM8, %YMM7, %YMM7 ++ ++ VPTESTN %YMM7, %YMM7, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(first_aligned_loop) ++ ++ VPCMP $0, %YMM6, %YMMMATCH, %k3 ++ VPTESTN %YMM8, %YMM8, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_prep) ++ ++ kortestd %k2, %k3 ++ jnz L(return_first_aligned_loop) ++ ++ .p2align 4,, 6 ++L(first_vec_x1_or_x2_or_x3): ++ VPCMP $0, %YMM4, %YMMMATCH, %k4 ++ kmovd %k4, %eax + testl %eax, %eax +- jnz L(match) +-L(return_value): +- testl %edx, %edx +- jz L(return_null) +- movl %edx, %eax +- movq %rsi, %rdi ++ jz L(first_vec_x1_or_x2) + bsrl %eax, %eax +-# ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax +-# endif ++ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 +-L(match): +- /* Find a CHAR. Check if there is a null byte. */ +- kmovd %k0, %ecx +- testl %ecx, %ecx +- jnz L(find_nul) ++ .p2align 4,, 8 ++L(return_first_aligned_loop): ++ VPTESTN %YMM5, %YMM5, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(first_vec_x1_or_x2_or_x3) + +- /* Remember the match and keep searching. */ +- movl %eax, %edx ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++ .p2align 4 ++ /* We can throw away the work done for the first 4x checks here ++ as we have a later match. This is the 'fast' path persay. ++ */ ++L(second_aligned_loop_prep): ++L(second_aligned_loop_set_furthest_match): + movq %rdi, %rsi +- jmp L(aligned_loop) ++ kunpck %k2, %k3, %k4 + + .p2align 4 +-L(find_nul): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* If there is no CHAR here, return the remembered one. */ +- jz L(return_value) +- bsrl %eax, %eax ++L(second_aligned_loop): ++ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1 ++ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2 ++ ++ VPCMP $0, %YMM1, %YMMMATCH, %k2 ++ vpxord %YMM2, %YMMMATCH, %YMM3 ++ ++ VPMIN %YMM1, %YMM2, %YMM4 ++ VPMIN %YMM3, %YMM4, %YMM3 ++ ++ VPTESTN %YMM3, %YMM3, %k1 ++ subq $(VEC_SIZE * -2), %rdi ++ kortestd %k1, %k2 ++ jz L(second_aligned_loop) ++ ++ VPCMP $0, %YMM2, %YMMMATCH, %k3 ++ VPTESTN %YMM4, %YMM4, %k1 ++ ktestd %k1, %k1 ++ jz L(second_aligned_loop_set_furthest_match) ++ ++ kortestd %k2, %k3 ++ /* branch here because there is a significant advantage interms ++ of output dependency chance in using edx. */ ++ jnz L(return_new_match) ++L(return_old_match): ++ kmovq %k4, %rax ++ bsrq %rax, %rax ++ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(return_new_match): ++ VPTESTN %YMM1, %YMM1, %k0 ++ kunpck %k0, %k1, %k0 ++ kmov_2x %k0, %maskz_2x ++ ++ blsmsk %maskz_2x, %maskz_2x ++ kunpck %k2, %k3, %k3 ++ kmov_2x %k3, %maskm_2x ++ and %maskz_2x, %maskm_2x ++ jz L(return_old_match) ++ ++ bsr %maskm_2x, %maskm_2x ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++L(cross_page_boundary): ++ /* eax contains all the page offset bits of src (rdi). `xor rdi, ++ rax` sets pointer will all page offset bits cleared so ++ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC ++ before page cross (guranteed to be safe to read). Doing this ++ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves ++ a bit of code size. */ ++ xorq %rdi, %rax ++ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1 ++ VPTESTN %YMM1, %YMM1, %k0 ++ kmovd %k0, %ecx ++ ++ /* Shift out zero CHAR matches that are before the begining of ++ src (rdi). */ + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ movl %edi, %esi ++ andl $(VEC_SIZE - 1), %esi ++ shrl $2, %esi + # endif +- ret ++ shrxl %SHIFT_REG, %ecx, %ecx + +- .p2align 4 +-L(char_and_nul): +- /* Find both a CHAR and a null byte. */ +- addq %rcx, %rdi +- movl %edx, %ecx +-L(char_and_nul_in_first_vec): +- /* Mask out any matching bits after the null byte. */ +- movl %ecx, %r8d +- subl $1, %r8d +- xorl %ecx, %r8d +- andl %r8d, %eax +- testl %eax, %eax +- /* Return null pointer if the null byte comes first. */ +- jz L(return_null) ++ testl %ecx, %ecx ++ jz L(page_cross_continue) ++ ++ /* Found zero CHAR so need to test for search CHAR. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k1, %eax ++ /* Shift out search CHAR matches that are before the begining of ++ src (rdi). */ ++ shrxl %SHIFT_REG, %eax, %eax ++ ++ /* Check if any search CHAR match in range. */ ++ blsmskl %ecx, %ecx ++ andl %ecx, %eax ++ jz L(ret3) + bsrl %eax, %eax + # ifdef USE_AS_WCSRCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else +- leaq -VEC_SIZE(%rdi, %rax), %rax ++ addq %rdi, %rax + # endif ++L(ret3): + ret + +- .p2align 4 +-L(return_null): +- xorl %eax, %eax +- ret +- +-END (STRRCHR) ++END(STRRCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-106.patch b/glibc-RHEL-15696-106.patch new file mode 100644 index 0000000..f3bdb17 --- /dev/null +++ b/glibc-RHEL-15696-106.patch @@ -0,0 +1,73 @@ +From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 27 Apr 2022 15:13:02 -0500 +Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h +Content-type: text/plain; charset=UTF-8 + +'get_fast_jitter' is meant to be used purely for performance +purposes. In all cases it's used it should be acceptable to get no +randomness (see default case). An example use case is in setting +jitter for retries between threads at a lock. There is a +performance benefit to having jitter, but only if the jitter can +be generated very quickly and ultimately there is no serious issue +if no jitter is generated. + +The implementation generally uses 'HP_TIMING_NOW' iff it is +inlined (avoid any potential syscall paths). +Reviewed-by: H.J. Lu +--- + sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++ + 1 file changed, 42 insertions(+) + create mode 100644 sysdeps/generic/fast-jitter.h + +diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h +new file mode 100644 +index 00000000..4dd53e34 +--- /dev/null ++++ b/sysdeps/generic/fast-jitter.h +@@ -0,0 +1,42 @@ ++/* Fallback for fast jitter just return 0. ++ Copyright (C) 2019-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef _FAST_JITTER_H ++# define _FAST_JITTER_H ++ ++# include ++# include ++ ++/* Baseline just return 0. We could create jitter using a clock or ++ 'random_bits' but that may imply a syscall and the goal of ++ 'get_fast_jitter' is minimal overhead "randomness" when such ++ randomness helps performance. Adding high overhead the function ++ defeats the purpose. */ ++static inline uint32_t ++get_fast_jitter (void) ++{ ++# if HP_TIMING_INLINE ++ hp_timing_t jitter; ++ HP_TIMING_NOW (jitter); ++ return (uint32_t) jitter; ++# else ++ return 0; ++# endif ++} ++ ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-107.patch b/glibc-RHEL-15696-107.patch new file mode 100644 index 0000000..738cc23 --- /dev/null +++ b/glibc-RHEL-15696-107.patch @@ -0,0 +1,226 @@ +From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001 +From: Wangyang Guo +Date: Fri, 6 May 2022 01:50:10 +0000 +Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop +Content-type: text/plain; charset=UTF-8 + +When mutiple threads waiting for lock at the same time, once lock owner +releases the lock, waiters will see lock available and all try to lock, +which may cause an expensive CAS storm. + +Binary exponential backoff with random jitter is introduced. As try-lock +attempt increases, there is more likely that a larger number threads +compete for adaptive mutex lock, so increase wait time in exponential. +A random jitter is also added to avoid synchronous try-lock from other +threads. + +v2: Remove read-check before try-lock for performance. + +v3: +1. Restore read-check since it works well in some platform. +2. Make backoff arch dependent, and enable it for x86_64. +3. Limit max backoff to reduce latency in large critical section. + +v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h + +v5: Commit log updated for regression in large critical section. + +Result of pthread-mutex-locks bench + +Test Platform: Xeon 8280L (2 socket, 112 CPUs in total) +First Row: thread number +First Col: critical section length +Values: backoff vs upstream, time based, low is better + +non-critical-length: 1 + 1 2 4 8 16 32 64 112 140 +0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54 +1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57 +2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61 +4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65 +8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71 +16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80 +32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90 +64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99 +128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02 + +non-critical-length: 32 + 1 2 4 8 16 32 64 112 140 +0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70 +1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72 +2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74 +4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77 +8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80 +16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84 +32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91 +64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99 +128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99 + +non-critical-length: 128 + 1 2 4 8 16 32 64 112 140 +0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73 +1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74 +2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76 +4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77 +8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80 +16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84 +32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91 +64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99 +128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98 + +There is regression in large critical section. But adaptive mutex is +aimed for "quick" locks. Small critical section is more common when +users choose to use adaptive pthread_mutex. + +Signed-off-by: Wangyang Guo +Reviewed-by: H.J. Lu + +Conflicts: + pthreadP.h + (had been moved) + nptl/pthread_mutex_lock.c + (max_adaptive_count renamed) + +--- + nptl/pthreadP.h | 1 + + nptl/pthread_mutex_lock.c | 16 +++++++-- + sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++ + sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++ + 4 files changed, 89 insertions(+), 2 deletions(-) + create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h + create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h + +diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h +index 7ddc166c..1550e3b6 100644 +--- a/nptl/pthreadP.h ++++ b/nptl/pthreadP.h +@@ -33,6 +33,7 @@ + #include + #include + #include ++#include + + + /* Atomic operations on TLS memory. */ +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index d96a9933..c7770fc9 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex) + int cnt = 0; + int max_cnt = MIN (MAX_ADAPTIVE_COUNT, + mutex->__data.__spins * 2 + 10); ++ int spin_count, exp_backoff = 1; ++ unsigned int jitter = get_jitter (); + do + { +- if (cnt++ >= max_cnt) ++ /* In each loop, spin count is exponential backoff plus ++ random jitter, random range is [0, exp_backoff-1]. */ ++ spin_count = exp_backoff + (jitter & (exp_backoff - 1)); ++ cnt += spin_count; ++ if (cnt >= max_cnt) + { ++ /* If cnt exceeds max spin count, just go to wait ++ queue. */ + LLL_MUTEX_LOCK (mutex); + break; + } +- atomic_spin_nop (); ++ do ++ atomic_spin_nop (); ++ while (--spin_count > 0); ++ /* Prepare for next loop. */ ++ exp_backoff = get_next_backoff (exp_backoff); + } + while (LLL_MUTEX_READ_LOCK (mutex) != 0 + || LLL_MUTEX_TRYLOCK (mutex) != 0); +diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h +new file mode 100644 +index 00000000..5b26c22a +--- /dev/null ++++ b/sysdeps/nptl/pthread_mutex_backoff.h +@@ -0,0 +1,35 @@ ++/* Pthread mutex backoff configuration. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++#ifndef _PTHREAD_MUTEX_BACKOFF_H ++#define _PTHREAD_MUTEX_BACKOFF_H 1 ++ ++static inline unsigned int ++get_jitter (void) ++{ ++ /* Arch dependent random jitter, return 0 disables random. */ ++ return 0; ++} ++ ++static inline int ++get_next_backoff (int backoff) ++{ ++ /* Next backoff, return 1 disables mutex backoff. */ ++ return 1; ++} ++ ++#endif +diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h +new file mode 100644 +index 00000000..ec74c3d9 +--- /dev/null ++++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h +@@ -0,0 +1,39 @@ ++/* Pthread mutex backoff configuration. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++#ifndef _PTHREAD_MUTEX_BACKOFF_H ++#define _PTHREAD_MUTEX_BACKOFF_H 1 ++ ++#include ++ ++static inline unsigned int ++get_jitter (void) ++{ ++ return get_fast_jitter (); ++} ++ ++#define MAX_BACKOFF 16 ++ ++static inline int ++get_next_backoff (int backoff) ++{ ++ /* Binary expontial backoff. Limiting max backoff ++ can reduce latency in large critical section. */ ++ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff; ++} ++ ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-108.patch b/glibc-RHEL-15696-108.patch new file mode 100644 index 0000000..17bf7d8 --- /dev/null +++ b/glibc-RHEL-15696-108.patch @@ -0,0 +1,55 @@ +From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 08:18:15 -0600 +Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ + #28896] +Content-type: text/plain; charset=UTF-8 + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. + +Co-authored-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------ + 1 file changed, 2 insertions(+), 6 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strcmp-avx2.S + (split into two patches due to upstream bug differences) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 28cc98b6..e267c6cb 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -345,10 +345,10 @@ L(one_or_less): + movq %LOCALE_REG, %rdx + # endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_avx2 ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -357,10 +357,6 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- +- jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + TOLOWER_gpr (%rax, %eax) +-- +GitLab + diff --git a/glibc-RHEL-15696-109.patch b/glibc-RHEL-15696-109.patch new file mode 100644 index 0000000..8aaa314 --- /dev/null +++ b/glibc-RHEL-15696-109.patch @@ -0,0 +1,60 @@ +From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001 +From: Stefan Liebler +Date: Mon, 28 Jun 2021 13:01:07 +0200 +Subject: s390x: Update math: redirect roundeven function + +After recent commit +447954a206837b5f153869cfeeeab44631c3fac9 +"math: redirect roundeven function", building on +s390x fails with: +Error: symbol `__roundevenl' is already defined + +Similar to aarch64/riscv fix, this patch redirects target +specific functions for s390x: +commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6 +"Update math: redirect roundeven function" + +diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c +index 40b07e054b..0773adfed0 100644 +--- a/sysdeps/s390/fpu/s_roundeven.c ++++ b/sysdeps/s390/fpu/s_roundeven.c +@@ -18,6 +18,7 @@ + . */ + + #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT ++# define NO_MATH_REDIRECT + # include + # include + +@@ -31,7 +32,6 @@ __roundeven (double x) + __asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x)); + return y; + } +-hidden_def (__roundeven) + libm_alias_double (__roundeven, roundeven) + + #else +diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c +index d2fbf3d2b6..289785bc4a 100644 +--- a/sysdeps/s390/fpu/s_roundevenf.c ++++ b/sysdeps/s390/fpu/s_roundevenf.c +@@ -18,6 +18,7 @@ + . */ + + #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT ++# define NO_MATH_REDIRECT + # include + # include + +diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c +index 29ab7a8616..94b6459ab4 100644 +--- a/sysdeps/s390/fpu/s_roundevenl.c ++++ b/sysdeps/s390/fpu/s_roundevenl.c +@@ -18,6 +18,7 @@ + . */ + + #ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT ++# define NO_MATH_REDIRECT + # include + # include + # include diff --git a/glibc-RHEL-15696-11.patch b/glibc-RHEL-15696-11.patch new file mode 100644 index 0000000..54d7eff --- /dev/null +++ b/glibc-RHEL-15696-11.patch @@ -0,0 +1,74 @@ +From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 26 Feb 2021 05:36:59 -0800 +Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP +Content-type: text/plain; charset=UTF-8 + +1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered +by VZEROUPPER inside a transactionally executing RTM region. +2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2 +loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs, +1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add +Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions. +--- + sysdeps/x86/cpu-features.c | 20 +++++++++++++++++-- + sysdeps/x86/cpu-tunables.c | 2 ++ + ...cpu-features-preferred_feature_index_1.def | 1 + + 3 files changed, 21 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 91042505..3610ee5c 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features) + cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] + |= bit_arch_Prefer_No_VZEROUPPER; + else +- cpu_features->preferred[index_arch_Prefer_No_AVX512] +- |= bit_arch_Prefer_No_AVX512; ++ { ++ cpu_features->preferred[index_arch_Prefer_No_AVX512] ++ |= bit_arch_Prefer_No_AVX512; ++ ++ /* Avoid RTM abort triggered by VZEROUPPER inside a ++ transactionally executing RTM region. */ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER] ++ |= bit_arch_Prefer_No_VZEROUPPER; ++ ++ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp ++ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp ++ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB, ++ AVX2 strcmp is faster than EVEX strcmp. */ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) ++ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP] ++ |= bit_arch_Prefer_AVX2_STRCMP; ++ } + } + /* This spells out "AuthenticAMD". */ + else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65) +diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c +index 3173b2b9..73adbaba 100644 +--- a/sysdeps/x86/cpu-tunables.c ++++ b/sysdeps/x86/cpu-tunables.c +@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp) + CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features, + Fast_Copy_Backward, + disable, 18); ++ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH ++ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18); + } + break; + case 19: +diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +index 17a5cc42..4ca70b40 100644 +--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def ++++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def +@@ -32,3 +32,4 @@ BIT (Prefer_ERMS) + BIT (Prefer_FSRM) + BIT (Prefer_No_AVX512) + BIT (MathVec_Prefer_No_AVX512) ++BIT (Prefer_AVX2_STRCMP) +-- +GitLab + diff --git a/glibc-RHEL-15696-110.patch b/glibc-RHEL-15696-110.patch new file mode 100644 index 0000000..c499761 --- /dev/null +++ b/glibc-RHEL-15696-110.patch @@ -0,0 +1,26 @@ +From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Wed, 23 Jun 2021 13:29:41 -0700 +Subject: Update math: redirect roundeven function + +Redirect target specific roundeven functions for aarch64, ldbl-128ibm +and riscv. + +Conflicts: + sysdeps/aarch64/* + (not needed) + sysdeps/riscv/* + (not supported) + +diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c +index 6701970f4a..90eecf496b 100644 +--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c ++++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + diff --git a/glibc-RHEL-15696-12.patch b/glibc-RHEL-15696-12.patch new file mode 100644 index 0000000..85b568e --- /dev/null +++ b/glibc-RHEL-15696-12.patch @@ -0,0 +1,3410 @@ +From 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 06:24:52 -0800 +Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to +select the function optimized with 256-bit EVEX instructions using +YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW +and BMI2 since VZEROUPPER isn't needed at function exit. + +For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP +is set. +--- + sysdeps/x86_64/multiarch/Makefile | 21 +- + sysdeps/x86_64/multiarch/ifunc-avx2.h | 14 +- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 81 ++ + sysdeps/x86_64/multiarch/memchr-evex.S | 381 +++++++ + sysdeps/x86_64/multiarch/memrchr-evex.S | 337 +++++++ + sysdeps/x86_64/multiarch/rawmemchr-evex.S | 4 + + sysdeps/x86_64/multiarch/strchr-evex.S | 335 +++++++ + sysdeps/x86_64/multiarch/strchr.c | 14 +- + sysdeps/x86_64/multiarch/strchrnul-evex.S | 3 + + sysdeps/x86_64/multiarch/strcmp-evex.S | 1043 ++++++++++++++++++++ + sysdeps/x86_64/multiarch/strcmp.c | 15 +- + sysdeps/x86_64/multiarch/strlen-evex.S | 436 ++++++++ + sysdeps/x86_64/multiarch/strncmp-evex.S | 3 + + sysdeps/x86_64/multiarch/strncmp.c | 15 +- + sysdeps/x86_64/multiarch/strnlen-evex.S | 4 + + sysdeps/x86_64/multiarch/strrchr-evex.S | 265 +++++ + sysdeps/x86_64/multiarch/wcschr-evex.S | 3 + + sysdeps/x86_64/multiarch/wcscmp-evex.S | 4 + + sysdeps/x86_64/multiarch/wcslen-evex.S | 4 + + sysdeps/x86_64/multiarch/wcsncmp-evex.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen-evex.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen.c | 14 +- + sysdeps/x86_64/multiarch/wcsrchr-evex.S | 3 + + sysdeps/x86_64/multiarch/wmemchr-evex.S | 4 + + 24 files changed, 2996 insertions(+), 17 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S + create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S + +Conflicts: + sysdeps/x86_64/multiarch/wcsnlen.c + (account for missing upstream macros) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 9477538a..5ce85882 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memmove-avx512-unaligned-erms \ + memset-sse2-unaligned-erms \ + memset-avx2-unaligned-erms \ +- memset-avx512-unaligned-erms ++ memset-avx512-unaligned-erms \ ++ memchr-evex \ ++ memrchr-evex \ ++ rawmemchr-evex \ ++ strchr-evex \ ++ strchrnul-evex \ ++ strcmp-evex \ ++ strlen-evex \ ++ strncmp-evex \ ++ strnlen-evex \ ++ strrchr-evex + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +@@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcschr-sse2 wcschr-avx2 \ + wcsrchr-sse2 wcsrchr-avx2 \ + wcsnlen-sse4_1 wcsnlen-c \ +- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 ++ wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ ++ wcschr-evex \ ++ wcscmp-evex \ ++ wcslen-evex \ ++ wcsncmp-evex \ ++ wcsnlen-evex \ ++ wcsrchr-evex \ ++ wmemchr-evex + endif + + ifeq ($(subdir),debug) +diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h +index 5c88640a..7081b0c9 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h ++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h +@@ -21,16 +21,24 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + return OPTIMIZE (sse2); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index fe13505c..bd7d9f19 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memchr, + CPU_FEATURE_USABLE (AVX2), + __memchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __memchr_evex) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/memcmp.c. */ +@@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memrchr, + CPU_FEATURE_USABLE (AVX2), + __memrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memrchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memrchr_evex) ++ + IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2)) + + #ifdef SHARED +@@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, rawmemchr, + CPU_FEATURE_USABLE (AVX2), + __rawmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __rawmemchr_evex) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strlen.c. */ +@@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strlen, + CPU_FEATURE_USABLE (AVX2), + __strlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strlen_evex) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/strnlen.c. */ +@@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strnlen, + CPU_FEATURE_USABLE (AVX2), + __strnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strnlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strnlen_evex) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) + + /* Support sysdeps/x86_64/multiarch/stpncpy.c. */ +@@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchr, + CPU_FEATURE_USABLE (AVX2), + __strchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strchr_evex) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf) + IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2)) + +@@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchrnul, + CPU_FEATURE_USABLE (AVX2), + __strchrnul_avx2) ++ IFUNC_IMPL_ADD (array, i, strchrnul, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strchrnul_evex) + IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2)) + + /* Support sysdeps/x86_64/multiarch/strrchr.c. */ +@@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strrchr, + CPU_FEATURE_USABLE (AVX2), + __strrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strrchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strrchr_evex) + IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strcmp.c. */ +@@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strcmp, + CPU_FEATURE_USABLE (AVX2), + __strcmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __strcmp_evex) + IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2), + __strcmp_sse42) + IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3), +@@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcschr, + CPU_FEATURE_USABLE (AVX2), + __wcschr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcschr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcschr_evex) + IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsrchr.c. */ +@@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsrchr, + CPU_FEATURE_USABLE (AVX2), + __wcsrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsrchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcsrchr_evex) + IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscmp.c. */ +@@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcscmp, + CPU_FEATURE_USABLE (AVX2), + __wcscmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcscmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcscmp_evex) + IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsncmp.c. */ +@@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsncmp, + CPU_FEATURE_USABLE (AVX2), + __wcsncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsncmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcsncmp_evex) + IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcscpy.c. */ +@@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (AVX2), + __wcslen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcslen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcslen_evex) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ +@@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (AVX2), + __wcsnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsnlen, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wcsnlen_evex) + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (SSE4_1), + __wcsnlen_sse4_1) +@@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemchr, + CPU_FEATURE_USABLE (AVX2), + __wmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wmemchr_evex) + IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ +@@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strncmp, + CPU_FEATURE_USABLE (AVX2), + __strncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncmp_evex) + IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2), + __strncmp_sse42) + IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3), +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +new file mode 100644 +index 00000000..6dd5d67b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -0,0 +1,381 @@ ++/* memchr/wmemchr optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef MEMCHR ++# define MEMCHR __memchr_evex ++# endif ++ ++# ifdef USE_AS_WMEMCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMP vpcmpd ++# define SHIFT_REG r8d ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMP vpcmpb ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMMATCH xmm16 ++# define YMMMATCH ymm16 ++# define YMM1 ymm17 ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (MEMCHR) ++# ifndef USE_AS_RAWMEMCHR ++ /* Check for zero length. */ ++ test %RDX_LP, %RDX_LP ++ jz L(zero) ++# endif ++ movl %edi, %ecx ++# ifdef USE_AS_WMEMCHR ++ shl $2, %RDX_LP ++# else ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++# endif ++ /* Broadcast CHAR to YMMMATCH. */ ++ VPBROADCAST %esi, %YMMMATCH ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. */ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++# ifndef USE_AS_RAWMEMCHR ++ jnz L(first_vec_x0_check) ++ /* Adjust length and check the end of data. */ ++ subq $VEC_SIZE, %rdx ++ jbe L(zero) ++# else ++ jnz L(first_vec_x0) ++# endif ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. */ ++ addq %rcx, %rdx ++ ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ jmp L(more_4x_vec) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ andq $-VEC_SIZE, %rdi ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ /* Remove the leading bytes. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++# ifndef USE_AS_RAWMEMCHR ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++# endif ++ addq %rdi, %rax ++ addq %rcx, %rax ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++# ifndef USE_AS_RAWMEMCHR ++ /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" ++ instead of "(rdx + rcx) - VEC_SIZE" to void possible addition ++ overflow. */ ++ negq %rcx ++ addq $VEC_SIZE, %rcx ++ ++ /* Check the end of data. */ ++ subq %rcx, %rdx ++ jbe L(zero) ++# endif ++ ++ addq $VEC_SIZE, %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++L(more_4x_vec): ++ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++ /* Align data to 4 * VEC_SIZE. */ ++ movq %rdi, %rcx ++ andl $(4 * VEC_SIZE - 1), %ecx ++ andq $-(4 * VEC_SIZE), %rdi ++ ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. */ ++ addq %rcx, %rdx ++# endif ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 ++ kord %k1, %k2, %k5 ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 ++ ++ kord %k3, %k4, %k6 ++ kortestd %k5, %k6 ++ jnz L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifdef USE_AS_RAWMEMCHR ++ jmp L(loop_4x_vec) ++# else ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_4x_vec) ++ ++L(last_4x_vec_or_less): ++ /* Less than 4 * VEC and aligned to VEC_SIZE. */ ++ addl $(VEC_SIZE * 2), %edx ++ jle L(last_2x_vec) ++ ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x2_check) ++ subl $VEC_SIZE, %edx ++ jle L(zero) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x3_check) ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ addl $(VEC_SIZE * 2), %edx ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ ++ jnz L(first_vec_x0_check) ++ subl $VEC_SIZE, %edx ++ jle L(zero) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(first_vec_x0_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(first_vec_x2_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(first_vec_x3_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rdx ++ jbe L(zero) ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++# else ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ kmovd %k4, %eax ++ testl %eax, %eax ++L(first_vec_x3): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax ++# else ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++# endif ++ ret ++ ++END (MEMCHR) ++#endif +diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S +new file mode 100644 +index 00000000..16bf8e02 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memrchr-evex.S +@@ -0,0 +1,337 @@ ++/* memrchr optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# define VMOVA vmovdqa64 ++ ++# define YMMMATCH ymm16 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (__memrchr_evex) ++ /* Broadcast CHAR to YMMMATCH. */ ++ vpbroadcastb %esi, %YMMMATCH ++ ++ sub $VEC_SIZE, %RDX_LP ++ jbe L(last_vec_or_less) ++ ++ add %RDX_LP, %RDI_LP ++ ++ /* Check the last VEC_SIZE bytes. */ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x0) ++ ++ subq $(VEC_SIZE * 4), %rdi ++ movl %edi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ jz L(aligned_more) ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rdx ++ andq $-VEC_SIZE, %rdi ++ subq %rcx, %rdx ++ ++ .p2align 4 ++L(aligned_more): ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++ ++ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k4 ++ kmovd %k4, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x0) ++ ++ /* Align data to 4 * VEC_SIZE for loop with fewer branches. ++ There are some overlaps with above if data isn't aligned ++ to 4 * VEC_SIZE. */ ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ jz L(loop_4x_vec) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ addq $(VEC_SIZE * 4), %rdx ++ andq $-(VEC_SIZE * 4), %rdi ++ subq %rcx, %rdx ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ subq $(VEC_SIZE * 4), %rdi ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(last_4x_vec_or_less) ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 ++ kord %k1, %k2, %k5 ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 ++ ++ kord %k3, %k4, %k6 ++ kortestd %k5, %k6 ++ jz L(loop_4x_vec) ++ ++ /* There is a match. */ ++ kmovd %k4, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ kmovd %k1, %eax ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_4x_vec_or_less): ++ addl $(VEC_SIZE * 4), %edx ++ cmpl $(VEC_SIZE * 2), %edx ++ jbe L(last_2x_vec) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2 ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3 ++ kmovd %k3, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x1_check) ++ cmpl $(VEC_SIZE * 3), %edx ++ jbe L(zero) ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k4 ++ kmovd %k4, %eax ++ testl %eax, %eax ++ jz L(zero) ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 4), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3_check) ++ cmpl $VEC_SIZE, %edx ++ jbe L(zero) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jz L(zero) ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 2), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $(VEC_SIZE * 2), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x0): ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x1): ++ bsrl %eax, %eax ++ addl $VEC_SIZE, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x2): ++ bsrl %eax, %eax ++ addl $(VEC_SIZE * 2), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x3): ++ bsrl %eax, %eax ++ addl $(VEC_SIZE * 3), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x1_check): ++ bsrl %eax, %eax ++ subq $(VEC_SIZE * 3), %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $VEC_SIZE, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x3_check): ++ bsrl %eax, %eax ++ subq $VEC_SIZE, %rdx ++ addq %rax, %rdx ++ jl L(zero) ++ addl $(VEC_SIZE * 3), %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(last_vec_or_less_aligned): ++ movl %edx, %ecx ++ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ ++ movl $1, %edx ++ /* Support rdx << 32. */ ++ salq %cl, %rdx ++ subq $1, %rdx ++ ++ kmovd %k1, %eax ++ ++ /* Remove the trailing bytes. */ ++ andl %edx, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_or_less): ++ addl $VEC_SIZE, %edx ++ ++ /* Check for zero length. */ ++ testl %edx, %edx ++ jz L(zero) ++ ++ movl %edi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ jz L(last_vec_or_less_aligned) ++ ++ movl %ecx, %esi ++ movl %ecx, %r8d ++ addl %edx, %esi ++ andq $-VEC_SIZE, %rdi ++ ++ subl $VEC_SIZE, %esi ++ ja L(last_vec_2x_aligned) ++ ++ /* Check the last VEC. */ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ kmovd %k1, %eax ++ ++ /* Remove the leading and trailing bytes. */ ++ sarl %cl, %eax ++ movl %edx, %ecx ++ ++ movl $1, %edx ++ sall %cl, %edx ++ subl $1, %edx ++ ++ andl %edx, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ addq %r8, %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_2x_aligned): ++ movl %esi, %ecx ++ ++ /* Check the last VEC. */ ++ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 ++ ++ movl $1, %edx ++ sall %cl, %edx ++ subl $1, %edx ++ ++ kmovd %k1, %eax ++ ++ /* Remove the trailing bytes. */ ++ andl %edx, %eax ++ ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ /* Check the second last VEC. */ ++ vpcmpb $0, (%rdi), %YMMMATCH, %k1 ++ ++ movl %r8d, %ecx ++ ++ kmovd %k1, %eax ++ ++ /* Remove the leading bytes. Must use unsigned right shift for ++ bsrl below. */ ++ shrl %cl, %eax ++ testl %eax, %eax ++ jz L(zero) ++ ++ bsrl %eax, %eax ++ addq %rdi, %rax ++ addq %r8, %rax ++ ret ++END (__memrchr_evex) ++#endif +diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S +new file mode 100644 +index 00000000..ec942b77 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __rawmemchr_evex ++#define USE_AS_RAWMEMCHR 1 ++ ++#include "memchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +new file mode 100644 +index 00000000..ddc86a70 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -0,0 +1,335 @@ ++/* strchr/strchrnul optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRCHR ++# define STRCHR __strchr_evex ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMP vpcmpd ++# define VPMINU vpminud ++# define CHAR_REG esi ++# define SHIFT_REG r8d ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMP vpcmpb ++# define VPMINU vpminub ++# define CHAR_REG sil ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMZERO xmm16 ++ ++# define YMMZERO ymm16 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++# define YMM8 ymm25 ++ ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCHR) ++ movl %edi, %ecx ++# ifndef USE_AS_STRCHRNUL ++ xorl %edx, %edx ++# endif ++ ++ /* Broadcast CHAR to YMM0. */ ++ VPBROADCAST %esi, %YMM0 ++ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Check if we cross page boundary with one vector load. */ ++ andl $(PAGE_SIZE - 1), %ecx ++ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx ++ ja L(cross_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. Search for both CHAR and the ++ null bytes. */ ++ VMOVU (%rdi), %YMM1 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ ktestd %k0, %k0 ++ jz L(more_vecs) ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(more_vecs): ++ /* Align data for aligned loads in the loop. */ ++ andq $-VEC_SIZE, %rdi ++L(aligned_more): ++ ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VMOVA VEC_SIZE(%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VMOVA VEC_SIZE(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ ktestd %k0, %k0 ++ jz L(prep_loop_4x) ++ ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax ++# else ++ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq VEC_SIZE(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ /* Found CHAR or the null byte. */ ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++# else ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++L(prep_loop_4x): ++ /* Align data to 4 * VEC_SIZE. */ ++ andq $-(VEC_SIZE * 4), %rdi ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++ VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 ++ VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM5 ++ vpxorq %YMM2, %YMM0, %YMM6 ++ vpxorq %YMM3, %YMM0, %YMM7 ++ vpxorq %YMM4, %YMM0, %YMM8 ++ ++ VPMINU %YMM5, %YMM1, %YMM5 ++ VPMINU %YMM6, %YMM2, %YMM6 ++ VPMINU %YMM7, %YMM3, %YMM7 ++ VPMINU %YMM8, %YMM4, %YMM8 ++ ++ VPMINU %YMM5, %YMM6, %YMM1 ++ VPMINU %YMM7, %YMM8, %YMM2 ++ ++ VPMINU %YMM1, %YMM2, %YMM1 ++ ++ /* Each bit in K0 represents a CHAR or a null byte. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++ ktestd %k0, %k0 ++ jz L(loop_4x_vec) ++ ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM5, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ ++ VPCMP $0, %YMMZERO, %YMM6, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ ++ VPCMP $0, %YMMZERO, %YMM7, %k2 ++ /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ ++ VPCMP $0, %YMMZERO, %YMM8, %k3 ++ ++# ifdef USE_AS_WCSCHR ++ /* NB: Each bit in K2/K3 represents 4-byte element. */ ++ kshiftlw $8, %k3, %k1 ++# else ++ kshiftlq $32, %k3, %k1 ++# endif ++ ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korq %k1, %k2, %k1 ++ kmovq %k1, %rax ++ ++ tzcntq %rax, %rax ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++# else ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++ /* Cold case for crossing page with first load. */ ++ .p2align 4 ++L(cross_page_boundary): ++ andq $-VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ ++ VMOVA (%rdi), %YMM1 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ ++# ifdef USE_AS_WCSCHR ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ ++ /* Remove the leading bits. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++ ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++ addq %rcx, %rdi ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq (%rdi, %rax, 4), %rax ++# else ++ addq %rdi, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ ret ++ ++END (STRCHR) ++# endif +diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c +index 32954713..be05e197 100644 +--- a/sysdeps/x86_64/multiarch/strchr.c ++++ b/sysdeps/x86_64/multiarch/strchr.c +@@ -29,16 +29,24 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF)) + return OPTIMIZE (sse2_no_bsf); +diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S +new file mode 100644 +index 00000000..064fe7ca +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S +@@ -0,0 +1,3 @@ ++#define STRCHR __strchrnul_evex ++#define USE_AS_STRCHRNUL 1 ++#include "strchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +new file mode 100644 +index 00000000..459eeed0 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -0,0 +1,1043 @@ ++/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRCMP ++# define STRCMP __strcmp_evex ++# endif ++ ++# define PAGE_SIZE 4096 ++ ++/* VEC_SIZE = Number of bytes in a ymm register */ ++# define VEC_SIZE 32 ++ ++/* Shift for dividing by (VEC_SIZE * 4). */ ++# define DIVIDE_BY_VEC_4_SHIFT 7 ++# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) ++# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSCMP ++/* Compare packed dwords. */ ++# define VPCMP vpcmpd ++# define SHIFT_REG32 r8d ++# define SHIFT_REG64 r8 ++/* 1 dword char == 4 bytes. */ ++# define SIZE_OF_CHAR 4 ++# else ++/* Compare packed bytes. */ ++# define VPCMP vpcmpb ++# define SHIFT_REG32 ecx ++# define SHIFT_REG64 rcx ++/* 1 byte char == 1 byte. */ ++# define SIZE_OF_CHAR 1 ++# endif ++ ++# define XMMZERO xmm16 ++# define XMM0 xmm17 ++# define XMM1 xmm18 ++ ++# define YMMZERO ymm16 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++ ++/* Warning! ++ wcscmp/wcsncmp have to use SIGNED comparison for elements. ++ strcmp/strncmp have to use UNSIGNED comparison for elements. ++*/ ++ ++/* The main idea of the string comparison (byte or dword) using 256-bit ++ EVEX instructions consists of comparing (VPCMP) two ymm vectors. The ++ latter can be on either packed bytes or dwords depending on ++ USE_AS_WCSCMP. In order to check the null char, algorithm keeps the ++ matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 ++ KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) ++ are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd ++ instructions. Main loop (away from from page boundary) compares 4 ++ vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128 ++ bytes) on each loop. ++ ++ The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic ++ is the same as strcmp, except that an a maximum offset is tracked. If ++ the maximum offset is reached before a difference is found, zero is ++ returned. */ ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCMP) ++# ifdef USE_AS_STRNCMP ++ /* Check for simple cases (0 or 1) in offset. */ ++ cmp $1, %RDX_LP ++ je L(char0) ++ jb L(zero) ++# ifdef USE_AS_WCSCMP ++ /* Convert units: from wide to byte char. */ ++ shl $2, %RDX_LP ++# endif ++ /* Register %r11 tracks the maximum offset. */ ++ mov %RDX_LP, %R11_LP ++# endif ++ movl %edi, %eax ++ xorl %edx, %edx ++ /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax ++ jg L(cross_page) ++ /* Start comparing 4 vectors. */ ++ VMOVU (%rdi), %YMM0 ++ VMOVU (%rsi), %YMM1 ++ ++ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ ++ /* Check for NULL in YMM0. */ ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ /* Check for NULL in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ ++ kord %k1, %k2, %k1 ++ ++ /* Each bit in K1 represents: ++ 1. A mismatch in YMM0 and YMM1. Or ++ 2. A NULL in YMM0 or YMM1. ++ */ ++ kord %k0, %k1, %k1 ++ ++ ktestd %k1, %k1 ++ je L(next_3_vectors) ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx) is after the maximum ++ offset (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ je L(return) ++L(wcscmp_return): ++ setl %al ++ negl %eax ++ orl $1, %eax ++L(return): ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(return_vec_size): ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after ++ the maximum offset (%r11). */ ++ addq $VEC_SIZE, %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl VEC_SIZE(%rdi, %rdx), %ecx ++ cmpl VEC_SIZE(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl VEC_SIZE(%rdi, %rdx), %eax ++ movzbl VEC_SIZE(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(return_2_vec_size): ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is ++ after the maximum offset (%r11). */ ++ addq $(VEC_SIZE * 2), %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx ++ cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(return_3_vec_size): ++ kmovd %k1, %ecx ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is ++ after the maximum offset (%r11). */ ++ addq $(VEC_SIZE * 3), %rdx ++ cmpq %r11, %rdx ++ jae L(zero) ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx ++ cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(next_3_vectors): ++ VMOVU VEC_SIZE(%rdi), %YMM0 ++ VMOVU VEC_SIZE(%rsi), %YMM1 ++ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ ktestd %k1, %k1 ++ jne L(return_vec_size) ++ ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 ++ ++ /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ ++ VPCMP $4, %YMM2, %YMM4, %k0 ++ VPCMP $0, %YMMZERO, %YMM2, %k1 ++ VPCMP $0, %YMMZERO, %YMM4, %k2 ++ /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ ktestd %k1, %k1 ++ jne L(return_2_vec_size) ++ ++ /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ ++ VPCMP $4, %YMM3, %YMM5, %k0 ++ VPCMP $0, %YMMZERO, %YMM3, %k1 ++ VPCMP $0, %YMMZERO, %YMM5, %k2 ++ /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ ktestd %k1, %k1 ++ jne L(return_3_vec_size) ++L(main_loop_header): ++ leaq (VEC_SIZE * 4)(%rdi), %rdx ++ movl $PAGE_SIZE, %ecx ++ /* Align load via RAX. */ ++ andq $-(VEC_SIZE * 4), %rdx ++ subq %rdi, %rdx ++ leaq (%rdi, %rdx), %rax ++# ifdef USE_AS_STRNCMP ++ /* Starting from this point, the maximum offset, or simply the ++ 'offset', DECREASES by the same amount when base pointers are ++ moved forward. Return 0 when: ++ 1) On match: offset <= the matched vector index. ++ 2) On mistmach, offset is before the mistmatched index. ++ */ ++ subq %rdx, %r11 ++ jbe L(zero) ++# endif ++ addq %rsi, %rdx ++ movq %rdx, %rsi ++ andl $(PAGE_SIZE - 1), %esi ++ /* Number of bytes before page crossing. */ ++ subq %rsi, %rcx ++ /* Number of VEC_SIZE * 4 blocks before page crossing. */ ++ shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx ++ /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ ++ movl %ecx, %esi ++ jmp L(loop_start) ++ ++ .p2align 4 ++L(loop): ++# ifdef USE_AS_STRNCMP ++ /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease ++ the maximum offset (%r11) by the same amount. */ ++ subq $(VEC_SIZE * 4), %r11 ++ jbe L(zero) ++# endif ++ addq $(VEC_SIZE * 4), %rax ++ addq $(VEC_SIZE * 4), %rdx ++L(loop_start): ++ testl %esi, %esi ++ leal -1(%esi), %esi ++ je L(loop_cross_page) ++L(back_to_loop): ++ /* Main loop, comparing 4 vectors are a time. */ ++ VMOVA (%rax), %YMM0 ++ VMOVA VEC_SIZE(%rax), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rax), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rax), %YMM6 ++ VMOVU (%rdx), %YMM1 ++ VMOVU VEC_SIZE(%rdx), %YMM3 ++ VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 ++ VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 ++ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K4 represents a NULL or a mismatch in YMM0 and ++ YMM1. */ ++ kord %k0, %k1, %k4 ++ ++ VPCMP $4, %YMM2, %YMM3, %k0 ++ VPCMP $0, %YMMZERO, %YMM2, %k1 ++ VPCMP $0, %YMMZERO, %YMM3, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K5 represents a NULL or a mismatch in YMM2 and ++ YMM3. */ ++ kord %k0, %k1, %k5 ++ ++ VPCMP $4, %YMM4, %YMM5, %k0 ++ VPCMP $0, %YMMZERO, %YMM4, %k1 ++ VPCMP $0, %YMMZERO, %YMM5, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K6 represents a NULL or a mismatch in YMM4 and ++ YMM5. */ ++ kord %k0, %k1, %k6 ++ ++ VPCMP $4, %YMM6, %YMM7, %k0 ++ VPCMP $0, %YMMZERO, %YMM6, %k1 ++ VPCMP $0, %YMMZERO, %YMM7, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K7 represents a NULL or a mismatch in YMM6 and ++ YMM7. */ ++ kord %k0, %k1, %k7 ++ ++ kord %k4, %k5, %k0 ++ kord %k6, %k7, %k1 ++ ++ /* Test each mask (32 bits) individually because for VEC_SIZE ++ == 32 is not possible to OR the four masks and keep all bits ++ in a 64-bit integer register, differing from SSE2 strcmp ++ where ORing is possible. */ ++ kortestd %k0, %k1 ++ je L(loop) ++ ktestd %k4, %k4 ++ je L(test_vec) ++ kmovd %k4, %edi ++ tzcntl %edi, %ecx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(test_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first vector matched. Return 0 if the maximum offset ++ (%r11) <= VEC_SIZE. */ ++ cmpq $VEC_SIZE, %r11 ++ jbe L(zero) ++# endif ++ ktestd %k5, %k5 ++ je L(test_2_vec) ++ kmovd %k5, %ecx ++ tzcntl %ecx, %edi ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edi ++# endif ++# ifdef USE_AS_STRNCMP ++ addq $VEC_SIZE, %rdi ++ cmpq %rdi, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rdi), %ecx ++ cmpl (%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rdi), %eax ++ movzbl (%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl VEC_SIZE(%rsi, %rdi), %ecx ++ cmpl VEC_SIZE(%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl VEC_SIZE(%rax, %rdi), %eax ++ movzbl VEC_SIZE(%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(test_2_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first 2 vectors matched. Return 0 if the maximum offset ++ (%r11) <= 2 * VEC_SIZE. */ ++ cmpq $(VEC_SIZE * 2), %r11 ++ jbe L(zero) ++# endif ++ ktestd %k6, %k6 ++ je L(test_3_vec) ++ kmovd %k6, %ecx ++ tzcntl %ecx, %edi ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edi ++# endif ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 2), %rdi ++ cmpq %rdi, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rdi), %ecx ++ cmpl (%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rdi), %eax ++ movzbl (%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx ++ cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax ++ movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(test_3_vec): ++# ifdef USE_AS_STRNCMP ++ /* The first 3 vectors matched. Return 0 if the maximum offset ++ (%r11) <= 3 * VEC_SIZE. */ ++ cmpq $(VEC_SIZE * 3), %r11 ++ jbe L(zero) ++# endif ++ kmovd %k7, %esi ++ tzcntl %esi, %ecx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 3), %rcx ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %esi ++ cmpl (%rdx, %rcx), %esi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rsi, %rcx), %esi ++ cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(loop_cross_page): ++ xorl %r10d, %r10d ++ movq %rdx, %rcx ++ /* Align load via RDX. We load the extra ECX bytes which should ++ be ignored. */ ++ andl $((VEC_SIZE * 4) - 1), %ecx ++ /* R10 is -RCX. */ ++ subq %rcx, %r10 ++ ++ /* This works only if VEC_SIZE * 2 == 64. */ ++# if (VEC_SIZE * 2) != 64 ++# error (VEC_SIZE * 2) != 64 ++# endif ++ ++ /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ ++ cmpl $(VEC_SIZE * 2), %ecx ++ jge L(loop_cross_page_2_vec) ++ ++ VMOVU (%rax, %r10), %YMM2 ++ VMOVU VEC_SIZE(%rax, %r10), %YMM3 ++ VMOVU (%rdx, %r10), %YMM4 ++ VMOVU VEC_SIZE(%rdx, %r10), %YMM5 ++ ++ VPCMP $4, %YMM4, %YMM2, %k0 ++ VPCMP $0, %YMMZERO, %YMM2, %k1 ++ VPCMP $0, %YMMZERO, %YMM4, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch in YMM2 and ++ YMM4. */ ++ kord %k0, %k1, %k1 ++ ++ VPCMP $4, %YMM5, %YMM3, %k3 ++ VPCMP $0, %YMMZERO, %YMM3, %k4 ++ VPCMP $0, %YMMZERO, %YMM5, %k5 ++ kord %k4, %k5, %k4 ++ /* Each bit in K3 represents a NULL or a mismatch in YMM3 and ++ YMM5. */ ++ kord %k3, %k4, %k3 ++ ++# ifdef USE_AS_WCSCMP ++ /* NB: Each bit in K1/K3 represents 4-byte element. */ ++ kshiftlw $8, %k3, %k2 ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG32 ++ sarl $2, %SHIFT_REG32 ++# else ++ kshiftlq $32, %k3, %k2 ++# endif ++ ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korq %k1, %k2, %k1 ++ kmovq %k1, %rdi ++ ++ /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ ++ shrxq %SHIFT_REG64, %rdi, %rdi ++ testq %rdi, %rdi ++ je L(loop_cross_page_2_vec) ++ tzcntq %rdi, %rcx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++ .p2align 4 ++L(loop_cross_page_2_vec): ++ /* The first VEC_SIZE * 2 bytes match or are ignored. */ ++ VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 ++ VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 ++ VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 ++ VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 ++ ++ VPCMP $4, %YMM0, %YMM2, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM2, %k2 ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch in YMM0 and ++ YMM2. */ ++ kord %k0, %k1, %k1 ++ ++ VPCMP $4, %YMM1, %YMM3, %k3 ++ VPCMP $0, %YMMZERO, %YMM1, %k4 ++ VPCMP $0, %YMMZERO, %YMM3, %k5 ++ kord %k4, %k5, %k4 ++ /* Each bit in K3 represents a NULL or a mismatch in YMM1 and ++ YMM3. */ ++ kord %k3, %k4, %k3 ++ ++# ifdef USE_AS_WCSCMP ++ /* NB: Each bit in K1/K3 represents 4-byte element. */ ++ kshiftlw $8, %k3, %k2 ++# else ++ kshiftlq $32, %k3, %k2 ++# endif ++ ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korq %k1, %k2, %k1 ++ kmovq %k1, %rdi ++ ++ xorl %r8d, %r8d ++ /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ ++ subl $(VEC_SIZE * 2), %ecx ++ jle 1f ++ /* R8 has number of bytes skipped. */ ++ movl %ecx, %r8d ++# ifdef USE_AS_WCSCMP ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ sarl $2, %ecx ++# endif ++ /* Skip ECX bytes. */ ++ shrq %cl, %rdi ++1: ++ /* Before jumping back to the loop, set ESI to the number of ++ VEC_SIZE * 4 blocks before page crossing. */ ++ movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi ++ ++ testq %rdi, %rdi ++# ifdef USE_AS_STRNCMP ++ /* At this point, if %rdi value is 0, it already tested ++ VEC_SIZE*4+%r10 byte starting from %rax. This label ++ checks whether strncmp maximum offset reached or not. */ ++ je L(string_nbyte_offset_check) ++# else ++ je L(back_to_loop) ++# endif ++ tzcntq %rdi, %rcx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %ecx ++# endif ++ addq %r10, %rcx ++ /* Adjust for number of bytes skipped. */ ++ addq %r8, %rcx ++# ifdef USE_AS_STRNCMP ++ addq $(VEC_SIZE * 2), %rcx ++ subq %rcx, %r11 ++ jbe L(zero) ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (%rsi, %rcx), %edi ++ cmpl (%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (%rax, %rcx), %eax ++ movzbl (%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# else ++# ifdef USE_AS_WCSCMP ++ movq %rax, %rsi ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rsi, %rcx), %edi ++ cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi ++ jne L(wcscmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx ++ subl %edx, %eax ++# endif ++# endif ++ ret ++ ++# ifdef USE_AS_STRNCMP ++L(string_nbyte_offset_check): ++ leaq (VEC_SIZE * 4)(%r10), %r10 ++ cmpq %r10, %r11 ++ jbe L(zero) ++ jmp L(back_to_loop) ++# endif ++ ++ .p2align 4 ++L(cross_page_loop): ++ /* Check one byte/dword at a time. */ ++# ifdef USE_AS_WCSCMP ++ cmpl %ecx, %eax ++# else ++ subl %ecx, %eax ++# endif ++ jne L(different) ++ addl $SIZE_OF_CHAR, %edx ++ cmpl $(VEC_SIZE * 4), %edx ++ je L(main_loop_header) ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rdx), %eax ++ movl (%rsi, %rdx), %ecx ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %ecx ++# endif ++ /* Check null char. */ ++ testl %eax, %eax ++ jne L(cross_page_loop) ++ /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED ++ comparisons. */ ++ subl %ecx, %eax ++# ifndef USE_AS_WCSCMP ++L(different): ++# endif ++ ret ++ ++# ifdef USE_AS_WCSCMP ++ .p2align 4 ++L(different): ++ /* Use movl to avoid modifying EFLAGS. */ ++ movl $0, %eax ++ setl %al ++ negl %eax ++ orl $1, %eax ++ ret ++# endif ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(char0): ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax ++ subl %ecx, %eax ++# endif ++ ret ++# endif ++ ++ .p2align 4 ++L(last_vector): ++ addq %rdx, %rdi ++ addq %rdx, %rsi ++# ifdef USE_AS_STRNCMP ++ subq %rdx, %r11 ++# endif ++ tzcntl %ecx, %edx ++# ifdef USE_AS_WCSCMP ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %edx ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ xorl %eax, %eax ++ movl (%rdi, %rdx), %ecx ++ cmpl (%rsi, %rdx), %ecx ++ jne L(wcscmp_return) ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %edx ++ subl %edx, %eax ++# endif ++ ret ++ ++ /* Comparing on page boundary region requires special treatment: ++ It must done one vector at the time, starting with the wider ++ ymm vector if possible, if not, with xmm. If fetching 16 bytes ++ (xmm) still passes the boundary, byte comparison must be done. ++ */ ++ .p2align 4 ++L(cross_page): ++ /* Try one ymm vector at a time. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(cross_page_1_vector) ++L(loop_1_vector): ++ VMOVU (%rdi, %rdx), %YMM0 ++ VMOVU (%rsi, %rdx), %YMM1 ++ ++ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ ++ VPCMP $4, %YMM0, %YMM1, %k0 ++ VPCMP $0, %YMMZERO, %YMM0, %k1 ++ VPCMP $0, %YMMZERO, %YMM1, %k2 ++ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $VEC_SIZE, %edx ++ ++ addl $VEC_SIZE, %eax ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jle L(loop_1_vector) ++L(cross_page_1_vector): ++ /* Less than 32 bytes to check, try one xmm vector. */ ++ cmpl $(PAGE_SIZE - 16), %eax ++ jg L(cross_page_1_xmm) ++ VMOVU (%rdi, %rdx), %XMM0 ++ VMOVU (%rsi, %rdx), %XMM1 ++ ++ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ ++ VPCMP $4, %XMM0, %XMM1, %k0 ++ VPCMP $0, %XMMZERO, %XMM0, %k1 ++ VPCMP $0, %XMMZERO, %XMM1, %k2 ++ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ ++ korw %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ korw %k0, %k1, %k1 ++ kmovw %k1, %ecx ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $16, %edx ++# ifndef USE_AS_WCSCMP ++ addl $16, %eax ++# endif ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_1_xmm): ++# ifndef USE_AS_WCSCMP ++ /* Less than 16 bytes to check, try 8 byte vector. NB: No need ++ for wcscmp nor wcsncmp since wide char is 4 bytes. */ ++ cmpl $(PAGE_SIZE - 8), %eax ++ jg L(cross_page_8bytes) ++ vmovq (%rdi, %rdx), %XMM0 ++ vmovq (%rsi, %rdx), %XMM1 ++ ++ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ ++ VPCMP $4, %XMM0, %XMM1, %k0 ++ VPCMP $0, %XMMZERO, %XMM0, %k1 ++ VPCMP $0, %XMMZERO, %XMM1, %k2 ++ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ kmovd %k1, %ecx ++ ++# ifdef USE_AS_WCSCMP ++ /* Only last 2 bits are valid. */ ++ andl $0x3, %ecx ++# else ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %ecx ++# endif ++ ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $8, %edx ++ addl $8, %eax ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_8bytes): ++ /* Less than 8 bytes to check, try 4 byte vector. */ ++ cmpl $(PAGE_SIZE - 4), %eax ++ jg L(cross_page_4bytes) ++ vmovd (%rdi, %rdx), %XMM0 ++ vmovd (%rsi, %rdx), %XMM1 ++ ++ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ ++ VPCMP $4, %XMM0, %XMM1, %k0 ++ VPCMP $0, %XMMZERO, %XMM0, %k1 ++ VPCMP $0, %XMMZERO, %XMM1, %k2 ++ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ ++ kord %k1, %k2, %k1 ++ /* Each bit in K1 represents a NULL or a mismatch. */ ++ kord %k0, %k1, %k1 ++ kmovd %k1, %ecx ++ ++# ifdef USE_AS_WCSCMP ++ /* Only the last bit is valid. */ ++ andl $0x1, %ecx ++# else ++ /* Only last 4 bits are valid. */ ++ andl $0xf, %ecx ++# endif ++ ++ testl %ecx, %ecx ++ jne L(last_vector) ++ ++ addl $4, %edx ++# ifdef USE_AS_STRNCMP ++ /* Return 0 if the current offset (%rdx) >= the maximum offset ++ (%r11). */ ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++ ++L(cross_page_4bytes): ++# endif ++ /* Less than 4 bytes to check, try one byte/dword at a time. */ ++# ifdef USE_AS_STRNCMP ++ cmpq %r11, %rdx ++ jae L(zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rdx), %eax ++ movl (%rsi, %rdx), %ecx ++# else ++ movzbl (%rdi, %rdx), %eax ++ movzbl (%rsi, %rdx), %ecx ++# endif ++ testl %eax, %eax ++ jne L(cross_page_loop) ++ subl %ecx, %eax ++ ret ++END (STRCMP) ++#endif +diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c +index 3f433fbc..c5f38510 100644 +--- a/sysdeps/x86_64/multiarch/strcmp.c ++++ b/sysdeps/x86_64/multiarch/strcmp.c +@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return OPTIMIZE (sse2_unaligned); +diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S +new file mode 100644 +index 00000000..cd022509 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-evex.S +@@ -0,0 +1,436 @@ ++/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRLEN ++# define STRLEN __strlen_evex ++# endif ++ ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSLEN ++# define VPCMP vpcmpd ++# define VPMINU vpminud ++# define SHIFT_REG r9d ++# else ++# define VPCMP vpcmpb ++# define VPMINU vpminub ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMM1 ymm17 ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRLEN) ++# ifdef USE_AS_STRNLEN ++ /* Check for zero length. */ ++ test %RSI_LP, %RSI_LP ++ jz L(zero) ++# ifdef USE_AS_WCSLEN ++ shl $2, %RSI_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %esi, %esi ++# endif ++ mov %RSI_LP, %R8_LP ++# endif ++ movl %edi, %ecx ++ movq %rdi, %rdx ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a ++ null byte. */ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ ++# ifdef USE_AS_STRNLEN ++ jnz L(first_vec_x0_check) ++ /* Adjust length and check the end of data. */ ++ subq $VEC_SIZE, %rsi ++ jbe L(max) ++# else ++ jnz L(first_vec_x0) ++# endif ++ ++ /* Align data for aligned loads in the loop. */ ++ addq $VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. */ ++ addq %rcx, %rsi ++ ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ jmp L(more_4x_vec) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide shift count by 4 since each bit in K0 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ ++ /* Remove the leading bytes. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++# ifdef USE_AS_STRNLEN ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++# endif ++ addq %rdi, %rax ++ addq %rcx, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(aligned_more): ++# ifdef USE_AS_STRNLEN ++ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" ++ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" ++ to void possible addition overflow. */ ++ negq %rcx ++ addq $VEC_SIZE, %rcx ++ ++ /* Check the end of data. */ ++ subq %rcx, %rsi ++ jbe L(max) ++# endif ++ ++ addq $VEC_SIZE, %rdi ++ ++# ifdef USE_AS_STRNLEN ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++L(more_4x_vec): ++ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifdef USE_AS_STRNLEN ++ subq $(VEC_SIZE * 4), %rsi ++ jbe L(last_4x_vec_or_less) ++# endif ++ ++ /* Align data to 4 * VEC_SIZE. */ ++ movq %rdi, %rcx ++ andl $(4 * VEC_SIZE - 1), %ecx ++ andq $-(4 * VEC_SIZE), %rdi ++ ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. */ ++ addq %rcx, %rsi ++# endif ++ ++ .p2align 4 ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VMOVA (%rdi), %YMM1 ++ VMOVA VEC_SIZE(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 ++ ++ VPMINU %YMM1, %YMM2, %YMM5 ++ VPMINU %YMM3, %YMM4, %YMM6 ++ ++ VPMINU %YMM5, %YMM6, %YMM5 ++ VPCMP $0, %YMM5, %YMMZERO, %k0 ++ ktestd %k0, %k0 ++ jnz L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ ++# ifndef USE_AS_STRNLEN ++ jmp L(loop_4x_vec) ++# else ++ subq $(VEC_SIZE * 4), %rsi ++ ja L(loop_4x_vec) ++ ++L(last_4x_vec_or_less): ++ /* Less than 4 * VEC and aligned to VEC_SIZE. */ ++ addl $(VEC_SIZE * 2), %esi ++ jle L(last_2x_vec) ++ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2_check) ++ subl $VEC_SIZE, %esi ++ jle L(max) ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3_check) ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ addl $(VEC_SIZE * 2), %esi ++ ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0_check) ++ subl $VEC_SIZE, %esi ++ jle L(max) ++ ++ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x0_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x3_check): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ jbe L(max) ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(max): ++ movq %r8, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(first_vec_x0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq $VEC_SIZE, %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq $(VEC_SIZE * 2), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ VPCMP $0, %YMM1, %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x0) ++ VPCMP $0, %YMM2, %YMMZERO, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ VPCMP $0, %YMM3, %YMMZERO, %k2 ++ kmovd %k2, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ VPCMP $0, %YMM4, %YMMZERO, %k3 ++ kmovd %k3, %eax ++L(first_vec_x3): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ sall $2, %eax ++# endif ++ addq $(VEC_SIZE * 3), %rax ++ addq %rdi, %rax ++ subq %rdx, %rax ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rax ++# endif ++ ret ++ ++END (STRLEN) ++#endif +diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S +new file mode 100644 +index 00000000..a1d53e8c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncmp-evex.S +@@ -0,0 +1,3 @@ ++#define STRCMP __strncmp_evex ++#define USE_AS_STRNCMP 1 ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c +index 686d654f..4c15542f 100644 +--- a/sysdeps/x86_64/multiarch/strncmp.c ++++ b/sysdeps/x86_64/multiarch/strncmp.c +@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) +diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S +new file mode 100644 +index 00000000..722022f3 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strnlen-evex.S +@@ -0,0 +1,4 @@ ++#define STRLEN __strnlen_evex ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S +new file mode 100644 +index 00000000..f920b5a5 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strrchr-evex.S +@@ -0,0 +1,265 @@ ++/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRRCHR ++# define STRRCHR __strrchr_evex ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++# ifdef USE_AS_WCSRCHR ++# define VPBROADCAST vpbroadcastd ++# define VPCMP vpcmpd ++# define SHIFT_REG r8d ++# else ++# define VPBROADCAST vpbroadcastb ++# define VPCMP vpcmpb ++# define SHIFT_REG ecx ++# endif ++ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMMMATCH ymm17 ++# define YMM1 ymm18 ++ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRRCHR) ++ movl %edi, %ecx ++ /* Broadcast CHAR to YMMMATCH. */ ++ VPBROADCAST %esi, %YMMMATCH ++ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Check if we may cross page boundary with one vector load. */ ++ andl $(2 * VEC_SIZE - 1), %ecx ++ cmpl $VEC_SIZE, %ecx ++ ja L(cros_page_boundary) ++ ++ VMOVU (%rdi), %YMM1 ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ ++ addq $VEC_SIZE, %rdi ++ ++ testl %eax, %eax ++ jnz L(first_vec) ++ ++ testl %ecx, %ecx ++ jnz L(return_null) ++ ++ andq $-VEC_SIZE, %rdi ++ xorl %edx, %edx ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(first_vec): ++ /* Check if there is a null byte. */ ++ testl %ecx, %ecx ++ jnz L(char_and_nul_in_first_vec) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ movq %rdi, %rsi ++ andq $-VEC_SIZE, %rdi ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(cros_page_boundary): ++ andl $(VEC_SIZE - 1), %ecx ++ andq $-VEC_SIZE, %rdi ++ ++# ifdef USE_AS_WCSRCHR ++ /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ bytes. */ ++ movl %ecx, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++# endif ++ ++ VMOVA (%rdi), %YMM1 ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %edx ++ kmovd %k1, %eax ++ ++ shrxl %SHIFT_REG, %edx, %edx ++ shrxl %SHIFT_REG, %eax, %eax ++ addq $VEC_SIZE, %rdi ++ ++ /* Check if there is a CHAR. */ ++ testl %eax, %eax ++ jnz L(found_char) ++ ++ testl %edx, %edx ++ jnz L(return_null) ++ ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(found_char): ++ testl %edx, %edx ++ jnz L(char_and_nul) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ leaq (%rdi, %rcx), %rsi ++ ++ .p2align 4 ++L(aligned_loop): ++ VMOVA (%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ VMOVA (%rdi), %YMM1 ++ add $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ VMOVA (%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jnz L(char_nor_null) ++ ++ VMOVA (%rdi), %YMM1 ++ addq $VEC_SIZE, %rdi ++ ++ /* Each bit in K0 represents a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMMMATCH, %YMM1, %k1 ++ kmovd %k0, %ecx ++ kmovd %k1, %eax ++ orl %eax, %ecx ++ jz L(aligned_loop) ++ ++ .p2align 4 ++L(char_nor_null): ++ /* Find a CHAR or a null byte in a loop. */ ++ testl %eax, %eax ++ jnz L(match) ++L(return_value): ++ testl %edx, %edx ++ jz L(return_null) ++ movl %edx, %eax ++ movq %rsi, %rdi ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(match): ++ /* Find a CHAR. Check if there is a null byte. */ ++ kmovd %k0, %ecx ++ testl %ecx, %ecx ++ jnz L(find_nul) ++ ++ /* Remember the match and keep searching. */ ++ movl %eax, %edx ++ movq %rdi, %rsi ++ jmp L(aligned_loop) ++ ++ .p2align 4 ++L(find_nul): ++ /* Mask out any matching bits after the null byte. */ ++ movl %ecx, %r8d ++ subl $1, %r8d ++ xorl %ecx, %r8d ++ andl %r8d, %eax ++ testl %eax, %eax ++ /* If there is no CHAR here, return the remembered one. */ ++ jz L(return_value) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(char_and_nul): ++ /* Find both a CHAR and a null byte. */ ++ addq %rcx, %rdi ++ movl %edx, %ecx ++L(char_and_nul_in_first_vec): ++ /* Mask out any matching bits after the null byte. */ ++ movl %ecx, %r8d ++ subl $1, %r8d ++ xorl %ecx, %r8d ++ andl %r8d, %eax ++ testl %eax, %eax ++ /* Return null pointer if the null byte comes first. */ ++ jz L(return_null) ++ bsrl %eax, %eax ++# ifdef USE_AS_WCSRCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ ++ leaq -VEC_SIZE(%rdi, %rax, 4), %rax ++# else ++ leaq -VEC_SIZE(%rdi, %rax), %rax ++# endif ++ ret ++ ++ .p2align 4 ++L(return_null): ++ xorl %eax, %eax ++ ret ++ ++END (STRRCHR) ++#endif +diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S +new file mode 100644 +index 00000000..7cb8f1e4 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcschr-evex.S +@@ -0,0 +1,3 @@ ++#define STRCHR __wcschr_evex ++#define USE_AS_WCSCHR 1 ++#include "strchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S +new file mode 100644 +index 00000000..42e73e51 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S +@@ -0,0 +1,4 @@ ++#define STRCMP __wcscmp_evex ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S +new file mode 100644 +index 00000000..bdafa83b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcslen-evex.S +@@ -0,0 +1,4 @@ ++#define STRLEN __wcslen_evex ++#define USE_AS_WCSLEN 1 ++ ++#include "strlen-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S +new file mode 100644 +index 00000000..8a8e3107 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S +@@ -0,0 +1,5 @@ ++#define STRCMP __wcsncmp_evex ++#define USE_AS_STRNCMP 1 ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S +new file mode 100644 +index 00000000..24773bb4 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S +@@ -0,0 +1,5 @@ ++#define STRLEN __wcsnlen_evex ++#define USE_AS_WCSLEN 1 ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c +index b3144c93..84254b83 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen.c ++++ b/sysdeps/x86_64/multiarch/wcsnlen.c +@@ -29,16 +29,24 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S +new file mode 100644 +index 00000000..c64602f7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S +@@ -0,0 +1,3 @@ ++#define STRRCHR __wcsrchr_evex ++#define USE_AS_WCSRCHR 1 ++#include "strrchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S +new file mode 100644 +index 00000000..06cd0f9f +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __wmemchr_evex ++#define USE_AS_WMEMCHR 1 ++ ++#include "memchr-evex.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-13.patch b/glibc-RHEL-15696-13.patch new file mode 100644 index 0000000..a88a3bc --- /dev/null +++ b/glibc-RHEL-15696-13.patch @@ -0,0 +1,1488 @@ +From 525bc2a32c9710df40371f951217c6ae7a923aee Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 06:36:50 -0800 +Subject: [PATCH] x86-64: Add strcpy family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-strcpy.h to select the function optimized with 256-bit EVEX +instructions using YMM16-YMM31 registers to avoid RTM abort with usable +AVX512VL and AVX512BW since VZEROUPPER isn't needed at function exit. +--- + sysdeps/x86_64/multiarch/Makefile | 6 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 + + sysdeps/x86_64/multiarch/ifunc-strcpy.h | 13 +- + sysdeps/x86_64/multiarch/stpcpy-evex.S | 3 + + sysdeps/x86_64/multiarch/stpncpy-evex.S | 4 + + sysdeps/x86_64/multiarch/strcat-evex.S | 283 ++++++ + sysdeps/x86_64/multiarch/strcpy-evex.S | 1003 ++++++++++++++++++++ + sysdeps/x86_64/multiarch/strncat-evex.S | 3 + + sysdeps/x86_64/multiarch/strncpy-evex.S | 3 + + 9 files changed, 1339 insertions(+), 3 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/stpcpy-evex.S + create mode 100644 sysdeps/x86_64/multiarch/stpncpy-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strcat-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strcpy-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncat-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncpy-evex.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 5ce85882..46783cd1 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -43,11 +43,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memchr-evex \ + memrchr-evex \ + rawmemchr-evex \ ++ stpcpy-evex \ ++ stpncpy-evex \ ++ strcat-evex \ + strchr-evex \ + strchrnul-evex \ + strcmp-evex \ ++ strcpy-evex \ + strlen-evex \ ++ strncat-evex \ + strncmp-evex \ ++ strncpy-evex \ + strnlen-evex \ + strrchr-evex + CFLAGS-varshift.c += -msse4 +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index bd7d9f19..082e4da3 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -224,6 +224,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), + __stpncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpncpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __stpncpy_evex) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, + __stpncpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpncpy, 1, __stpncpy_sse2)) +@@ -234,6 +238,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), + __stpcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpcpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __stpcpy_evex) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, stpcpy, 1, __stpcpy_sse2)) + +@@ -268,6 +276,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), + __strcat_avx2) ++ IFUNC_IMPL_ADD (array, i, strcat, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcat_evex) + IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (SSSE3), + __strcat_ssse3) + IFUNC_IMPL_ADD (array, i, strcat, 1, __strcat_sse2_unaligned) +@@ -330,6 +342,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), + __strcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strcpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcpy_evex) + IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (SSSE3), + __strcpy_ssse3) + IFUNC_IMPL_ADD (array, i, strcpy, 1, __strcpy_sse2_unaligned) +@@ -373,6 +389,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), + __strncat_avx2) ++ IFUNC_IMPL_ADD (array, i, strncat, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncat_evex) + IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (SSSE3), + __strncat_ssse3) + IFUNC_IMPL_ADD (array, i, strncat, 1, +@@ -383,6 +403,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), + __strncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strncpy, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncpy_evex) + IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (SSSE3), + __strncpy_ssse3) + IFUNC_IMPL_ADD (array, i, strncpy, 1, +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +index 100dca5c..deae6348 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +@@ -25,16 +25,23 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } + + if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load)) + return OPTIMIZE (sse2_unaligned); +diff --git a/sysdeps/x86_64/multiarch/stpcpy-evex.S b/sysdeps/x86_64/multiarch/stpcpy-evex.S +new file mode 100644 +index 00000000..7c6f26cd +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpcpy-evex.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STPCPY ++#define STRCPY __stpcpy_evex ++#include "strcpy-evex.S" +diff --git a/sysdeps/x86_64/multiarch/stpncpy-evex.S b/sysdeps/x86_64/multiarch/stpncpy-evex.S +new file mode 100644 +index 00000000..1570014d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpncpy-evex.S +@@ -0,0 +1,4 @@ ++#define USE_AS_STPCPY ++#define USE_AS_STRNCPY ++#define STRCPY __stpncpy_evex ++#include "strcpy-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcat-evex.S b/sysdeps/x86_64/multiarch/strcat-evex.S +new file mode 100644 +index 00000000..97c3d85b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcat-evex.S +@@ -0,0 +1,283 @@ ++/* strcat with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# include ++ ++# ifndef STRCAT ++# define STRCAT __strcat_evex ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++/* zero register */ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++ ++# define USE_AS_STRCAT ++ ++/* Number of bytes in a vector register */ ++# define VEC_SIZE 32 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCAT) ++ mov %rdi, %r9 ++# ifdef USE_AS_STRNCAT ++ mov %rdx, %r8 ++# endif ++ ++ xor %eax, %eax ++ mov %edi, %ecx ++ and $((VEC_SIZE * 4) - 1), %ecx ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ cmp $(VEC_SIZE * 3), %ecx ++ ja L(fourth_vector_boundary) ++ vpcmpb $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_first_vector) ++ mov %rdi, %rax ++ and $-VEC_SIZE, %rax ++ jmp L(align_vec_size_start) ++L(fourth_vector_boundary): ++ mov %rdi, %rax ++ and $-VEC_SIZE, %rax ++ vpcmpb $0, (%rax), %YMMZERO, %k0 ++ mov $-1, %r10d ++ sub %rax, %rcx ++ shl %cl, %r10d ++ kmovd %k0, %edx ++ and %r10d, %edx ++ jnz L(exit) ++ ++L(align_vec_size_start): ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ add $(VEC_SIZE * 4), %rax ++ kmovd %k4, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ kmovd %k4, %edx ++ add $(VEC_SIZE * 4), %rax ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ add $(VEC_SIZE * 4), %rax ++ kmovd %k4, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fifth_vector) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, (VEC_SIZE * 5)(%rax), %YMMZERO, %k4 ++ add $(VEC_SIZE * 5), %rax ++ kmovd %k4, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 ++ add $VEC_SIZE, %rax ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k0 ++ add $VEC_SIZE, %rax ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ test $((VEC_SIZE * 4) - 1), %rax ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, VEC_SIZE(%rax), %YMMZERO, %k1 ++ add $VEC_SIZE, %rax ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit) ++ ++ add $VEC_SIZE, %rax ++ ++ .p2align 4 ++L(align_four_vec_loop): ++ VMOVA (%rax), %YMM0 ++ VMOVA (VEC_SIZE * 2)(%rax), %YMM1 ++ vpminub VEC_SIZE(%rax), %YMM0, %YMM0 ++ vpminub (VEC_SIZE * 3)(%rax), %YMM1, %YMM1 ++ vpminub %YMM0, %YMM1, %YMM0 ++ /* If K0 != 0, there is a null byte. */ ++ vpcmpb $0, %YMM0, %YMMZERO, %k0 ++ add $(VEC_SIZE * 4), %rax ++ ktestd %k0, %k0 ++ jz L(align_four_vec_loop) ++ ++ vpcmpb $0, -(VEC_SIZE * 4)(%rax), %YMMZERO, %k0 ++ sub $(VEC_SIZE * 5), %rax ++ kmovd %k0, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_second_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 2)(%rax), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_third_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 3)(%rax), %YMMZERO, %k2 ++ kmovd %k2, %edx ++ test %edx, %edx ++ jnz L(exit_null_on_fourth_vector) ++ ++ vpcmpb $0, (VEC_SIZE * 4)(%rax), %YMMZERO, %k3 ++ kmovd %k3, %edx ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 4), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit): ++ sub %rdi, %rax ++L(exit_null_on_first_vector): ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_second_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $VEC_SIZE, %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_third_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 2), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_fourth_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 3), %rax ++ jmp L(StartStrcpyPart) ++ ++ .p2align 4 ++L(exit_null_on_fifth_vector): ++ sub %rdi, %rax ++ bsf %rdx, %rdx ++ add %rdx, %rax ++ add $(VEC_SIZE * 4), %rax ++ ++ .p2align 4 ++L(StartStrcpyPart): ++ lea (%r9, %rax), %rdi ++ mov %rsi, %rcx ++ mov %r9, %rax /* save result */ ++ ++# ifdef USE_AS_STRNCAT ++ test %r8, %r8 ++ jz L(ExitZero) ++# define USE_AS_STRNCPY ++# endif ++ ++# include "strcpy-evex.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/strcpy-evex.S b/sysdeps/x86_64/multiarch/strcpy-evex.S +new file mode 100644 +index 00000000..a343a1a6 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcpy-evex.S +@@ -0,0 +1,1003 @@ ++/* strcpy with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++# ifndef USE_AS_STRCAT ++# include ++ ++# ifndef STRCPY ++# define STRCPY __strcpy_evex ++# endif ++ ++# endif ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ ++/* Number of bytes in a vector register */ ++# ifndef VEC_SIZE ++# define VEC_SIZE 32 ++# endif ++ ++# define XMM2 xmm18 ++# define XMM3 xmm19 ++ ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++# define YMM7 ymm23 ++ ++# ifndef USE_AS_STRCAT ++ ++/* zero register */ ++# define XMMZERO xmm16 ++# define YMMZERO ymm16 ++# define YMM1 ymm17 ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (STRCPY) ++# ifdef USE_AS_STRNCPY ++ mov %RDX_LP, %R8_LP ++ test %R8_LP, %R8_LP ++ jz L(ExitZero) ++# endif ++ mov %rsi, %rcx ++# ifndef USE_AS_STPCPY ++ mov %rdi, %rax /* save result */ ++# endif ++ ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++# endif ++ ++ and $((VEC_SIZE * 4) - 1), %ecx ++ cmp $(VEC_SIZE * 2), %ecx ++ jbe L(SourceStringAlignmentLessTwoVecSize) ++ ++ and $-VEC_SIZE, %rsi ++ and $(VEC_SIZE - 1), %ecx ++ ++ vpcmpb $0, (%rsi), %YMMZERO, %k0 ++ kmovd %k0, %edx ++ shr %cl, %rdx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ mov $VEC_SIZE, %r10 ++ sub %rcx, %r10 ++ cmp %r10, %r8 ++# else ++ mov $(VEC_SIZE + 1), %r10 ++ sub %rcx, %r10 ++ cmp %r10, %r8 ++# endif ++ jbe L(CopyVecSizeTailCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyVecSizeTail) ++ ++ vpcmpb $0, VEC_SIZE(%rsi), %YMMZERO, %k1 ++ kmovd %k1, %edx ++ ++# ifdef USE_AS_STRNCPY ++ add $VEC_SIZE, %r10 ++ cmp %r10, %r8 ++ jbe L(CopyTwoVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyTwoVecSize) ++ ++ VMOVU (%rsi, %rcx), %YMM2 /* copy VEC_SIZE bytes */ ++ VMOVU %YMM2, (%rdi) ++ ++/* If source address alignment != destination address alignment */ ++ .p2align 4 ++L(UnalignVecSizeBoth): ++ sub %rcx, %rdi ++# ifdef USE_AS_STRNCPY ++ add %rcx, %r8 ++ sbb %rcx, %rcx ++ or %rcx, %r8 ++# endif ++ mov $VEC_SIZE, %rcx ++ VMOVA (%rsi, %rcx), %YMM2 ++ VMOVU %YMM2, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 3), %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM2, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 ++ vpcmpb $0, %YMM3, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec3) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM3, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM4 ++ vpcmpb $0, %YMM4, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec4) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM4, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM2, (%rdi, %rcx) ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM2 ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec2) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVA VEC_SIZE(%rsi, %rcx), %YMM3 ++ VMOVU %YMM2, (%rdi, %rcx) ++ vpcmpb $0, %YMM3, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ add $VEC_SIZE, %rcx ++# ifdef USE_AS_STRNCPY ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++# endif ++ test %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec3) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ VMOVU %YMM3, (%rdi, %rcx) ++ mov %rsi, %rdx ++ lea VEC_SIZE(%rsi, %rcx), %rsi ++ and $-(VEC_SIZE * 4), %rsi ++ sub %rsi, %rdx ++ sub %rdx, %rdi ++# ifdef USE_AS_STRNCPY ++ lea (VEC_SIZE * 8)(%r8, %rdx), %r8 ++# endif ++L(UnalignedFourVecSizeLoop): ++ VMOVA (%rsi), %YMM4 ++ VMOVA VEC_SIZE(%rsi), %YMM5 ++ VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 ++ VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 ++ vpminub %YMM5, %YMM4, %YMM2 ++ vpminub %YMM7, %YMM6, %YMM3 ++ vpminub %YMM2, %YMM3, %YMM2 ++ /* If K7 != 0, there is a null byte. */ ++ vpcmpb $0, %YMM2, %YMMZERO, %k7 ++ kmovd %k7, %edx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 4), %r8 ++ jbe L(UnalignedLeaveCase2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(UnalignedFourVecSizeLeave) ++ ++L(UnalignedFourVecSizeLoop_start): ++ add $(VEC_SIZE * 4), %rdi ++ add $(VEC_SIZE * 4), %rsi ++ VMOVU %YMM4, -(VEC_SIZE * 4)(%rdi) ++ VMOVA (%rsi), %YMM4 ++ VMOVU %YMM5, -(VEC_SIZE * 3)(%rdi) ++ VMOVA VEC_SIZE(%rsi), %YMM5 ++ vpminub %YMM5, %YMM4, %YMM2 ++ VMOVU %YMM6, -(VEC_SIZE * 2)(%rdi) ++ VMOVA (VEC_SIZE * 2)(%rsi), %YMM6 ++ VMOVU %YMM7, -VEC_SIZE(%rdi) ++ VMOVA (VEC_SIZE * 3)(%rsi), %YMM7 ++ vpminub %YMM7, %YMM6, %YMM3 ++ vpminub %YMM2, %YMM3, %YMM2 ++ /* If K7 != 0, there is a null byte. */ ++ vpcmpb $0, %YMM2, %YMMZERO, %k7 ++ kmovd %k7, %edx ++# ifdef USE_AS_STRNCPY ++ sub $(VEC_SIZE * 4), %r8 ++ jbe L(UnalignedLeaveCase2OrCase3) ++# endif ++ test %edx, %edx ++ jz L(UnalignedFourVecSizeLoop_start) ++ ++L(UnalignedFourVecSizeLeave): ++ vpcmpb $0, %YMM4, %YMMZERO, %k1 ++ kmovd %k1, %edx ++ test %edx, %edx ++ jnz L(CopyVecSizeUnaligned_0) ++ ++ vpcmpb $0, %YMM5, %YMMZERO, %k2 ++ kmovd %k2, %ecx ++ test %ecx, %ecx ++ jnz L(CopyVecSizeUnaligned_16) ++ ++ vpcmpb $0, %YMM6, %YMMZERO, %k3 ++ kmovd %k3, %edx ++ test %edx, %edx ++ jnz L(CopyVecSizeUnaligned_32) ++ ++ vpcmpb $0, %YMM7, %YMMZERO, %k4 ++ kmovd %k4, %ecx ++ bsf %ecx, %edx ++ VMOVU %YMM4, (%rdi) ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 3)(%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) ++ add $(VEC_SIZE - 1), %r8 ++ sub %rdx, %r8 ++ lea ((VEC_SIZE * 3) + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $(VEC_SIZE * 3), %rsi ++ add $(VEC_SIZE * 3), %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++/* If source address alignment == destination address alignment */ ++ ++L(SourceStringAlignmentLessTwoVecSize): ++ VMOVU (%rsi), %YMM3 ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ vpcmpb $0, %YMM3, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ cmp $VEC_SIZE, %r8 ++# else ++ cmp $(VEC_SIZE + 1), %r8 ++# endif ++ jbe L(CopyVecSizeTail1Case2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyVecSizeTail1) ++ ++ VMOVU %YMM3, (%rdi) ++ vpcmpb $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %edx ++ ++# ifdef USE_AS_STRNCPY ++# if defined USE_AS_STPCPY || defined USE_AS_STRCAT ++ cmp $(VEC_SIZE * 2), %r8 ++# else ++ cmp $((VEC_SIZE * 2) + 1), %r8 ++# endif ++ jbe L(CopyTwoVecSize1Case2OrCase3) ++# endif ++ test %edx, %edx ++ jnz L(CopyTwoVecSize1) ++ ++ and $-VEC_SIZE, %rsi ++ and $(VEC_SIZE - 1), %ecx ++ jmp L(UnalignVecSizeBoth) ++ ++/*------End of main part with loops---------------------*/ ++ ++/* Case1 */ ++ ++# if (!defined USE_AS_STRNCPY) || (defined USE_AS_STRCAT) ++ .p2align 4 ++L(CopyVecSize): ++ add %rcx, %rdi ++# endif ++L(CopyVecSizeTail): ++ add %rcx, %rsi ++L(CopyVecSizeTail1): ++ bsf %edx, %edx ++L(CopyVecSizeExit): ++ cmp $32, %edx ++ jae L(Exit32_63) ++ cmp $16, %edx ++ jae L(Exit16_31) ++ cmp $8, %edx ++ jae L(Exit8_15) ++ cmp $4, %edx ++ jae L(Exit4_7) ++ cmp $3, %edx ++ je L(Exit3) ++ cmp $1, %edx ++ ja L(Exit2) ++ je L(Exit1) ++ movb $0, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea (%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $1, %r8 ++ lea 1(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(CopyTwoVecSize1): ++ add $VEC_SIZE, %rsi ++ add $VEC_SIZE, %rdi ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $VEC_SIZE, %r8 ++# endif ++ jmp L(CopyVecSizeTail1) ++ ++ .p2align 4 ++L(CopyTwoVecSize): ++ bsf %edx, %edx ++ add %rcx, %rsi ++ add $VEC_SIZE, %edx ++ sub %ecx, %edx ++ jmp L(CopyVecSizeExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_0): ++ bsf %edx, %edx ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM4, (%rdi) ++ add $((VEC_SIZE * 4) - 1), %r8 ++ sub %rdx, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ jmp L(CopyVecSizeExit) ++# endif ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_16): ++ bsf %ecx, %edx ++ VMOVU %YMM4, (%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea VEC_SIZE(%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ add $((VEC_SIZE * 3) - 1), %r8 ++ sub %rdx, %r8 ++ lea (VEC_SIZE + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $VEC_SIZE, %rsi ++ add $VEC_SIZE, %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++ .p2align 4 ++L(CopyVecSizeUnaligned_32): ++ bsf %edx, %edx ++ VMOVU %YMM4, (%rdi) ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 2)(%rdi, %rdx), %rax ++# endif ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++ add $((VEC_SIZE * 2) - 1), %r8 ++ sub %rdx, %r8 ++ lea ((VEC_SIZE * 2) + 1)(%rdi, %rdx), %rdi ++ jmp L(StrncpyFillTailWithZero) ++# else ++ add $(VEC_SIZE * 2), %rsi ++ add $(VEC_SIZE * 2), %rdi ++ jmp L(CopyVecSizeExit) ++# endif ++ ++# ifdef USE_AS_STRNCPY ++# ifndef USE_AS_STRCAT ++ .p2align 4 ++L(CopyVecSizeUnalignedVec6): ++ VMOVU %YMM6, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec5): ++ VMOVU %YMM5, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec4): ++ VMOVU %YMM4, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec3): ++ VMOVU %YMM3, (%rdi, %rcx) ++ jmp L(CopyVecSizeVecExit) ++# endif ++ ++/* Case2 */ ++ ++ .p2align 4 ++L(CopyVecSizeCase2): ++ add $VEC_SIZE, %r8 ++ add %rcx, %rdi ++ add %rcx, %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSizeCase2): ++ add %rcx, %rsi ++ bsf %edx, %edx ++ add $VEC_SIZE, %edx ++ sub %ecx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++L(CopyVecSizeTailCase2): ++ add %rcx, %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++L(CopyVecSizeTail1Case2): ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++ jmp L(StrncpyExit) ++ ++/* Case2 or Case3, Case3 */ ++ ++ .p2align 4 ++L(CopyVecSizeCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeCase2) ++L(CopyVecSizeCase3): ++ add $VEC_SIZE, %r8 ++ add %rcx, %rdi ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSizeCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyTwoVecSizeCase2) ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyVecSizeTailCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeTailCase2) ++ add %rcx, %rsi ++ jmp L(StrncpyExit) ++ ++ .p2align 4 ++L(CopyTwoVecSize1Case2OrCase3): ++ add $VEC_SIZE, %rdi ++ add $VEC_SIZE, %rsi ++ sub $VEC_SIZE, %r8 ++L(CopyVecSizeTail1Case2OrCase3): ++ test %rdx, %rdx ++ jnz L(CopyVecSizeTail1Case2) ++ jmp L(StrncpyExit) ++# endif ++ ++/*------------End labels regarding with copying 1-VEC_SIZE bytes--and 1-(VEC_SIZE*2) bytes----*/ ++ ++ .p2align 4 ++L(Exit1): ++ movzwl (%rsi), %edx ++ mov %dx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 1(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $2, %r8 ++ lea 2(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit2): ++ movzwl (%rsi), %ecx ++ mov %cx, (%rdi) ++ movb $0, 2(%rdi) ++# ifdef USE_AS_STPCPY ++ lea 2(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $3, %r8 ++ lea 3(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit3): ++ mov (%rsi), %edx ++ mov %edx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 3(%rdi), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub $4, %r8 ++ lea 4(%rdi), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit4_7): ++ mov (%rsi), %ecx ++ mov %ecx, (%rdi) ++ mov -3(%rsi, %rdx), %ecx ++ mov %ecx, -3(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit8_15): ++ mov (%rsi), %rcx ++ mov -7(%rsi, %rdx), %r9 ++ mov %rcx, (%rdi) ++ mov %r9, -7(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit16_31): ++ VMOVU (%rsi), %XMM2 ++ VMOVU -15(%rsi, %rdx), %XMM3 ++ VMOVU %XMM2, (%rdi) ++ VMOVU %XMM3, -15(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++ .p2align 4 ++L(Exit32_63): ++ VMOVU (%rsi), %YMM2 ++ VMOVU -31(%rsi, %rdx), %YMM3 ++ VMOVU %YMM2, (%rdi) ++ VMOVU %YMM3, -31(%rdi, %rdx) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++# if defined USE_AS_STRNCPY && !defined USE_AS_STRCAT ++ sub %rdx, %r8 ++ sub $1, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ jnz L(StrncpyFillTailWithZero) ++# endif ++ ret ++ ++# ifdef USE_AS_STRNCPY ++ ++ .p2align 4 ++L(StrncpyExit1): ++ movzbl (%rsi), %edx ++ mov %dl, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 1(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 1(%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit2): ++ movzwl (%rsi), %edx ++ mov %dx, (%rdi) ++# ifdef USE_AS_STPCPY ++ lea 2(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 2(%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit3_4): ++ movzwl (%rsi), %ecx ++ movzwl -2(%rsi, %r8), %edx ++ mov %cx, (%rdi) ++ mov %dx, -2(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit5_8): ++ mov (%rsi), %ecx ++ mov -4(%rsi, %r8), %edx ++ mov %ecx, (%rdi) ++ mov %edx, -4(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit9_16): ++ mov (%rsi), %rcx ++ mov -8(%rsi, %r8), %rdx ++ mov %rcx, (%rdi) ++ mov %rdx, -8(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit17_32): ++ VMOVU (%rsi), %XMM2 ++ VMOVU -16(%rsi, %r8), %XMM3 ++ VMOVU %XMM2, (%rdi) ++ VMOVU %XMM3, -16(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit33_64): ++ /* 0/32, 31/16 */ ++ VMOVU (%rsi), %YMM2 ++ VMOVU -VEC_SIZE(%rsi, %r8), %YMM3 ++ VMOVU %YMM2, (%rdi) ++ VMOVU %YMM3, -VEC_SIZE(%rdi, %r8) ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %r8), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi, %r8) ++# endif ++ ret ++ ++ .p2align 4 ++L(StrncpyExit65): ++ /* 0/32, 32/32, 64/1 */ ++ VMOVU (%rsi), %YMM2 ++ VMOVU 32(%rsi), %YMM3 ++ mov 64(%rsi), %cl ++ VMOVU %YMM2, (%rdi) ++ VMOVU %YMM3, 32(%rdi) ++ mov %cl, 64(%rdi) ++# ifdef USE_AS_STPCPY ++ lea 65(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, 65(%rdi) ++# endif ++ ret ++ ++# ifndef USE_AS_STRCAT ++ ++ .p2align 4 ++L(Fill1): ++ mov %dl, (%rdi) ++ ret ++ ++ .p2align 4 ++L(Fill2): ++ mov %dx, (%rdi) ++ ret ++ ++ .p2align 4 ++L(Fill3_4): ++ mov %dx, (%rdi) ++ mov %dx, -2(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(Fill5_8): ++ mov %edx, (%rdi) ++ mov %edx, -4(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(Fill9_16): ++ mov %rdx, (%rdi) ++ mov %rdx, -8(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(Fill17_32): ++ VMOVU %XMMZERO, (%rdi) ++ VMOVU %XMMZERO, -16(%rdi, %r8) ++ ret ++ ++ .p2align 4 ++L(CopyVecSizeUnalignedVec2): ++ VMOVU %YMM2, (%rdi, %rcx) ++ ++ .p2align 4 ++L(CopyVecSizeVecExit): ++ bsf %edx, %edx ++ add $(VEC_SIZE - 1), %r8 ++ add %rcx, %rdi ++# ifdef USE_AS_STPCPY ++ lea (%rdi, %rdx), %rax ++# endif ++ sub %rdx, %r8 ++ lea 1(%rdi, %rdx), %rdi ++ ++ .p2align 4 ++L(StrncpyFillTailWithZero): ++ xor %edx, %edx ++ sub $VEC_SIZE, %r8 ++ jbe L(StrncpyFillExit) ++ ++ VMOVU %YMMZERO, (%rdi) ++ add $VEC_SIZE, %rdi ++ ++ mov %rdi, %rsi ++ and $(VEC_SIZE - 1), %esi ++ sub %rsi, %rdi ++ add %rsi, %r8 ++ sub $(VEC_SIZE * 4), %r8 ++ jb L(StrncpyFillLessFourVecSize) ++ ++L(StrncpyFillLoopVmovdqa): ++ VMOVA %YMMZERO, (%rdi) ++ VMOVA %YMMZERO, VEC_SIZE(%rdi) ++ VMOVA %YMMZERO, (VEC_SIZE * 2)(%rdi) ++ VMOVA %YMMZERO, (VEC_SIZE * 3)(%rdi) ++ add $(VEC_SIZE * 4), %rdi ++ sub $(VEC_SIZE * 4), %r8 ++ jae L(StrncpyFillLoopVmovdqa) ++ ++L(StrncpyFillLessFourVecSize): ++ add $(VEC_SIZE * 2), %r8 ++ jl L(StrncpyFillLessTwoVecSize) ++ VMOVA %YMMZERO, (%rdi) ++ VMOVA %YMMZERO, VEC_SIZE(%rdi) ++ add $(VEC_SIZE * 2), %rdi ++ sub $VEC_SIZE, %r8 ++ jl L(StrncpyFillExit) ++ VMOVA %YMMZERO, (%rdi) ++ add $VEC_SIZE, %rdi ++ jmp L(Fill) ++ ++ .p2align 4 ++L(StrncpyFillLessTwoVecSize): ++ add $VEC_SIZE, %r8 ++ jl L(StrncpyFillExit) ++ VMOVA %YMMZERO, (%rdi) ++ add $VEC_SIZE, %rdi ++ jmp L(Fill) ++ ++ .p2align 4 ++L(StrncpyFillExit): ++ add $VEC_SIZE, %r8 ++L(Fill): ++ cmp $17, %r8d ++ jae L(Fill17_32) ++ cmp $9, %r8d ++ jae L(Fill9_16) ++ cmp $5, %r8d ++ jae L(Fill5_8) ++ cmp $3, %r8d ++ jae L(Fill3_4) ++ cmp $1, %r8d ++ ja L(Fill2) ++ je L(Fill1) ++ ret ++ ++/* end of ifndef USE_AS_STRCAT */ ++# endif ++ ++ .p2align 4 ++L(UnalignedLeaveCase2OrCase3): ++ test %rdx, %rdx ++ jnz L(UnalignedFourVecSizeLeaveCase2) ++L(UnalignedFourVecSizeLeaveCase3): ++ lea (VEC_SIZE * 4)(%r8), %rcx ++ and $-VEC_SIZE, %rcx ++ add $(VEC_SIZE * 3), %r8 ++ jl L(CopyVecSizeCase3) ++ VMOVU %YMM4, (%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++ sub $VEC_SIZE, %r8 ++ jb L(CopyVecSizeCase3) ++ VMOVU %YMM7, (VEC_SIZE * 3)(%rdi) ++# ifdef USE_AS_STPCPY ++ lea (VEC_SIZE * 4)(%rdi), %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (VEC_SIZE * 4)(%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(UnalignedFourVecSizeLeaveCase2): ++ xor %ecx, %ecx ++ vpcmpb $0, %YMM4, %YMMZERO, %k1 ++ kmovd %k1, %edx ++ add $(VEC_SIZE * 3), %r8 ++ jle L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec4) ++# else ++ jnz L(CopyVecSize) ++# endif ++ vpcmpb $0, %YMM5, %YMMZERO, %k2 ++ kmovd %k2, %edx ++ VMOVU %YMM4, (%rdi) ++ add $VEC_SIZE, %rcx ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec5) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vpcmpb $0, %YMM6, %YMMZERO, %k3 ++ kmovd %k3, %edx ++ VMOVU %YMM5, VEC_SIZE(%rdi) ++ add $VEC_SIZE, %rcx ++ sub $VEC_SIZE, %r8 ++ jbe L(CopyVecSizeCase2OrCase3) ++ test %edx, %edx ++# ifndef USE_AS_STRCAT ++ jnz L(CopyVecSizeUnalignedVec6) ++# else ++ jnz L(CopyVecSize) ++# endif ++ ++ vpcmpb $0, %YMM7, %YMMZERO, %k4 ++ kmovd %k4, %edx ++ VMOVU %YMM6, (VEC_SIZE * 2)(%rdi) ++ lea VEC_SIZE(%rdi, %rcx), %rdi ++ lea VEC_SIZE(%rsi, %rcx), %rsi ++ bsf %edx, %edx ++ cmp %r8d, %edx ++ jb L(CopyVecSizeExit) ++L(StrncpyExit): ++ cmp $65, %r8d ++ je L(StrncpyExit65) ++ cmp $33, %r8d ++ jae L(StrncpyExit33_64) ++ cmp $17, %r8d ++ jae L(StrncpyExit17_32) ++ cmp $9, %r8d ++ jae L(StrncpyExit9_16) ++ cmp $5, %r8d ++ jae L(StrncpyExit5_8) ++ cmp $3, %r8d ++ jae L(StrncpyExit3_4) ++ cmp $1, %r8d ++ ja L(StrncpyExit2) ++ je L(StrncpyExit1) ++# ifdef USE_AS_STPCPY ++ mov %rdi, %rax ++# endif ++# ifdef USE_AS_STRCAT ++ movb $0, (%rdi) ++# endif ++ ret ++ ++ .p2align 4 ++L(ExitZero): ++# ifndef USE_AS_STRCAT ++ mov %rdi, %rax ++# endif ++ ret ++ ++# endif ++ ++# ifndef USE_AS_STRCAT ++END (STRCPY) ++# else ++END (STRCAT) ++# endif ++#endif +diff --git a/sysdeps/x86_64/multiarch/strncat-evex.S b/sysdeps/x86_64/multiarch/strncat-evex.S +new file mode 100644 +index 00000000..8884f023 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncat-evex.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCAT ++#define STRCAT __strncat_evex ++#include "strcat-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strncpy-evex.S b/sysdeps/x86_64/multiarch/strncpy-evex.S +new file mode 100644 +index 00000000..40e391f0 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncpy-evex.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCPY ++#define STRCPY __strncpy_evex ++#include "strcpy-evex.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-14.patch b/glibc-RHEL-15696-14.patch new file mode 100644 index 0000000..84a4593 --- /dev/null +++ b/glibc-RHEL-15696-14.patch @@ -0,0 +1,242 @@ +From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 06:46:08 -0800 +Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memmove.h to select the function optimized with 256-bit EVEX +instructions using YMM16-YMM31 registers to avoid RTM abort with usable +AVX512VL since VZEROUPPER isn't needed at function exit. +--- + sysdeps/x86_64/multiarch/Makefile | 1 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++ + sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++-- + .../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++ + .../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++----- + 5 files changed, 104 insertions(+), 11 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 46783cd1..4563fc56 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms \ + memchr-evex \ ++ memmove-evex-unaligned-erms \ + memrchr-evex \ + rawmemchr-evex \ + stpcpy-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 082e4da3..6bd3abfc 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX), + __memmove_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (SSSE3), + __memmove_chk_ssse3_back) +@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX), + __memmove_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memmove_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX512F), + __memmove_avx512_no_vzeroupper) +@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX), + __memcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (SSSE3), + __memcpy_chk_ssse3_back) +@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX), + __memcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __memcpy_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), + __memcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3), +@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX), + __mempcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (SSSE3), + __mempcpy_chk_ssse3_back) +@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX), + __mempcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __mempcpy_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), + __mempcpy_ssse3_back) + IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h +index 5e5f0299..6f8bce5f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h +@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) +@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx_unaligned_erms); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (evex_unaligned_erms); ++ ++ return OPTIMIZE (evex_unaligned); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx_unaligned_erms); + +- return OPTIMIZE (avx_unaligned); ++ return OPTIMIZE (avx_unaligned); ++ } + } + + if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3) +diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S +new file mode 100644 +index 00000000..0cbce8f9 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S +@@ -0,0 +1,33 @@ ++#if IS_IN (libc) ++# define VEC_SIZE 32 ++# define XMM0 xmm16 ++# define XMM1 xmm17 ++# define YMM0 ymm16 ++# define YMM1 ymm17 ++# define VEC0 ymm16 ++# define VEC1 ymm17 ++# define VEC2 ymm18 ++# define VEC3 ymm19 ++# define VEC4 ymm20 ++# define VEC5 ymm21 ++# define VEC6 ymm22 ++# define VEC7 ymm23 ++# define VEC8 ymm24 ++# define VEC9 ymm25 ++# define VEC10 ymm26 ++# define VEC11 ymm27 ++# define VEC12 ymm28 ++# define VEC13 ymm29 ++# define VEC14 ymm30 ++# define VEC15 ymm31 ++# define VEC(i) VEC##i ++# define VMOVNT vmovntdq ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++# define VZEROUPPER ++ ++# define SECTION(p) p##.evex ++# define MEMMOVE_SYMBOL(p,s) p##_evex_##s ++ ++# include "memmove-vec-unaligned-erms.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 274aa1c7..08e21692 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -48,6 +48,14 @@ + # define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s) + #endif + ++#ifndef XMM0 ++# define XMM0 xmm0 ++#endif ++ ++#ifndef YMM0 ++# define YMM0 ymm0 ++#endif ++ + #ifndef VZEROUPPER + # if VEC_SIZE > 16 + # define VZEROUPPER vzeroupper +@@ -277,20 +285,20 @@ L(less_vec): + #if VEC_SIZE > 32 + L(between_32_63): + /* From 32 to 63. No branch when size == 32. */ +- vmovdqu (%rsi), %ymm0 +- vmovdqu -32(%rsi,%rdx), %ymm1 +- vmovdqu %ymm0, (%rdi) +- vmovdqu %ymm1, -32(%rdi,%rdx) ++ VMOVU (%rsi), %YMM0 ++ VMOVU -32(%rsi,%rdx), %YMM1 ++ VMOVU %YMM0, (%rdi) ++ VMOVU %YMM1, -32(%rdi,%rdx) + VZEROUPPER + ret + #endif + #if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): +- vmovdqu (%rsi), %xmm0 +- vmovdqu -16(%rsi,%rdx), %xmm1 +- vmovdqu %xmm0, (%rdi) +- vmovdqu %xmm1, -16(%rdi,%rdx) ++ VMOVU (%rsi), %XMM0 ++ VMOVU -16(%rsi,%rdx), %XMM1 ++ VMOVU %XMM0, (%rdi) ++ VMOVU %XMM1, -16(%rdi,%rdx) + ret + #endif + L(between_8_15): +-- +GitLab + diff --git a/glibc-RHEL-15696-15.patch b/glibc-RHEL-15696-15.patch new file mode 100644 index 0000000..72cd8cf --- /dev/null +++ b/glibc-RHEL-15696-15.patch @@ -0,0 +1,254 @@ +From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 07:15:03 -0800 +Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized +with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM +abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at +function exit. +--- + sysdeps/x86_64/multiarch/Makefile | 1 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++ + sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++---- + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++---- + .../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++ + .../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++----- + 6 files changed, 90 insertions(+), 14 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 4563fc56..1cc0a10e 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memchr-evex \ + memmove-evex-unaligned-erms \ + memrchr-evex \ ++ memset-evex-unaligned-erms \ + rawmemchr-evex \ + stpcpy-evex \ + stpncpy-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 6bd3abfc..7cf83485 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX2), + __memset_chk_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_chk_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX512F), + __memset_chk_avx512_unaligned_erms) +@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX2), + __memset_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX512F), + __memset_avx512_unaligned_erms) +@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX2), + __wmemset_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, wmemset, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX512F), + __wmemset_avx512_unaligned)) +@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + CPU_FEATURE_USABLE (AVX2), + __wmemset_chk_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, __wmemset_chk, ++ CPU_FEATURE_USABLE (AVX512VL), ++ __wmemset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, + CPU_FEATURE_USABLE (AVX512F), + __wmemset_chk_avx512_unaligned)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 708bd72e..6f31f4dc 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms) +@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx2_unaligned_erms); +- else +- return OPTIMIZE (avx2_unaligned); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (evex_unaligned_erms); ++ ++ return OPTIMIZE (evex_unaligned); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx2_unaligned_erms); ++ ++ return OPTIMIZE (avx2_unaligned); ++ } + } + + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +index eb242210..9290c4bf 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +@@ -20,6 +20,7 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + + static inline void * +@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx512_unaligned); +- else ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) ++ return OPTIMIZE (evex_unaligned); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_unaligned); + } + +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +new file mode 100644 +index 00000000..ae0a4d6e +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -0,0 +1,24 @@ ++#if IS_IN (libc) ++# define VEC_SIZE 32 ++# define XMM0 xmm16 ++# define YMM0 ymm16 ++# define VEC0 ymm16 ++# define VEC(i) VEC##i ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++# define VZEROUPPER ++ ++# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++ movq r, %rax; \ ++ vpbroadcastb d, %VEC0 ++ ++# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++ movq r, %rax; \ ++ vpbroadcastd d, %VEC0 ++ ++# define SECTION(p) p##.evex ++# define MEMSET_SYMBOL(p,s) p##_evex_##s ++# define WMEMSET_SYMBOL(p,s) p##_evex_##s ++ ++# include "memset-vec-unaligned-erms.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 9a0fd818..71e91a8f 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -34,6 +34,14 @@ + # define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s) + #endif + ++#ifndef XMM0 ++# define XMM0 xmm0 ++#endif ++ ++#ifndef YMM0 ++# define YMM0 ymm0 ++#endif ++ + #ifndef VZEROUPPER + # if VEC_SIZE > 16 + # define VZEROUPPER vzeroupper +@@ -67,7 +75,7 @@ + ENTRY (__bzero) + mov %RDI_LP, %RAX_LP /* Set return value. */ + mov %RSI_LP, %RDX_LP /* Set n. */ +- pxor %xmm0, %xmm0 ++ pxor %XMM0, %XMM0 + jmp L(entry_from_bzero) + END (__bzero) + weak_alias (__bzero, bzero) +@@ -223,7 +231,7 @@ L(less_vec): + cmpb $16, %dl + jae L(between_16_31) + # endif +- MOVQ %xmm0, %rcx ++ MOVQ %XMM0, %rcx + cmpb $8, %dl + jae L(between_8_15) + cmpb $4, %dl +@@ -238,16 +246,16 @@ L(less_vec): + # if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- vmovdqu %ymm0, -32(%rdi,%rdx) +- vmovdqu %ymm0, (%rdi) ++ VMOVU %YMM0, -32(%rdi,%rdx) ++ VMOVU %YMM0, (%rdi) + VZEROUPPER + ret + # endif + # if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): +- vmovdqu %xmm0, -16(%rdi,%rdx) +- vmovdqu %xmm0, (%rdi) ++ VMOVU %XMM0, -16(%rdi,%rdx) ++ VMOVU %XMM0, (%rdi) + VZEROUPPER + ret + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-16.patch b/glibc-RHEL-15696-16.patch new file mode 100644 index 0000000..b3f443d --- /dev/null +++ b/glibc-RHEL-15696-16.patch @@ -0,0 +1,561 @@ +From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 07:20:28 -0800 +Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX +instructions using YMM16-YMM31 registers to avoid RTM abort with usable +AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function +exit. +--- + sysdeps/x86_64/multiarch/Makefile | 4 +- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 + + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++ + sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 + + 5 files changed, 467 insertions(+), 4 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S + create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 1cc0a10e..9d79b138 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms \ + memchr-evex \ ++ memcmp-evex-movbe \ + memmove-evex-unaligned-erms \ + memrchr-evex \ + memset-evex-unaligned-erms \ +@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcsncmp-evex \ + wcsnlen-evex \ + wcsrchr-evex \ +- wmemchr-evex ++ wmemchr-evex \ ++ wmemcmp-evex-movbe + endif + + ifeq ($(subdir),debug) +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 7cf83485..c8da910e 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, memcmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (MOVBE)), ++ __memcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), + __memcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3), +@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, wmemcmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (MOVBE)), ++ __wmemcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), + __wmemcmp_sse4_1) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 6c1f3153..3ca1f0a6 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- return OPTIMIZE (avx2_movbe); ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex_movbe); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2_movbe); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) + return OPTIMIZE (sse4_1); +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +new file mode 100644 +index 00000000..9c093972 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -0,0 +1,440 @@ ++/* memcmp/wmemcmp optimized with 256-bit EVEX instructions. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if IS_IN (libc) ++ ++/* memcmp/wmemcmp is implemented as: ++ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap ++ to avoid branches. ++ 2. Use overlapping compare to avoid branch. ++ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 ++ bytes for wmemcmp. ++ 4. If size is 8 * VEC_SIZE or less, unroll the loop. ++ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory ++ area. ++ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. ++ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. ++ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ ++# include ++ ++# ifndef MEMCMP ++# define MEMCMP __memcmp_evex_movbe ++# endif ++ ++# define VMOVU vmovdqu64 ++ ++# ifdef USE_AS_WMEMCMP ++# define VPCMPEQ vpcmpeqd ++# else ++# define VPCMPEQ vpcmpeqb ++# endif ++ ++# define XMM1 xmm17 ++# define XMM2 xmm18 ++# define YMM1 ymm17 ++# define YMM2 ymm18 ++# define YMM3 ymm19 ++# define YMM4 ymm20 ++# define YMM5 ymm21 ++# define YMM6 ymm22 ++ ++# define VEC_SIZE 32 ++# ifdef USE_AS_WMEMCMP ++# define VEC_MASK 0xff ++# define XMM_MASK 0xf ++# else ++# define VEC_MASK 0xffffffff ++# define XMM_MASK 0xffff ++# endif ++ ++/* Warning! ++ wmemcmp has to use SIGNED comparison for elements. ++ memcmp has to use UNSIGNED comparison for elemnts. ++*/ ++ ++ .section .text.evex,"ax",@progbits ++ENTRY (MEMCMP) ++# ifdef USE_AS_WMEMCMP ++ shl $2, %RDX_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec) ++ ++ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(last_vec) ++ ++ /* More than 2 * VEC. */ ++ cmpq $(VEC_SIZE * 8), %rdx ++ ja L(more_8x_vec) ++ cmpq $(VEC_SIZE * 4), %rdx ++ jb L(last_4x_vec) ++ ++ /* From 4 * VEC to 8 * VEC, inclusively. */ ++ VMOVU (%rsi), %YMM1 ++ VPCMPEQ (%rdi), %YMM1, %k1 ++ ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ ++ kandd %k1, %k2, %k5 ++ kandd %k3, %k4, %k6 ++ kandd %k5, %k6, %k6 ++ ++ kmovd %k6, %eax ++ cmpl $VEC_MASK, %eax ++ jne L(4x_vec_end) ++ ++ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi ++ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %YMM1 ++ VPCMPEQ (%rdi), %YMM1, %k1 ++ ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ kandd %k1, %k2, %k5 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ kandd %k3, %k5, %k5 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ kandd %k4, %k5, %k5 ++ ++ kmovd %k5, %eax ++ cmpl $VEC_MASK, %eax ++ jne L(4x_vec_end) ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++L(last_vec): ++ /* Use overlapping loads to avoid branches. */ ++ leaq -VEC_SIZE(%rdi, %rdx), %rdi ++ leaq -VEC_SIZE(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(first_vec): ++ /* A byte or int32 is different within 16 or 32 bytes. */ ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (%rdi, %rcx, 4), %edx ++ cmpl (%rsi, %rcx, 4), %edx ++L(wmemcmp_return): ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++ ++# ifdef USE_AS_WMEMCMP ++ .p2align 4 ++L(4): ++ xorl %eax, %eax ++ movl (%rdi), %edx ++ cmpl (%rsi), %edx ++ jne L(wmemcmp_return) ++ ret ++# else ++ .p2align 4 ++L(between_4_7): ++ /* Load as big endian with overlapping movbe to avoid branches. */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ je L(exit) ++ sbbl %eax, %eax ++ orl $1, %eax ++ ret ++ ++ .p2align 4 ++L(exit): ++ ret ++ ++ .p2align 4 ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movb -1(%rdi, %rdx), %al ++ movb -1(%rsi, %rdx), %cl ++ /* Subtraction is okay because the upper 8 bits are zero. */ ++ subl %ecx, %eax ++ ret ++ ++ .p2align 4 ++L(1): ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax ++ ret ++# endif ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax ++ ret ++ ++ .p2align 4 ++L(less_vec): ++# ifdef USE_AS_WMEMCMP ++ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ ++ cmpb $4, %dl ++ je L(4) ++ jb L(zero) ++# else ++ cmpb $1, %dl ++ je L(1) ++ jb L(zero) ++ cmpb $4, %dl ++ jb L(between_2_3) ++ cmpb $8, %dl ++ jb L(between_4_7) ++# endif ++ cmpb $16, %dl ++ jae L(between_16_31) ++ /* It is between 8 and 15 bytes. */ ++ vmovq (%rdi), %XMM1 ++ vmovq (%rsi), %XMM2 ++ VPCMPEQ %XMM1, %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ /* Use overlapping loads to avoid branches. */ ++ leaq -8(%rdi, %rdx), %rdi ++ leaq -8(%rsi, %rdx), %rsi ++ vmovq (%rdi), %XMM1 ++ vmovq (%rsi), %XMM2 ++ VPCMPEQ %XMM1, %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(between_16_31): ++ /* From 16 to 31 bytes. No branch when size == 16. */ ++ VMOVU (%rsi), %XMM2 ++ VPCMPEQ (%rdi), %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ ++ /* Use overlapping loads to avoid branches. */ ++ leaq -16(%rdi, %rdx), %rdi ++ leaq -16(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %XMM2 ++ VPCMPEQ (%rdi), %XMM2, %k2 ++ kmovw %k2, %eax ++ subl $XMM_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(more_8x_vec): ++ /* More than 8 * VEC. Check the first VEC. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ /* Align the first memory area for aligned loads in the loop. ++ Compute how much the first memory area is misaligned. */ ++ movq %rdi, %rcx ++ andl $(VEC_SIZE - 1), %ecx ++ /* Get the negative of offset for alignment. */ ++ subq $VEC_SIZE, %rcx ++ /* Adjust the second memory area. */ ++ subq %rcx, %rsi ++ /* Adjust the first memory area which should be aligned now. */ ++ subq %rcx, %rdi ++ /* Adjust length. */ ++ addq %rcx, %rdx ++ ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ VMOVU (%rsi), %YMM1 ++ VPCMPEQ (%rdi), %YMM1, %k1 ++ ++ VMOVU VEC_SIZE(%rsi), %YMM2 ++ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ kandd %k2, %k1, %k5 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ kandd %k3, %k5, %k5 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ kandd %k4, %k5, %k5 ++ ++ kmovd %k5, %eax ++ cmpl $VEC_MASK, %eax ++ jne L(4x_vec_end) ++ ++ addq $(VEC_SIZE * 4), %rdi ++ addq $(VEC_SIZE * 4), %rsi ++ ++ subq $(VEC_SIZE * 4), %rdx ++ cmpq $(VEC_SIZE * 4), %rdx ++ jae L(loop_4x_vec) ++ ++ /* Less than 4 * VEC. */ ++ cmpq $VEC_SIZE, %rdx ++ jbe L(last_vec) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(last_2x_vec) ++ ++L(last_4x_vec): ++ /* From 2 * VEC to 4 * VEC. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ /* Use overlapping loads to avoid branches. */ ++ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi ++ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ++ addq $VEC_SIZE, %rdi ++ addq $VEC_SIZE, %rsi ++ VMOVU (%rsi), %YMM2 ++ VPCMPEQ (%rdi), %YMM2, %k2 ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ ret ++ ++ .p2align 4 ++L(4x_vec_end): ++ kmovd %k1, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec) ++ kmovd %k2, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec_x1) ++ kmovd %k3, %eax ++ subl $VEC_MASK, %eax ++ jnz L(first_vec_x2) ++ kmovd %k4, %eax ++ subl $VEC_MASK, %eax ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x1): ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl VEC_SIZE(%rdi, %rcx, 4), %edx ++ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %ecx ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx ++ jmp L(wmemcmp_return) ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ sub %edx, %eax ++# endif ++ ret ++END (MEMCMP) ++#endif +diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S +new file mode 100644 +index 00000000..4726d74a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S +@@ -0,0 +1,4 @@ ++#define MEMCMP __wmemcmp_evex_movbe ++#define USE_AS_WMEMCMP 1 ++ ++#include "memcmp-evex-movbe.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-17.patch b/glibc-RHEL-15696-17.patch new file mode 100644 index 0000000..3176514 --- /dev/null +++ b/glibc-RHEL-15696-17.patch @@ -0,0 +1,2568 @@ +From 7ebba91361badf7531d4e75050627a88d424872f Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 5 Mar 2021 07:26:42 -0800 +Subject: [PATCH] x86-64: Add AVX optimized string/memory functions for RTM +Content-type: text/plain; charset=UTF-8 + +Since VZEROUPPER triggers RTM abort while VZEROALL won't, select AVX +optimized string/memory functions with + + xtest + jz 1f + vzeroall + ret +1: + vzeroupper + ret + +at function exit on processors with usable RTM, but without 256-bit EVEX +instructions to avoid VZEROUPPER inside a transactionally executing RTM +region. +--- + sysdeps/x86_64/multiarch/Makefile | 27 +++ + sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 170 ++++++++++++++++++ + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 + + sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 ++ + sysdeps/x86_64/multiarch/ifunc-memset.h | 12 ++ + sysdeps/x86_64/multiarch/ifunc-strcpy.h | 4 + + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 5 + + sysdeps/x86_64/multiarch/memchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/memchr-avx2.S | 45 +++-- + .../x86_64/multiarch/memcmp-avx2-movbe-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 28 ++- + .../memmove-avx-unaligned-erms-rtm.S | 17 ++ + .../multiarch/memmove-vec-unaligned-erms.S | 33 ++-- + sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/memrchr-avx2.S | 53 +++--- + .../memset-avx2-unaligned-erms-rtm.S | 10 ++ + .../multiarch/memset-avx2-unaligned-erms.S | 12 +- + .../multiarch/memset-vec-unaligned-erms.S | 41 ++--- + sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/strcat-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strcat-avx2.S | 6 +- + sysdeps/x86_64/multiarch/strchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strchr-avx2.S | 22 +-- + sysdeps/x86_64/multiarch/strchr.c | 4 + + sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 55 +++--- + sysdeps/x86_64/multiarch/strcmp.c | 4 + + sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strcpy-avx2.S | 85 ++++----- + sysdeps/x86_64/multiarch/strlen-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strlen-avx2.S | 43 ++--- + sysdeps/x86_64/multiarch/strncat-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strncmp.c | 4 + + sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S | 12 ++ + sysdeps/x86_64/multiarch/strrchr-avx2.S | 19 +- + sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S | 4 + + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S | 5 + + sysdeps/x86_64/multiarch/wcsnlen.c | 4 + + sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S | 3 + + sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S | 4 + + .../x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S | 4 + + sysdeps/x86_64/sysdep.h | 22 +++ + 52 files changed, 668 insertions(+), 244 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/memchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcat-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strlen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncat-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S + +Conflicts: + sysdeps/x86_64/multiarch/strchr-avx2.S + (same fix, different location) + + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 9d79b138..491c7698 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -40,6 +40,25 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + memset-sse2-unaligned-erms \ + memset-avx2-unaligned-erms \ + memset-avx512-unaligned-erms \ ++ memchr-avx2-rtm \ ++ memcmp-avx2-movbe-rtm \ ++ memmove-avx-unaligned-erms-rtm \ ++ memrchr-avx2-rtm \ ++ memset-avx2-unaligned-erms-rtm \ ++ rawmemchr-avx2-rtm \ ++ strchr-avx2-rtm \ ++ strcmp-avx2-rtm \ ++ strchrnul-avx2-rtm \ ++ stpcpy-avx2-rtm \ ++ stpncpy-avx2-rtm \ ++ strcat-avx2-rtm \ ++ strcpy-avx2-rtm \ ++ strlen-avx2-rtm \ ++ strncat-avx2-rtm \ ++ strncmp-avx2-rtm \ ++ strncpy-avx2-rtm \ ++ strnlen-avx2-rtm \ ++ strrchr-avx2-rtm \ + memchr-evex \ + memcmp-evex-movbe \ + memmove-evex-unaligned-erms \ +@@ -76,6 +95,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcsrchr-sse2 wcsrchr-avx2 \ + wcsnlen-sse4_1 wcsnlen-c \ + wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ ++ wcschr-avx2-rtm \ ++ wcscmp-avx2-rtm \ ++ wcslen-avx2-rtm \ ++ wcsncmp-avx2-rtm \ ++ wcsnlen-avx2-rtm \ ++ wcsrchr-avx2-rtm \ ++ wmemchr-avx2-rtm \ ++ wmemcmp-avx2-movbe-rtm \ + wcschr-evex \ + wcscmp-evex \ + wcslen-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h +index 7081b0c9..e0f30e61 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h ++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h +@@ -21,6 +21,7 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -36,6 +37,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c8da910e..c1efeec0 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -43,6 +43,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memchr, + CPU_FEATURE_USABLE (AVX2), + __memchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -56,6 +60,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, memcmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (MOVBE) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -85,6 +94,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX), + __memmove_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_chk_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __memmove_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_evex_unaligned) +@@ -113,6 +130,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX), + __memmove_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, memmove, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memmove_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memmove, + CPU_FEATURE_USABLE (AVX512VL), + __memmove_evex_unaligned) +@@ -143,6 +168,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memrchr, + CPU_FEATURE_USABLE (AVX2), + __memrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, memrchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, memrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -165,6 +194,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX2), + __memset_chk_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_chk_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __memset_chk, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_chk_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -198,6 +235,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX2), + __memset_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, memset, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memset_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -222,6 +267,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, rawmemchr, + CPU_FEATURE_USABLE (AVX2), + __rawmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __rawmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, rawmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -234,6 +283,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strlen, + CPU_FEATURE_USABLE (AVX2), + __strlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strlen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -245,6 +298,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strnlen, + CPU_FEATURE_USABLE (AVX2), + __strnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, strnlen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -257,6 +314,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpncpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpncpy, CPU_FEATURE_USABLE (AVX2), + __stpncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpncpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __stpncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -271,6 +332,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __stpcpy_ssse3) + IFUNC_IMPL_ADD (array, i, stpcpy, CPU_FEATURE_USABLE (AVX2), + __stpcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, stpcpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __stpcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, stpcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -309,6 +374,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcat, + IFUNC_IMPL_ADD (array, i, strcat, CPU_FEATURE_USABLE (AVX2), + __strcat_avx2) ++ IFUNC_IMPL_ADD (array, i, strcat, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -323,6 +392,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchr, + CPU_FEATURE_USABLE (AVX2), + __strchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -336,6 +409,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strchrnul, + CPU_FEATURE_USABLE (AVX2), + __strchrnul_avx2) ++ IFUNC_IMPL_ADD (array, i, strchrnul, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strchrnul_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -348,6 +425,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strrchr, + CPU_FEATURE_USABLE (AVX2), + __strrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, strrchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -359,6 +440,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strcmp, + CPU_FEATURE_USABLE (AVX2), + __strcmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -375,6 +460,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strcpy, + IFUNC_IMPL_ADD (array, i, strcpy, CPU_FEATURE_USABLE (AVX2), + __strcpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strcpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -422,6 +511,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncat, + IFUNC_IMPL_ADD (array, i, strncat, CPU_FEATURE_USABLE (AVX2), + __strncat_avx2) ++ IFUNC_IMPL_ADD (array, i, strncat, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncat_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncat, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -436,6 +529,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, strncpy, + IFUNC_IMPL_ADD (array, i, strncpy, CPU_FEATURE_USABLE (AVX2), + __strncpy_avx2) ++ IFUNC_IMPL_ADD (array, i, strncpy, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncpy_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncpy, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +@@ -469,6 +566,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcschr, + CPU_FEATURE_USABLE (AVX2), + __wcschr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcschr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcschr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -481,6 +582,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsrchr, + CPU_FEATURE_USABLE (AVX2), + __wcsrchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsrchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcsrchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsrchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -493,6 +598,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcscmp, + CPU_FEATURE_USABLE (AVX2), + __wcscmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcscmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcscmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcscmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -505,6 +614,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsncmp, + CPU_FEATURE_USABLE (AVX2), + __wcsncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsncmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcsncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsncmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -523,6 +636,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (AVX2), + __wcslen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcslen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcslen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -535,6 +652,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wcsnlen, + CPU_FEATURE_USABLE (AVX2), + __wcsnlen_avx2) ++ IFUNC_IMPL_ADD (array, i, wcsnlen, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wcsnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -550,6 +671,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemchr, + CPU_FEATURE_USABLE (AVX2), + __wmemchr_avx2) ++ IFUNC_IMPL_ADD (array, i, wmemchr, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wmemchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wmemchr, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -563,6 +688,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_avx2_movbe) ++ IFUNC_IMPL_ADD (array, i, wmemcmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (MOVBE) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wmemcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) +@@ -581,6 +711,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX2), + __wmemset_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, wmemset, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __wmemset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, wmemset, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_evex_unaligned) +@@ -606,6 +740,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX), + __memcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_chk_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __memcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_evex_unaligned) +@@ -634,6 +776,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX), + __memcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __memcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memcpy, + CPU_FEATURE_USABLE (AVX512VL), + __memcpy_evex_unaligned) +@@ -676,6 +826,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX), + __mempcpy_chk_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_chk_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, __mempcpy_chk, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_chk_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_evex_unaligned) +@@ -713,6 +871,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX), + __mempcpy_avx_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_avx_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, mempcpy, ++ (CPU_FEATURE_USABLE (AVX) ++ && CPU_FEATURE_USABLE (RTM)), ++ __mempcpy_avx_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_evex_unaligned) +@@ -734,6 +900,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL_ADD (array, i, strncmp, + CPU_FEATURE_USABLE (AVX2), + __strncmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW)), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 3ca1f0a6..8043c635 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -23,6 +23,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden; + + static inline void * +@@ -38,6 +39,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex_movbe); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_movbe_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_movbe); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h +index 6f8bce5f..fa09b9fb 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h +@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms_rtm) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) +@@ -71,6 +75,14 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (evex_unaligned); + } + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx_unaligned_erms_rtm); ++ ++ return OPTIMIZE (avx_unaligned_rtm); ++ } ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 6f31f4dc..6f3375cc 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms) + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms) + attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms_rtm) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms) +@@ -69,6 +73,14 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (evex_unaligned); + } + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx2_unaligned_erms_rtm); ++ ++ return OPTIMIZE (avx2_unaligned_rtm); ++ } ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcpy.h b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +index deae6348..a924762e 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcpy.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcpy.h +@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) + attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -39,6 +40,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +index 9290c4bf..bdc94c6c 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +@@ -20,6 +20,8 @@ + + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_rtm) ++ attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden; + +@@ -39,6 +41,9 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) + return OPTIMIZE (evex_unaligned); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_unaligned_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2_unaligned); + } +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +new file mode 100644 +index 00000000..87b076c7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef MEMCHR ++# define MEMCHR __memchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "memchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index c81da19b..cf893e77 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -34,9 +34,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +@@ -107,8 +111,8 @@ L(cros_page_boundary): + # endif + addq %rdi, %rax + addq %rcx, %rax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(aligned_more): +@@ -224,8 +228,7 @@ L(last_4x_vec_or_less): + + jnz L(first_vec_x3_check) + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -243,8 +246,7 @@ L(last_2x_vec): + testl %eax, %eax + jnz L(first_vec_x1_check) + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x0_check): +@@ -253,8 +255,7 @@ L(first_vec_x0_check): + cmpq %rax, %rdx + jbe L(zero) + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1_check): +@@ -264,8 +265,7 @@ L(first_vec_x1_check): + jbe L(zero) + addq $VEC_SIZE, %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2_check): +@@ -275,8 +275,7 @@ L(first_vec_x2_check): + jbe L(zero) + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x3_check): +@@ -286,12 +285,14 @@ L(first_vec_x3_check): + jbe L(zero) + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(zero): +- VZEROUPPER ++ xorl %eax, %eax ++ jmp L(return_vzeroupper) ++ ++ .p2align 4 + L(null): + xorl %eax, %eax + ret +@@ -301,24 +302,21 @@ L(null): + L(first_vec_x0): + tzcntl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): + tzcntl %eax, %eax + addq $VEC_SIZE, %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): + tzcntl %eax, %eax + addq $(VEC_SIZE * 2), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -337,8 +335,7 @@ L(first_vec_x3): + tzcntl %eax, %eax + addq $(VEC_SIZE * 3), %rax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (MEMCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S +new file mode 100644 +index 00000000..cf4eff5d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef MEMCMP ++# define MEMCMP __memcmp_avx2_movbe_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "memcmp-avx2-movbe.S" +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index e3a35b89..9d5c9c72 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -47,6 +47,10 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + # define VEC_MASK ((1 << VEC_SIZE) - 1) + +@@ -55,7 +59,7 @@ + memcmp has to use UNSIGNED comparison for elemnts. + */ + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP + shl $2, %RDX_LP +@@ -123,8 +127,8 @@ ENTRY (MEMCMP) + vptest %ymm0, %ymm5 + jnc L(4x_vec_end) + xorl %eax, %eax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -144,8 +148,7 @@ L(last_vec): + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec): +@@ -164,8 +167,7 @@ L(wmemcmp_return): + movzbl (%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_WMEMCMP + .p2align 4 +@@ -367,8 +369,7 @@ L(last_4x_vec): + vpmovmskb %ymm2, %eax + subl $VEC_MASK, %eax + jnz L(first_vec) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -394,8 +395,7 @@ L(4x_vec_end): + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): +@@ -410,8 +410,7 @@ L(first_vec_x1): + movzbl VEC_SIZE(%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): +@@ -426,7 +425,6 @@ L(first_vec_x2): + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx + sub %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + END (MEMCMP) + #endif +diff --git a/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S +new file mode 100644 +index 00000000..1ec1962e +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memmove-avx-unaligned-erms-rtm.S +@@ -0,0 +1,17 @@ ++#if IS_IN (libc) ++# define VEC_SIZE 32 ++# define VEC(i) ymm##i ++# define VMOVNT vmovntdq ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa ++ ++# define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++# define VZEROUPPER_RETURN jmp L(return) ++ ++# define SECTION(p) p##.avx.rtm ++# define MEMMOVE_SYMBOL(p,s) p##_avx_##s##_rtm ++ ++# include "memmove-vec-unaligned-erms.S" ++#endif +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 08e21692..71f5954d 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -140,11 +140,12 @@ L(last_2x_vec): + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(1) + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) +- VZEROUPPER + #if !defined USE_MULTIARCH || !IS_IN (libc) + L(nop): +-#endif + ret ++#else ++ VZEROUPPER_RETURN ++#endif + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMMOVE_SYMBOL (__memmove, unaligned)) + +@@ -237,8 +238,11 @@ L(last_2x_vec): + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(1), -VEC_SIZE(%rdi,%rdx) + L(return): +- VZEROUPPER ++#if VEC_SIZE > 16 ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++#else + ret ++#endif + + L(movsb): + cmpq __x86_shared_non_temporal_threshold(%rip), %rdx +@@ -289,8 +293,7 @@ L(between_32_63): + VMOVU -32(%rsi,%rdx), %YMM1 + VMOVU %YMM0, (%rdi) + VMOVU %YMM1, -32(%rdi,%rdx) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + #endif + #if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ +@@ -299,7 +302,7 @@ L(between_16_31): + VMOVU -16(%rsi,%rdx), %XMM1 + VMOVU %XMM0, (%rdi) + VMOVU %XMM1, -16(%rdi,%rdx) +- ret ++ VZEROUPPER_RETURN + #endif + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +@@ -352,8 +355,7 @@ L(more_2x_vec): + VMOVU %VEC(5), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(6), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + L(last_4x_vec): + /* Copy from 2 * VEC to 4 * VEC. */ + VMOVU (%rsi), %VEC(0) +@@ -364,8 +366,7 @@ L(last_4x_vec): + VMOVU %VEC(1), VEC_SIZE(%rdi) + VMOVU %VEC(2), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(3), -(VEC_SIZE * 2)(%rdi,%rdx) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(more_8x_vec): + cmpq %rsi, %rdi +@@ -421,8 +422,7 @@ L(loop_4x_vec_forward): + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(more_8x_vec_backward): + /* Load the first 4 * VEC and last VEC to support overlapping +@@ -473,8 +473,7 @@ L(loop_4x_vec_backward): + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + L(large_forward): +@@ -509,8 +508,7 @@ L(loop_large_forward): + VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) + /* Store the first VEC. */ + VMOVU %VEC(4), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(large_backward): + /* Don't use non-temporal store if there is overlap between +@@ -544,8 +542,7 @@ L(loop_large_backward): + VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) + /* Store the last VEC. */ + VMOVU %VEC(8), (%r11) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + #endif + END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +new file mode 100644 +index 00000000..cea2d2a7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef MEMRCHR ++# define MEMRCHR __memrchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "memrchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S +index ce488dd9..20efe7ac 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S +@@ -20,14 +20,22 @@ + + # include + ++# ifndef MEMRCHR ++# define MEMRCHR __memrchr_avx2 ++# endif ++ + # ifndef VZEROUPPER + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits +-ENTRY (__memrchr_avx2) ++ .section SECTION(.text),"ax",@progbits ++ENTRY (MEMRCHR) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpbroadcastb %xmm0, %ymm0 +@@ -134,8 +142,8 @@ L(loop_4x_vec): + vpmovmskb %ymm1, %eax + bsrl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(last_4x_vec_or_less): +@@ -169,8 +177,7 @@ L(last_4x_vec_or_less): + addq %rax, %rdx + jl L(zero) + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -191,31 +198,27 @@ L(last_2x_vec): + jl L(zero) + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x0): + bsrl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x1): + bsrl %eax, %eax + addl $VEC_SIZE, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x2): + bsrl %eax, %eax + addl $(VEC_SIZE * 2), %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x3): +@@ -232,8 +235,7 @@ L(last_vec_x1_check): + jl L(zero) + addl $VEC_SIZE, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_x3_check): +@@ -243,12 +245,14 @@ L(last_vec_x3_check): + jl L(zero) + addl $(VEC_SIZE * 3), %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(zero): +- VZEROUPPER ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++ ++ .p2align 4 + L(null): + xorl %eax, %eax + ret +@@ -273,8 +277,7 @@ L(last_vec_or_less_aligned): + + bsrl %eax, %eax + addq %rdi, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_or_less): +@@ -315,8 +318,7 @@ L(last_vec_or_less): + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_vec_2x_aligned): +@@ -353,7 +355,6 @@ L(last_vec_2x_aligned): + bsrl %eax, %eax + addq %rdi, %rax + addq %r8, %rax +- VZEROUPPER +- ret +-END (__memrchr_avx2) ++ VZEROUPPER_RETURN ++END (MEMRCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +new file mode 100644 +index 00000000..8ac3e479 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -0,0 +1,10 @@ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return) ++ ++#define SECTION(p) p##.avx.rtm ++#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++ ++#include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index 7ab3d898..ae0860f3 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,9 +14,15 @@ + movq r, %rax; \ + vpbroadcastd %xmm0, %ymm0 + +-# define SECTION(p) p##.avx +-# define MEMSET_SYMBOL(p,s) p##_avx2_##s +-# define WMEMSET_SYMBOL(p,s) p##_avx2_##s ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++# ifndef MEMSET_SYMBOL ++# define MEMSET_SYMBOL(p,s) p##_avx2_##s ++# endif ++# ifndef WMEMSET_SYMBOL ++# define WMEMSET_SYMBOL(p,s) p##_avx2_##s ++# endif + + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 71e91a8f..bae5cba4 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -45,17 +45,14 @@ + #ifndef VZEROUPPER + # if VEC_SIZE > 16 + # define VZEROUPPER vzeroupper ++# define VZEROUPPER_SHORT_RETURN vzeroupper; ret + # else + # define VZEROUPPER + # endif + #endif + + #ifndef VZEROUPPER_SHORT_RETURN +-# if VEC_SIZE > 16 +-# define VZEROUPPER_SHORT_RETURN vzeroupper +-# else +-# define VZEROUPPER_SHORT_RETURN rep +-# endif ++# define VZEROUPPER_SHORT_RETURN rep; ret + #endif + + #ifndef MOVQ +@@ -117,8 +114,7 @@ L(entry_from_bzero): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + #if defined USE_MULTIARCH && IS_IN (libc) + END (MEMSET_SYMBOL (__memset, unaligned)) + +@@ -141,14 +137,12 @@ ENTRY (__memset_erms) + ENTRY (MEMSET_SYMBOL (__memset, erms)) + # endif + L(stosb): +- /* Issue vzeroupper before rep stosb. */ +- VZEROUPPER + mov %RDX_LP, %RCX_LP + movzbl %sil, %eax + mov %RDI_LP, %RDX_LP + rep stosb + mov %RDX_LP, %RAX_LP +- ret ++ VZEROUPPER_RETURN + # if VEC_SIZE == 16 + END (__memset_erms) + # else +@@ -175,8 +169,7 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + L(stosb_more_2x_vec): + cmp __x86_rep_stosb_threshold(%rip), %RDX_LP +@@ -190,8 +183,11 @@ L(more_2x_vec): + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + L(return): +- VZEROUPPER ++#if VEC_SIZE > 16 ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++#else + ret ++#endif + + L(loop_start): + leaq (VEC_SIZE * 4)(%rdi), %rcx +@@ -217,7 +213,6 @@ L(loop): + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN +- ret + L(less_vec): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 +@@ -241,40 +236,34 @@ L(less_vec): + jb 1f + movb %cl, (%rdi) + 1: +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): + VMOVU %YMM0, -32(%rdi,%rdx) + VMOVU %YMM0, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + # if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): + VMOVU %XMM0, -16(%rdi,%rdx) + VMOVU %XMM0, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + /* From 8 to 15. No branch when size == 8. */ + L(between_8_15): + movq %rcx, -8(%rdi,%rdx) + movq %rcx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ + movl %ecx, -4(%rdi,%rdx) + movl %ecx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + movw %cx, -2(%rdi,%rdx) + movw %cx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +diff --git a/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S +new file mode 100644 +index 00000000..acc5f6e2 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/rawmemchr-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __rawmemchr_avx2_rtm ++#define USE_AS_RAWMEMCHR 1 ++ ++#include "memchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S +new file mode 100644 +index 00000000..2b9c07a5 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpcpy-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STPCPY ++#define STRCPY __stpcpy_avx2_rtm ++#include "strcpy-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S +new file mode 100644 +index 00000000..60a2ccfe +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/stpncpy-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define USE_AS_STPCPY ++#define USE_AS_STRNCPY ++#define STRCPY __stpncpy_avx2_rtm ++#include "strcpy-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S +new file mode 100644 +index 00000000..637fb557 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcat-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCAT ++# define STRCAT __strcat_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcat-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcat-avx2.S b/sysdeps/x86_64/multiarch/strcat-avx2.S +index b0623564..aa48c058 100644 +--- a/sysdeps/x86_64/multiarch/strcat-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcat-avx2.S +@@ -30,7 +30,11 @@ + /* Number of bytes in a vector register */ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCAT) + mov %rdi, %r9 + # ifdef USE_AS_STRNCAT +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S +new file mode 100644 +index 00000000..81f20d1d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCHR ++# define STRCHR __strchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 47bc3c99..da7d2620 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -38,9 +38,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMM0. */ +@@ -93,8 +97,8 @@ L(cros_page_boundary): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(aligned_more): +@@ -190,8 +194,7 @@ L(first_vec_x0): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): +@@ -205,8 +208,7 @@ L(first_vec_x1): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): +@@ -220,8 +222,7 @@ L(first_vec_x2): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -247,8 +248,7 @@ L(first_vec_x3): + cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (STRCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c +index be05e197..7e582f02 100644 +--- a/sysdeps/x86_64/multiarch/strchr.c ++++ b/sysdeps/x86_64/multiarch/strchr.c +@@ -29,6 +29,7 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S +new file mode 100644 +index 00000000..cdcf818b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strchrnul-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRCHR __strchrnul_avx2_rtm ++#define USE_AS_STRCHRNUL 1 ++#include "strchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S +new file mode 100644 +index 00000000..aecd30d9 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCMP ++# define STRCMP __strcmp_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 8fb8eedc..5d1c9d90 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -55,6 +55,10 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +@@ -75,7 +79,7 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCMP) + # ifdef USE_AS_STRNCMP + /* Check for simple cases (0 or 1) in offset. */ +@@ -137,8 +141,8 @@ L(return): + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(return_vec_size): +@@ -171,8 +175,7 @@ L(return_vec_size): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(return_2_vec_size): +@@ -205,8 +208,7 @@ L(return_2_vec_size): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(return_3_vec_size): +@@ -239,8 +241,7 @@ L(return_3_vec_size): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(next_3_vectors): +@@ -366,8 +367,7 @@ L(back_to_loop): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(test_vec): +@@ -410,8 +410,7 @@ L(test_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(test_2_vec): +@@ -454,8 +453,7 @@ L(test_2_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(test_3_vec): +@@ -496,8 +494,7 @@ L(test_3_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(loop_cross_page): +@@ -566,8 +563,7 @@ L(loop_cross_page): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(loop_cross_page_2_vec): +@@ -641,8 +637,7 @@ L(loop_cross_page_2_vec): + subl %edx, %eax + # endif + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCMP + L(string_nbyte_offset_check): +@@ -684,8 +679,7 @@ L(cross_page_loop): + # ifndef USE_AS_WCSCMP + L(different): + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_WCSCMP + .p2align 4 +@@ -695,16 +689,14 @@ L(different): + setl %al + negl %eax + orl $1, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + + # ifdef USE_AS_STRNCMP + .p2align 4 + L(zero): + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(char0): +@@ -718,8 +710,7 @@ L(char0): + movzbl (%rdi), %eax + subl %ecx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + # endif + + .p2align 4 +@@ -744,8 +735,7 @@ L(last_vector): + movzbl (%rsi, %rdx), %edx + subl %edx, %eax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + /* Comparing on page boundary region requires special treatment: + It must done one vector at the time, starting with the wider +@@ -866,7 +856,6 @@ L(cross_page_4bytes): + testl %eax, %eax + jne L(cross_page_loop) + subl %ecx, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + END (STRCMP) + #endif +diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c +index c5f38510..11bbea2b 100644 +--- a/sysdeps/x86_64/multiarch/strcmp.c ++++ b/sysdeps/x86_64/multiarch/strcmp.c +@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S +new file mode 100644 +index 00000000..c2c581ec +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcpy-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRCPY ++# define STRCPY __strcpy_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcpy-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcpy-avx2.S b/sysdeps/x86_64/multiarch/strcpy-avx2.S +index 81677f90..613c59aa 100644 +--- a/sysdeps/x86_64/multiarch/strcpy-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcpy-avx2.S +@@ -37,6 +37,10 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + /* zero register */ + #define xmmZ xmm0 + #define ymmZ ymm0 +@@ -46,7 +50,7 @@ + + # ifndef USE_AS_STRCAT + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRCPY) + # ifdef USE_AS_STRNCPY + mov %rdx, %r8 +@@ -369,8 +373,8 @@ L(CopyVecSizeExit): + lea 1(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(CopyTwoVecSize1): +@@ -553,8 +557,7 @@ L(Exit1): + lea 2(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit2): +@@ -569,8 +572,7 @@ L(Exit2): + lea 3(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit3): +@@ -584,8 +586,7 @@ L(Exit3): + lea 4(%rdi), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit4_7): +@@ -602,8 +603,7 @@ L(Exit4_7): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit8_15): +@@ -620,8 +620,7 @@ L(Exit8_15): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit16_31): +@@ -638,8 +637,7 @@ L(Exit16_31): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Exit32_63): +@@ -656,8 +654,7 @@ L(Exit32_63): + lea 1(%rdi, %rdx), %rdi + jnz L(StrncpyFillTailWithZero) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCPY + +@@ -671,8 +668,7 @@ L(StrncpyExit1): + # ifdef USE_AS_STRCAT + movb $0, 1(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit2): +@@ -684,8 +680,7 @@ L(StrncpyExit2): + # ifdef USE_AS_STRCAT + movb $0, 2(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit3_4): +@@ -699,8 +694,7 @@ L(StrncpyExit3_4): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit5_8): +@@ -714,8 +708,7 @@ L(StrncpyExit5_8): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit9_16): +@@ -729,8 +722,7 @@ L(StrncpyExit9_16): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit17_32): +@@ -744,8 +736,7 @@ L(StrncpyExit17_32): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit33_64): +@@ -760,8 +751,7 @@ L(StrncpyExit33_64): + # ifdef USE_AS_STRCAT + movb $0, (%rdi, %r8) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(StrncpyExit65): +@@ -778,50 +768,43 @@ L(StrncpyExit65): + # ifdef USE_AS_STRCAT + movb $0, 65(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # ifndef USE_AS_STRCAT + + .p2align 4 + L(Fill1): + mov %dl, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill2): + mov %dx, (%rdi) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill3_4): + mov %dx, (%rdi) + mov %dx, -2(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill5_8): + mov %edx, (%rdi) + mov %edx, -4(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill9_16): + mov %rdx, (%rdi) + mov %rdx, -8(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(Fill17_32): + vmovdqu %xmmZ, (%rdi) + vmovdqu %xmmZ, -16(%rdi, %r8) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(CopyVecSizeUnalignedVec2): +@@ -898,8 +881,7 @@ L(Fill): + cmp $1, %r8d + ja L(Fill2) + je L(Fill1) +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + /* end of ifndef USE_AS_STRCAT */ + # endif +@@ -929,8 +911,7 @@ L(UnalignedFourVecSizeLeaveCase3): + # ifdef USE_AS_STRCAT + movb $0, (VEC_SIZE * 4)(%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(UnalignedFourVecSizeLeaveCase2): +@@ -1001,16 +982,14 @@ L(StrncpyExit): + # ifdef USE_AS_STRCAT + movb $0, (%rdi) + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(ExitZero): + # ifndef USE_AS_STRCAT + mov %rdi, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + # endif + +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S +new file mode 100644 +index 00000000..75b4b761 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRLEN ++# define STRLEN __strlen_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strlen-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index 645e0446..82826e10 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -36,9 +36,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN + /* Check for zero length. */ +@@ -111,8 +115,8 @@ L(cros_page_boundary): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(aligned_more): +@@ -231,8 +235,7 @@ L(last_4x_vec_or_less): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(last_2x_vec): +@@ -253,8 +256,7 @@ L(last_2x_vec): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x0_check): +@@ -267,8 +269,7 @@ L(first_vec_x0_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1_check): +@@ -282,8 +283,7 @@ L(first_vec_x1_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2_check): +@@ -297,8 +297,7 @@ L(first_vec_x2_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x3_check): +@@ -312,8 +311,7 @@ L(first_vec_x3_check): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(max): +@@ -321,8 +319,7 @@ L(max): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(zero): +@@ -338,8 +335,7 @@ L(first_vec_x0): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x1): +@@ -350,8 +346,7 @@ L(first_vec_x1): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(first_vec_x2): +@@ -362,8 +357,7 @@ L(first_vec_x2): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(4x_vec_end): +@@ -389,8 +383,7 @@ L(first_vec_x3): + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (STRLEN) + #endif +diff --git a/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S +new file mode 100644 +index 00000000..0dcea18d +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncat-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCAT ++#define STRCAT __strncat_avx2_rtm ++#include "strcat-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +new file mode 100644 +index 00000000..37d1224b +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRCMP __strncmp_avx2_rtm ++#define USE_AS_STRNCMP 1 ++#include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c +index 4c15542f..44c85116 100644 +--- a/sysdeps/x86_64/multiarch/strncmp.c ++++ b/sysdeps/x86_64/multiarch/strncmp.c +@@ -30,6 +30,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -46,6 +47,9 @@ IFUNC_SELECTOR (void) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S +new file mode 100644 +index 00000000..79e70832 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncpy-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define USE_AS_STRNCPY ++#define STRCPY __strncpy_avx2_rtm ++#include "strcpy-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S +new file mode 100644 +index 00000000..04f1626a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strnlen-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define STRLEN __strnlen_avx2_rtm ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S +new file mode 100644 +index 00000000..5def14ec +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2-rtm.S +@@ -0,0 +1,12 @@ ++#ifndef STRRCHR ++# define STRRCHR __strrchr_avx2_rtm ++#endif ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strrchr-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index 4381e6ab..9f22a15e 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -36,9 +36,13 @@ + # define VZEROUPPER vzeroupper + # endif + ++# ifndef SECTION ++# define SECTION(p) p##.avx ++# endif ++ + # define VEC_SIZE 32 + +- .section .text.avx,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (STRRCHR) + movd %esi, %xmm4 + movl %edi, %ecx +@@ -166,8 +170,8 @@ L(return_value): + # endif + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER +- ret ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 + L(match): +@@ -198,8 +202,7 @@ L(find_nul): + jz L(return_value) + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(char_and_nul): +@@ -222,14 +225,12 @@ L(char_and_nul_in_first_vec): + jz L(return_null) + bsrl %eax, %eax + leaq -VEC_SIZE(%rdi, %rax), %rax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + .p2align 4 + L(return_null): + xorl %eax, %eax +- VZEROUPPER +- ret ++ VZEROUPPER_RETURN + + END (STRRCHR) + #endif +diff --git a/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S +new file mode 100644 +index 00000000..d49dbbf0 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcschr-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRCHR __wcschr_avx2_rtm ++#define USE_AS_WCSCHR 1 ++#include "strchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S +new file mode 100644 +index 00000000..d6ca2b80 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcscmp-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define STRCMP __wcscmp_avx2_rtm ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S +new file mode 100644 +index 00000000..35658d73 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcslen-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define STRLEN __wcslen_avx2_rtm ++#define USE_AS_WCSLEN 1 ++ ++#include "strlen-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +new file mode 100644 +index 00000000..4e88c70c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +@@ -0,0 +1,5 @@ ++#define STRCMP __wcsncmp_avx2_rtm ++#define USE_AS_STRNCMP 1 ++#define USE_AS_WCSCMP 1 ++ ++#include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S +new file mode 100644 +index 00000000..7437ebee +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsnlen-avx2-rtm.S +@@ -0,0 +1,5 @@ ++#define STRLEN __wcsnlen_avx2_rtm ++#define USE_AS_WCSLEN 1 ++#define USE_AS_STRNLEN 1 ++ ++#include "strlen-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c +index 84254b83..20b731ae 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen.c ++++ b/sysdeps/x86_64/multiarch/wcsnlen.c +@@ -29,6 +29,7 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * +@@ -44,6 +45,9 @@ IFUNC_SELECTOR (void) + && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + return OPTIMIZE (evex); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ + if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) + return OPTIMIZE (avx2); + } +diff --git a/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S +new file mode 100644 +index 00000000..9bf76083 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcsrchr-avx2-rtm.S +@@ -0,0 +1,3 @@ ++#define STRRCHR __wcsrchr_avx2_rtm ++#define USE_AS_WCSRCHR 1 ++#include "strrchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S +new file mode 100644 +index 00000000..58ed21db +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemchr-avx2-rtm.S +@@ -0,0 +1,4 @@ ++#define MEMCHR __wmemchr_avx2_rtm ++#define USE_AS_WMEMCHR 1 ++ ++#include "memchr-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S +new file mode 100644 +index 00000000..31104d12 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemcmp-avx2-movbe-rtm.S +@@ -0,0 +1,4 @@ ++#define MEMCMP __wmemcmp_avx2_movbe_rtm ++#define USE_AS_WMEMCMP 1 ++ ++#include "memcmp-avx2-movbe-rtm.S" +diff --git a/sysdeps/x86_64/sysdep.h b/sysdeps/x86_64/sysdep.h +index 1738d7f9..223f1a59 100644 +--- a/sysdeps/x86_64/sysdep.h ++++ b/sysdeps/x86_64/sysdep.h +@@ -95,6 +95,28 @@ lose: \ + #define R14_LP r14 + #define R15_LP r15 + ++/* Zero upper vector registers and return with xtest. NB: Use VZEROALL ++ to avoid RTM abort triggered by VZEROUPPER inside transactionally. */ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST \ ++ xtest; \ ++ jz 1f; \ ++ vzeroall; \ ++ ret; \ ++1: \ ++ vzeroupper; \ ++ ret ++ ++/* Zero upper vector registers and return. */ ++#ifndef ZERO_UPPER_VEC_REGISTERS_RETURN ++# define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ VZEROUPPER; \ ++ ret ++#endif ++ ++#ifndef VZEROUPPER_RETURN ++# define VZEROUPPER_RETURN VZEROUPPER; ret ++#endif ++ + #else /* __ASSEMBLER__ */ + + /* Long and pointer size in bytes. */ +-- +GitLab + diff --git a/glibc-RHEL-15696-18.patch b/glibc-RHEL-15696-18.patch new file mode 100644 index 0000000..2cf0e45 --- /dev/null +++ b/glibc-RHEL-15696-18.patch @@ -0,0 +1,735 @@ +From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Tue, 23 Feb 2021 06:33:10 -0800 +Subject: [PATCH] x86: Add string/memory function tests in RTM region +Content-type: text/plain; charset=UTF-8 + +At function exit, AVX optimized string/memory functions have VZEROUPPER +which triggers RTM abort. When such functions are called inside a +transactionally executing RTM region, RTM abort causes severe performance +degradation. Add tests to verify that string/memory functions won't +cause RTM abort in RTM region. +--- + sysdeps/x86/Makefile | 23 +++++++++++ + sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++ + sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++ + sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++ + sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++ + sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++ + sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++ + sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++ + sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++ + sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++ + sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++ + sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++ + 12 files changed, 618 insertions(+) + create mode 100644 sysdeps/x86/tst-memchr-rtm.c + create mode 100644 sysdeps/x86/tst-memcmp-rtm.c + create mode 100644 sysdeps/x86/tst-memmove-rtm.c + create mode 100644 sysdeps/x86/tst-memrchr-rtm.c + create mode 100644 sysdeps/x86/tst-memset-rtm.c + create mode 100644 sysdeps/x86/tst-strchr-rtm.c + create mode 100644 sysdeps/x86/tst-strcpy-rtm.c + create mode 100644 sysdeps/x86/tst-string-rtm.h + create mode 100644 sysdeps/x86/tst-strlen-rtm.c + create mode 100644 sysdeps/x86/tst-strncmp-rtm.c + create mode 100644 sysdeps/x86/tst-strrchr-rtm.c + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 59e928e9..5be71ada 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -17,6 +17,29 @@ endif + + ifeq ($(subdir),string) + sysdep_routines += cacheinfo ++ ++tests += \ ++ tst-memchr-rtm \ ++ tst-memcmp-rtm \ ++ tst-memmove-rtm \ ++ tst-memrchr-rtm \ ++ tst-memset-rtm \ ++ tst-strchr-rtm \ ++ tst-strcpy-rtm \ ++ tst-strlen-rtm \ ++ tst-strncmp-rtm \ ++ tst-strrchr-rtm ++ ++CFLAGS-tst-memchr-rtm.c += -mrtm ++CFLAGS-tst-memcmp-rtm.c += -mrtm ++CFLAGS-tst-memmove-rtm.c += -mrtm ++CFLAGS-tst-memrchr-rtm.c += -mrtm ++CFLAGS-tst-memset-rtm.c += -mrtm ++CFLAGS-tst-strchr-rtm.c += -mrtm ++CFLAGS-tst-strcpy-rtm.c += -mrtm ++CFLAGS-tst-strlen-rtm.c += -mrtm ++CFLAGS-tst-strncmp-rtm.c += -mrtm ++CFLAGS-tst-strrchr-rtm.c += -mrtm + endif + + ifneq ($(enable-cet),no) +diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c +new file mode 100644 +index 00000000..e4749401 +--- /dev/null ++++ b/sysdeps/x86/tst-memchr-rtm.c +@@ -0,0 +1,54 @@ ++/* Test case for memchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ string1[100] = 'c'; ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = memchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = memchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memchr", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c +new file mode 100644 +index 00000000..e4c8a623 +--- /dev/null ++++ b/sysdeps/x86/tst-memcmp-rtm.c +@@ -0,0 +1,52 @@ ++/* Test case for memcmp inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ memset (string2, 'a', STRING_SIZE); ++ if (memcmp (string1, string2, STRING_SIZE) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (memcmp (string1, string2, STRING_SIZE) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memcmp", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c +new file mode 100644 +index 00000000..4bf97ef1 +--- /dev/null ++++ b/sysdeps/x86/tst-memmove-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for memmove inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ if (memmove (string2, string1, STRING_SIZE) == string2 ++ && memcmp (string2, string1, STRING_SIZE) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (memmove (string2, string1, STRING_SIZE) == string2 ++ && memcmp (string2, string1, STRING_SIZE) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memmove", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c +new file mode 100644 +index 00000000..a57a5a8e +--- /dev/null ++++ b/sysdeps/x86/tst-memrchr-rtm.c +@@ -0,0 +1,54 @@ ++/* Test case for memrchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ string1[100] = 'c'; ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = memrchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[STRING_SIZE - 100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = memrchr (string1, 'c', STRING_SIZE); ++ if (p == &string1[STRING_SIZE - 100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memrchr", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c +new file mode 100644 +index 00000000..bf343a4d +--- /dev/null ++++ b/sysdeps/x86/tst-memset-rtm.c +@@ -0,0 +1,45 @@ ++/* Test case for memset inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ return EXIT_SUCCESS; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ memset (string1, 'a', STRING_SIZE); ++ return 0; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("memset", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c +new file mode 100644 +index 00000000..a82e29c0 +--- /dev/null ++++ b/sysdeps/x86/tst-strchr-rtm.c +@@ -0,0 +1,54 @@ ++/* Test case for strchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ string1[100] = 'c'; ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = strchr (string1, 'c'); ++ if (p == &string1[100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = strchr (string1, 'c'); ++ if (p == &string1[100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strchr", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c +new file mode 100644 +index 00000000..2b2a583f +--- /dev/null ++++ b/sysdeps/x86/tst-strcpy-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for strcpy inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ if (strcpy (string2, string1) == string2 ++ && strcmp (string2, string1) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (strcpy (string2, string1) == string2 ++ && strcmp (string2, string1) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strcpy", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h +new file mode 100644 +index 00000000..d2470afa +--- /dev/null ++++ b/sysdeps/x86/tst-string-rtm.h +@@ -0,0 +1,72 @@ ++/* Test string function in a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++#include ++#include ++#include ++#include ++ ++static int ++do_test_1 (const char *name, unsigned int loop, int (*prepare) (void), ++ int (*function) (void)) ++{ ++ if (!CPU_FEATURE_USABLE (RTM)) ++ return EXIT_UNSUPPORTED; ++ ++ int status = prepare (); ++ if (status != EXIT_SUCCESS) ++ return status; ++ ++ unsigned int i; ++ unsigned int naborts = 0; ++ unsigned int failed = 0; ++ for (i = 0; i < loop; i++) ++ { ++ failed |= function (); ++ if (_xbegin() == _XBEGIN_STARTED) ++ { ++ failed |= function (); ++ _xend(); ++ } ++ else ++ { ++ failed |= function (); ++ ++naborts; ++ } ++ } ++ ++ if (failed) ++ FAIL_EXIT1 ("%s() failed", name); ++ ++ if (naborts) ++ { ++ /* NB: Low single digit (<= 5%) noise-level aborts are normal for ++ TSX. */ ++ double rate = 100 * ((double) naborts) / ((double) loop); ++ if (rate > 5) ++ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)", ++ rate, naborts, loop); ++ } ++ ++ return EXIT_SUCCESS; ++} ++ ++static int do_test (void); ++ ++#include +diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c +new file mode 100644 +index 00000000..0dcf14db +--- /dev/null ++++ b/sysdeps/x86/tst-strlen-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for strlen inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ string1[STRING_SIZE - 100] = '\0'; ++ size_t len = strlen (string1); ++ if (len == STRING_SIZE - 100) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ size_t len = strlen (string1); ++ if (len == STRING_SIZE - 100) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strlen", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +new file mode 100644 +index 00000000..236ad951 +--- /dev/null ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -0,0 +1,52 @@ ++/* Test case for strncmp inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++char string2[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ memset (string2, 'a', STRING_SIZE - 1); ++ if (strncmp (string1, string2, STRING_SIZE) == 0) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ if (strncmp (string1, string2, STRING_SIZE) == 0) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strncmp", LOOP, prepare, function); ++} +diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c +new file mode 100644 +index 00000000..e32bfaf5 +--- /dev/null ++++ b/sysdeps/x86/tst-strrchr-rtm.c +@@ -0,0 +1,53 @@ ++/* Test case for strrchr inside a transactionally executing RTM region. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define LOOP 3000 ++#define STRING_SIZE 1024 ++char string1[STRING_SIZE]; ++ ++__attribute__ ((noinline, noclone)) ++static int ++prepare (void) ++{ ++ memset (string1, 'a', STRING_SIZE - 1); ++ string1[STRING_SIZE - 100] = 'c'; ++ char *p = strrchr (string1, 'c'); ++ if (p == &string1[STRING_SIZE - 100]) ++ return EXIT_SUCCESS; ++ else ++ return EXIT_FAILURE; ++} ++ ++__attribute__ ((noinline, noclone)) ++static int ++function (void) ++{ ++ char *p = strrchr (string1, 'c'); ++ if (p == &string1[STRING_SIZE - 100]) ++ return 0; ++ else ++ return 1; ++} ++ ++static int ++do_test (void) ++{ ++ return do_test_1 ("strrchr", LOOP, prepare, function); ++} +-- +GitLab + diff --git a/glibc-RHEL-15696-19.patch b/glibc-RHEL-15696-19.patch new file mode 100644 index 0000000..0500875 --- /dev/null +++ b/glibc-RHEL-15696-19.patch @@ -0,0 +1,148 @@ +From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sun, 7 Mar 2021 09:44:18 -0800 +Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized +with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort +with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at +function exit. +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++----- + sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++----- + sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------ + .../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++-------- + 4 files changed, 31 insertions(+), 24 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c1efeec0..d969a156 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX512F), +@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW)), + __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX512F), +@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512VL), + __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __wmemset_avx512_unaligned)) + + #ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 6f3375cc..19795938 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { +- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx512_no_vzeroupper); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx512_unaligned_erms); + +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx512_unaligned_erms); ++ return OPTIMIZE (avx512_unaligned); ++ } + +- return OPTIMIZE (avx512_unaligned); ++ return OPTIMIZE (avx512_no_vzeroupper); + } + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +index bdc94c6c..98c5d406 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h +@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512) +- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx512_unaligned); +- + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) +- return OPTIMIZE (evex_unaligned); ++ { ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ return OPTIMIZE (avx512_unaligned); ++ ++ return OPTIMIZE (evex_unaligned); ++ } + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_unaligned_rtm); +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 0783979c..22e7b187 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -1,22 +1,22 @@ + #if IS_IN (libc) + # define VEC_SIZE 64 +-# define VEC(i) zmm##i ++# define XMM0 xmm16 ++# define YMM0 ymm16 ++# define VEC0 zmm16 ++# define VEC(i) VEC##i + # define VMOVU vmovdqu64 + # define VMOVA vmovdqa64 ++# define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ + movq r, %rax; \ +- vpbroadcastb %xmm0, %xmm0; \ +- vpbroadcastq %xmm0, %zmm0 ++ vpbroadcastb d, %VEC0 + + # define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ + movq r, %rax; \ +- vpbroadcastd %xmm0, %xmm0; \ +- vpbroadcastq %xmm0, %zmm0 ++ vpbroadcastd d, %VEC0 + +-# define SECTION(p) p##.avx512 ++# define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s + # define WMEMSET_SYMBOL(p,s) p##_avx512_##s + +-- +GitLab + diff --git a/glibc-RHEL-15696-2.patch b/glibc-RHEL-15696-2.patch new file mode 100644 index 0000000..54f3ac3 --- /dev/null +++ b/glibc-RHEL-15696-2.patch @@ -0,0 +1,230 @@ +From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:25:56 -0800 +Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for + length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise. + * sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and + tst-size_t-wmemcmp. + * sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise. +--- + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +- + sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++- + sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +- + sysdeps/x86_64/x32/Makefile | 4 +- + sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++ + 6 files changed, 114 insertions(+), 9 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 30f764c3..e3a35b89 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -58,9 +58,12 @@ + .section .text.avx,"ax",@progbits + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP +- shl $2, %rdx ++ shl $2, %RDX_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx + # endif +- cmpq $VEC_SIZE, %rdx ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +index 8e164f2c..302900f5 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S +@@ -42,13 +42,16 @@ + .section .text.sse4.1,"ax",@progbits + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP +- shl $2, %rdx ++ shl $2, %RDX_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + # endif + pxor %xmm0, %xmm0 +- cmp $79, %rdx ++ cmp $79, %RDX_LP + ja L(79bytesormore) + # ifndef USE_AS_WMEMCMP +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je L(firstbyte) + # endif + add %rdx, %rsi +diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S +index 6f76c641..69d030fc 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S ++++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S +@@ -33,9 +33,12 @@ + atom_text_section + ENTRY (MEMCMP) + # ifdef USE_AS_WMEMCMP +- shl $2, %rdx +- test %rdx, %rdx ++ shl $2, %RDX_LP ++ test %RDX_LP, %RDX_LP + jz L(equal) ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + # endif + mov %rdx, %rcx + mov %rdi, %rdx +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 7d528889..ddec7f04 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround + endif + + ifeq ($(subdir),string) +-tests += tst-size_t-memchr ++tests += tst-size_t-memchr tst-size_t-memcmp + endif + + ifeq ($(subdir),wcsmbs) +-tests += tst-size_t-wmemchr ++tests += tst-size_t-wmemchr tst-size_t-wmemcmp + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c +new file mode 100644 +index 00000000..9bd6fdb4 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c +@@ -0,0 +1,76 @@ ++/* Test memcmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_MAIN ++#ifdef WIDE ++# define TEST_NAME "wmemcmp" ++#else ++# define TEST_NAME "memcmp" ++#endif ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# include ++ ++# define MEMCMP wmemcmp ++# define CHAR wchar_t ++#else ++# define MEMCMP memcmp ++# define CHAR char ++#endif ++ ++IMPL (MEMCMP, 1) ++ ++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_memcmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ memcpy (buf1, buf2, page_size); ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_memcmp (dest, src); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c +new file mode 100644 +index 00000000..e8b5ffd0 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c +@@ -0,0 +1,20 @@ ++/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-memcmp.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-20.patch b/glibc-RHEL-15696-20.patch new file mode 100644 index 0000000..c63b3fb --- /dev/null +++ b/glibc-RHEL-15696-20.patch @@ -0,0 +1,164 @@ +From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sun, 7 Mar 2021 09:45:23 -0800 +Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions +Content-type: text/plain; charset=UTF-8 + +Update ifunc-memmove.h to select the function optimized with AVX512 +instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable +AVX512VL since VZEROUPPER isn't needed at function exit. +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++--------- + sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++---- + .../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++-- + 3 files changed, 42 insertions(+), 19 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d969a156..fec384f6 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memmove_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memmove_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memmove_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memmove_chk, + CPU_FEATURE_USABLE (AVX), +@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memmove_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memmove, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memmove, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memmove_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3), + __memmove_ssse3_back) +@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memcpy_chk, + CPU_FEATURE_USABLE (AVX), +@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __memcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, memcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __memcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned) + IFUNC_IMPL_ADD (array, i, memcpy, 1, +@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __mempcpy_chk_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __mempcpy_chk, + CPU_FEATURE_USABLE (AVX), +@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX512F), + __mempcpy_avx512_no_vzeroupper) + IFUNC_IMPL_ADD (array, i, mempcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, mempcpy, +- CPU_FEATURE_USABLE (AVX512F), ++ CPU_FEATURE_USABLE (AVX512VL), + __mempcpy_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, mempcpy, + CPU_FEATURE_USABLE (AVX), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h +index fa09b9fb..014e95c7 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h +@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { +- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx512_no_vzeroupper); ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE (avx512_unaligned_erms); + +- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) +- return OPTIMIZE (avx512_unaligned_erms); ++ return OPTIMIZE (avx512_unaligned); ++ } + +- return OPTIMIZE (avx512_unaligned); ++ return OPTIMIZE (avx512_no_vzeroupper); + } + + if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +index aac1515c..848848ab 100644 +--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S +@@ -1,11 +1,32 @@ + #if IS_IN (libc) + # define VEC_SIZE 64 +-# define VEC(i) zmm##i ++# define XMM0 xmm16 ++# define XMM1 xmm17 ++# define YMM0 ymm16 ++# define YMM1 ymm17 ++# define VEC0 zmm16 ++# define VEC1 zmm17 ++# define VEC2 zmm18 ++# define VEC3 zmm19 ++# define VEC4 zmm20 ++# define VEC5 zmm21 ++# define VEC6 zmm22 ++# define VEC7 zmm23 ++# define VEC8 zmm24 ++# define VEC9 zmm25 ++# define VEC10 zmm26 ++# define VEC11 zmm27 ++# define VEC12 zmm28 ++# define VEC13 zmm29 ++# define VEC14 zmm30 ++# define VEC15 zmm31 ++# define VEC(i) VEC##i + # define VMOVNT vmovntdq + # define VMOVU vmovdqu64 + # define VMOVA vmovdqa64 ++# define VZEROUPPER + +-# define SECTION(p) p##.avx512 ++# define SECTION(p) p##.evex512 + # define MEMMOVE_SYMBOL(p,s) p##_avx512_##s + + # include "memmove-vec-unaligned-erms.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-21.patch b/glibc-RHEL-15696-21.patch new file mode 100644 index 0000000..319c08d --- /dev/null +++ b/glibc-RHEL-15696-21.patch @@ -0,0 +1,71 @@ +From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001 +From: Sunil K Pandey +Date: Thu, 1 Apr 2021 15:47:04 -0700 +Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S +Content-type: text/plain; charset=UTF-8 + +Fix some indentations of ifdef in file strlen-evex.S which are off by 1 +and confusing to read. +--- + sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++-------- + 1 file changed, 8 insertions(+), 8 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S +index cd022509..05838190 100644 +--- a/sysdeps/x86_64/multiarch/strlen-evex.S ++++ b/sysdeps/x86_64/multiarch/strlen-evex.S +@@ -276,10 +276,10 @@ L(last_2x_vec): + .p2align 4 + L(first_vec_x0_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +@@ -293,10 +293,10 @@ L(first_vec_x0_check): + .p2align 4 + L(first_vec_x1_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +@@ -311,10 +311,10 @@ L(first_vec_x1_check): + .p2align 4 + L(first_vec_x2_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +@@ -329,10 +329,10 @@ L(first_vec_x2_check): + .p2align 4 + L(first_vec_x3_check): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %eax +-# endif ++# endif + /* Check the end of data. */ + cmpq %rax, %rsi + jbe L(max) +-- +GitLab + diff --git a/glibc-RHEL-15696-22.patch b/glibc-RHEL-15696-22.patch new file mode 100644 index 0000000..c20557b --- /dev/null +++ b/glibc-RHEL-15696-22.patch @@ -0,0 +1,51 @@ +From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 19 Apr 2021 07:07:21 -0700 +Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex +Content-type: text/plain; charset=UTF-8 + +Since __strlen_evex and __strnlen_evex added by + +commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 +Author: H.J. Lu +Date: Fri Mar 5 06:24:52 2021 -0800 + + x86-64: Add ifunc-avx2.h functions with 256-bit EVEX + +use sarx: + +c4 e2 6a f7 c0 sarx %edx,%eax,%eax + +require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c. +ifunc-avx2.h already requires BMI2 for EVEX implementation. +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index fec384f6..cbfc1a5d 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __strlen_evex) + IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2)) + +@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __strnlen_evex) + IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2)) + +-- +GitLab + diff --git a/glibc-RHEL-15696-23.patch b/glibc-RHEL-15696-23.patch new file mode 100644 index 0000000..ffde3d7 --- /dev/null +++ b/glibc-RHEL-15696-23.patch @@ -0,0 +1,584 @@ +From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 3 May 2021 03:01:58 -0400 +Subject: [PATCH] x86: Optimize memchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memchr-avx2.S. The optimizations include +replacing some branches with cmovcc, avoiding some branches entirely +in the less_4x_vec case, making the page cross logic less strict, +asaving a few instructions the in loop return loop. test-memchr, +test-rawmemchr, and test-wmemchr are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++----------- + 1 file changed, 247 insertions(+), 178 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index cf893e77..b377f22e 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -26,8 +26,22 @@ + + # ifdef USE_AS_WMEMCHR + # define VPCMPEQ vpcmpeqd ++# define VPBROADCAST vpbroadcastd ++# define CHAR_SIZE 4 + # else + # define VPCMPEQ vpcmpeqb ++# define VPBROADCAST vpbroadcastb ++# define CHAR_SIZE 1 ++# endif ++ ++# ifdef USE_AS_RAWMEMCHR ++# define ERAW_PTR_REG ecx ++# define RRAW_PTR_REG rcx ++# define ALGN_PTR_REG rdi ++# else ++# define ERAW_PTR_REG edi ++# define RRAW_PTR_REG rdi ++# define ALGN_PTR_REG rcx + # endif + + # ifndef VZEROUPPER +@@ -39,6 +53,7 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) +@@ -47,295 +62,349 @@ ENTRY (MEMCHR) + test %RDX_LP, %RDX_LP + jz L(null) + # endif +- movl %edi, %ecx +- /* Broadcast CHAR to YMM0. */ +- vmovd %esi, %xmm0 + # ifdef USE_AS_WMEMCHR + shl $2, %RDX_LP +- vpbroadcastd %xmm0, %ymm0 + # else + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx + # endif +- vpbroadcastb %xmm0, %ymm0 + # endif ++ /* Broadcast CHAR to YMMMATCH. */ ++ vmovd %esi, %xmm0 ++ VPBROADCAST %xmm0, %ymm0 + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 ++ VPCMPEQ (%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +- testl %eax, %eax +- + # ifndef USE_AS_RAWMEMCHR +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rdx +- jbe L(zero) +-# else +- jnz L(first_vec_x0) ++ /* If length < CHAR_PER_VEC handle special. */ ++ cmpq $VEC_SIZE, %rdx ++ jbe L(first_vec_x0) + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + + # ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ +- addq %rcx, %rdx ++ .p2align 5 ++L(first_vec_x0): ++ /* Check if first match was before length. */ ++ tzcntl %eax, %eax ++ xorl %ecx, %ecx ++ cmpl %eax, %edx ++ leaq (%rdi, %rax), %rax ++ cmovle %rcx, %rax ++ VZEROUPPER_RETURN + +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) ++L(null): ++ xorl %eax, %eax ++ ret + # endif +- jmp L(more_4x_vec) +- + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- VPCMPEQ (%rdi), %ymm0, %ymm1 ++L(cross_page_boundary): ++ /* Save pointer before aligning as its original value is necessary ++ for computer return address if byte is found or adjusting length ++ if it is not and this is memchr. */ ++ movq %rdi, %rcx ++ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and ++ rdi for rawmemchr. */ ++ orq $(VEC_SIZE - 1), %ALGN_PTR_REG ++ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Calculate length until end of page (length checked for a ++ match). */ ++ leaq 1(%ALGN_PTR_REG), %rsi ++ subq %RRAW_PTR_REG, %rsi ++# endif + /* Remove the leading bytes. */ +- sarl %cl, %eax +- testl %eax, %eax +- jz L(aligned_more) +- tzcntl %eax, %eax ++ sarxl %ERAW_PTR_REG, %eax, %eax + # ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) ++ cmpq %rsi, %rdx ++ jbe L(first_vec_x0) + # endif +- addq %rdi, %rax +- addq %rcx, %rax ++ testl %eax, %eax ++ jz L(cross_page_continue) ++ tzcntl %eax, %eax ++ addq %RRAW_PTR_REG, %rax + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 +-L(aligned_more): +-# ifndef USE_AS_RAWMEMCHR +- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" +- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition +- overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ incq %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- /* Check the end of data. */ +- subq %rcx, %rdx +- jbe L(zero) +-# endif ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE + 1), %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 2 + 1), %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- addq $VEC_SIZE, %rdi + +-# ifndef USE_AS_RAWMEMCHR +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +-L(more_4x_vec): ++ .p2align 4 ++L(aligned_more): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++# ifndef USE_AS_RAWMEMCHR ++L(cross_page_continue): ++ /* Align data to VEC_SIZE - 1. */ ++ xorl %ecx, %ecx ++ subl %edi, %ecx ++ orq $(VEC_SIZE - 1), %rdi ++ /* esi is for adjusting length to see if near the end. */ ++ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi ++# else ++ orq $(VEC_SIZE - 1), %rdi ++L(cross_page_continue): ++# endif ++ /* Load first VEC regardless. */ ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. If near end handle specially. */ ++ subq %rsi, %rdx ++ jbe L(last_4x_vec_or_less) ++# endif + testl %eax, %eax + jnz L(first_vec_x1) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) + + # ifndef USE_AS_RAWMEMCHR ++ /* Check if at last VEC_SIZE * 4 length. */ + subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi +- +-# ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ ++ jbe L(last_4x_vec_or_less_cmpeq) ++ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust ++ length. */ ++ incq %rdi ++ movl %edi, %ecx ++ orq $(VEC_SIZE * 4 - 1), %rdi ++ andl $(VEC_SIZE * 4 - 1), %ecx + addq %rcx, %rdx ++# else ++ /* Align data to VEC_SIZE * 4 - 1 for loop. */ ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + # endif + ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 +- ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2 ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3 ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4 + vpor %ymm1, %ymm2, %ymm5 + vpor %ymm3, %ymm4, %ymm6 + vpor %ymm5, %ymm6, %ymm5 + +- vpmovmskb %ymm5, %eax +- testl %eax, %eax +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- ++ vpmovmskb %ymm5, %ecx + # ifdef USE_AS_RAWMEMCHR +- jmp L(loop_4x_vec) ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx ++ jz L(loop_4x_vec) + # else +- subq $(VEC_SIZE * 4), %rdx +- ja L(loop_4x_vec) ++ testl %ecx, %ecx ++ jnz L(loop_4x_vec_end) + +-L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %edx +- jle L(last_2x_vec) ++ subq $-(VEC_SIZE * 4), %rdi + +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_4x_vec) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 ++ /* Fall through into less than 4 remaining vectors of length case. ++ */ ++ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++ .p2align 4 ++L(last_4x_vec_or_less): ++ /* Check if first VEC contained match. */ + testl %eax, %eax +- jnz L(first_vec_x1) ++ jnz L(first_vec_x1_check) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax ++ /* If remaining length > VEC_SIZE * 2. */ ++ addl $(VEC_SIZE * 2), %edx ++ jg L(last_4x_vec) + +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %edx +- jle L(zero) ++L(last_2x_vec): ++ /* If remaining length < VEC_SIZE. */ ++ addl $VEC_SIZE, %edx ++ jle L(zero_end) + +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 ++ /* Check VEC2 and compare any match with remaining length. */ ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +- testl %eax, %eax +- +- jnz L(first_vec_x3_check) +- xorl %eax, %eax ++ tzcntl %eax, %eax ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ addq $(VEC_SIZE + 1), %rdi ++ addq %rdi, %rax ++L(zero_end): + VZEROUPPER_RETURN + + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %edx +- VPCMPEQ (%rdi), %ymm0, %ymm1 ++L(loop_4x_vec_end): ++# endif ++ /* rawmemchr will fall through into this if match was found in ++ loop. */ ++ + vpmovmskb %ymm1, %eax + testl %eax, %eax ++ jnz L(last_vec_x1_return) + +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %edx +- jle L(zero) +- +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm2, %eax + testl %eax, %eax +- jnz L(first_vec_x1_check) +- xorl %eax, %eax +- VZEROUPPER_RETURN ++ jnz L(last_vec_x2_return) + +- .p2align 4 +-L(first_vec_x0_check): +- tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) ++ vpmovmskb %ymm3, %eax ++ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */ ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++# ifdef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 2 - 1), %rdi ++# else ++ subq $-(VEC_SIZE * 2 + 1), %rdi ++# endif + addq %rdi, %rax + VZEROUPPER_RETURN ++# ifndef USE_AS_RAWMEMCHR + + .p2align 4 + L(first_vec_x1_check): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $VEC_SIZE, %rax ++ /* Adjust length. */ ++ subl $-(VEC_SIZE * 4), %edx ++ /* Check if match within remaining length. */ ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ incq %rdi + addq %rdi, %rax + VZEROUPPER_RETURN ++ .p2align 4 ++L(set_zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++# endif + + .p2align 4 +-L(first_vec_x2_check): ++L(last_vec_x1_return): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 2), %rax ++# ifdef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 4 - 1), %rdi ++# else ++ incq %rdi ++# endif + addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x2_return): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 3), %rax ++# ifdef USE_AS_RAWMEMCHR ++ subq $(VEC_SIZE * 3 - 1), %rdi ++# else ++ subq $-(VEC_SIZE + 1), %rdi ++# endif + addq %rdi, %rax + VZEROUPPER_RETURN + ++# ifndef USE_AS_RAWMEMCHR + .p2align 4 +-L(zero): +- xorl %eax, %eax +- jmp L(return_vzeroupper) ++L(last_4x_vec_or_less_cmpeq): ++ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ subq $-(VEC_SIZE * 4), %rdi ++ /* Check first VEC regardless. */ ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) + ++ /* If remaining length <= CHAR_PER_VEC * 2. */ ++ addl $(VEC_SIZE * 2), %edx ++ jle L(last_2x_vec) + .p2align 4 +-L(null): +- xorl %eax, %eax +- ret +-# endif ++L(last_4x_vec): ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2_return) + +- .p2align 4 +-L(first_vec_x0): +- tzcntl %eax, %eax +- addq %rdi, %rax +- VZEROUPPER_RETURN ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + +- .p2align 4 +-L(first_vec_x1): +- tzcntl %eax, %eax +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +- VZEROUPPER_RETURN ++ /* Create mask for possible matches within remaining length. */ ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx + +- .p2align 4 +-L(first_vec_x2): ++ /* Test matches in data against length match. */ ++ andl %ecx, %eax ++ jnz L(last_vec_x3) ++ ++ /* if remaining length <= VEC_SIZE * 3 (Note this is after ++ remaining length was found to be > VEC_SIZE * 2. */ ++ subl $VEC_SIZE, %edx ++ jbe L(zero_end2) ++ ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Shift remaining length mask for last VEC. */ ++ shrq $32, %rcx ++ andl %ecx, %eax ++ jz L(zero_end2) + tzcntl %eax, %eax +- addq $(VEC_SIZE * 2), %rax ++ addq $(VEC_SIZE * 3 + 1), %rdi + addq %rdi, %rax ++L(zero_end2): + VZEROUPPER_RETURN + + .p2align 4 +-L(4x_vec_end): +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- vpmovmskb %ymm2, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- vpmovmskb %ymm3, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- vpmovmskb %ymm4, %eax +- testl %eax, %eax +-L(first_vec_x3): ++L(last_vec_x3): + tzcntl %eax, %eax +- addq $(VEC_SIZE * 3), %rax ++ subq $-(VEC_SIZE * 2 + 1), %rdi + addq %rdi, %rax + VZEROUPPER_RETURN ++# endif + + END (MEMCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-24.patch b/glibc-RHEL-15696-24.patch new file mode 100644 index 0000000..c4f24ff --- /dev/null +++ b/glibc-RHEL-15696-24.patch @@ -0,0 +1,388 @@ +From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 9 Jun 2021 16:25:32 -0400 +Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ + #27974] +Content-type: text/plain; charset=UTF-8 + +This commit fixes the bug mentioned in the previous commit. + +The previous implementations of wmemchr in these files relied +on n * sizeof(wchar_t) which was not guranteed by the standard. + +The new overflow tests added in the previous commit now +pass (As well as all the other tests). + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++------- + sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------ + 2 files changed, 98 insertions(+), 37 deletions(-) + +diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S +index cb320257..24f9a0c5 100644 +--- a/sysdeps/x86_64/memchr.S ++++ b/sysdeps/x86_64/memchr.S +@@ -21,9 +21,11 @@ + #ifdef USE_AS_WMEMCHR + # define MEMCHR wmemchr + # define PCMPEQ pcmpeqd ++# define CHAR_PER_VEC 4 + #else + # define MEMCHR memchr + # define PCMPEQ pcmpeqb ++# define CHAR_PER_VEC 16 + #endif + + /* fast SSE2 version with using pmaxub and 64 byte loop */ +@@ -33,15 +35,14 @@ ENTRY(MEMCHR) + movd %esi, %xmm1 + mov %edi, %ecx + ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++#endif + #ifdef USE_AS_WMEMCHR + test %RDX_LP, %RDX_LP + jz L(return_null) +- shl $2, %RDX_LP + #else +-# ifdef __ILP32__ +- /* Clear the upper 32 bits. */ +- movl %edx, %edx +-# endif + punpcklbw %xmm1, %xmm1 + test %RDX_LP, %RDX_LP + jz L(return_null) +@@ -60,13 +61,16 @@ ENTRY(MEMCHR) + test %eax, %eax + + jnz L(matches_1) +- sub $16, %rdx ++ sub $CHAR_PER_VEC, %rdx + jbe L(return_null) + add $16, %rdi + and $15, %ecx + and $-16, %rdi ++#ifdef USE_AS_WMEMCHR ++ shr $2, %ecx ++#endif + add %rcx, %rdx +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + jmp L(loop_prolog) + +@@ -77,16 +81,21 @@ L(crosscache): + movdqa (%rdi), %xmm0 + + PCMPEQ %xmm1, %xmm0 +-/* Check if there is a match. */ ++ /* Check if there is a match. */ + pmovmskb %xmm0, %eax +-/* Remove the leading bytes. */ ++ /* Remove the leading bytes. */ + sar %cl, %eax + test %eax, %eax + je L(unaligned_no_match) +-/* Check which byte is a match. */ ++ /* Check which byte is a match. */ + bsf %eax, %eax +- ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + add %rdi, %rax + add %rcx, %rax +@@ -94,15 +103,18 @@ L(crosscache): + + .p2align 4 + L(unaligned_no_match): +- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using ++ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using + "rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void + possible addition overflow. */ + neg %rcx + add $16, %rcx ++#ifdef USE_AS_WMEMCHR ++ shr $2, %ecx ++#endif + sub %rcx, %rdx + jbe L(return_null) + add $16, %rdi +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + .p2align 4 +@@ -135,7 +147,7 @@ L(loop_prolog): + test $0x3f, %rdi + jz L(align64_loop) + +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + + movdqa (%rdi), %xmm0 +@@ -167,11 +179,14 @@ L(loop_prolog): + mov %rdi, %rcx + and $-64, %rdi + and $63, %ecx ++#ifdef USE_AS_WMEMCHR ++ shr $2, %ecx ++#endif + add %rcx, %rdx + + .p2align 4 + L(align64_loop): +- sub $64, %rdx ++ sub $(CHAR_PER_VEC * 4), %rdx + jbe L(exit_loop) + movdqa (%rdi), %xmm0 + movdqa 16(%rdi), %xmm2 +@@ -218,7 +233,7 @@ L(align64_loop): + + .p2align 4 + L(exit_loop): +- add $32, %edx ++ add $(CHAR_PER_VEC * 2), %edx + jle L(exit_loop_32) + + movdqa (%rdi), %xmm0 +@@ -238,7 +253,7 @@ L(exit_loop): + pmovmskb %xmm3, %eax + test %eax, %eax + jnz L(matches32_1) +- sub $16, %edx ++ sub $CHAR_PER_VEC, %edx + jle L(return_null) + + PCMPEQ 48(%rdi), %xmm1 +@@ -250,13 +265,13 @@ L(exit_loop): + + .p2align 4 + L(exit_loop_32): +- add $32, %edx ++ add $(CHAR_PER_VEC * 2), %edx + movdqa (%rdi), %xmm0 + PCMPEQ %xmm1, %xmm0 + pmovmskb %xmm0, %eax + test %eax, %eax + jnz L(matches_1) +- sub $16, %edx ++ sub $CHAR_PER_VEC, %edx + jbe L(return_null) + + PCMPEQ 16(%rdi), %xmm1 +@@ -293,7 +308,13 @@ L(matches32): + .p2align 4 + L(matches_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + add %rdi, %rax + ret +@@ -301,7 +322,13 @@ L(matches_1): + .p2align 4 + L(matches16_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + lea 16(%rdi, %rax), %rax + ret +@@ -309,7 +336,13 @@ L(matches16_1): + .p2align 4 + L(matches32_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + lea 32(%rdi, %rax), %rax + ret +@@ -317,7 +350,13 @@ L(matches32_1): + .p2align 4 + L(matches48_1): + bsf %eax, %eax ++#ifdef USE_AS_WMEMCHR ++ mov %eax, %esi ++ shr $2, %esi ++ sub %rsi, %rdx ++#else + sub %rax, %rdx ++#endif + jbe L(return_null) + lea 48(%rdi, %rax), %rax + ret +diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S +index b377f22e..16027abb 100644 +--- a/sysdeps/x86_64/multiarch/memchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memchr-avx2.S +@@ -54,21 +54,19 @@ + + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +- test %RDX_LP, %RDX_LP +- jz L(null) +-# endif +-# ifdef USE_AS_WMEMCHR +- shl $2, %RDX_LP +-# else + # ifdef __ILP32__ +- /* Clear the upper 32 bits. */ +- movl %edx, %edx ++ /* Clear upper bits. */ ++ and %RDX_LP, %RDX_LP ++# else ++ test %RDX_LP, %RDX_LP + # endif ++ jz L(null) + # endif + /* Broadcast CHAR to YMMMATCH. */ + vmovd %esi, %xmm0 +@@ -84,7 +82,7 @@ ENTRY (MEMCHR) + vpmovmskb %ymm1, %eax + # ifndef USE_AS_RAWMEMCHR + /* If length < CHAR_PER_VEC handle special. */ +- cmpq $VEC_SIZE, %rdx ++ cmpq $CHAR_PER_VEC, %rdx + jbe L(first_vec_x0) + # endif + testl %eax, %eax +@@ -98,6 +96,10 @@ ENTRY (MEMCHR) + L(first_vec_x0): + /* Check if first match was before length. */ + tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %edx ++# endif + xorl %ecx, %ecx + cmpl %eax, %edx + leaq (%rdi, %rax), %rax +@@ -110,12 +112,12 @@ L(null): + # endif + .p2align 4 + L(cross_page_boundary): +- /* Save pointer before aligning as its original value is necessary +- for computer return address if byte is found or adjusting length +- if it is not and this is memchr. */ ++ /* Save pointer before aligning as its original value is ++ necessary for computer return address if byte is found or ++ adjusting length if it is not and this is memchr. */ + movq %rdi, %rcx +- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and +- rdi for rawmemchr. */ ++ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr ++ and rdi for rawmemchr. */ + orq $(VEC_SIZE - 1), %ALGN_PTR_REG + VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax +@@ -124,6 +126,10 @@ L(cross_page_boundary): + match). */ + leaq 1(%ALGN_PTR_REG), %rsi + subq %RRAW_PTR_REG, %rsi ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get wchar_t count. */ ++ shrl $2, %esi ++# endif + # endif + /* Remove the leading bytes. */ + sarxl %ERAW_PTR_REG, %eax, %eax +@@ -181,6 +187,10 @@ L(cross_page_continue): + orq $(VEC_SIZE - 1), %rdi + /* esi is for adjusting length to see if near the end. */ + leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %esi ++# endif + # else + orq $(VEC_SIZE - 1), %rdi + L(cross_page_continue): +@@ -213,7 +223,7 @@ L(cross_page_continue): + + # ifndef USE_AS_RAWMEMCHR + /* Check if at last VEC_SIZE * 4 length. */ +- subq $(VEC_SIZE * 4), %rdx ++ subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) + /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust + length. */ +@@ -221,6 +231,10 @@ L(cross_page_continue): + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif + addq %rcx, %rdx + # else + /* Align data to VEC_SIZE * 4 - 1 for loop. */ +@@ -250,15 +264,19 @@ L(loop_4x_vec): + + subq $-(VEC_SIZE * 4), %rdi + +- subq $(VEC_SIZE * 4), %rdx ++ subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) + +- /* Fall through into less than 4 remaining vectors of length case. +- */ ++ /* Fall through into less than 4 remaining vectors of length ++ case. */ + VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax + .p2align 4 + L(last_4x_vec_or_less): ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %edx ++# endif + /* Check if first VEC contained match. */ + testl %eax, %eax + jnz L(first_vec_x1_check) +@@ -355,6 +373,10 @@ L(last_vec_x2_return): + L(last_4x_vec_or_less_cmpeq): + VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1 + vpmovmskb %ymm1, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %edx ++# endif + subq $-(VEC_SIZE * 4), %rdi + /* Check first VEC regardless. */ + testl %eax, %eax +-- +GitLab + diff --git a/glibc-RHEL-15696-25.patch b/glibc-RHEL-15696-25.patch new file mode 100644 index 0000000..e0ed8ea --- /dev/null +++ b/glibc-RHEL-15696-25.patch @@ -0,0 +1,767 @@ +From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 19 Apr 2021 19:36:07 -0400 +Subject: [PATCH] x86: Optimize strlen-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strlen-avx2.S. The optimizations are +mostly small things but they add up to roughly 10-30% performance +improvement for strlen. The results for strnlen are bit more +ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen +are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +- + sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++-------- + 2 files changed, 334 insertions(+), 214 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index cbfc1a5d..f1a6460a 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strlen.c. */ + IFUNC_IMPL (i, name, strlen, + IFUNC_IMPL_ADD (array, i, strlen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strlen_avx2) + IFUNC_IMPL_ADD (array, i, strlen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strlen, +@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strnlen.c. */ + IFUNC_IMPL (i, name, strnlen, + IFUNC_IMPL_ADD (array, i, strnlen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strnlen_avx2) + IFUNC_IMPL_ADD (array, i, strnlen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strnlen, +@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/wcslen.c. */ + IFUNC_IMPL (i, name, wcslen, + IFUNC_IMPL_ADD (array, i, wcslen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __wcslen_avx2) + IFUNC_IMPL_ADD (array, i, wcslen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcslen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcslen, +@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ + IFUNC_IMPL (i, name, wcsnlen, + IFUNC_IMPL_ADD (array, i, wcsnlen, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __wcsnlen_avx2) + IFUNC_IMPL_ADD (array, i, wcsnlen, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcsnlen_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcsnlen, +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index 82826e10..be8a5db5 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -27,9 +27,11 @@ + # ifdef USE_AS_WCSLEN + # define VPCMPEQ vpcmpeqd + # define VPMINU vpminud ++# define CHAR_SIZE 4 + # else + # define VPCMPEQ vpcmpeqb + # define VPMINU vpminub ++# define CHAR_SIZE 1 + # endif + + # ifndef VZEROUPPER +@@ -41,349 +43,459 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN +- /* Check for zero length. */ ++ /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) ++ /* Store max len in R8_LP before adjusting if using WCSLEN. */ ++ mov %RSI_LP, %R8_LP + # ifdef USE_AS_WCSLEN + shl $2, %RSI_LP + # elif defined __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi + # endif +- mov %RSI_LP, %R8_LP + # endif +- movl %edi, %ecx ++ movl %edi, %eax + movq %rdi, %rdx + vpxor %xmm0, %xmm0, %xmm0 +- ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. */ ++ andl $(PAGE_SIZE - 1), %eax + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- ++ VPCMPEQ (%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + # ifdef USE_AS_STRNLEN +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rsi +- jbe L(max) +-# else +- jnz L(first_vec_x0) ++ /* If length < VEC_SIZE handle special. */ ++ cmpq $VEC_SIZE, %rsi ++ jbe L(first_vec_x0) + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ /* If empty continue to aligned_more. Otherwise return bit ++ position of first match. */ ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ +- addq %rcx, %rsi ++L(zero): ++ xorl %eax, %eax ++ ret + +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ .p2align 4 ++L(first_vec_x0): ++ /* Set bit for max len so that tzcnt will return min of max len ++ and position of first match. */ ++ btsq %rsi, %rax ++ tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + # endif +- jmp L(more_4x_vec) + + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- /* Remove the leading bytes. */ +- sarl %cl, %eax +- testl %eax, %eax +- jz L(aligned_more) ++L(first_vec_x1): + tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE * 4 + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ incl %edi ++ addl %edi, %eax + # endif +- addq %rdi, %rax +- addq %rcx, %rax +- subq %rdx, %rax + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ shrl $2, %eax + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ VZEROUPPER_RETURN + + .p2align 4 +-L(aligned_more): ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" +- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" +- to void possible addition overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx +- +- /* Check the end of data. */ +- subq %rcx, %rsi +- jbe L(max) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE * 3 + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ addl $(VEC_SIZE + 1), %edi ++ addl %edi, %eax + # endif ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + +- addq $VEC_SIZE, %rdi ++ .p2align 4 ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ ++# ifdef USE_AS_STRNLEN ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE * 2 + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ addl $(VEC_SIZE * 2 + 1), %edi ++ addl %edi, %eax ++# endif ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ subl $(VEC_SIZE + 1), %ecx ++ addl %ecx, %eax ++# else ++ subl %edx, %edi ++ addl $(VEC_SIZE * 3 + 1), %edi ++ addl %edi, %eax + # endif ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif ++ VZEROUPPER_RETURN + +-L(more_4x_vec): ++ .p2align 5 ++L(aligned_more): ++ /* Align data to VEC_SIZE - 1. This is the same number of ++ instructions as using andq with -VEC_SIZE but saves 4 bytes of ++ code on the x4 check. */ ++ orq $(VEC_SIZE - 1), %rdi ++L(cross_page_continue): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++# ifdef USE_AS_STRNLEN ++ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because ++ it simplies the logic in last_4x_vec_or_less. */ ++ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx ++ subq %rdx, %rcx ++# endif ++ /* Load first VEC regardless. */ ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. If near end handle specially. */ ++ subq %rcx, %rsi ++ jb L(last_4x_vec_or_less) ++# endif ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi +- +-# ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) +-# endif +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) + ++ /* Align data to VEC_SIZE * 4 - 1. */ + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ ++ /* Before adjusting length check if at last VEC_SIZE * 4. */ ++ cmpq $(VEC_SIZE * 4 - 1), %rsi ++ jbe L(last_4x_vec_or_less_load) ++ incq %rdi ++ movl %edi, %ecx ++ orq $(VEC_SIZE * 4 - 1), %rdi ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ /* Readjust length. */ + addq %rcx, %rsi ++# else ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + # endif +- ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm1 +- vmovdqa VEC_SIZE(%rdi), %ymm2 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4 +- VPMINU %ymm1, %ymm2, %ymm5 +- VPMINU %ymm3, %ymm4, %ymm6 +- VPMINU %ymm5, %ymm6, %ymm5 +- +- VPCMPEQ %ymm5, %ymm0, %ymm5 +- vpmovmskb %ymm5, %eax +- testl %eax, %eax +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- +-# ifndef USE_AS_STRNLEN +- jmp L(loop_4x_vec) +-# else ++# ifdef USE_AS_STRNLEN ++ /* Break if at end of length. */ + subq $(VEC_SIZE * 4), %rsi +- ja L(loop_4x_vec) +- +-L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %esi +- jle L(last_2x_vec) ++ jb L(last_4x_vec_or_less_cmpeq) ++# endif ++ /* Save some code size by microfusing VPMINU with the load. Since ++ the matches in ymm2/ymm4 can only be returned if there where no ++ matches in ymm1/ymm3 respectively there is no issue with overlap. ++ */ ++ vmovdqa 1(%rdi), %ymm1 ++ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 ++ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4 ++ ++ VPMINU %ymm2, %ymm4, %ymm5 ++ VPCMPEQ %ymm5, %ymm0, %ymm5 ++ vpmovmskb %ymm5, %ecx + +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx ++ jz L(loop_4x_vec) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) + +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm1, %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ subq %rdx, %rdi + testl %eax, %eax ++ jnz L(last_vec_return_x0) + +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %esi +- jle L(max) +- +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm2, %ymm0, %ymm2 ++ vpmovmskb %ymm2, %eax + testl %eax, %eax +- +- jnz L(first_vec_x3_check) +- movq %r8, %rax +-# ifdef USE_AS_WCSLEN ++ jnz L(last_vec_return_x1) ++ ++ /* Combine last 2 VEC. */ ++ VPCMPEQ %ymm3, %ymm0, %ymm3 ++ vpmovmskb %ymm3, %eax ++ /* rcx has combined result from all 4 VEC. It will only be used if ++ the first 3 other VEC all did not contain a match. */ ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ subq $(VEC_SIZE * 2 - 1), %rdi ++ addq %rdi, %rax ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + ++ ++# ifdef USE_AS_STRNLEN + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %esi +- VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax ++L(last_4x_vec_or_less_load): ++ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ ++ subq $-(VEC_SIZE * 4), %rdi ++L(last_4x_vec_or_less_cmpeq): ++ VPCMPEQ 1(%rdi), %ymm0, %ymm1 ++L(last_4x_vec_or_less): + +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %esi +- jle L(max) ++ vpmovmskb %ymm1, %eax ++ /* If remaining length > VEC_SIZE * 2. This works if esi is off by ++ VEC_SIZE * 4. */ ++ testl $(VEC_SIZE * 2), %esi ++ jnz L(last_4x_vec) + +- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ /* length may have been negative or positive by an offset of ++ VEC_SIZE * 4 depending on where this was called from. This fixes ++ that. */ ++ andl $(VEC_SIZE * 4 - 1), %esi + testl %eax, %eax +- jnz L(first_vec_x1_check) +- movq %r8, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif +- VZEROUPPER_RETURN ++ jnz L(last_vec_x1_check) + +- .p2align 4 +-L(first_vec_x0_check): ++ subl $VEC_SIZE, %esi ++ jb L(max) ++ ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi ++ addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax +- subq %rdx, %rax + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif + VZEROUPPER_RETURN ++# endif + + .p2align 4 +-L(first_vec_x1_check): ++L(last_vec_return_x0): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $VEC_SIZE, %rax ++ subq $(VEC_SIZE * 4 - 1), %rdi + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2_check): ++L(last_vec_return_x1): + tzcntl %eax, %eax +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 2), %rax ++ subq $(VEC_SIZE * 3 - 1), %rdi + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + ++# ifdef USE_AS_STRNLEN + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x1_check): ++ + tzcntl %eax, %eax + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 3), %rax ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi ++ incl %eax + addq %rdi, %rax +- subq %rdx, %rax + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif + VZEROUPPER_RETURN + +- .p2align 4 + L(max): + movq %r8, %rax ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(last_4x_vec): ++ /* Test first 2x VEC normally. */ ++ testl %eax, %eax ++ jnz L(last_vec_x1) ++ ++ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ /* Normalize length. */ ++ andl $(VEC_SIZE * 4 - 1), %esi ++ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x3) ++ ++ subl $(VEC_SIZE * 3), %esi ++ jb L(max) ++ ++ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi ++ addl $(VEC_SIZE * 3 + 1), %eax ++ addq %rdi, %rax + # ifdef USE_AS_WCSLEN + shrq $2, %rax + # endif + VZEROUPPER_RETURN + +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +-# endif + + .p2align 4 +-L(first_vec_x0): ++L(last_vec_x1): ++ /* essentially duplicates of first_vec_x1 but use 64 bit ++ instructions. */ + tzcntl %eax, %eax ++ subq %rdx, %rdi ++ incl %eax + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x1): ++L(last_vec_x2): ++ /* essentially duplicates of first_vec_x1 but use 64 bit ++ instructions. */ + tzcntl %eax, %eax +- addq $VEC_SIZE, %rax ++ subq %rdx, %rdi ++ addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2): ++L(last_vec_x3): + tzcntl %eax, %eax +- addq $(VEC_SIZE * 2), %rax ++ subl $(VEC_SIZE * 2), %esi ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max_end) ++ subq %rdx, %rdi ++ addl $(VEC_SIZE * 2 + 1), %eax + addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN ++# ifdef USE_AS_WCSLEN + shrq $2, %rax +-# endif ++# endif ++ VZEROUPPER_RETURN ++L(max_end): ++ movq %r8, %rax + VZEROUPPER_RETURN ++# endif + ++ /* Cold case for crossing page with first load. */ + .p2align 4 +-L(4x_vec_end): +- VPCMPEQ %ymm1, %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- VPCMPEQ %ymm2, %ymm0, %ymm2 +- vpmovmskb %ymm2, %eax ++L(cross_page_boundary): ++ /* Align data to VEC_SIZE - 1. */ ++ orq $(VEC_SIZE - 1), %rdi ++ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT ++ so no need to manually mod rdx. */ ++ sarxl %edx, %eax, %eax ++# ifdef USE_AS_STRNLEN + testl %eax, %eax +- jnz L(first_vec_x1) +- VPCMPEQ %ymm3, %ymm0, %ymm3 +- vpmovmskb %ymm3, %eax ++ jnz L(cross_page_less_vec) ++ leaq 1(%rdi), %rcx ++ subq %rdx, %rcx ++ /* Check length. */ ++ cmpq %rsi, %rcx ++ jb L(cross_page_continue) ++ movq %r8, %rax ++# else + testl %eax, %eax +- jnz L(first_vec_x2) +- VPCMPEQ %ymm4, %ymm0, %ymm4 +- vpmovmskb %ymm4, %eax +-L(first_vec_x3): ++ jz L(cross_page_continue) + tzcntl %eax, %eax +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif + # endif ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++# ifdef USE_AS_STRNLEN ++ .p2align 4 ++L(cross_page_less_vec): ++ tzcntl %eax, %eax ++ cmpq %rax, %rsi ++ cmovb %esi, %eax ++# ifdef USE_AS_WCSLEN ++ shrl $2, %eax ++# endif + VZEROUPPER_RETURN ++# endif + + END (STRLEN) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-26.patch b/glibc-RHEL-15696-26.patch new file mode 100644 index 0000000..d46fe6e --- /dev/null +++ b/glibc-RHEL-15696-26.patch @@ -0,0 +1,701 @@ +From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 3 May 2021 03:03:19 -0400 +Subject: [PATCH] x86: Optimize memchr-evex.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memchr-evex.S. The optimizations include +replacing some branches with cmovcc, avoiding some branches entirely +in the less_4x_vec case, making the page cross logic less strict, +saving some ALU in the alignment process, and most importantly +increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and +test-wmemchr are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++---------- + 1 file changed, 322 insertions(+), 225 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +index 6dd5d67b..81d5cd64 100644 +--- a/sysdeps/x86_64/multiarch/memchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -26,14 +26,28 @@ + + # ifdef USE_AS_WMEMCHR + # define VPBROADCAST vpbroadcastd +-# define VPCMP vpcmpd +-# define SHIFT_REG r8d ++# define VPMINU vpminud ++# define VPCMP vpcmpd ++# define VPCMPEQ vpcmpeqd ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb +-# define VPCMP vpcmpb +-# define SHIFT_REG ecx ++# define VPMINU vpminub ++# define VPCMP vpcmpb ++# define VPCMPEQ vpcmpeqb ++# define CHAR_SIZE 1 + # endif + ++# ifdef USE_AS_RAWMEMCHR ++# define RAW_PTR_REG rcx ++# define ALGN_PTR_REG rdi ++# else ++# define RAW_PTR_REG rdi ++# define ALGN_PTR_REG rcx ++# endif ++ ++# define XMMZERO xmm23 ++# define YMMZERO ymm23 + # define XMMMATCH xmm16 + # define YMMMATCH ymm16 + # define YMM1 ymm17 +@@ -44,6 +58,8 @@ + # define YMM6 ymm22 + + # define VEC_SIZE 32 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) ++# define PAGE_SIZE 4096 + + .section .text.evex,"ax",@progbits + ENTRY (MEMCHR) +@@ -51,11 +67,7 @@ ENTRY (MEMCHR) + /* Check for zero length. */ + test %RDX_LP, %RDX_LP + jz L(zero) +-# endif +- movl %edi, %ecx +-# ifdef USE_AS_WMEMCHR +- shl $2, %RDX_LP +-# else ++ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -64,318 +76,403 @@ ENTRY (MEMCHR) + /* Broadcast CHAR to YMMMATCH. */ + VPBROADCAST %esi, %YMMMATCH + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. */ +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- ++ VPCMP $0, (%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax + # ifndef USE_AS_RAWMEMCHR +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rdx +- jbe L(zero) ++ /* If length < CHAR_PER_VEC handle special. */ ++ cmpq $CHAR_PER_VEC, %rdx ++ jbe L(first_vec_x0) ++# endif ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else +- jnz L(first_vec_x0) ++ addq %rdi, %rax + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ ret + + # ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ +- addq %rcx, %rdx +- +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif +- jmp L(more_4x_vec) ++L(zero): ++ xorl %eax, %eax ++ ret + ++ .p2align 5 ++L(first_vec_x0): ++ /* Check if first match was before length. */ ++ tzcntl %eax, %eax ++ xorl %ecx, %ecx ++ cmpl %eax, %edx ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++ cmovle %rcx, %rax ++ ret ++# else ++ /* NB: first_vec_x0 is 17 bytes which will leave ++ cross_page_boundary (which is relatively cold) close enough ++ to ideal alignment. So only realign L(cross_page_boundary) if ++ rawmemchr. */ + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx ++# endif ++L(cross_page_boundary): ++ /* Save pointer before aligning as its original value is ++ necessary for computer return address if byte is found or ++ adjusting length if it is not and this is memchr. */ ++ movq %rdi, %rcx ++ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi ++ for rawmemchr. */ ++ andq $-VEC_SIZE, %ALGN_PTR_REG ++ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0 ++ kmovd %k0, %r8d + # ifdef USE_AS_WMEMCHR +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ /* NB: Divide shift count by 4 since each bit in K0 represent 4 + bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ sarl $2, %eax ++# endif ++# ifndef USE_AS_RAWMEMCHR ++ movl $(PAGE_SIZE / CHAR_SIZE), %esi ++ subl %eax, %esi + # endif +- andq $-VEC_SIZE, %rdi +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- /* Remove the leading bytes. */ +- sarxl %SHIFT_REG, %eax, %eax +- testl %eax, %eax +- jz L(aligned_more) +- tzcntl %eax, %eax + # ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++ andl $(CHAR_PER_VEC - 1), %eax + # endif ++ /* Remove the leading bytes. */ ++ sarxl %eax, %r8d, %eax + # ifndef USE_AS_RAWMEMCHR + /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) ++ cmpq %rsi, %rdx ++ jbe L(first_vec_x0) ++# endif ++ testl %eax, %eax ++ jz L(cross_page_continue) ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax ++# else ++ addq %RAW_PTR_REG, %rax + # endif +- addq %rdi, %rax +- addq %rcx, %rax + ret + + .p2align 4 +-L(aligned_more): +-# ifndef USE_AS_RAWMEMCHR +- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)" +- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition +- overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Check the end of data. */ +- subq %rcx, %rdx +- jbe L(zero) +-# endif ++ .p2align 4 ++L(first_vec_x2): ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- addq $VEC_SIZE, %rdi ++ .p2align 4 ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +-# ifndef USE_AS_RAWMEMCHR +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +-L(more_4x_vec): ++ .p2align 5 ++L(aligned_more): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Align data to VEC_SIZE. */ ++L(cross_page_continue): ++ xorl %ecx, %ecx ++ subl %edi, %ecx ++ andq $-VEC_SIZE, %rdi ++ /* esi is for adjusting length to see if near the end. */ ++ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi ++# ifdef USE_AS_WMEMCHR ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %esi ++# endif ++# else ++ andq $-VEC_SIZE, %rdi ++L(cross_page_continue): ++# endif ++ /* Load first VEC regardless. */ ++ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++# ifndef USE_AS_RAWMEMCHR ++ /* Adjust length. If near end handle specially. */ ++ subq %rsi, %rdx ++ jbe L(last_4x_vec_or_less) ++# endif + testl %eax, %eax + jnz L(first_vec_x1) + +- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) ++ + + # ifndef USE_AS_RAWMEMCHR +- subq $(VEC_SIZE * 4), %rdx +- jbe L(last_4x_vec_or_less) +-# endif ++ /* Check if at last CHAR_PER_VEC * 4 length. */ ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(last_4x_vec_or_less_cmpeq) ++ addq $VEC_SIZE, %rdi + +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx ++ /* Align data to VEC_SIZE * 4 for the loop and readjust length. ++ */ ++# ifdef USE_AS_WMEMCHR ++ movl %edi, %ecx + andq $-(4 * VEC_SIZE), %rdi +- +-# ifndef USE_AS_RAWMEMCHR +- /* Adjust length. */ ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx + addq %rcx, %rdx ++# else ++ addq %rdi, %rdx ++ andq $-(4 * VEC_SIZE), %rdi ++ subq %rdi, %rdx ++# endif ++# else ++ addq $VEC_SIZE, %rdi ++ andq $-(4 * VEC_SIZE), %rdi + # endif + ++ vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++ ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2 +- kord %k1, %k2, %k5 +- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3 +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4 +- +- kord %k3, %k4, %k6 +- kortestd %k5, %k6 +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- ++ /* It would be possible to save some instructions using 4x VPCMP ++ but bottleneck on port 5 makes it not woth it. */ ++ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 ++ /* xor will set bytes match esi to zero. */ ++ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2 ++ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 ++ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 ++ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ ++ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z} ++ VPCMP $0, %YMM3, %YMMZERO, %k2 + # ifdef USE_AS_RAWMEMCHR +- jmp L(loop_4x_vec) ++ subq $-(VEC_SIZE * 4), %rdi ++ kortestd %k2, %k3 ++ jz L(loop_4x_vec) + # else +- subq $(VEC_SIZE * 4), %rdx ++ kortestd %k2, %k3 ++ jnz L(loop_4x_vec_end) ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ ++ subq $(CHAR_PER_VEC * 4), %rdx + ja L(loop_4x_vec) + ++ /* Fall through into less than 4 remaining vectors of length case. ++ */ ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ addq $(VEC_SIZE * 3), %rdi ++ .p2align 4 + L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %edx +- jle L(last_2x_vec) +- +- VPCMP $0, (%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ /* Check if first VEC contained match. */ + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(first_vec_x1_check) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) ++ /* If remaining length > CHAR_PER_VEC * 2. */ ++ addl $(CHAR_PER_VEC * 2), %edx ++ jg L(last_4x_vec) + +- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax ++L(last_2x_vec): ++ /* If remaining length < CHAR_PER_VEC. */ ++ addl $CHAR_PER_VEC, %edx ++ jle L(zero_end) + +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %edx +- jle L(zero) ++ /* Check VEC2 and compare any match with remaining length. */ ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++L(zero_end): ++ ret + +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax +- testl %eax, %eax + +- jnz L(first_vec_x3_check) ++ .p2align 4 ++L(first_vec_x1_check): ++ tzcntl %eax, %eax ++ /* Adjust length. */ ++ subl $-(CHAR_PER_VEC * 4), %edx ++ /* Check if match within remaining length. */ ++ cmpl %eax, %edx ++ jbe L(set_zero_end) ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++L(set_zero_end): + xorl %eax, %eax + ret + + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %edx +- VPCMP $0, (%rdi), %YMMMATCH, %k1 ++L(loop_4x_vec_end): ++# endif ++ /* rawmemchr will fall through into this if match was found in ++ loop. */ ++ ++ /* k1 has not of matches with VEC1. */ + kmovd %k1, %eax +- testl %eax, %eax ++# ifdef USE_AS_WMEMCHR ++ subl $((1 << CHAR_PER_VEC) - 1), %eax ++# else ++ incl %eax ++# endif ++ jnz L(last_vec_x1_return) + +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %edx +- jle L(zero) ++ VPCMP $0, %YMM2, %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2_return) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1 +- kmovd %k1, %eax ++ kmovd %k2, %eax + testl %eax, %eax +- jnz L(first_vec_x1_check) +- xorl %eax, %eax +- ret ++ jnz L(last_vec_x3_return) + +- .p2align 4 +-L(first_vec_x0_check): ++ kmovd %k3, %eax + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++# ifdef USE_AS_RAWMEMCHR ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++# else ++ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax + # endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq %rdi, %rax + ret + + .p2align 4 +-L(first_vec_x1_check): ++L(last_vec_x1_return): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $VEC_SIZE, %rax ++# ifdef USE_AS_RAWMEMCHR ++# ifdef USE_AS_WMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else + addq %rdi, %rax +- ret +- +- .p2align 4 +-L(first_vec_x2_check): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++# endif ++# else ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + # endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax + ret + + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x2_return): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax ++# ifdef USE_AS_RAWMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax ++# else ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax + # endif +- /* Check the end of data. */ +- cmpq %rax, %rdx +- jbe L(zero) +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax + ret + + .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret +-# endif +- +- .p2align 4 +-L(first_vec_x0): ++L(last_vec_x3_return): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++# ifdef USE_AS_RAWMEMCHR ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + # else +- addq %rdi, %rax ++ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ ++ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax + # endif + ret + ++ ++# ifndef USE_AS_RAWMEMCHR ++L(last_4x_vec_or_less_cmpeq): ++ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ subq $-(VEC_SIZE * 4), %rdi ++ /* Check first VEC regardless. */ ++ testl %eax, %eax ++ jnz L(first_vec_x1_check) ++ ++ /* If remaining length <= CHAR_PER_VEC * 2. */ ++ addl $(CHAR_PER_VEC * 2), %edx ++ jle L(last_2x_vec) ++ + .p2align 4 +-L(first_vec_x1): ++L(last_4x_vec): ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(last_vec_x2) ++ ++ ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ /* Create mask for possible matches within remaining length. */ ++# ifdef USE_AS_WMEMCHR ++ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx ++ bzhil %edx, %ecx, %ecx ++# else ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx ++# endif ++ /* Test matches in data against length match. */ ++ andl %ecx, %eax ++ jnz L(last_vec_x3) ++ ++ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after ++ remaining length was found to be > CHAR_PER_VEC * 2. */ ++ subl $CHAR_PER_VEC, %edx ++ jbe L(zero_end2) ++ ++ ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ kmovd %k0, %eax ++ /* Shift remaining length mask for last VEC. */ ++# ifdef USE_AS_WMEMCHR ++ shrl $CHAR_PER_VEC, %ecx ++# else ++ shrq $CHAR_PER_VEC, %rcx ++# endif ++ andl %ecx, %eax ++ jz L(zero_end2) + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +-# endif ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ++L(zero_end2): + ret + +- .p2align 4 +-L(first_vec_x2): ++L(last_vec_x2): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +-# else +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +-# endif ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 +-L(4x_vec_end): +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- kmovd %k2, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- kmovd %k3, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- kmovd %k4, %eax +- testl %eax, %eax +-L(first_vec_x3): ++L(last_vec_x3): + tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax +-# else +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +-# endif ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax + ret ++# endif + + END (MEMCHR) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-27.patch b/glibc-RHEL-15696-27.patch new file mode 100644 index 0000000..9dcf16d --- /dev/null +++ b/glibc-RHEL-15696-27.patch @@ -0,0 +1,30 @@ +From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001 +From: Alice Xu +Date: Fri, 7 May 2021 19:03:21 -0700 +Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S +Content-type: text/plain; charset=UTF-8 + +An unknown vector operation occurred in commit 2a76821c308. Fixed it +by using "ymm{k1}{z}" but not "ymm {k1} {z}". + +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memchr-evex.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +index 81d5cd64..f3fdad4f 100644 +--- a/sysdeps/x86_64/multiarch/memchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -271,7 +271,7 @@ L(loop_4x_vec): + vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3 + VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3 + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ +- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z} ++ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} + VPCMP $0, %YMM3, %YMMZERO, %k2 + # ifdef USE_AS_RAWMEMCHR + subq $-(VEC_SIZE * 4), %rdi +-- +GitLab + diff --git a/glibc-RHEL-15696-28.patch b/glibc-RHEL-15696-28.patch new file mode 100644 index 0000000..3063d4d --- /dev/null +++ b/glibc-RHEL-15696-28.patch @@ -0,0 +1,566 @@ +From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Tue, 22 Jun 2021 20:42:10 -0700 +Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S +Content-type: text/plain; charset=UTF-8 + +Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1 +version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S +and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants. +This also removes the unused symbols, __GI___strlen_sse2 and +__GI___wcsnlen_sse4_1. +--- + sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +- + sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++ + sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +- + sysdeps/x86_64/strlen.S | 243 +------------------- + 4 files changed, 262 insertions(+), 242 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S + +Conflicts: + sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S + (Copyright dates, URL) + +diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S +index 7bc57b8d..449c8a7f 100644 +--- a/sysdeps/x86_64/multiarch/strlen-sse2.S ++++ b/sysdeps/x86_64/multiarch/strlen-sse2.S +@@ -20,4 +20,4 @@ + # define strlen __strlen_sse2 + #endif + +-#include "../strlen.S" ++#include "strlen-vec.S" +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +new file mode 100644 +index 00000000..8f660bb9 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -0,0 +1,257 @@ ++/* SSE2 version of strlen and SSE4.1 version of wcslen. ++ Copyright (C) 2012-2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#ifdef AS_WCSLEN ++# define PMINU pminud ++# define PCMPEQ pcmpeqd ++# define SHIFT_RETURN shrq $2, %rax ++#else ++# define PMINU pminub ++# define PCMPEQ pcmpeqb ++# define SHIFT_RETURN ++#endif ++ ++/* Long lived register in strlen(s), strnlen(s, n) are: ++ ++ %xmm3 - zero ++ %rdi - s ++ %r10 (s+n) & (~(64-1)) ++ %r11 s+n ++*/ ++ ++ ++.text ++ENTRY(strlen) ++ ++/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ ++#define FIND_ZERO \ ++ PCMPEQ (%rax), %xmm0; \ ++ PCMPEQ 16(%rax), %xmm1; \ ++ PCMPEQ 32(%rax), %xmm2; \ ++ PCMPEQ 48(%rax), %xmm3; \ ++ pmovmskb %xmm0, %esi; \ ++ pmovmskb %xmm1, %edx; \ ++ pmovmskb %xmm2, %r8d; \ ++ pmovmskb %xmm3, %ecx; \ ++ salq $16, %rdx; \ ++ salq $16, %rcx; \ ++ orq %rsi, %rdx; \ ++ orq %r8, %rcx; \ ++ salq $32, %rcx; \ ++ orq %rcx, %rdx; ++ ++#ifdef AS_STRNLEN ++/* Do not read anything when n==0. */ ++ test %RSI_LP, %RSI_LP ++ jne L(n_nonzero) ++ xor %rax, %rax ++ ret ++L(n_nonzero): ++# ifdef AS_WCSLEN ++ shl $2, %RSI_LP ++# endif ++ ++/* Initialize long lived registers. */ ++ ++ add %RDI_LP, %RSI_LP ++ mov %RSI_LP, %R10_LP ++ and $-64, %R10_LP ++ mov %RSI_LP, %R11_LP ++#endif ++ ++ pxor %xmm0, %xmm0 ++ pxor %xmm1, %xmm1 ++ pxor %xmm2, %xmm2 ++ pxor %xmm3, %xmm3 ++ movq %rdi, %rax ++ movq %rdi, %rcx ++ andq $4095, %rcx ++/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ ++ cmpq $4047, %rcx ++/* We cannot unify this branching as it would be ~6 cycles slower. */ ++ ja L(cross_page) ++ ++#ifdef AS_STRNLEN ++/* Test if end is among first 64 bytes. */ ++# define STRNLEN_PROLOG \ ++ mov %r11, %rsi; \ ++ subq %rax, %rsi; \ ++ andq $-64, %rax; \ ++ testq $-64, %rsi; \ ++ je L(strnlen_ret) ++#else ++# define STRNLEN_PROLOG andq $-64, %rax; ++#endif ++ ++/* Ignore bits in mask that come before start of string. */ ++#define PROLOG(lab) \ ++ movq %rdi, %rcx; \ ++ xorq %rax, %rcx; \ ++ STRNLEN_PROLOG; \ ++ sarq %cl, %rdx; \ ++ test %rdx, %rdx; \ ++ je L(lab); \ ++ bsfq %rdx, %rax; \ ++ SHIFT_RETURN; \ ++ ret ++ ++#ifdef AS_STRNLEN ++ andq $-16, %rax ++ FIND_ZERO ++#else ++ /* Test first 16 bytes unaligned. */ ++ movdqu (%rax), %xmm4 ++ PCMPEQ %xmm0, %xmm4 ++ pmovmskb %xmm4, %edx ++ test %edx, %edx ++ je L(next48_bytes) ++ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ ++ SHIFT_RETURN ++ ret ++ ++L(next48_bytes): ++/* Same as FIND_ZERO except we do not check first 16 bytes. */ ++ andq $-16, %rax ++ PCMPEQ 16(%rax), %xmm1 ++ PCMPEQ 32(%rax), %xmm2 ++ PCMPEQ 48(%rax), %xmm3 ++ pmovmskb %xmm1, %edx ++ pmovmskb %xmm2, %r8d ++ pmovmskb %xmm3, %ecx ++ salq $16, %rdx ++ salq $16, %rcx ++ orq %r8, %rcx ++ salq $32, %rcx ++ orq %rcx, %rdx ++#endif ++ ++ /* When no zero byte is found xmm1-3 are zero so we do not have to ++ zero them. */ ++ PROLOG(loop) ++ ++ .p2align 4 ++L(cross_page): ++ andq $-64, %rax ++ FIND_ZERO ++ PROLOG(loop_init) ++ ++#ifdef AS_STRNLEN ++/* We must do this check to correctly handle strnlen (s, -1). */ ++L(strnlen_ret): ++ bts %rsi, %rdx ++ sarq %cl, %rdx ++ test %rdx, %rdx ++ je L(loop_init) ++ bsfq %rdx, %rax ++ SHIFT_RETURN ++ ret ++#endif ++ .p2align 4 ++L(loop_init): ++ pxor %xmm1, %xmm1 ++ pxor %xmm2, %xmm2 ++ pxor %xmm3, %xmm3 ++#ifdef AS_STRNLEN ++ .p2align 4 ++L(loop): ++ ++ addq $64, %rax ++ cmpq %rax, %r10 ++ je L(exit_end) ++ ++ movdqa (%rax), %xmm0 ++ PMINU 16(%rax), %xmm0 ++ PMINU 32(%rax), %xmm0 ++ PMINU 48(%rax), %xmm0 ++ PCMPEQ %xmm3, %xmm0 ++ pmovmskb %xmm0, %edx ++ testl %edx, %edx ++ jne L(exit) ++ jmp L(loop) ++ ++ .p2align 4 ++L(exit_end): ++ cmp %rax, %r11 ++ je L(first) /* Do not read when end is at page boundary. */ ++ pxor %xmm0, %xmm0 ++ FIND_ZERO ++ ++L(first): ++ bts %r11, %rdx ++ bsfq %rdx, %rdx ++ addq %rdx, %rax ++ subq %rdi, %rax ++ SHIFT_RETURN ++ ret ++ ++ .p2align 4 ++L(exit): ++ pxor %xmm0, %xmm0 ++ FIND_ZERO ++ ++ bsfq %rdx, %rdx ++ addq %rdx, %rax ++ subq %rdi, %rax ++ SHIFT_RETURN ++ ret ++ ++#else ++ ++ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ ++ .p2align 4 ++L(loop): ++ ++ movdqa 64(%rax), %xmm0 ++ PMINU 80(%rax), %xmm0 ++ PMINU 96(%rax), %xmm0 ++ PMINU 112(%rax), %xmm0 ++ PCMPEQ %xmm3, %xmm0 ++ pmovmskb %xmm0, %edx ++ testl %edx, %edx ++ jne L(exit64) ++ ++ subq $-128, %rax ++ ++ movdqa (%rax), %xmm0 ++ PMINU 16(%rax), %xmm0 ++ PMINU 32(%rax), %xmm0 ++ PMINU 48(%rax), %xmm0 ++ PCMPEQ %xmm3, %xmm0 ++ pmovmskb %xmm0, %edx ++ testl %edx, %edx ++ jne L(exit0) ++ jmp L(loop) ++ ++ .p2align 4 ++L(exit64): ++ addq $64, %rax ++L(exit0): ++ pxor %xmm0, %xmm0 ++ FIND_ZERO ++ ++ bsfq %rdx, %rdx ++ addq %rdx, %rax ++ subq %rdi, %rax ++ SHIFT_RETURN ++ ret ++ ++#endif ++ ++END(strlen) +diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +index a8cab0cb..5fa51fe0 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S ++++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S +@@ -2,4 +2,4 @@ + #define AS_STRNLEN + #define strlen __wcsnlen_sse4_1 + +-#include "../strlen.S" ++#include "strlen-vec.S" +diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S +index f845f3d4..ad047d84 100644 +--- a/sysdeps/x86_64/strlen.S ++++ b/sysdeps/x86_64/strlen.S +@@ -1,5 +1,5 @@ +-/* SSE2 version of strlen/wcslen. +- Copyright (C) 2012-2018 Free Software Foundation, Inc. ++/* SSE2 version of strlen. ++ Copyright (C) 2021 Free Software Foundation, Inc. + This file is part of the GNU C Library. + + The GNU C Library is free software; you can redistribute it and/or +@@ -16,243 +16,6 @@ + License along with the GNU C Library; if not, see + . */ + +-#include ++#include "multiarch/strlen-vec.S" + +-#ifdef AS_WCSLEN +-# define PMINU pminud +-# define PCMPEQ pcmpeqd +-# define SHIFT_RETURN shrq $2, %rax +-#else +-# define PMINU pminub +-# define PCMPEQ pcmpeqb +-# define SHIFT_RETURN +-#endif +- +-/* Long lived register in strlen(s), strnlen(s, n) are: +- +- %xmm3 - zero +- %rdi - s +- %r10 (s+n) & (~(64-1)) +- %r11 s+n +-*/ +- +- +-.text +-ENTRY(strlen) +- +-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */ +-#define FIND_ZERO \ +- PCMPEQ (%rax), %xmm0; \ +- PCMPEQ 16(%rax), %xmm1; \ +- PCMPEQ 32(%rax), %xmm2; \ +- PCMPEQ 48(%rax), %xmm3; \ +- pmovmskb %xmm0, %esi; \ +- pmovmskb %xmm1, %edx; \ +- pmovmskb %xmm2, %r8d; \ +- pmovmskb %xmm3, %ecx; \ +- salq $16, %rdx; \ +- salq $16, %rcx; \ +- orq %rsi, %rdx; \ +- orq %r8, %rcx; \ +- salq $32, %rcx; \ +- orq %rcx, %rdx; +- +-#ifdef AS_STRNLEN +-/* Do not read anything when n==0. */ +- test %RSI_LP, %RSI_LP +- jne L(n_nonzero) +- xor %rax, %rax +- ret +-L(n_nonzero): +-# ifdef AS_WCSLEN +- shl $2, %RSI_LP +-# endif +- +-/* Initialize long lived registers. */ +- +- add %RDI_LP, %RSI_LP +- mov %RSI_LP, %R10_LP +- and $-64, %R10_LP +- mov %RSI_LP, %R11_LP +-#endif +- +- pxor %xmm0, %xmm0 +- pxor %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- pxor %xmm3, %xmm3 +- movq %rdi, %rax +- movq %rdi, %rcx +- andq $4095, %rcx +-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */ +- cmpq $4047, %rcx +-/* We cannot unify this branching as it would be ~6 cycles slower. */ +- ja L(cross_page) +- +-#ifdef AS_STRNLEN +-/* Test if end is among first 64 bytes. */ +-# define STRNLEN_PROLOG \ +- mov %r11, %rsi; \ +- subq %rax, %rsi; \ +- andq $-64, %rax; \ +- testq $-64, %rsi; \ +- je L(strnlen_ret) +-#else +-# define STRNLEN_PROLOG andq $-64, %rax; +-#endif +- +-/* Ignore bits in mask that come before start of string. */ +-#define PROLOG(lab) \ +- movq %rdi, %rcx; \ +- xorq %rax, %rcx; \ +- STRNLEN_PROLOG; \ +- sarq %cl, %rdx; \ +- test %rdx, %rdx; \ +- je L(lab); \ +- bsfq %rdx, %rax; \ +- SHIFT_RETURN; \ +- ret +- +-#ifdef AS_STRNLEN +- andq $-16, %rax +- FIND_ZERO +-#else +- /* Test first 16 bytes unaligned. */ +- movdqu (%rax), %xmm4 +- PCMPEQ %xmm0, %xmm4 +- pmovmskb %xmm4, %edx +- test %edx, %edx +- je L(next48_bytes) +- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */ +- SHIFT_RETURN +- ret +- +-L(next48_bytes): +-/* Same as FIND_ZERO except we do not check first 16 bytes. */ +- andq $-16, %rax +- PCMPEQ 16(%rax), %xmm1 +- PCMPEQ 32(%rax), %xmm2 +- PCMPEQ 48(%rax), %xmm3 +- pmovmskb %xmm1, %edx +- pmovmskb %xmm2, %r8d +- pmovmskb %xmm3, %ecx +- salq $16, %rdx +- salq $16, %rcx +- orq %r8, %rcx +- salq $32, %rcx +- orq %rcx, %rdx +-#endif +- +- /* When no zero byte is found xmm1-3 are zero so we do not have to +- zero them. */ +- PROLOG(loop) +- +- .p2align 4 +-L(cross_page): +- andq $-64, %rax +- FIND_ZERO +- PROLOG(loop_init) +- +-#ifdef AS_STRNLEN +-/* We must do this check to correctly handle strnlen (s, -1). */ +-L(strnlen_ret): +- bts %rsi, %rdx +- sarq %cl, %rdx +- test %rdx, %rdx +- je L(loop_init) +- bsfq %rdx, %rax +- SHIFT_RETURN +- ret +-#endif +- .p2align 4 +-L(loop_init): +- pxor %xmm1, %xmm1 +- pxor %xmm2, %xmm2 +- pxor %xmm3, %xmm3 +-#ifdef AS_STRNLEN +- .p2align 4 +-L(loop): +- +- addq $64, %rax +- cmpq %rax, %r10 +- je L(exit_end) +- +- movdqa (%rax), %xmm0 +- PMINU 16(%rax), %xmm0 +- PMINU 32(%rax), %xmm0 +- PMINU 48(%rax), %xmm0 +- PCMPEQ %xmm3, %xmm0 +- pmovmskb %xmm0, %edx +- testl %edx, %edx +- jne L(exit) +- jmp L(loop) +- +- .p2align 4 +-L(exit_end): +- cmp %rax, %r11 +- je L(first) /* Do not read when end is at page boundary. */ +- pxor %xmm0, %xmm0 +- FIND_ZERO +- +-L(first): +- bts %r11, %rdx +- bsfq %rdx, %rdx +- addq %rdx, %rax +- subq %rdi, %rax +- SHIFT_RETURN +- ret +- +- .p2align 4 +-L(exit): +- pxor %xmm0, %xmm0 +- FIND_ZERO +- +- bsfq %rdx, %rdx +- addq %rdx, %rax +- subq %rdi, %rax +- SHIFT_RETURN +- ret +- +-#else +- +- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */ +- .p2align 4 +-L(loop): +- +- movdqa 64(%rax), %xmm0 +- PMINU 80(%rax), %xmm0 +- PMINU 96(%rax), %xmm0 +- PMINU 112(%rax), %xmm0 +- PCMPEQ %xmm3, %xmm0 +- pmovmskb %xmm0, %edx +- testl %edx, %edx +- jne L(exit64) +- +- subq $-128, %rax +- +- movdqa (%rax), %xmm0 +- PMINU 16(%rax), %xmm0 +- PMINU 32(%rax), %xmm0 +- PMINU 48(%rax), %xmm0 +- PCMPEQ %xmm3, %xmm0 +- pmovmskb %xmm0, %edx +- testl %edx, %edx +- jne L(exit0) +- jmp L(loop) +- +- .p2align 4 +-L(exit64): +- addq $64, %rax +-L(exit0): +- pxor %xmm0, %xmm0 +- FIND_ZERO +- +- bsfq %rdx, %rdx +- addq %rdx, %rax +- subq %rdi, %rax +- SHIFT_RETURN +- ret +- +-#endif +- +-END(strlen) + libc_hidden_builtin_def (strlen) +-- +GitLab + diff --git a/glibc-RHEL-15696-29.patch b/glibc-RHEL-15696-29.patch new file mode 100644 index 0000000..112821a --- /dev/null +++ b/glibc-RHEL-15696-29.patch @@ -0,0 +1,181 @@ +From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Jun 2021 01:19:34 -0400 +Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1 +Content-type: text/plain; charset=UTF-8 + +No bug. This comment adds the ifunc / build infrastructure +necessary for wcslen to prefer the sse4.1 implementation +in strlen-vec.S. test-wcslen.c is passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 4 +- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++ + sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++ + sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++ + sysdeps/x86_64/multiarch/wcslen.c | 2 +- + sysdeps/x86_64/multiarch/wcsnlen.c | 34 +------------- + 6 files changed, 63 insertions(+), 36 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h + create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 491c7698..65fde4eb 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcscpy-ssse3 wcscpy-c \ + wcschr-sse2 wcschr-avx2 \ + wcsrchr-sse2 wcsrchr-avx2 \ +- wcsnlen-sse4_1 wcsnlen-c \ +- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \ ++ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ ++ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ + wcschr-avx2-rtm \ + wcscmp-avx2-rtm \ + wcslen-avx2-rtm \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index f1a6460a..580913ca 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcslen_evex) ++ IFUNC_IMPL_ADD (array, i, wcsnlen, ++ CPU_FEATURE_USABLE (SSE4_1), ++ __wcsnlen_sse4_1) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ +diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h +new file mode 100644 +index 00000000..39e33473 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h +@@ -0,0 +1,52 @@ ++/* Common definition for ifunc selections for wcslen and wcsnlen ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2017-2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) ++ return OPTIMIZE (sse4_1); ++ ++ return OPTIMIZE (sse2); ++} +diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +new file mode 100644 +index 00000000..7e62621a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S +@@ -0,0 +1,4 @@ ++#define AS_WCSLEN ++#define strlen __wcslen_sse4_1 ++ ++#include "strlen-vec.S" +diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c +index 6d06e47c..3b04b75b 100644 +--- a/sysdeps/x86_64/multiarch/wcslen.c ++++ b/sysdeps/x86_64/multiarch/wcslen.c +@@ -24,7 +24,7 @@ + # undef __wcslen + + # define SYMBOL_NAME wcslen +-# include "ifunc-avx2.h" ++# include "ifunc-wcslen.h" + + libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ()); + weak_alias (__wcslen, wcslen); +diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c +index 20b731ae..06736410 100644 +--- a/sysdeps/x86_64/multiarch/wcsnlen.c ++++ b/sysdeps/x86_64/multiarch/wcsnlen.c +@@ -24,39 +24,7 @@ + # undef __wcsnlen + + # define SYMBOL_NAME wcsnlen +-# include +- +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; +- +-static inline void * +-IFUNC_SELECTOR (void) +-{ +- const struct cpu_features* cpu_features = __get_cpu_features (); +- +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) +- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) +- { +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) +- return OPTIMIZE (evex); +- +- if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +- return OPTIMIZE (avx2_rtm); +- +- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) +- return OPTIMIZE (avx2); +- } +- +- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1)) +- return OPTIMIZE (sse4_1); +- +- return OPTIMIZE (sse2); +-} ++# include "ifunc-wcslen.h" + + libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ()); + weak_alias (__wcsnlen, wcsnlen); +-- +GitLab + diff --git a/glibc-RHEL-15696-3.patch b/glibc-RHEL-15696-3.patch new file mode 100644 index 0000000..8f5093c --- /dev/null +++ b/glibc-RHEL-15696-3.patch @@ -0,0 +1,396 @@ +From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:27:25 -0800 +Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ# + 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64, +libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for + length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise. + * sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S: + Likewise. + * sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S: + Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy. + tst-size_t-wmemchr. + * sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file. +--- + sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++-- + sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++-- + .../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++-- + .../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++-------- + sysdeps/x86_64/x32/Makefile | 2 +- + sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++ + 6 files changed, 122 insertions(+), 42 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +index 3cd11233..568eebd3 100644 +--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S ++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S +@@ -45,28 +45,33 @@ + .section .text.ssse3,"ax",@progbits + #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE + ENTRY (MEMPCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMPCPY_CHK) + + ENTRY (MEMPCPY) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (MEMPCPY) + #endif + + #if !defined USE_AS_BCOPY + ENTRY (MEMCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMCPY_CHK) + #endif + + ENTRY (MEMCPY) +- mov %rdi, %rax ++ mov %RDI_LP, %RAX_LP + #ifdef USE_AS_MEMPCPY +- add %rdx, %rax ++ add %RDX_LP, %RAX_LP ++#endif ++ ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + #endif + + #ifdef USE_AS_MEMMOVE +diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S +index 0240bfa3..0bd5ee99 100644 +--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S ++++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S +@@ -45,28 +45,33 @@ + .section .text.ssse3,"ax",@progbits + #if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE + ENTRY (MEMPCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMPCPY_CHK) + + ENTRY (MEMPCPY) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (MEMPCPY) + #endif + + #if !defined USE_AS_BCOPY + ENTRY (MEMCPY_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMCPY_CHK) + #endif + + ENTRY (MEMCPY) +- mov %rdi, %rax ++ mov %RDI_LP, %RAX_LP + #ifdef USE_AS_MEMPCPY +- add %rdx, %rax ++ add %RDX_LP, %RAX_LP ++#endif ++ ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx + #endif + + #ifdef USE_AS_MEMMOVE +diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S +index effc3ac2..6ca2bbc9 100644 +--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S ++++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S +@@ -24,27 +24,31 @@ + + .section .text.avx512,"ax",@progbits + ENTRY (__mempcpy_chk_avx512_no_vzeroupper) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__mempcpy_chk_avx512_no_vzeroupper) + + ENTRY (__mempcpy_avx512_no_vzeroupper) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (__mempcpy_avx512_no_vzeroupper) + + ENTRY (__memmove_chk_avx512_no_vzeroupper) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__memmove_chk_avx512_no_vzeroupper) + + ENTRY (__memmove_avx512_no_vzeroupper) +- mov %rdi, %rax ++ mov %RDI_LP, %RAX_LP + # ifdef USE_AS_MEMPCPY +- add %rdx, %rax ++ add %RDX_LP, %RAX_LP + # endif + L(start): ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif + lea (%rsi, %rdx), %rcx + lea (%rdi, %rdx), %r9 + cmp $512, %rdx +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index c952576c..274aa1c7 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -95,20 +95,20 @@ + .section SECTION(.text),"ax",@progbits + #if defined SHARED && IS_IN (libc) + ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned)) + #endif + + ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned)) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start) + END (MEMPCPY_SYMBOL (__mempcpy, unaligned)) + + #if defined SHARED && IS_IN (libc) + ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + #endif +@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned)) + ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned)) + movq %rdi, %rax + L(start): +- cmpq $VEC_SIZE, %rdx ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) +- cmpq $(VEC_SIZE * 2), %rdx ++ cmp $(VEC_SIZE * 2), %RDX_LP + ja L(more_2x_vec) + #if !defined USE_MULTIARCH || !IS_IN (libc) + L(last_2x_vec): +@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned)) + + # if VEC_SIZE == 16 + ENTRY (__mempcpy_chk_erms) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__mempcpy_chk_erms) + + /* Only used to measure performance of REP MOVSB. */ + ENTRY (__mempcpy_erms) +- movq %rdi, %rax ++ mov %RDI_LP, %RAX_LP + /* Skip zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz 2f +- addq %rdx, %rax ++ add %RDX_LP, %RAX_LP + jmp L(start_movsb) + END (__mempcpy_erms) + + ENTRY (__memmove_chk_erms) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__memmove_chk_erms) + + ENTRY (__memmove_erms) + movq %rdi, %rax + /* Skip zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jz 2f + L(start_movsb): +- movq %rdx, %rcx +- cmpq %rsi, %rdi ++ mov %RDX_LP, %RCX_LP ++ cmp %RSI_LP, %RDI_LP + jb 1f + /* Source == destination is less common. */ + je 2f +- leaq (%rsi,%rcx), %rdx +- cmpq %rdx, %rdi ++ lea (%rsi,%rcx), %RDX_LP ++ cmp %RDX_LP, %RDI_LP + jb L(movsb_backward) + 1: + rep movsb +@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms) + + # ifdef SHARED + ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms)) + # endif + + ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) +- movq %rdi, %rax +- addq %rdx, %rax ++ mov %RDI_LP, %RAX_LP ++ add %RDX_LP, %RAX_LP + jmp L(start_erms) + END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms)) + + # ifdef SHARED + ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + # endif +@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms)) + ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) + movq %rdi, %rax + L(start_erms): +- cmpq $VEC_SIZE, %rdx ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) +- cmpq $(VEC_SIZE * 2), %rdx ++ cmp $(VEC_SIZE * 2), %RDX_LP + ja L(movsb_more_2x_vec) + L(last_2x_vec): + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +@@ -236,7 +244,7 @@ L(movsb): + /* Avoid slow backward REP MOVSB. */ + jb L(more_8x_vec_backward) + 1: +- movq %rdx, %rcx ++ mov %RDX_LP, %RCX_LP + rep movsb + L(nop): + ret +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index ddec7f04..2fe1e5ac 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround + endif + + ifeq ($(subdir),string) +-tests += tst-size_t-memchr tst-size_t-memcmp ++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c +new file mode 100644 +index 00000000..66b71e17 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c +@@ -0,0 +1,58 @@ ++/* Test memcpy with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "memcpy" ++#include "test-size_t.h" ++ ++IMPL (memcpy, 1) ++ ++typedef void *(*proto_t) (void *, const void *, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_memcpy (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ do_memcpy (dest, src); ++ int res = memcmp (dest.p, src.p, dest.len); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-30.patch b/glibc-RHEL-15696-30.patch new file mode 100644 index 0000000..0b16f0f --- /dev/null +++ b/glibc-RHEL-15696-30.patch @@ -0,0 +1,497 @@ +From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Jun 2021 01:56:29 -0400 +Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ + #27974] +Content-type: text/plain; charset=UTF-8 + +This commit fixes the bug mentioned in the previous commit. + +The previous implementations of wmemchr in these files relied +on maxlen * sizeof(wchar_t) which was not guranteed by the standard. + +The new overflow tests added in the previous commit now +pass (As well as all the other tests). + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++------- + sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++- + 2 files changed, 107 insertions(+), 38 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index be8a5db5..37688966 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -44,21 +44,21 @@ + + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section SECTION(.text),"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN + /* Check zero length. */ ++# ifdef __ILP32__ ++ /* Clear upper bits. */ ++ and %RSI_LP, %RSI_LP ++# else + test %RSI_LP, %RSI_LP ++# endif + jz L(zero) + /* Store max len in R8_LP before adjusting if using WCSLEN. */ + mov %RSI_LP, %R8_LP +-# ifdef USE_AS_WCSLEN +- shl $2, %RSI_LP +-# elif defined __ILP32__ +- /* Clear the upper 32 bits. */ +- movl %esi, %esi +-# endif + # endif + movl %edi, %eax + movq %rdi, %rdx +@@ -72,10 +72,10 @@ ENTRY (STRLEN) + + /* Check the first VEC_SIZE bytes. */ + VPCMPEQ (%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + # ifdef USE_AS_STRNLEN + /* If length < VEC_SIZE handle special. */ +- cmpq $VEC_SIZE, %rsi ++ cmpq $CHAR_PER_VEC, %rsi + jbe L(first_vec_x0) + # endif + /* If empty continue to aligned_more. Otherwise return bit +@@ -84,6 +84,7 @@ ENTRY (STRLEN) + jz L(aligned_more) + tzcntl %eax, %eax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -97,9 +98,14 @@ L(zero): + L(first_vec_x0): + /* Set bit for max len so that tzcnt will return min of max len + and position of first match. */ ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %esi ++# endif + btsq %rsi, %rax + tzcntl %eax, %eax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -113,14 +119,19 @@ L(first_vec_x1): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE * 4 + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + incl %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -133,14 +144,19 @@ L(first_vec_x2): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE * 3 + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + addl $(VEC_SIZE + 1), %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -153,14 +169,19 @@ L(first_vec_x3): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE * 2 + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + addl $(VEC_SIZE * 2 + 1), %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -173,14 +194,19 @@ L(first_vec_x4): + # ifdef USE_AS_STRNLEN + /* Use ecx which was computed earlier to compute correct value. + */ ++# ifdef USE_AS_WCSLEN ++ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax ++# else + subl $(VEC_SIZE + 1), %ecx + addl %ecx, %eax ++# endif + # else + subl %edx, %edi + addl $(VEC_SIZE * 3 + 1), %edi + addl %edi, %eax + # endif + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + VZEROUPPER_RETURN +@@ -195,10 +221,14 @@ L(cross_page_continue): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ + # ifdef USE_AS_STRNLEN +- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because +- it simplies the logic in last_4x_vec_or_less. */ ++ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE ++ because it simplies the logic in last_4x_vec_or_less. */ + leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx + subq %rdx, %rcx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif + # endif + /* Load first VEC regardless. */ + VPCMPEQ 1(%rdi), %ymm0, %ymm1 +@@ -207,34 +237,38 @@ L(cross_page_continue): + subq %rcx, %rsi + jb L(last_4x_vec_or_less) + # endif +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x2) + + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x3) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x4) + + /* Align data to VEC_SIZE * 4 - 1. */ + # ifdef USE_AS_STRNLEN + /* Before adjusting length check if at last VEC_SIZE * 4. */ +- cmpq $(VEC_SIZE * 4 - 1), %rsi ++ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi + jbe L(last_4x_vec_or_less_load) + incq %rdi + movl %edi, %ecx + orq $(VEC_SIZE * 4 - 1), %rdi + andl $(VEC_SIZE * 4 - 1), %ecx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif + /* Readjust length. */ + addq %rcx, %rsi + # else +@@ -246,13 +280,13 @@ L(cross_page_continue): + L(loop_4x_vec): + # ifdef USE_AS_STRNLEN + /* Break if at end of length. */ +- subq $(VEC_SIZE * 4), %rsi ++ subq $(CHAR_PER_VEC * 4), %rsi + jb L(last_4x_vec_or_less_cmpeq) + # endif +- /* Save some code size by microfusing VPMINU with the load. Since +- the matches in ymm2/ymm4 can only be returned if there where no +- matches in ymm1/ymm3 respectively there is no issue with overlap. +- */ ++ /* Save some code size by microfusing VPMINU with the load. ++ Since the matches in ymm2/ymm4 can only be returned if there ++ where no matches in ymm1/ymm3 respectively there is no issue ++ with overlap. */ + vmovdqa 1(%rdi), %ymm1 + VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2 + vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3 +@@ -260,7 +294,7 @@ L(loop_4x_vec): + + VPMINU %ymm2, %ymm4, %ymm5 + VPCMPEQ %ymm5, %ymm0, %ymm5 +- vpmovmskb %ymm5, %ecx ++ vpmovmskb %ymm5, %ecx + + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx +@@ -268,27 +302,28 @@ L(loop_4x_vec): + + + VPCMPEQ %ymm1, %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + subq %rdx, %rdi + testl %eax, %eax + jnz L(last_vec_return_x0) + + VPCMPEQ %ymm2, %ymm0, %ymm2 +- vpmovmskb %ymm2, %eax ++ vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_return_x1) + + /* Combine last 2 VEC. */ + VPCMPEQ %ymm3, %ymm0, %ymm3 +- vpmovmskb %ymm3, %eax +- /* rcx has combined result from all 4 VEC. It will only be used if +- the first 3 other VEC all did not contain a match. */ ++ vpmovmskb %ymm3, %eax ++ /* rcx has combined result from all 4 VEC. It will only be used ++ if the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax + subq $(VEC_SIZE * 2 - 1), %rdi + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -297,15 +332,19 @@ L(loop_4x_vec): + # ifdef USE_AS_STRNLEN + .p2align 4 + L(last_4x_vec_or_less_load): +- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */ ++ /* Depending on entry adjust rdi / prepare first VEC in ymm1. ++ */ + subq $-(VEC_SIZE * 4), %rdi + L(last_4x_vec_or_less_cmpeq): + VPCMPEQ 1(%rdi), %ymm0, %ymm1 + L(last_4x_vec_or_less): +- +- vpmovmskb %ymm1, %eax +- /* If remaining length > VEC_SIZE * 2. This works if esi is off by +- VEC_SIZE * 4. */ ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %esi ++# endif ++ vpmovmskb %ymm1, %eax ++ /* If remaining length > VEC_SIZE * 2. This works if esi is off ++ by VEC_SIZE * 4. */ + testl $(VEC_SIZE * 2), %esi + jnz L(last_4x_vec) + +@@ -320,7 +359,7 @@ L(last_4x_vec_or_less): + jb L(max) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi +@@ -329,6 +368,7 @@ L(last_4x_vec_or_less): + addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -340,6 +380,7 @@ L(last_vec_return_x0): + subq $(VEC_SIZE * 4 - 1), %rdi + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -350,6 +391,7 @@ L(last_vec_return_x1): + subq $(VEC_SIZE * 3 - 1), %rdi + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -366,6 +408,7 @@ L(last_vec_x1_check): + incl %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -381,14 +424,14 @@ L(last_4x_vec): + jnz L(last_vec_x1) + + VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x2) + + /* Normalize length. */ + andl $(VEC_SIZE * 4 - 1), %esi + VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(last_vec_x3) + +@@ -396,7 +439,7 @@ L(last_4x_vec): + jb L(max) + + VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + tzcntl %eax, %eax + /* Check the end of data. */ + cmpl %eax, %esi +@@ -405,6 +448,7 @@ L(last_4x_vec): + addl $(VEC_SIZE * 3 + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -419,6 +463,7 @@ L(last_vec_x1): + incl %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -432,6 +477,7 @@ L(last_vec_x2): + addl $(VEC_SIZE + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -447,6 +493,7 @@ L(last_vec_x3): + addl $(VEC_SIZE * 2 + 1), %eax + addq %rdi, %rax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ + shrq $2, %rax + # endif + VZEROUPPER_RETURN +@@ -455,13 +502,13 @@ L(max_end): + VZEROUPPER_RETURN + # endif + +- /* Cold case for crossing page with first load. */ ++ /* Cold case for crossing page with first load. */ + .p2align 4 + L(cross_page_boundary): + /* Align data to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi + VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1 +- vpmovmskb %ymm1, %eax ++ vpmovmskb %ymm1, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod rdx. */ + sarxl %edx, %eax, %eax +@@ -470,6 +517,10 @@ L(cross_page_boundary): + jnz L(cross_page_less_vec) + leaq 1(%rdi), %rcx + subq %rdx, %rcx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get wchar_t count. */ ++ shrl $2, %ecx ++# endif + /* Check length. */ + cmpq %rsi, %rcx + jb L(cross_page_continue) +@@ -479,6 +530,7 @@ L(cross_page_boundary): + jz L(cross_page_continue) + tzcntl %eax, %eax + # ifdef USE_AS_WCSLEN ++ /* NB: Divide length by 4 to get wchar_t count. */ + shrl $2, %eax + # endif + # endif +@@ -489,6 +541,10 @@ L(return_vzeroupper): + .p2align 4 + L(cross_page_less_vec): + tzcntl %eax, %eax ++# ifdef USE_AS_WCSLEN ++ /* NB: Multiply length by 4 to get byte count. */ ++ sall $2, %esi ++# endif + cmpq %rax, %rsi + cmovb %esi, %eax + # ifdef USE_AS_WCSLEN +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +index 8f660bb9..439e486a 100644 +--- a/sysdeps/x86_64/multiarch/strlen-vec.S ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -65,12 +65,25 @@ ENTRY(strlen) + ret + L(n_nonzero): + # ifdef AS_WCSLEN +- shl $2, %RSI_LP ++/* Check for overflow from maxlen * sizeof(wchar_t). If it would ++ overflow the only way this program doesn't have undefined behavior ++ is if there is a null terminator in valid memory so wcslen will ++ suffice. */ ++ mov %RSI_LP, %R10_LP ++ sar $62, %R10_LP ++ test %R10_LP, %R10_LP ++ jnz __wcslen_sse4_1 ++ sal $2, %RSI_LP + # endif + ++ + /* Initialize long lived registers. */ + + add %RDI_LP, %RSI_LP ++# ifdef AS_WCSLEN ++/* Check for overflow again from s + maxlen * sizeof(wchar_t). */ ++ jbe __wcslen_sse4_1 ++# endif + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +-- +GitLab + diff --git a/glibc-RHEL-15696-31.patch b/glibc-RHEL-15696-31.patch new file mode 100644 index 0000000..4ef6911 --- /dev/null +++ b/glibc-RHEL-15696-31.patch @@ -0,0 +1,745 @@ +From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 19 Apr 2021 19:36:06 -0400 +Subject: [PATCH] x86: Optimize strlen-evex.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strlen-evex.S. The +optimizations are mostly small things but they add up to roughly +10-30% performance improvement for strlen. The results for strnlen are +bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and +test-wcsnlen are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++----------- + 1 file changed, 317 insertions(+), 264 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S +index 05838190..4bf6874b 100644 +--- a/sysdeps/x86_64/multiarch/strlen-evex.S ++++ b/sysdeps/x86_64/multiarch/strlen-evex.S +@@ -29,11 +29,13 @@ + # ifdef USE_AS_WCSLEN + # define VPCMP vpcmpd + # define VPMINU vpminud +-# define SHIFT_REG r9d ++# define SHIFT_REG ecx ++# define CHAR_SIZE 4 + # else + # define VPCMP vpcmpb + # define VPMINU vpminub +-# define SHIFT_REG ecx ++# define SHIFT_REG edx ++# define CHAR_SIZE 1 + # endif + + # define XMMZERO xmm16 +@@ -46,132 +48,165 @@ + # define YMM6 ymm22 + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN +- /* Check for zero length. */ ++ /* Check zero length. */ + test %RSI_LP, %RSI_LP + jz L(zero) +-# ifdef USE_AS_WCSLEN +- shl $2, %RSI_LP +-# elif defined __ILP32__ ++# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %esi, %esi + # endif + mov %RSI_LP, %R8_LP + # endif +- movl %edi, %ecx +- movq %rdi, %rdx ++ movl %edi, %eax + vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. */ ++ andl $(PAGE_SIZE - 1), %eax + /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Each bit in K0 represents a + null byte. */ + VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax +- testl %eax, %eax +- + # ifdef USE_AS_STRNLEN +- jnz L(first_vec_x0_check) +- /* Adjust length and check the end of data. */ +- subq $VEC_SIZE, %rsi +- jbe L(max) +-# else +- jnz L(first_vec_x0) ++ /* If length < CHAR_PER_VEC handle special. */ ++ cmpq $CHAR_PER_VEC, %rsi ++ jbe L(first_vec_x0) + # endif +- +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- ++ testl %eax, %eax ++ jz L(aligned_more) ++ tzcntl %eax, %eax ++ ret + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ +- addq %rcx, %rsi ++L(zero): ++ xorl %eax, %eax ++ ret + +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ .p2align 4 ++L(first_vec_x0): ++ /* Set bit for max len so that tzcnt will return min of max len ++ and position of first match. */ ++ btsq %rsi, %rax ++ tzcntl %eax, %eax ++ ret + # endif +- jmp L(more_4x_vec) + + .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- +-# ifdef USE_AS_WCSLEN +- /* NB: Divide shift count by 4 since each bit in K0 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++L(first_vec_x1): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ ++# ifdef USE_AS_STRNLEN ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal CHAR_PER_VEC(%rdi, %rax), %eax + # endif +- VPCMP $0, (%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax ++ ret + +- /* Remove the leading bytes. */ +- sarxl %SHIFT_REG, %eax, %eax +- testl %eax, %eax +- jz L(aligned_more) ++ .p2align 4 ++L(first_vec_x2): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +-# endif +- addq %rdi, %rax +- addq %rcx, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax + # endif + ret + + .p2align 4 +-L(aligned_more): ++L(first_vec_x3): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE" +- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE" +- to void possible addition overflow. */ +- negq %rcx +- addq $VEC_SIZE, %rcx +- +- /* Check the end of data. */ +- subq %rcx, %rsi +- jbe L(max) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax + # endif ++ ret + +- addq $VEC_SIZE, %rdi +- ++ .p2align 4 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ /* Safe to use 32 bit instructions as these are only called for ++ size = [1, 159]. */ + # ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) ++ /* Use ecx which was computed earlier to compute correct value. ++ */ ++ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax ++# else ++ subl %edx, %edi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %edi ++# endif ++ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax + # endif ++ ret + +-L(more_4x_vec): ++ .p2align 5 ++L(aligned_more): ++ movq %rdi, %rdx ++ /* Align data to VEC_SIZE. */ ++ andq $-(VEC_SIZE), %rdi ++L(cross_page_continue): + /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- VPCMP $0, (%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- ++# ifdef USE_AS_STRNLEN ++ /* + CHAR_SIZE because it simplies the logic in ++ last_4x_vec_or_less. */ ++ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx ++ subq %rdx, %rcx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif ++# endif ++ /* Load first VEC regardless. */ + VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++# ifdef USE_AS_STRNLEN ++ /* Adjust length. If near end handle specially. */ ++ subq %rcx, %rsi ++ jb L(last_4x_vec_or_less) ++# endif + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax +- testl %eax, %eax ++ test %eax, %eax + jnz L(first_vec_x2) + + VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 +@@ -179,258 +214,276 @@ L(more_4x_vec): + testl %eax, %eax + jnz L(first_vec_x3) + +- addq $(VEC_SIZE * 4), %rdi +- +-# ifdef USE_AS_STRNLEN +- subq $(VEC_SIZE * 4), %rsi +- jbe L(last_4x_vec_or_less) +-# endif +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) + ++ addq $VEC_SIZE, %rdi + # ifdef USE_AS_STRNLEN +- /* Adjust length. */ ++ /* Check if at last VEC_SIZE * 4 length. */ ++ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi ++ jbe L(last_4x_vec_or_less_load) ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarl $2, %ecx ++# endif ++ /* Readjust length. */ + addq %rcx, %rsi + # endif ++ /* Align data to VEC_SIZE * 4. */ ++ andq $-(VEC_SIZE * 4), %rdi + ++ /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VMOVA (%rdi), %YMM1 +- VMOVA VEC_SIZE(%rdi), %YMM2 +- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3 +- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4 +- +- VPMINU %YMM1, %YMM2, %YMM5 +- VPMINU %YMM3, %YMM4, %YMM6 ++ /* Load first VEC regardless. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++# ifdef USE_AS_STRNLEN ++ /* Break if at end of length. */ ++ subq $(CHAR_PER_VEC * 4), %rsi ++ jb L(last_4x_vec_or_less_cmpeq) ++# endif ++ /* Save some code size by microfusing VPMINU with the load. Since ++ the matches in ymm2/ymm4 can only be returned if there where no ++ matches in ymm1/ymm3 respectively there is no issue with overlap. ++ */ ++ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2 ++ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 ++ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4 ++ ++ VPCMP $0, %YMM2, %YMMZERO, %k0 ++ VPCMP $0, %YMM4, %YMMZERO, %k1 ++ subq $-(VEC_SIZE * 4), %rdi ++ kortestd %k0, %k1 ++ jz L(loop_4x_vec) ++ ++ /* Check if end was in first half. */ ++ kmovd %k0, %eax ++ subq %rdx, %rdi ++# ifdef USE_AS_WCSLEN ++ shrq $2, %rdi ++# endif ++ testl %eax, %eax ++ jz L(second_vec_return) + +- VPMINU %YMM5, %YMM6, %YMM5 +- VPCMP $0, %YMM5, %YMMZERO, %k0 +- ktestd %k0, %k0 +- jnz L(4x_vec_end) ++ VPCMP $0, %YMM1, %YMMZERO, %k2 ++ kmovd %k2, %edx ++ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */ ++# ifdef USE_AS_WCSLEN ++ sall $CHAR_PER_VEC, %eax ++ orl %edx, %eax ++ tzcntl %eax, %eax ++# else ++ salq $CHAR_PER_VEC, %rax ++ orq %rdx, %rax ++ tzcntq %rax, %rax ++# endif ++ addq %rdi, %rax ++ ret + +- addq $(VEC_SIZE * 4), %rdi + +-# ifndef USE_AS_STRNLEN +- jmp L(loop_4x_vec) +-# else +- subq $(VEC_SIZE * 4), %rsi +- ja L(loop_4x_vec) ++# ifdef USE_AS_STRNLEN + ++L(last_4x_vec_or_less_load): ++ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++L(last_4x_vec_or_less_cmpeq): ++ VPCMP $0, %YMM1, %YMMZERO, %k0 ++ addq $(VEC_SIZE * 3), %rdi + L(last_4x_vec_or_less): +- /* Less than 4 * VEC and aligned to VEC_SIZE. */ +- addl $(VEC_SIZE * 2), %esi +- jle L(last_2x_vec) +- +- VPCMP $0, (%rdi), %YMMZERO, %k0 + kmovd %k0, %eax ++ /* If remaining length > VEC_SIZE * 2. This works if esi is off by ++ VEC_SIZE * 4. */ ++ testl $(CHAR_PER_VEC * 2), %esi ++ jnz L(last_4x_vec) ++ ++ /* length may have been negative or positive by an offset of ++ CHAR_PER_VEC * 4 depending on where this was called from. This ++ fixes that. */ ++ andl $(CHAR_PER_VEC * 4 - 1), %esi + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(last_vec_x1_check) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) ++ /* Check the end of data. */ ++ subl $CHAR_PER_VEC, %esi ++ jb L(max) + + VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x2_check) +- subl $VEC_SIZE, %esi +- jle L(max) ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max) + +- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x3_check) ++ subq %rdx, %rdi ++# ifdef USE_AS_WCSLEN ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi ++# endif ++ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax ++ ret ++L(max): + movq %r8, %rax ++ ret ++# endif ++ ++ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less) ++ in the 4x VEC loop can use 2 byte encoding. */ ++ .p2align 4 ++L(second_vec_return): ++ VPCMP $0, %YMM3, %YMMZERO, %k0 ++ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */ ++# ifdef USE_AS_WCSLEN ++ kunpckbw %k0, %k1, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax ++# else ++ kunpckdq %k0, %k1, %k0 ++ kmovq %k0, %rax ++ tzcntq %rax, %rax ++# endif ++ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax ++ ret ++ ++ ++# ifdef USE_AS_STRNLEN ++L(last_vec_x1_check): ++ tzcntl %eax, %eax ++ /* Check the end of data. */ ++ cmpl %eax, %esi ++ jb L(max) ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(last_2x_vec): +- addl $(VEC_SIZE * 2), %esi ++L(last_4x_vec): ++ /* Test first 2x VEC normally. */ ++ testl %eax, %eax ++ jnz L(last_vec_x1) + +- VPCMP $0, (%rdi), %YMMZERO, %k0 ++ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x0_check) +- subl $VEC_SIZE, %esi +- jle L(max) ++ jnz L(last_vec_x2) + +- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0 ++ /* Normalize length. */ ++ andl $(CHAR_PER_VEC * 4 - 1), %esi ++ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0 + kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x1_check) +- movq %r8, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif +- ret ++ jnz L(last_vec_x3) + +- .p2align 4 +-L(first_vec_x0_check): ++ /* Check the end of data. */ ++ subl $(CHAR_PER_VEC * 3), %esi ++ jb L(max) ++ ++ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq %rdi, %rax +- subq %rdx, %rax ++ cmpl %eax, %esi ++ jb L(max_end) ++ ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(first_vec_x1_check): ++L(last_vec_x1): + tzcntl %eax, %eax ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(first_vec_x2_check): ++L(last_vec_x2): + tzcntl %eax, %eax ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax + ret + + .p2align 4 +-L(first_vec_x3_check): ++L(last_vec_x3): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif ++ subl $(CHAR_PER_VEC * 2), %esi + /* Check the end of data. */ +- cmpq %rax, %rsi +- jbe L(max) +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +- subq %rdx, %rax ++ cmpl %eax, %esi ++ jb L(max_end) ++ subq %rdx, %rdi + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide bytes by 4 to get the wchar_t count. */ ++ sarq $2, %rdi + # endif ++ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax + ret +- +- .p2align 4 +-L(max): ++L(max_end): + movq %r8, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif +- ret +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax + ret + # endif + ++ /* Cold case for crossing page with first load. */ + .p2align 4 +-L(first_vec_x0): +- tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq %rdi, %rax +- subq %rdx, %rax ++L(cross_page_boundary): ++ movq %rdi, %rdx ++ /* Align data to VEC_SIZE. */ ++ andq $-VEC_SIZE, %rdi ++ VPCMP $0, (%rdi), %YMMZERO, %k0 ++ kmovd %k0, %eax ++ /* Remove the leading bytes. */ + # ifdef USE_AS_WCSLEN +- shrq $2, %rax ++ /* NB: Divide shift count by 4 since each bit in K0 represent 4 ++ bytes. */ ++ movl %edx, %ecx ++ shrl $2, %ecx ++ andl $(CHAR_PER_VEC - 1), %ecx + # endif +- ret +- +- .p2align 4 +-L(first_vec_x1): ++ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */ ++ sarxl %SHIFT_REG, %eax, %eax ++ testl %eax, %eax ++# ifndef USE_AS_STRNLEN ++ jz L(cross_page_continue) + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif + ret +- +- .p2align 4 +-L(first_vec_x2): +- tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif ++# else ++ jnz L(cross_page_less_vec) ++# ifndef USE_AS_WCSLEN ++ movl %edx, %ecx ++ andl $(CHAR_PER_VEC - 1), %ecx ++# endif ++ movl $CHAR_PER_VEC, %eax ++ subl %ecx, %eax ++ /* Check the end of data. */ ++ cmpq %rax, %rsi ++ ja L(cross_page_continue) ++ movl %esi, %eax + ret +- +- .p2align 4 +-L(4x_vec_end): +- VPCMP $0, %YMM1, %YMMZERO, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- VPCMP $0, %YMM2, %YMMZERO, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- VPCMP $0, %YMM3, %YMMZERO, %k2 +- kmovd %k2, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- VPCMP $0, %YMM4, %YMMZERO, %k3 +- kmovd %k3, %eax +-L(first_vec_x3): ++L(cross_page_less_vec): + tzcntl %eax, %eax +-# ifdef USE_AS_WCSLEN +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %eax +-# endif +- addq $(VEC_SIZE * 3), %rax +- addq %rdi, %rax +- subq %rdx, %rax +-# ifdef USE_AS_WCSLEN +- shrq $2, %rax +-# endif ++ /* Select min of length and position of first null. */ ++ cmpq %rax, %rsi ++ cmovb %esi, %eax + ret ++# endif + + END (STRLEN) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-32.patch b/glibc-RHEL-15696-32.patch new file mode 100644 index 0000000..8f1a94a --- /dev/null +++ b/glibc-RHEL-15696-32.patch @@ -0,0 +1,158 @@ +From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Wed, 30 Jun 2021 10:47:06 -0700 +Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033] +Content-type: text/plain; charset=UTF-8 + +From + +https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html + +* Intel TSX will be disabled by default. +* The processor will force abort all Restricted Transactional Memory (RTM) + transactions by default. +* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated, + which is set to indicate to updated software that the loaded microcode is + forcing RTM abort. +* On processors that enumerate support for RTM, the CPUID enumeration bits + for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to + be set by default after microcode update. +* Workloads that were benefited from Intel TSX might experience a change + in performance. +* System software may use a new bit in Model-Specific Register (MSR) 0x10F + TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock + Elision (HLE) and RTM bits to indicate to software that Intel TSX is + disabled. + +1. Add RTM_ALWAYS_ABORT to CPUID features. +2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set. This skips the +string/tst-memchr-rtm etc. testcases on the affected processors, which +always fail after a microcde update. +3. Check RTM feature, instead of usability, against /proc/cpuinfo. + +This fixes BZ #28033. +--- + manual/platform.texi | 3 +++ + sysdeps/x86/cpu-features.c | 5 ++++- + sysdeps/x86/sys/platform/x86.h | 6 +++--- + sysdeps/x86/tst-cpu-features-supports.c | 2 +- + sysdeps/x86/tst-get-cpu-features.c | 2 ++ + 5 files changed, 13 insertions(+), 5 deletions(-) + +Conflicts: + sysdeps/x86/bits/platform/x86.h + (doesn't exist) + sysdeps/x86/bits/platform/x86.h + (account for lack of upstream renames) + +diff --git a/manual/platform.texi b/manual/platform.texi +index 8fec2933..b7e8aef7 100644 +--- a/manual/platform.texi ++++ b/manual/platform.texi +@@ -510,6 +510,9 @@ capability. + @item + @code{RTM} -- RTM instruction extensions. + ++@item ++@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable. ++ + @item + @code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug. + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 3610ee5c..4889f062 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features) + CPU_FEATURE_SET_USABLE (cpu_features, HLE); + CPU_FEATURE_SET_USABLE (cpu_features, BMI2); + CPU_FEATURE_SET_USABLE (cpu_features, ERMS); +- CPU_FEATURE_SET_USABLE (cpu_features, RTM); + CPU_FEATURE_SET_USABLE (cpu_features, RDSEED); + CPU_FEATURE_SET_USABLE (cpu_features, ADX); + CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT); +@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features) + CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI); + CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B); + CPU_FEATURE_SET_USABLE (cpu_features, FSRM); ++ CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT); + CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE); + CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK); + CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64); +@@ -779,6 +779,9 @@ no_cpuid: + GLRO(dl_platform) = "i586"; + #endif + ++ if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT)) ++ CPU_FEATURE_SET_USABLE (cpu_features, RTM); ++ + #if CET_ENABLED + # if HAVE_TUNABLES + TUNABLE_GET (x86_ibt, tunable_val_t *, +diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h +index e5cc7c68..7a434926 100644 +--- a/sysdeps/x86/sys/platform/x86.h ++++ b/sysdeps/x86/sys/platform/x86.h +@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int) + #define bit_cpu_AVX512_VP2INTERSECT (1u << 8) + #define bit_cpu_INDEX_7_EDX_9 (1u << 9) + #define bit_cpu_MD_CLEAR (1u << 10) +-#define bit_cpu_INDEX_7_EDX_11 (1u << 11) ++#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11) + #define bit_cpu_INDEX_7_EDX_12 (1u << 12) + #define bit_cpu_INDEX_7_EDX_13 (1u << 13) + #define bit_cpu_SERIALIZE (1u << 14) +@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int) + #define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7 + #define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7 + #define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7 +-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7 ++#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7 + #define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7 + #define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7 + #define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7 +@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int) + #define reg_AVX512_VP2INTERSECT edx + #define reg_INDEX_7_EDX_9 edx + #define reg_MD_CLEAR edx +-#define reg_INDEX_7_EDX_11 edx ++#define reg_RTM_ALWAYS_ABORT edx + #define reg_INDEX_7_EDX_12 edx + #define reg_INDEX_7_EDX_13 edx + #define reg_SERIALIZE edx +diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c +index 287cf01f..8100a319 100644 +--- a/sysdeps/x86/tst-cpu-features-supports.c ++++ b/sysdeps/x86/tst-cpu-features-supports.c +@@ -152,7 +152,7 @@ do_test (int argc, char **argv) + fails += CHECK_SUPPORTS (rdpid, RDPID); + fails += CHECK_SUPPORTS (rdrnd, RDRAND); + fails += CHECK_SUPPORTS (rdseed, RDSEED); +- fails += CHECK_SUPPORTS (rtm, RTM); ++ fails += CHECK_CPU_SUPPORTS (rtm, RTM); + fails += CHECK_SUPPORTS (serialize, SERIALIZE); + fails += CHECK_SUPPORTS (sha, SHA); + fails += CHECK_CPU_SUPPORTS (shstk, SHSTK); +diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c +index 2763deb6..0717e5d8 100644 +--- a/sysdeps/x86/tst-get-cpu-features.c ++++ b/sysdeps/x86/tst-get-cpu-features.c +@@ -183,6 +183,7 @@ do_test (void) + CHECK_CPU_FEATURE (UINTR); + CHECK_CPU_FEATURE (AVX512_VP2INTERSECT); + CHECK_CPU_FEATURE (MD_CLEAR); ++ CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT); + CHECK_CPU_FEATURE (SERIALIZE); + CHECK_CPU_FEATURE (HYBRID); + CHECK_CPU_FEATURE (TSXLDTRK); +@@ -344,6 +345,7 @@ do_test (void) + CHECK_CPU_FEATURE_USABLE (FSRM); + CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT); + CHECK_CPU_FEATURE_USABLE (MD_CLEAR); ++ CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT); + CHECK_CPU_FEATURE_USABLE (SERIALIZE); + CHECK_CPU_FEATURE_USABLE (HYBRID); + CHECK_CPU_FEATURE_USABLE (TSXLDTRK); +-- +GitLab + diff --git a/glibc-RHEL-15696-33.patch b/glibc-RHEL-15696-33.patch new file mode 100644 index 0000000..1196471 --- /dev/null +++ b/glibc-RHEL-15696-33.patch @@ -0,0 +1,51 @@ +From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 8 Jul 2021 16:13:19 -0400 +Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ + #28064] +Content-type: text/plain; charset=UTF-8 + +The following commit + +commit 6f573a27b6c8b4236445810a44660612323f5a73 +Author: Noah Goldstein +Date: Wed Jun 23 01:19:34 2021 -0400 + + x86-64: Add wcslen optimize for sse4.1 + +Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did +not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit +fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc +implementation list and adding wcslen-sse4.1 to the ifunc +implementation list. + +Testing: +test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as +well as all other tests in wcsmbs and string. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 580913ca..695cdba6 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wcslen_evex) +- IFUNC_IMPL_ADD (array, i, wcsnlen, ++ IFUNC_IMPL_ADD (array, i, wcslen, + CPU_FEATURE_USABLE (SSE4_1), +- __wcsnlen_sse4_1) ++ __wcslen_sse4_1) + IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2)) + + /* Support sysdeps/x86_64/multiarch/wcsnlen.c. */ +-- +GitLab + diff --git a/glibc-RHEL-15696-34.patch b/glibc-RHEL-15696-34.patch new file mode 100644 index 0000000..f7c9a56 --- /dev/null +++ b/glibc-RHEL-15696-34.patch @@ -0,0 +1,135 @@ +From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 08:18:15 -0600 +Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ + #28896] +Content-type: text/plain; charset=UTF-8 + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. + +Co-authored-by: H.J. Lu +--- + sysdeps/x86/Makefile | 2 +- + sysdeps/x86/tst-strncmp-rtm.c | 17 ++++++++++++++++- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 1 + + sysdeps/x86_64/multiarch/strncmp-avx2.S | 1 + + sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 2 +- + sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 2 +- + 7 files changed, 22 insertions(+), 5 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strcmp-avx2.S + (split into two patches due to upstream bug differences) + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 5be71ada..2d814915 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm + CFLAGS-tst-strchr-rtm.c += -mrtm + CFLAGS-tst-strcpy-rtm.c += -mrtm + CFLAGS-tst-strlen-rtm.c += -mrtm +-CFLAGS-tst-strncmp-rtm.c += -mrtm ++CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error + CFLAGS-tst-strrchr-rtm.c += -mrtm + endif + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 236ad951..4d0004b5 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -16,6 +16,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#include + #include + + #define LOOP 3000 +@@ -45,8 +46,22 @@ function (void) + return 1; + } + ++__attribute__ ((noinline, noclone)) ++static int ++function_overflow (void) ++{ ++ if (strncmp (string1, string2, SIZE_MAX) == 0) ++ return 0; ++ else ++ return 1; ++} ++ + static int + do_test (void) + { +- return do_test_1 ("strncmp", LOOP, prepare, function); ++ int status = do_test_1 ("strncmp", LOOP, prepare, function); ++ if (status != EXIT_SUCCESS) ++ return status; ++ status = do_test_1 ("strncmp", LOOP, prepare, function_overflow); ++ return status; + } +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 5d1c9d90..433ae047 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -95,7 +95,7 @@ ENTRY (STRCMP) + length to bound a valid memory region. In these cases just use + 'wcscmp'. */ + shrq $56, %rcx +- jnz __wcscmp_avx2 ++ jnz OVERFLOW_STRCMP + # endif + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP +diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +index 37d1224b..68bad365 100644 +--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S ++++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S +@@ -1,3 +1,4 @@ + #define STRCMP __strncmp_avx2_rtm + #define USE_AS_STRNCMP 1 ++#define OVERFLOW_STRCMP __strcmp_avx2_rtm + #include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S +index 1678bcc2..f138e9f1 100644 +--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S +@@ -1,3 +1,4 @@ + #define STRCMP __strncmp_avx2 + #define USE_AS_STRNCMP 1 ++#define OVERFLOW_STRCMP __strcmp_avx2 + #include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +index 4e88c70c..f467582c 100644 +--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S ++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S +@@ -1,5 +1,5 @@ + #define STRCMP __wcsncmp_avx2_rtm + #define USE_AS_STRNCMP 1 + #define USE_AS_WCSCMP 1 +- ++#define OVERFLOW_STRCMP __wcscmp_avx2_rtm + #include "strcmp-avx2-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S +index 4fa1de4d..e9ede522 100644 +--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S +@@ -1,5 +1,5 @@ + #define STRCMP __wcsncmp_avx2 + #define USE_AS_STRNCMP 1 + #define USE_AS_WCSCMP 1 +- ++#define OVERFLOW_STRCMP __wcscmp_avx2 + #include "strcmp-avx2.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-35.patch b/glibc-RHEL-15696-35.patch new file mode 100644 index 0000000..5e4fbdd --- /dev/null +++ b/glibc-RHEL-15696-35.patch @@ -0,0 +1,51 @@ +From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 9 May 2020 12:04:23 -0700 +Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ + #25966] +Content-type: text/plain; charset=UTF-8 + +Since __x86_shared_non_temporal_threshold is defined as + +long int __x86_shared_non_temporal_threshold; + +and long int is 4 bytes for x32, use RDX_LP to compare against +__x86_shared_non_temporal_threshold in assembly code. +--- + sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 71f5954d..673b73aa 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -245,7 +245,7 @@ L(return): + #endif + + L(movsb): +- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + jae L(more_8x_vec) + cmpq %rsi, %rdi + jb 1f +@@ -397,7 +397,7 @@ L(more_8x_vec): + addq %r8, %rdx + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ +- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_forward) + #endif + L(loop_4x_vec_forward): +@@ -448,7 +448,7 @@ L(more_8x_vec_backward): + subq %r8, %rdx + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ +- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_backward) + #endif + L(loop_4x_vec_backward): +-- +GitLab + diff --git a/glibc-RHEL-15696-36.patch b/glibc-RHEL-15696-36.patch new file mode 100644 index 0000000..e00b96e --- /dev/null +++ b/glibc-RHEL-15696-36.patch @@ -0,0 +1,44 @@ +From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 11 Jun 2020 12:41:18 -0700 +Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register +Content-type: text/plain; charset=UTF-8 + +Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use +%xmmN, instead of %ymmN, with vpxor to clear a vector register. +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 4 ++-- + sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 433ae047..70d8499b 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -105,8 +105,8 @@ ENTRY (STRCMP) + # endif + movl %edi, %eax + xorl %edx, %edx +- /* Make %ymm7 all zeros in this function. */ +- vpxor %ymm7, %ymm7, %ymm7 ++ /* Make %xmm7 (%ymm7) all zeros in this function. */ ++ vpxor %xmm7, %xmm7, %xmm7 + orl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S +index 9f22a15e..c949410b 100644 +--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S +@@ -48,7 +48,7 @@ ENTRY (STRRCHR) + movl %edi, %ecx + /* Broadcast CHAR to YMM4. */ + VPBROADCAST %xmm4, %ymm4 +- vpxor %ymm0, %ymm0, %ymm0 ++ vpxor %xmm0, %xmm0, %xmm0 + + /* Check if we may cross page boundary with one vector load. */ + andl $(2 * VEC_SIZE - 1), %ecx +-- +GitLab + diff --git a/glibc-RHEL-15696-37.patch b/glibc-RHEL-15696-37.patch new file mode 100644 index 0000000..10b0cc4 --- /dev/null +++ b/glibc-RHEL-15696-37.patch @@ -0,0 +1,359 @@ +From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001 +From: noah +Date: Wed, 3 Feb 2021 00:38:59 -0500 +Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. Just seemed the performance could be improved a bit. Observed +and expected behavior are unchanged. Optimized body of main +loop. Updated page cross logic and optimized accordingly. Made a few +minor instruction selection modifications. No regressions in test +suite. Both test-strchrnul and test-strchr passed. +--- + sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++------------- + sysdeps/x86_64/multiarch/strchr.c | 4 +- + 2 files changed, 114 insertions(+), 115 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strchr.c + (account for missing upstream macros) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index da7d2620..919d256c 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -27,10 +27,12 @@ + # ifdef USE_AS_WCSCHR + # define VPBROADCAST vpbroadcastd + # define VPCMPEQ vpcmpeqd ++# define VPMINU vpminud + # define CHAR_REG esi + # else + # define VPBROADCAST vpbroadcastb + # define VPCMPEQ vpcmpeqb ++# define VPMINU vpminub + # define CHAR_REG sil + # endif + +@@ -43,71 +45,54 @@ + # endif + + # define VEC_SIZE 32 ++# define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits + ENTRY (STRCHR) + movl %edi, %ecx +- /* Broadcast CHAR to YMM0. */ ++# ifndef USE_AS_STRCHRNUL ++ xorl %edx, %edx ++# endif ++ ++ /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + vpxor %xmm9, %xmm9, %xmm9 + VPBROADCAST %xmm0, %ymm0 +- /* Check if we may cross page boundary with one vector load. */ +- andl $(2 * VEC_SIZE - 1), %ecx +- cmpl $VEC_SIZE, %ecx +- ja L(cros_page_boundary) + +- /* Check the first VEC_SIZE bytes. Search for both CHAR and the +- null byte. */ +- vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) ++ /* Check if we cross page boundary with one vector load. */ ++ andl $(PAGE_SIZE - 1), %ecx ++ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx ++ ja L(cross_page_boundary) + +- /* Align data for aligned loads in the loop. */ +- addq $VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi +- +- jmp L(more_4x_vec) +- +- .p2align 4 +-L(cros_page_boundary): +- andl $(VEC_SIZE - 1), %ecx +- andq $-VEC_SIZE, %rdi ++ /* Check the first VEC_SIZE bytes. Search for both CHAR and the ++ null byte. */ + vmovdqu (%rdi), %ymm8 + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax +- /* Remove the leading bytes. */ +- sarl %cl, %eax + testl %eax, %eax +- jz L(aligned_more) +- /* Found CHAR or the null byte. */ ++ jz L(more_vecs) + tzcntl %eax, %eax +- addq %rcx, %rax +-# ifdef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ + addq %rdi, %rax +-# else +- xorl %edx, %edx +- leaq (%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 ++L(more_vecs): ++ /* Align data for aligned loads in the loop. */ ++ andq $-VEC_SIZE, %rdi + L(aligned_more): +- addq $VEC_SIZE, %rdi + +-L(more_4x_vec): +- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- vmovdqa (%rdi), %ymm8 ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vmovdqa VEC_SIZE(%rdi), %ymm8 ++ addq $VEC_SIZE, %rdi + VPCMPEQ %ymm8, %ymm0, %ymm1 + VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 +@@ -137,61 +122,24 @@ L(more_4x_vec): + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax +- jnz L(first_vec_x3) +- +- addq $(VEC_SIZE * 4), %rdi +- +- /* Align data to 4 * VEC_SIZE. */ +- movq %rdi, %rcx +- andl $(4 * VEC_SIZE - 1), %ecx +- andq $-(4 * VEC_SIZE), %rdi +- +- .p2align 4 +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm5 +- vmovdqa VEC_SIZE(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 +- +- VPCMPEQ %ymm5, %ymm0, %ymm1 +- VPCMPEQ %ymm6, %ymm0, %ymm2 +- VPCMPEQ %ymm7, %ymm0, %ymm3 +- VPCMPEQ %ymm8, %ymm0, %ymm4 +- +- VPCMPEQ %ymm5, %ymm9, %ymm5 +- VPCMPEQ %ymm6, %ymm9, %ymm6 +- VPCMPEQ %ymm7, %ymm9, %ymm7 +- VPCMPEQ %ymm8, %ymm9, %ymm8 +- +- vpor %ymm1, %ymm5, %ymm1 +- vpor %ymm2, %ymm6, %ymm2 +- vpor %ymm3, %ymm7, %ymm3 +- vpor %ymm4, %ymm8, %ymm4 +- +- vpor %ymm1, %ymm2, %ymm5 +- vpor %ymm3, %ymm4, %ymm6 +- +- vpor %ymm5, %ymm6, %ymm5 +- +- vpmovmskb %ymm5, %eax +- testl %eax, %eax +- jnz L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi ++ jz L(prep_loop_4x) + +- jmp L(loop_4x_vec) ++ tzcntl %eax, %eax ++ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret + + .p2align 4 + L(first_vec_x0): +- /* Found CHAR or the null byte. */ + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ + addq %rdi, %rax +-# else +- xorl %edx, %edx +- leaq (%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN +@@ -199,13 +147,9 @@ L(first_vec_x0): + .p2align 4 + L(first_vec_x1): + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL +- addq $VEC_SIZE, %rax +- addq %rdi, %rax +-# else +- xorl %edx, %edx + leaq VEC_SIZE(%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN +@@ -213,42 +157,97 @@ L(first_vec_x1): + .p2align 4 + L(first_vec_x2): + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL +- addq $(VEC_SIZE * 2), %rax +- addq %rdi, %rax +-# else +- xorl %edx, %edx ++ /* Found CHAR or the null byte. */ + leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN + ++L(prep_loop_4x): ++ /* Align data to 4 * VEC_SIZE. */ ++ andq $-(VEC_SIZE * 4), %rdi ++ + .p2align 4 +-L(4x_vec_end): ++L(loop_4x_vec): ++ /* Compare 4 * VEC at a time forward. */ ++ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 ++ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 ++ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 ++ ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxor %ymm5, %ymm0, %ymm1 ++ vpxor %ymm6, %ymm0, %ymm2 ++ vpxor %ymm7, %ymm0, %ymm3 ++ vpxor %ymm8, %ymm0, %ymm4 ++ ++ VPMINU %ymm1, %ymm5, %ymm1 ++ VPMINU %ymm2, %ymm6, %ymm2 ++ VPMINU %ymm3, %ymm7, %ymm3 ++ VPMINU %ymm4, %ymm8, %ymm4 ++ ++ VPMINU %ymm1, %ymm2, %ymm5 ++ VPMINU %ymm3, %ymm4, %ymm6 ++ ++ VPMINU %ymm5, %ymm6, %ymm5 ++ ++ VPCMPEQ %ymm5, %ymm9, %ymm5 ++ vpmovmskb %ymm5, %eax ++ ++ addq $(VEC_SIZE * 4), %rdi ++ testl %eax, %eax ++ jz L(loop_4x_vec) ++ ++ VPCMPEQ %ymm1, %ymm9, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax + jnz L(first_vec_x0) ++ ++ VPCMPEQ %ymm2, %ymm9, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(first_vec_x1) +- vpmovmskb %ymm3, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) ++ ++ VPCMPEQ %ymm3, %ymm9, %ymm3 ++ VPCMPEQ %ymm4, %ymm9, %ymm4 ++ vpmovmskb %ymm3, %ecx + vpmovmskb %ymm4, %eax ++ salq $32, %rax ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG ++ cmovne %rdx, %rax ++# endif ++ VZEROUPPER ++ ret ++ ++ /* Cold case for crossing page with first load. */ ++ .p2align 4 ++L(cross_page_boundary): ++ andq $-VEC_SIZE, %rdi ++ andl $(VEC_SIZE - 1), %ecx ++ ++ vmovdqa (%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* Remove the leading bits. */ ++ sarxl %ecx, %eax, %eax + testl %eax, %eax +-L(first_vec_x3): ++ jz L(aligned_more) + tzcntl %eax, %eax +-# ifdef USE_AS_STRCHRNUL +- addq $(VEC_SIZE * 3), %rax ++ addq %rcx, %rdi + addq %rdi, %rax +-# else +- xorl %edx, %edx +- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax +- cmp (%rax), %CHAR_REG ++# ifndef USE_AS_STRCHRNUL ++ cmp (%rax), %CHAR_REG + cmovne %rdx, %rax + # endif + VZEROUPPER_RETURN + + END (STRCHR) +-#endif ++# endif +diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c +index 7e582f02..5225bd4f 100644 +--- a/sysdeps/x86_64/multiarch/strchr.c ++++ b/sysdeps/x86_64/multiarch/strchr.c +@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void) + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +-- +GitLab + diff --git a/glibc-RHEL-15696-38.patch b/glibc-RHEL-15696-38.patch new file mode 100644 index 0000000..f97ab23 --- /dev/null +++ b/glibc-RHEL-15696-38.patch @@ -0,0 +1,67 @@ +From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 25 Jan 2020 14:19:40 -0800 +Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130] +Content-type: text/plain; charset=UTF-8 + +When copying with "rep movsb", if the distance between source and +destination is N*4GB + [1..63] with N >= 0, performance may be very +slow. This patch updates memmove-vec-unaligned-erms.S for AVX and +AVX512 versions with the distance in RCX: + + cmpl $63, %ecx + // Don't use "rep movsb" if ECX <= 63 + jbe L(Don't use rep movsb") + Use "rep movsb" + +Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random +and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its +performance impact is within noise range as "rep movsb" is only used for +data size >= 4KB. +--- + .../multiarch/memmove-vec-unaligned-erms.S | 21 +++++++++++++++++++ + 1 file changed, 21 insertions(+) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 673b73aa..c475fed4 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -64,6 +64,13 @@ + # endif + #endif + ++/* Avoid short distance rep movsb only with non-SSE vector. */ ++#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB ++# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) ++#else ++# define AVOID_SHORT_DISTANCE_REP_MOVSB 0 ++#endif ++ + #ifndef PREFETCH + # define PREFETCH(addr) prefetcht0 addr + #endif +@@ -255,7 +262,21 @@ L(movsb): + cmpq %r9, %rdi + /* Avoid slow backward REP MOVSB. */ + jb L(more_8x_vec_backward) ++# if AVOID_SHORT_DISTANCE_REP_MOVSB ++ movq %rdi, %rcx ++ subq %rsi, %rcx ++ jmp 2f ++# endif + 1: ++# if AVOID_SHORT_DISTANCE_REP_MOVSB ++ movq %rsi, %rcx ++ subq %rdi, %rcx ++2: ++/* Avoid "rep movsb" if RCX, the distance between source and destination, ++ is N*4GB + [1..63] with N >= 0. */ ++ cmpl $63, %ecx ++ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */ ++# endif + mov %RDX_LP, %RCX_LP + rep movsb + L(nop): +-- +GitLab + diff --git a/glibc-RHEL-15696-39.patch b/glibc-RHEL-15696-39.patch new file mode 100644 index 0000000..8343ba9 --- /dev/null +++ b/glibc-RHEL-15696-39.patch @@ -0,0 +1,449 @@ +From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001 +From: noah +Date: Sat, 3 Apr 2021 04:12:15 -0400 +Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No Bug. This commit updates the large memcpy case (no overlap). The +update is to perform memcpy on either 2 or 4 contiguous pages at +once. This 1) helps to alleviate the affects of false memory aliasing +when destination and source have a close 4k alignment and 2) In most +cases and for most DRAM units is a modestly more efficient access +pattern. These changes are a clear performance improvement for +VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, +test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all +pass. + +Signed-off-by: Noah Goldstein +--- + .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- + 1 file changed, 265 insertions(+), 73 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S + (different number of sections) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index c475fed4..3e2dd6bc 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -32,7 +32,16 @@ + overlapping addresses. + 6. If size >= __x86_shared_non_temporal_threshold and there is no + overlap between destination and source, use non-temporal store +- instead of aligned store. */ ++ instead of aligned store copying from either 2 or 4 pages at ++ once. ++ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold ++ and source and destination do not page alias, copy from 2 pages ++ at once using non-temporal stores. Page aliasing in this case is ++ considered true if destination's page alignment - sources' page ++ alignment is less than 8 * VEC_SIZE. ++ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source ++ and destination do page alias copy from 4 pages at once using ++ non-temporal stores. */ + + #include + +@@ -64,6 +73,34 @@ + # endif + #endif + ++#ifndef PAGE_SIZE ++# define PAGE_SIZE 4096 ++#endif ++ ++#if PAGE_SIZE != 4096 ++# error Unsupported PAGE_SIZE ++#endif ++ ++#ifndef LOG_PAGE_SIZE ++# define LOG_PAGE_SIZE 12 ++#endif ++ ++#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) ++# error Invalid LOG_PAGE_SIZE ++#endif ++ ++/* Byte per page for large_memcpy inner loop. */ ++#if VEC_SIZE == 64 ++# define LARGE_LOAD_SIZE (VEC_SIZE * 2) ++#else ++# define LARGE_LOAD_SIZE (VEC_SIZE * 4) ++#endif ++ ++/* Amount to shift rdx by to compare for memcpy_large_4x. */ ++#ifndef LOG_4X_MEMCPY_THRESH ++# define LOG_4X_MEMCPY_THRESH 4 ++#endif ++ + /* Avoid short distance rep movsb only with non-SSE vector. */ + #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB + # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) +@@ -103,6 +140,28 @@ + # error Unsupported PREFETCH_SIZE! + #endif + ++#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) ++# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ ++ VMOVU (offset)base, vec0; \ ++ VMOVU ((offset) + VEC_SIZE)base, vec1; ++# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ ++ VMOVNT vec0, (offset)base; \ ++ VMOVNT vec1, ((offset) + VEC_SIZE)base; ++#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) ++# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ ++ VMOVU (offset)base, vec0; \ ++ VMOVU ((offset) + VEC_SIZE)base, vec1; \ ++ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ ++ VMOVU ((offset) + VEC_SIZE * 3)base, vec3; ++# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ ++ VMOVNT vec0, (offset)base; \ ++ VMOVNT vec1, ((offset) + VEC_SIZE)base; \ ++ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ ++ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; ++#else ++# error Invalid LARGE_LOAD_SIZE ++#endif ++ + #ifndef SECTION + # error SECTION is not defined! + #endif +@@ -390,6 +449,15 @@ L(last_4x_vec): + VZEROUPPER_RETURN + + L(more_8x_vec): ++ /* Check if non-temporal move candidate. */ ++#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) ++ /* Check non-temporal store threshold. */ ++ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP ++ ja L(large_memcpy_2x) ++#endif ++ /* Entry if rdx is greater than non-temporal threshold but there ++ is overlap. */ ++L(more_8x_vec_check): + cmpq %rsi, %rdi + ja L(more_8x_vec_backward) + /* Source == destination is less common. */ +@@ -416,24 +484,21 @@ L(more_8x_vec): + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx +-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +- /* Check non-temporal store threshold. */ +- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP +- ja L(large_forward) +-#endif ++ ++ .p2align 4 + L(loop_4x_vec_forward): + /* Copy 4 * VEC a time forward. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) +- addq $(VEC_SIZE * 4), %rsi +- subq $(VEC_SIZE * 4), %rdx ++ subq $-(VEC_SIZE * 4), %rsi ++ addq $-(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) +- addq $(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rdi + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_forward) + /* Store the last 4 * VEC. */ +@@ -467,24 +532,21 @@ L(more_8x_vec_backward): + subq %r8, %r9 + /* Adjust length. */ + subq %r8, %rdx +-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +- /* Check non-temporal store threshold. */ +- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP +- ja L(large_backward) +-#endif ++ ++ .p2align 4 + L(loop_4x_vec_backward): + /* Copy 4 * VEC a time backward. */ + VMOVU (%rcx), %VEC(0) + VMOVU -VEC_SIZE(%rcx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) + VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) +- subq $(VEC_SIZE * 4), %rcx +- subq $(VEC_SIZE * 4), %rdx ++ addq $-(VEC_SIZE * 4), %rcx ++ addq $-(VEC_SIZE * 4), %rdx + VMOVA %VEC(0), (%r9) + VMOVA %VEC(1), -VEC_SIZE(%r9) + VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) + VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) +- subq $(VEC_SIZE * 4), %r9 ++ addq $-(VEC_SIZE * 4), %r9 + cmpq $(VEC_SIZE * 4), %rdx + ja L(loop_4x_vec_backward) + /* Store the first 4 * VEC. */ +@@ -497,72 +559,202 @@ L(loop_4x_vec_backward): + VZEROUPPER_RETURN + + #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) +-L(large_forward): ++ .p2align 4 ++L(large_memcpy_2x): ++ /* Compute absolute value of difference between source and ++ destination. */ ++ movq %rdi, %r9 ++ subq %rsi, %r9 ++ movq %r9, %r8 ++ leaq -1(%r9), %rcx ++ sarq $63, %r8 ++ xorq %r8, %r9 ++ subq %r8, %r9 + /* Don't use non-temporal store if there is overlap between +- destination and source since destination may be in cache +- when source is loaded. */ +- leaq (%rdi, %rdx), %r10 +- cmpq %r10, %rsi +- jb L(loop_4x_vec_forward) +-L(loop_large_forward): ++ destination and source since destination may be in cache when ++ source is loaded. */ ++ cmpq %r9, %rdx ++ ja L(more_8x_vec_check) ++ ++ /* Cache align destination. First store the first 64 bytes then ++ adjust alignments. */ ++ VMOVU (%rsi), %VEC(8) ++#if VEC_SIZE < 64 ++ VMOVU VEC_SIZE(%rsi), %VEC(9) ++#if VEC_SIZE < 32 ++ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) ++ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) ++#endif ++#endif ++ VMOVU %VEC(8), (%rdi) ++#if VEC_SIZE < 64 ++ VMOVU %VEC(9), VEC_SIZE(%rdi) ++#if VEC_SIZE < 32 ++ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) ++ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) ++#endif ++#endif ++ /* Adjust source, destination, and size. */ ++ movq %rdi, %r8 ++ andq $63, %r8 ++ /* Get the negative of offset for alignment. */ ++ subq $64, %r8 ++ /* Adjust source. */ ++ subq %r8, %rsi ++ /* Adjust destination which should be aligned now. */ ++ subq %r8, %rdi ++ /* Adjust length. */ ++ addq %r8, %rdx ++ ++ /* Test if source and destination addresses will alias. If they do ++ the larger pipeline in large_memcpy_4x alleviated the ++ performance drop. */ ++ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx ++ jz L(large_memcpy_4x) ++ ++ movq %rdx, %r10 ++ shrq $LOG_4X_MEMCPY_THRESH, %r10 ++ cmp __x86_shared_non_temporal_threshold(%rip), %r10 ++ jae L(large_memcpy_4x) ++ ++ /* edx will store remainder size for copying tail. */ ++ andl $(PAGE_SIZE * 2 - 1), %edx ++ /* r10 stores outer loop counter. */ ++ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 ++ /* Copy 4x VEC at a time from 2 pages. */ ++ .p2align 4 ++L(loop_large_memcpy_2x_outer): ++ /* ecx stores inner loop counter. */ ++ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx ++L(loop_large_memcpy_2x_inner): ++ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) ++ /* Load vectors from rsi. */ ++ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ subq $-LARGE_LOAD_SIZE, %rsi ++ /* Non-temporal store vectors to rdi. */ ++ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ subq $-LARGE_LOAD_SIZE, %rdi ++ decl %ecx ++ jnz L(loop_large_memcpy_2x_inner) ++ addq $PAGE_SIZE, %rdi ++ addq $PAGE_SIZE, %rsi ++ decq %r10 ++ jne L(loop_large_memcpy_2x_outer) ++ sfence ++ ++ /* Check if only last 4 loads are needed. */ ++ cmpl $(VEC_SIZE * 4), %edx ++ jbe L(large_memcpy_2x_end) ++ ++ /* Handle the last 2 * PAGE_SIZE bytes. */ ++L(loop_large_memcpy_2x_tail): + /* Copy 4 * VEC a time forward with non-temporal stores. */ +- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) +- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) ++ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) +- addq $PREFETCHED_LOAD_SIZE, %rsi +- subq $PREFETCHED_LOAD_SIZE, %rdx +- VMOVNT %VEC(0), (%rdi) +- VMOVNT %VEC(1), VEC_SIZE(%rdi) +- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) +- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) +- addq $PREFETCHED_LOAD_SIZE, %rdi +- cmpq $PREFETCHED_LOAD_SIZE, %rdx +- ja L(loop_large_forward) +- sfence ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $-(VEC_SIZE * 4), %edx ++ VMOVA %VEC(0), (%rdi) ++ VMOVA %VEC(1), VEC_SIZE(%rdi) ++ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) ++ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpl $(VEC_SIZE * 4), %edx ++ ja L(loop_large_memcpy_2x_tail) ++ ++L(large_memcpy_2x_end): + /* Store the last 4 * VEC. */ +- VMOVU %VEC(5), (%rcx) +- VMOVU %VEC(6), -VEC_SIZE(%rcx) +- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) +- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) +- /* Store the first VEC. */ +- VMOVU %VEC(4), (%r11) ++ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) ++ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) ++ ++ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) ++ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) ++ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) ++ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) + VZEROUPPER_RETURN + +-L(large_backward): +- /* Don't use non-temporal store if there is overlap between +- destination and source since destination may be in cache +- when source is loaded. */ +- leaq (%rcx, %rdx), %r10 +- cmpq %r10, %r9 +- jb L(loop_4x_vec_backward) +-L(loop_large_backward): +- /* Copy 4 * VEC a time backward with non-temporal stores. */ +- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) +- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) +- VMOVU (%rcx), %VEC(0) +- VMOVU -VEC_SIZE(%rcx), %VEC(1) +- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) +- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) +- subq $PREFETCHED_LOAD_SIZE, %rcx +- subq $PREFETCHED_LOAD_SIZE, %rdx +- VMOVNT %VEC(0), (%r9) +- VMOVNT %VEC(1), -VEC_SIZE(%r9) +- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) +- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) +- subq $PREFETCHED_LOAD_SIZE, %r9 +- cmpq $PREFETCHED_LOAD_SIZE, %rdx +- ja L(loop_large_backward) ++ .p2align 4 ++L(large_memcpy_4x): ++ movq %rdx, %r10 ++ /* edx will store remainder size for copying tail. */ ++ andl $(PAGE_SIZE * 4 - 1), %edx ++ /* r10 stores outer loop counter. */ ++ shrq $(LOG_PAGE_SIZE + 2), %r10 ++ /* Copy 4x VEC at a time from 4 pages. */ ++ .p2align 4 ++L(loop_large_memcpy_4x_outer): ++ /* ecx stores inner loop counter. */ ++ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx ++L(loop_large_memcpy_4x_inner): ++ /* Only one prefetch set per page as doing 4 pages give more time ++ for prefetcher to keep up. */ ++ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) ++ /* Load vectors from rsi. */ ++ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) ++ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) ++ subq $-LARGE_LOAD_SIZE, %rsi ++ /* Non-temporal store vectors to rdi. */ ++ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) ++ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) ++ subq $-LARGE_LOAD_SIZE, %rdi ++ decl %ecx ++ jnz L(loop_large_memcpy_4x_inner) ++ addq $(PAGE_SIZE * 3), %rdi ++ addq $(PAGE_SIZE * 3), %rsi ++ decq %r10 ++ jne L(loop_large_memcpy_4x_outer) + sfence +- /* Store the first 4 * VEC. */ +- VMOVU %VEC(4), (%rdi) +- VMOVU %VEC(5), VEC_SIZE(%rdi) +- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) +- /* Store the last VEC. */ +- VMOVU %VEC(8), (%r11) ++ /* Check if only last 4 loads are needed. */ ++ cmpl $(VEC_SIZE * 4), %edx ++ jbe L(large_memcpy_4x_end) ++ ++ /* Handle the last 4 * PAGE_SIZE bytes. */ ++L(loop_large_memcpy_4x_tail): ++ /* Copy 4 * VEC a time forward with non-temporal stores. */ ++ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) ++ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) ++ VMOVU (%rsi), %VEC(0) ++ VMOVU VEC_SIZE(%rsi), %VEC(1) ++ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) ++ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $-(VEC_SIZE * 4), %edx ++ VMOVA %VEC(0), (%rdi) ++ VMOVA %VEC(1), VEC_SIZE(%rdi) ++ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) ++ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpl $(VEC_SIZE * 4), %edx ++ ja L(loop_large_memcpy_4x_tail) ++ ++L(large_memcpy_4x_end): ++ /* Store the last 4 * VEC. */ ++ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) ++ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) ++ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) ++ ++ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) ++ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) ++ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) ++ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) + VZEROUPPER_RETURN + #endif + END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-4.patch b/glibc-RHEL-15696-4.patch new file mode 100644 index 0000000..531c171 --- /dev/null +++ b/glibc-RHEL-15696-4.patch @@ -0,0 +1,151 @@ +From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:29:58 -0800 +Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ# + 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memrchr for x32. Tested on x86-64 and x32. On x86-64, +libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/memrchr.S: Use RDX_LP for length. + * sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr. + * sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file. +--- + sysdeps/x86_64/memrchr.S | 4 +- + sysdeps/x86_64/multiarch/memrchr-avx2.S | 4 +- + sysdeps/x86_64/x32/Makefile | 3 +- + sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++ + 4 files changed, 63 insertions(+), 5 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S +index b8e3fa1d..dc82f8f7 100644 +--- a/sysdeps/x86_64/memrchr.S ++++ b/sysdeps/x86_64/memrchr.S +@@ -24,13 +24,13 @@ + ENTRY (__memrchr) + movd %esi, %xmm1 + +- sub $16, %rdx ++ sub $16, %RDX_LP + jbe L(length_less16) + + punpcklbw %xmm1, %xmm1 + punpcklbw %xmm1, %xmm1 + +- add %rdx, %rdi ++ add %RDX_LP, %RDI_LP + pshufd $0, %xmm1, %xmm1 + + movdqu (%rdi), %xmm0 +diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S +index b41a58bc..ce488dd9 100644 +--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S +@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2) + vmovd %esi, %xmm0 + vpbroadcastb %xmm0, %ymm0 + +- subq $VEC_SIZE, %rdx ++ sub $VEC_SIZE, %RDX_LP + jbe L(last_vec_or_less) + +- addq %rdx, %rdi ++ add %RDX_LP, %RDI_LP + + /* Check the last VEC_SIZE bytes. */ + vpcmpeqb (%rdi), %ymm0, %ymm1 +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 2fe1e5ac..e99dbd7c 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround + endif + + ifeq ($(subdir),string) +-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy ++tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ ++ tst-size_t-memrchr + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c +new file mode 100644 +index 00000000..c83699c0 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c +@@ -0,0 +1,57 @@ ++/* Test memrchr with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "memrchr" ++#include "test-size_t.h" ++ ++IMPL (memchr, 1) ++ ++typedef void * (*proto_t) (const void *, int, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_memrchr (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, (uintptr_t) b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t src = { { page_size }, buf2 }; ++ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ c.fn = impl->fn; ++ void * res = do_memrchr (src, c); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %p != NULL", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-40.patch b/glibc-RHEL-15696-40.patch new file mode 100644 index 0000000..7b7c07b --- /dev/null +++ b/glibc-RHEL-15696-40.patch @@ -0,0 +1,92 @@ +From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 19 Apr 2021 10:45:07 -0700 +Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +Since strchr-avx2.S updated by + +commit 1f745ecc2109890886b161d4791e1406fdfc29b8 +Author: noah +Date: Wed Feb 3 00:38:59 2021 -0500 + + x86-64: Refactor and improve performance of strchr-avx2.S + +uses sarx: + +c4 e2 72 f7 c0 sarx %ecx,%eax,%eax + +for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and +ifunc-avx2.h. +--- + sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++-- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++--- + 2 files changed, 11 insertions(+), 5 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h +index e0f30e61..ef72b73f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h ++++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h +@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void) + const struct cpu_features* cpu_features = __get_cpu_features (); + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) +- && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) + return OPTIMIZE (evex); + + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 695cdba6..85b8863a 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strchr.c. */ + IFUNC_IMPL (i, name, strchr, + IFUNC_IMPL_ADD (array, i, strchr, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strchr_avx2) + IFUNC_IMPL_ADD (array, i, strchr, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strchr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchr, +@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/strchrnul.c. */ + IFUNC_IMPL (i, name, strchrnul, + IFUNC_IMPL_ADD (array, i, strchrnul, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __strchrnul_avx2) + IFUNC_IMPL_ADD (array, i, strchrnul, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __strchrnul_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strchrnul, +@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/x86_64/multiarch/wcschr.c. */ + IFUNC_IMPL (i, name, wcschr, + IFUNC_IMPL_ADD (array, i, wcschr, +- CPU_FEATURE_USABLE (AVX2), ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2)), + __wcschr_avx2) + IFUNC_IMPL_ADD (array, i, wcschr, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (RTM)), + __wcschr_avx2_rtm) + IFUNC_IMPL_ADD (array, i, wcschr, +-- +GitLab + diff --git a/glibc-RHEL-15696-41.patch b/glibc-RHEL-15696-41.patch new file mode 100644 index 0000000..aa8fc69 --- /dev/null +++ b/glibc-RHEL-15696-41.patch @@ -0,0 +1,265 @@ +From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 19 Apr 2021 17:48:10 -0400 +Subject: [PATCH] x86: Optimize less_vec evex and avx512 + memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit adds optimized cased for less_vec memset case that +uses the avx512vl/avx512bw mask store avoiding the excessive +branches. test-memset and test-wmemset are passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 40 ++++++++++----- + sysdeps/x86_64/multiarch/ifunc-memset.h | 6 ++- + .../multiarch/memset-avx512-unaligned-erms.S | 2 +- + .../multiarch/memset-evex-unaligned-erms.S | 2 +- + .../multiarch/memset-vec-unaligned-erms.S | 51 +++++++++++++++---- + 5 files changed, 74 insertions(+), 27 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 85b8863a..d59d65f8 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_chk_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, __memset_chk, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_chk_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, __memset_chk, + CPU_FEATURE_USABLE (AVX512F), +@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx2_unaligned_erms_rtm) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_evex_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_avx512_unaligned_erms) + IFUNC_IMPL_ADD (array, i, memset, + (CPU_FEATURE_USABLE (AVX512VL) +- && CPU_FEATURE_USABLE (AVX512BW)), ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __memset_avx512_unaligned) + IFUNC_IMPL_ADD (array, i, memset, + CPU_FEATURE_USABLE (AVX512F), +@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (RTM)), + __wmemset_avx2_unaligned_rtm) + IFUNC_IMPL_ADD (array, i, wmemset, +- CPU_FEATURE_USABLE (AVX512VL), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_evex_unaligned) + IFUNC_IMPL_ADD (array, i, wmemset, +- CPU_FEATURE_USABLE (AVX512VL), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_avx512_unaligned)) + + #ifdef SHARED +@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + CPU_FEATURE_USABLE (AVX2), + __wmemset_chk_avx2_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, +- CPU_FEATURE_USABLE (AVX512VL), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_chk_evex_unaligned) + IFUNC_IMPL_ADD (array, i, __wmemset_chk, +- CPU_FEATURE_USABLE (AVX512F), ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), + __wmemset_chk_avx512_unaligned)) + #endif + +diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h +index 19795938..100e3707 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memset.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memset.h +@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void) + && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (avx512_unaligned_erms); +@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) + return OPTIMIZE (evex_unaligned_erms); +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 22e7b187..8ad842fc 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,6 +19,6 @@ + # define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s + # define WMEMSET_SYMBOL(p,s) p##_avx512_##s +- ++# define USE_LESS_VEC_MASK_STORE 1 + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index ae0a4d6e..640f0929 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,6 +19,6 @@ + # define SECTION(p) p##.evex + # define MEMSET_SYMBOL(p,s) p##_evex_##s + # define WMEMSET_SYMBOL(p,s) p##_evex_##s +- ++# define USE_LESS_VEC_MASK_STORE 1 + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index bae5cba4..f877ac9d 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -63,6 +63,8 @@ + # endif + #endif + ++#define PAGE_SIZE 4096 ++ + #ifndef SECTION + # error SECTION is not defined! + #endif +@@ -213,11 +215,38 @@ L(loop): + cmpq %rcx, %rdx + jne L(loop) + VZEROUPPER_SHORT_RETURN ++ ++ .p2align 4 + L(less_vec): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! + # endif ++# ifdef USE_LESS_VEC_MASK_STORE ++ /* Clear high bits from edi. Only keeping bits relevant to page ++ cross check. Note that we are using rax which is set in ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. ++ */ ++ andl $(PAGE_SIZE - 1), %edi ++ /* Check if VEC_SIZE store cross page. Mask stores suffer serious ++ performance degradation when it has to fault supress. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ ja L(cross_page) ++# if VEC_SIZE > 32 ++ movq $-1, %rcx ++ bzhiq %rdx, %rcx, %rcx ++ kmovq %rcx, %k1 ++# else ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k1 ++# endif ++ vmovdqu8 %VEC(0), (%rax) {%k1} ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(cross_page): ++# endif + # if VEC_SIZE > 32 + cmpb $32, %dl + jae L(between_32_63) +@@ -234,36 +263,36 @@ L(less_vec): + cmpb $1, %dl + ja L(between_2_3) + jb 1f +- movb %cl, (%rdi) ++ movb %cl, (%rax) + 1: + VZEROUPPER_RETURN + # if VEC_SIZE > 32 + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, -32(%rdi,%rdx) +- VMOVU %YMM0, (%rdi) ++ VMOVU %YMM0, -32(%rax,%rdx) ++ VMOVU %YMM0, (%rax) + VZEROUPPER_RETURN + # endif + # if VEC_SIZE > 16 + /* From 16 to 31. No branch when size == 16. */ + L(between_16_31): +- VMOVU %XMM0, -16(%rdi,%rdx) +- VMOVU %XMM0, (%rdi) ++ VMOVU %XMM0, -16(%rax,%rdx) ++ VMOVU %XMM0, (%rax) + VZEROUPPER_RETURN + # endif + /* From 8 to 15. No branch when size == 8. */ + L(between_8_15): +- movq %rcx, -8(%rdi,%rdx) +- movq %rcx, (%rdi) ++ movq %rcx, -8(%rax,%rdx) ++ movq %rcx, (%rax) + VZEROUPPER_RETURN + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %ecx, -4(%rdi,%rdx) +- movl %ecx, (%rdi) ++ movl %ecx, -4(%rax,%rdx) ++ movl %ecx, (%rax) + VZEROUPPER_RETURN + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %cx, -2(%rdi,%rdx) +- movw %cx, (%rdi) ++ movw %cx, -2(%rax,%rdx) ++ movw %cx, (%rax) + VZEROUPPER_RETURN + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-42.patch b/glibc-RHEL-15696-42.patch new file mode 100644 index 0000000..e2ca245 --- /dev/null +++ b/glibc-RHEL-15696-42.patch @@ -0,0 +1,396 @@ +From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 23 Apr 2021 15:56:24 -0400 +Subject: [PATCH] x86: Optimize strchr-avx2.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strchr-avx2.S. The optimizations are all +small things such as save an ALU in the alignment process, saving a +few instructions in the loop return, saving some bytes in the main +loop, and increasing the ILP in the return cases. test-strchr, +test-strchrnul, test-wcschr, and test-wcschrnul are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++---------- + 1 file changed, 170 insertions(+), 120 deletions(-) + +Conflics: + sysdeps/x86_64/multiarch/strchr-avx2.S + (rearranged to account for branch changes) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 919d256c..5884726b 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -49,133 +49,144 @@ + + .section SECTION(.text),"ax",@progbits + ENTRY (STRCHR) +- movl %edi, %ecx +-# ifndef USE_AS_STRCHRNUL +- xorl %edx, %edx +-# endif +- + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ VPBROADCAST %xmm0, %ymm0 + vpxor %xmm9, %xmm9, %xmm9 +- VPBROADCAST %xmm0, %ymm0 + + /* Check if we cross page boundary with one vector load. */ +- andl $(PAGE_SIZE - 1), %ecx +- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx +- ja L(cross_page_boundary) ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ + vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax +- jz L(more_vecs) ++ jz L(aligned_more) + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) ++# endif + addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ /* .p2align 5 helps keep performance more consistent if ENTRY() ++ alignment % 32 was either 16 or 0. As well this makes the ++ alignment % 32 of the loop_4x_vec fixed which makes tuning it ++ easier. */ ++ .p2align 5 ++L(first_vec_x4): ++ tzcntl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN +- +- .p2align 4 +-L(more_vecs): +- /* Align data for aligned loads in the loop. */ +- andq $-VEC_SIZE, %rdi +-L(aligned_more): +- +- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- vmovdqa VEC_SIZE(%rdi), %ymm8 +- addq $VEC_SIZE, %rdi +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- +- vmovdqa VEC_SIZE(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax +- testl %eax, %eax +- jz L(prep_loop_4x) ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- tzcntl %eax, %eax +- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++L(zero): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VZEROUPPER +- ret ++ + + .p2align 4 +-L(first_vec_x0): ++L(first_vec_x1): + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +- addq %rdi, %rax ++ incq %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif ++ addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x1): ++L(first_vec_x2): + tzcntl %eax, %eax +- leaq VEC_SIZE(%rdi, %rax), %rax ++ addq $(VEC_SIZE + 1), %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif ++ addq %rdi, %rax + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2): ++L(first_vec_x3): + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++ addq $(VEC_SIZE * 2 + 1), %rdi + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) + # endif ++ addq %rdi, %rax + VZEROUPPER_RETURN + +-L(prep_loop_4x): +- /* Align data to 4 * VEC_SIZE. */ +- andq $-(VEC_SIZE * 4), %rdi ++ .p2align 4 ++L(aligned_more): ++ /* Align data to VEC_SIZE - 1. This is the same number of ++ instructions as using andq -VEC_SIZE but saves 4 bytes of code ++ on x4 check. */ ++ orq $(VEC_SIZE - 1), %rdi ++L(cross_page_continue): ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time ++ since data is only aligned to VEC_SIZE. */ ++ vmovdqa 1(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x2) ++ ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) + ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 ++ vpor %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x4) ++ /* Align data to VEC_SIZE * 4 - 1. */ ++ addq $(VEC_SIZE * 4 + 1), %rdi ++ andq $-(VEC_SIZE * 4), %rdi + .p2align 4 + L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ +- vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5 +- vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8 ++ vmovdqa (%rdi), %ymm5 ++ vmovdqa (VEC_SIZE)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 ++ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 + + /* Leaves only CHARS matching esi as 0. */ + vpxor %ymm5, %ymm0, %ymm1 +@@ -191,63 +202,102 @@ L(loop_4x_vec): + VPMINU %ymm1, %ymm2, %ymm5 + VPMINU %ymm3, %ymm4, %ymm6 + +- VPMINU %ymm5, %ymm6, %ymm5 ++ VPMINU %ymm5, %ymm6, %ymm6 + +- VPCMPEQ %ymm5, %ymm9, %ymm5 +- vpmovmskb %ymm5, %eax ++ VPCMPEQ %ymm6, %ymm9, %ymm6 ++ vpmovmskb %ymm6, %ecx ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx ++ jz L(loop_4x_vec) + +- addq $(VEC_SIZE * 4), %rdi +- testl %eax, %eax +- jz L(loop_4x_vec) + +- VPCMPEQ %ymm1, %ymm9, %ymm1 ++ VPCMPEQ %ymm1, %ymm9, %ymm1 + vpmovmskb %ymm1, %eax + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(last_vec_x0) ++ + +- VPCMPEQ %ymm2, %ymm9, %ymm2 ++ VPCMPEQ %ymm5, %ymm9, %ymm2 + vpmovmskb %ymm2, %eax + testl %eax, %eax +- jnz L(first_vec_x1) ++ jnz L(last_vec_x1) ++ ++ VPCMPEQ %ymm3, %ymm9, %ymm3 ++ vpmovmskb %ymm3, %eax ++ /* rcx has combined result from all 4 VEC. It will only be used ++ if the first 3 other VEC all did not contain a match. */ ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ subq $(VEC_SIZE * 2), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero_end) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ ++ .p2align 4 ++L(last_vec_x0): ++ tzcntl %eax, %eax ++ addq $-(VEC_SIZE * 4), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero_end) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN + +- VPCMPEQ %ymm3, %ymm9, %ymm3 +- VPCMPEQ %ymm4, %ymm9, %ymm4 +- vpmovmskb %ymm3, %ecx +- vpmovmskb %ymm4, %eax +- salq $32, %rax +- orq %rcx, %rax +- tzcntq %rax, %rax +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++L(zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VZEROUPPER +- ret ++ ++ .p2align 4 ++L(last_vec_x1): ++ tzcntl %eax, %eax ++ subq $(VEC_SIZE * 3), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero_end) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ + + /* Cold case for crossing page with first load. */ + .p2align 4 + L(cross_page_boundary): +- andq $-VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- +- vmovdqa (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 ++ movq %rdi, %rdx ++ /* Align rdi to VEC_SIZE - 1. */ ++ orq $(VEC_SIZE - 1), %rdi ++ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 ++ VPCMPEQ %ymm8, %ymm0, %ymm1 ++ VPCMPEQ %ymm8, %ymm9, %ymm2 + vpor %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %eax +- /* Remove the leading bits. */ +- sarxl %ecx, %eax, %eax ++ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT ++ so no need to manually mod edx. */ ++ sarxl %edx, %eax, %eax + testl %eax, %eax +- jz L(aligned_more) ++ jz L(cross_page_continue) + tzcntl %eax, %eax +- addq %rcx, %rdi +- addq %rdi, %rax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ xorl %ecx, %ecx ++ /* Found CHAR or the null byte. */ ++ cmp (%rdx, %rax), %CHAR_REG ++ leaq (%rdx, %rax), %rax ++ cmovne %rcx, %rax ++# else ++ addq %rdx, %rax + # endif +- VZEROUPPER_RETURN ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + END (STRCHR) + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-43.patch b/glibc-RHEL-15696-43.patch new file mode 100644 index 0000000..9f76b11 --- /dev/null +++ b/glibc-RHEL-15696-43.patch @@ -0,0 +1,532 @@ +From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 23 Apr 2021 15:56:25 -0400 +Subject: [PATCH] x86: Optimize strchr-evex.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes strchr-evex.S. The optimizations are +mostly small things such as save an ALU in the alignment process, +saving a few instructions in the loop return. The one significant +change is saving 2 instructions in the 4x loop. test-strchr, +test-strchrnul, test-wcschr, and test-wcschrnul are all passing. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++----------- + 1 file changed, 218 insertions(+), 174 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +index ddc86a70..7f9d4ee4 100644 +--- a/sysdeps/x86_64/multiarch/strchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -32,13 +32,15 @@ + # define VPCMP vpcmpd + # define VPMINU vpminud + # define CHAR_REG esi +-# define SHIFT_REG r8d ++# define SHIFT_REG ecx ++# define CHAR_SIZE 4 + # else + # define VPBROADCAST vpbroadcastb + # define VPCMP vpcmpb + # define VPMINU vpminub + # define CHAR_REG sil +-# define SHIFT_REG ecx ++# define SHIFT_REG edx ++# define CHAR_SIZE 1 + # endif + + # define XMMZERO xmm16 +@@ -56,23 +58,20 @@ + + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits + ENTRY (STRCHR) +- movl %edi, %ecx +-# ifndef USE_AS_STRCHRNUL +- xorl %edx, %edx +-# endif +- + /* Broadcast CHAR to YMM0. */ +- VPBROADCAST %esi, %YMM0 +- ++ VPBROADCAST %esi, %YMM0 ++ movl %edi, %eax ++ andl $(PAGE_SIZE - 1), %eax + vpxorq %XMMZERO, %XMMZERO, %XMMZERO + +- /* Check if we cross page boundary with one vector load. */ +- andl $(PAGE_SIZE - 1), %ecx +- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx +- ja L(cross_page_boundary) ++ /* Check if we cross page boundary with one vector load. ++ Otherwise it is safe to use an unaligned load. */ ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ ja L(cross_page_boundary) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null bytes. */ +@@ -83,251 +82,296 @@ ENTRY (STRCHR) + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 +- ktestd %k0, %k0 +- jz L(more_vecs) + kmovd %k0, %eax ++ testl %eax, %eax ++ jz L(aligned_more) + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ + # ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. ++ */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax + # else + addq %rdi, %rax + # endif + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (%rax), %CHAR_REG ++ jne L(zero) + # endif + ret + +- .p2align 4 +-L(more_vecs): +- /* Align data for aligned loads in the loop. */ +- andq $-VEC_SIZE, %rdi +-L(aligned_more): +- +- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time +- since data is only aligned to VEC_SIZE. */ +- VMOVA VEC_SIZE(%rdi), %YMM1 +- addq $VEC_SIZE, %rdi +- +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x0) +- +- VMOVA VEC_SIZE(%rdi), %YMM1 +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x1) +- +- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(first_vec_x2) +- +- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 +- /* Leaves only CHARS matching esi as 0. */ +- vpxorq %YMM1, %YMM0, %YMM2 +- VPMINU %YMM2, %YMM1, %YMM2 +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 +- ktestd %k0, %k0 +- jz L(prep_loop_4x) +- +- kmovd %k0, %eax ++ /* .p2align 5 helps keep performance more consistent if ENTRY() ++ alignment % 32 was either 16 or 0. As well this makes the ++ alignment % 32 of the loop_4x_vec fixed which makes tuning it ++ easier. */ ++ .p2align 5 ++L(first_vec_x3): + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax +-# else +- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax ++ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax +-# endif ++L(zero): ++ xorl %eax, %eax + ret ++# endif + + .p2align 4 +-L(first_vec_x0): ++L(first_vec_x4): ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if first match was CHAR (k0) or null (k1). */ ++ kmovd %k0, %eax + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++ kmovd %k1, %ecx ++ /* bzhil will not be 0 if first match was null. */ ++ bzhil %eax, %ecx, %ecx ++ jne L(zero) + # else +- addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Combine CHAR and null matches. */ ++ kord %k0, %k1, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 + L(first_vec_x1): + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq VEC_SIZE(%rdi, %rax, 4), %rax +-# else +- leaq VEC_SIZE(%rdi, %rax), %rax +-# endif + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Found CHAR or the null byte. */ ++ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) ++ + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + .p2align 4 + L(first_vec_x2): ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if first match was CHAR (k0) or null (k1). */ ++ kmovd %k0, %eax + tzcntl %eax, %eax +- /* Found CHAR or the null byte. */ +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax ++ kmovd %k1, %ecx ++ /* bzhil will not be 0 if first match was null. */ ++ bzhil %eax, %ecx, %ecx ++ jne L(zero) + # else +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Combine CHAR and null matches. */ ++ kord %k0, %k1, %k0 ++ kmovd %k0, %eax ++ tzcntl %eax, %eax + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +-L(prep_loop_4x): +- /* Align data to 4 * VEC_SIZE. */ ++ .p2align 4 ++L(aligned_more): ++ /* Align data to VEC_SIZE. */ ++ andq $-VEC_SIZE, %rdi ++L(cross_page_continue): ++ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since ++ data is only aligned to VEC_SIZE. Use two alternating methods ++ for checking VEC to balance latency and port contention. */ ++ ++ /* This method has higher latency but has better port ++ distribution. */ ++ VMOVA (VEC_SIZE)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x1) ++ ++ /* This method has higher latency but has better port ++ distribution. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1 ++ /* Each bit in K0 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMM0, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMMZERO, %k1 ++ kortestd %k0, %k1 ++ jnz L(first_vec_x2) ++ ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1 ++ /* Leaves only CHARS matching esi as 0. */ ++ vpxorq %YMM1, %YMM0, %YMM2 ++ VPMINU %YMM2, %YMM1, %YMM2 ++ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(first_vec_x3) ++ ++ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 ++ /* Each bit in K0 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMM0, %k0 ++ /* Each bit in K1 represents a CHAR in YMM1. */ ++ VPCMP $0, %YMM1, %YMMZERO, %k1 ++ kortestd %k0, %k1 ++ jnz L(first_vec_x4) ++ ++ /* Align data to VEC_SIZE * 4 for the loop. */ ++ addq $VEC_SIZE, %rdi + andq $-(VEC_SIZE * 4), %rdi + + .p2align 4 + L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ ++ /* Check 4x VEC at a time. No penalty to imm32 offset with evex ++ encoding. */ + VMOVA (VEC_SIZE * 4)(%rdi), %YMM1 + VMOVA (VEC_SIZE * 5)(%rdi), %YMM2 + VMOVA (VEC_SIZE * 6)(%rdi), %YMM3 + VMOVA (VEC_SIZE * 7)(%rdi), %YMM4 + +- /* Leaves only CHARS matching esi as 0. */ ++ /* For YMM1 and YMM3 use xor to set the CHARs matching esi to ++ zero. */ + vpxorq %YMM1, %YMM0, %YMM5 +- vpxorq %YMM2, %YMM0, %YMM6 ++ /* For YMM2 and YMM4 cmp not equals to CHAR and store result in ++ k register. Its possible to save either 1 or 2 instructions ++ using cmp no equals method for either YMM1 or YMM1 and YMM3 ++ respectively but bottleneck on p5 makes it not worth it. */ ++ VPCMP $4, %YMM0, %YMM2, %k2 + vpxorq %YMM3, %YMM0, %YMM7 +- vpxorq %YMM4, %YMM0, %YMM8 +- +- VPMINU %YMM5, %YMM1, %YMM5 +- VPMINU %YMM6, %YMM2, %YMM6 +- VPMINU %YMM7, %YMM3, %YMM7 +- VPMINU %YMM8, %YMM4, %YMM8 +- +- VPMINU %YMM5, %YMM6, %YMM1 +- VPMINU %YMM7, %YMM8, %YMM2 +- +- VPMINU %YMM1, %YMM2, %YMM1 +- +- /* Each bit in K0 represents a CHAR or a null byte. */ +- VPCMP $0, %YMMZERO, %YMM1, %k0 +- +- addq $(VEC_SIZE * 4), %rdi +- +- ktestd %k0, %k0 ++ VPCMP $4, %YMM0, %YMM4, %k4 ++ ++ /* Use min to select all zeros from either xor or end of string). ++ */ ++ VPMINU %YMM1, %YMM5, %YMM1 ++ VPMINU %YMM3, %YMM7, %YMM3 ++ ++ /* Use min + zeromask to select for zeros. Since k2 and k4 will ++ have 0 as positions that matched with CHAR which will set ++ zero in the corresponding destination bytes in YMM2 / YMM4. ++ */ ++ VPMINU %YMM1, %YMM2, %YMM2{%k2}{z} ++ VPMINU %YMM3, %YMM4, %YMM4 ++ VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} ++ ++ VPCMP $0, %YMMZERO, %YMM4, %k1 ++ kmovd %k1, %ecx ++ subq $-(VEC_SIZE * 4), %rdi ++ testl %ecx, %ecx + jz L(loop_4x_vec) + +- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM5, %k0 ++ VPCMP $0, %YMMZERO, %YMM1, %k0 + kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x0) ++ jnz L(last_vec_x1) + +- /* Each bit in K1 represents a CHAR or a null byte in YMM2. */ +- VPCMP $0, %YMMZERO, %YMM6, %k1 +- kmovd %k1, %eax ++ VPCMP $0, %YMMZERO, %YMM2, %k0 ++ kmovd %k0, %eax + testl %eax, %eax +- jnz L(first_vec_x1) +- +- /* Each bit in K2 represents a CHAR or a null byte in YMM3. */ +- VPCMP $0, %YMMZERO, %YMM7, %k2 +- /* Each bit in K3 represents a CHAR or a null byte in YMM4. */ +- VPCMP $0, %YMMZERO, %YMM8, %k3 ++ jnz L(last_vec_x2) + ++ VPCMP $0, %YMMZERO, %YMM3, %k0 ++ kmovd %k0, %eax ++ /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ + # ifdef USE_AS_WCSCHR +- /* NB: Each bit in K2/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k1 ++ sall $8, %ecx ++ orl %ecx, %eax ++ tzcntl %eax, %eax + # else +- kshiftlq $32, %k3, %k1 ++ salq $32, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax + # endif ++# ifndef USE_AS_STRCHRNUL ++ /* Check if match was CHAR or null. */ ++ cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) ++# endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax ++ ret + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rax ++# ifndef USE_AS_STRCHRNUL ++L(zero_end): ++ xorl %eax, %eax ++ ret ++# endif + +- tzcntq %rax, %rax +-# ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax +-# else +- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax ++ .p2align 4 ++L(last_vec_x1): ++ tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Check if match was null. */ ++ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ ++ .p2align 4 ++L(last_vec_x2): ++ tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ /* Check if match was null. */ ++ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) + # endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + + /* Cold case for crossing page with first load. */ + .p2align 4 + L(cross_page_boundary): ++ movq %rdi, %rdx ++ /* Align rdi. */ + andq $-VEC_SIZE, %rdi +- andl $(VEC_SIZE - 1), %ecx +- + VMOVA (%rdi), %YMM1 +- + /* Leaves only CHARS matching esi as 0. */ + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ + VPCMP $0, %YMMZERO, %YMM2, %k0 + kmovd %k0, %eax +- testl %eax, %eax +- ++ /* Remove the leading bits. */ + # ifdef USE_AS_WCSCHR ++ movl %edx, %SHIFT_REG + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ +- movl %ecx, %SHIFT_REG +- sarl $2, %SHIFT_REG ++ sarl $2, %SHIFT_REG ++ andl $(CHAR_PER_VEC - 1), %SHIFT_REG + # endif +- +- /* Remove the leading bits. */ + sarxl %SHIFT_REG, %eax, %eax ++ /* If eax is zero continue. */ + testl %eax, %eax +- +- jz L(aligned_more) ++ jz L(cross_page_continue) + tzcntl %eax, %eax +- addq %rcx, %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if match was CHAR or null. */ ++ cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero_end) ++# endif + # ifdef USE_AS_WCSCHR +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- leaq (%rdi, %rax, 4), %rax ++ /* NB: Multiply wchar_t count by 4 to get the number of ++ bytes. */ ++ leaq (%rdx, %rax, CHAR_SIZE), %rax + # else +- addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- cmp (%rax), %CHAR_REG +- cmovne %rdx, %rax ++ addq %rdx, %rax + # endif + ret + +-- +GitLab + diff --git a/glibc-RHEL-15696-44.patch b/glibc-RHEL-15696-44.patch new file mode 100644 index 0000000..52fec88 --- /dev/null +++ b/glibc-RHEL-15696-44.patch @@ -0,0 +1,536 @@ +From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 4 May 2021 19:02:40 -0400 +Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM +Content-type: text/plain; charset=UTF-8 + +No bug. + +This commit adds a new implementation for EVEX memchr that is not safe +for RTM because it uses vzeroupper. The benefit is that by using +ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is +faster than the RTM safe version which cannot use vpcmpeq because +there is no EVEX encoding for the instruction. All parts of the +implementation aside from the 4x loop are the same for the two +versions and the optimization is only relevant for large sizes. + +Tigerlake: +size , algn , Pos , Cur T , New T , Win , Dif +512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16 +512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21 +2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2 +2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06 +2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4 +2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <-- + +Icelake: +size , algn , Pos , Cur T , New T , Win , Dif +512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3 +512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36 +2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1 +2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15 +2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54 +2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <-- + +test-memchr, test-wmemchr, and test-rawmemchr are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 7 +- + sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++ + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++ + sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 + + sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++---- + sysdeps/x86_64/multiarch/memchr.c | 2 +- + sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 + + sysdeps/x86_64/multiarch/rawmemchr.c | 2 +- + sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 + + sysdeps/x86_64/multiarch/wmemchr.c | 2 +- + 10 files changed, 217 insertions(+), 41 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h + create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 65fde4eb..26be4095 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \ + strncmp-evex \ + strncpy-evex \ + strnlen-evex \ +- strrchr-evex ++ strrchr-evex \ ++ memchr-evex-rtm \ ++ rawmemchr-evex-rtm + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ + wcsnlen-evex \ + wcsrchr-evex \ + wmemchr-evex \ +- wmemcmp-evex-movbe ++ wmemcmp-evex-movbe \ ++ wmemchr-evex-rtm + endif + + ifeq ($(subdir),debug) +diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h +new file mode 100644 +index 00000000..fc391edb +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/ifunc-evex.h +@@ -0,0 +1,55 @@ ++/* Common definition for ifunc selection optimized with EVEX. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2017-2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden; ++ ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (evex_rtm); ++ ++ return OPTIMIZE (evex); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ ++ return OPTIMIZE (sse2); ++} +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d59d65f8..ac097e8d 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __memchr_evex) ++ IFUNC_IMPL_ADD (array, i, memchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __memchr_evex_rtm) + IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/memcmp.c. */ +@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __rawmemchr_evex) ++ IFUNC_IMPL_ADD (array, i, rawmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __rawmemchr_evex_rtm) + IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/strlen.c. */ +@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + && CPU_FEATURE_USABLE (AVX512BW) + && CPU_FEATURE_USABLE (BMI2)), + __wmemchr_evex) ++ IFUNC_IMPL_ADD (array, i, wmemchr, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __wmemchr_evex_rtm) + IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2)) + + /* Support sysdeps/x86_64/multiarch/wmemcmp.c. */ +diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S +new file mode 100644 +index 00000000..19871882 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S +@@ -0,0 +1,8 @@ ++#ifndef MEMCHR ++# define MEMCHR __memchr_evex_rtm ++#endif ++ ++#define USE_IN_RTM 1 ++#define SECTION(p) p##.evex.rtm ++ ++#include "memchr-evex.S" +diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S +index f3fdad4f..4d0ed6d1 100644 +--- a/sysdeps/x86_64/multiarch/memchr-evex.S ++++ b/sysdeps/x86_64/multiarch/memchr-evex.S +@@ -38,10 +38,32 @@ + # define CHAR_SIZE 1 + # endif + ++ /* In the 4x loop the RTM and non-RTM versions have data pointer ++ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater. ++ This is represented by BASE_OFFSET. As well because the RTM ++ version uses vpcmp which stores a bit per element compared where ++ the non-RTM version uses vpcmpeq which stores a bit per byte ++ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM ++ version. */ ++# ifdef USE_IN_RTM ++# define VZEROUPPER ++# define BASE_OFFSET (VEC_SIZE * 4) ++# define RET_SCALE CHAR_SIZE ++# else ++# define VZEROUPPER vzeroupper ++# define BASE_OFFSET 0 ++# define RET_SCALE 1 ++# endif ++ ++ /* In the return from 4x loop memchr and rawmemchr versions have ++ data pointers off by VEC_SIZE * 4 with memchr version being ++ VEC_SIZE * 4 greater. */ + # ifdef USE_AS_RAWMEMCHR ++# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4)) + # define RAW_PTR_REG rcx + # define ALGN_PTR_REG rdi + # else ++# define RET_OFFSET BASE_OFFSET + # define RAW_PTR_REG rdi + # define ALGN_PTR_REG rcx + # endif +@@ -57,11 +79,15 @@ + # define YMM5 ymm21 + # define YMM6 ymm22 + ++# ifndef SECTION ++# define SECTION(p) p##.evex ++# endif ++ + # define VEC_SIZE 32 + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + # define PAGE_SIZE 4096 + +- .section .text.evex,"ax",@progbits ++ .section SECTION(.text),"ax",@progbits + ENTRY (MEMCHR) + # ifndef USE_AS_RAWMEMCHR + /* Check for zero length. */ +@@ -237,14 +263,15 @@ L(cross_page_continue): + /* Check if at last CHAR_PER_VEC * 4 length. */ + subq $(CHAR_PER_VEC * 4), %rdx + jbe L(last_4x_vec_or_less_cmpeq) +- addq $VEC_SIZE, %rdi ++ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */ ++ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi + + /* Align data to VEC_SIZE * 4 for the loop and readjust length. + */ + # ifdef USE_AS_WMEMCHR + movl %edi, %ecx + andq $-(4 * VEC_SIZE), %rdi +- andl $(VEC_SIZE * 4 - 1), %ecx ++ subl %edi, %ecx + /* NB: Divide bytes by 4 to get the wchar_t count. */ + sarl $2, %ecx + addq %rcx, %rdx +@@ -254,15 +281,28 @@ L(cross_page_continue): + subq %rdi, %rdx + # endif + # else +- addq $VEC_SIZE, %rdi ++ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi + andq $-(4 * VEC_SIZE), %rdi + # endif +- ++# ifdef USE_IN_RTM + vpxorq %XMMZERO, %XMMZERO, %XMMZERO ++# else ++ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not ++ encodable with EVEX registers (ymm16-ymm31). */ ++ vmovdqa64 %YMMMATCH, %ymm0 ++# endif + + /* Compare 4 * VEC at a time forward. */ + .p2align 4 + L(loop_4x_vec): ++ /* Two versions of the loop. One that does not require ++ vzeroupper by not using ymm0-ymm15 and another does that require ++ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15 ++ is used at all is because there is no EVEX encoding vpcmpeq and ++ with vpcmpeq this loop can be performed more efficiently. The ++ non-vzeroupper version is safe for RTM while the vzeroupper ++ version should be prefered if RTM are not supported. */ ++# ifdef USE_IN_RTM + /* It would be possible to save some instructions using 4x VPCMP + but bottleneck on port 5 makes it not woth it. */ + VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1 +@@ -273,12 +313,55 @@ L(loop_4x_vec): + /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */ + VPMINU %YMM2, %YMM3, %YMM3{%k1}{z} + VPCMP $0, %YMM3, %YMMZERO, %k2 ++# else ++ /* Since vptern can only take 3x vectors fastest to do 1 vec ++ seperately with EVEX vpcmp. */ ++# ifdef USE_AS_WMEMCHR ++ /* vptern can only accept masks for epi32/epi64 so can only save ++ instruction using not equals mask on vptern with wmemchr. */ ++ VPCMP $4, (%rdi), %YMMMATCH, %k1 ++# else ++ VPCMP $0, (%rdi), %YMMMATCH, %k1 ++# endif ++ /* Compare 3x with vpcmpeq and or them all together with vptern. ++ */ ++ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4 ++# ifdef USE_AS_WMEMCHR ++ /* This takes the not of or between ymm2, ymm3, ymm4 as well as ++ combines result from VEC0 with zero mask. */ ++ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z} ++ vpmovmskb %ymm4, %ecx ++# else ++ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */ ++ vpternlogd $254, %ymm2, %ymm3, %ymm4 ++ vpmovmskb %ymm4, %ecx ++ kmovd %k1, %eax ++# endif ++# endif ++ + # ifdef USE_AS_RAWMEMCHR + subq $-(VEC_SIZE * 4), %rdi ++# endif ++# ifdef USE_IN_RTM + kortestd %k2, %k3 ++# else ++# ifdef USE_AS_WMEMCHR ++ /* ecx contains not of matches. All 1s means no matches. incl will ++ overflow and set zeroflag if that is the case. */ ++ incl %ecx ++# else ++ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding ++ to ecx is not an issue because if eax is non-zero it will be ++ used for returning the match. If it is zero the add does ++ nothing. */ ++ addq %rax, %rcx ++# endif ++# endif ++# ifdef USE_AS_RAWMEMCHR + jz L(loop_4x_vec) + # else +- kortestd %k2, %k3 + jnz L(loop_4x_vec_end) + + subq $-(VEC_SIZE * 4), %rdi +@@ -288,10 +371,11 @@ L(loop_4x_vec): + + /* Fall through into less than 4 remaining vectors of length case. + */ +- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0 ++ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0 ++ addq $(BASE_OFFSET - VEC_SIZE), %rdi + kmovd %k0, %eax +- addq $(VEC_SIZE * 3), %rdi +- .p2align 4 ++ VZEROUPPER ++ + L(last_4x_vec_or_less): + /* Check if first VEC contained match. */ + testl %eax, %eax +@@ -338,73 +422,78 @@ L(loop_4x_vec_end): + /* rawmemchr will fall through into this if match was found in + loop. */ + ++# if defined USE_IN_RTM || defined USE_AS_WMEMCHR + /* k1 has not of matches with VEC1. */ + kmovd %k1, %eax +-# ifdef USE_AS_WMEMCHR ++# ifdef USE_AS_WMEMCHR + subl $((1 << CHAR_PER_VEC) - 1), %eax +-# else ++# else + incl %eax ++# endif ++# else ++ /* eax already has matches for VEC1. */ ++ testl %eax, %eax + # endif + jnz L(last_vec_x1_return) + ++# ifdef USE_IN_RTM + VPCMP $0, %YMM2, %YMMZERO, %k0 + kmovd %k0, %eax ++# else ++ vpmovmskb %ymm2, %eax ++# endif + testl %eax, %eax + jnz L(last_vec_x2_return) + ++# ifdef USE_IN_RTM + kmovd %k2, %eax + testl %eax, %eax + jnz L(last_vec_x3_return) + + kmovd %k3, %eax + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax + # else +- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax ++ vpmovmskb %ymm3, %eax ++ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */ ++ salq $VEC_SIZE, %rcx ++ orq %rcx, %rax ++ tzcntq %rax, %rax ++ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax ++ VZEROUPPER + # endif + ret + + .p2align 4 + L(last_vec_x1_return): + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +-# ifdef USE_AS_WMEMCHR ++# if defined USE_AS_WMEMCHR || RET_OFFSET != 0 + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (%rdi, %rax, CHAR_SIZE), %rax +-# else +- addq %rdi, %rax +-# endif ++ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax + # else +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax ++ addq %rdi, %rax + # endif ++ VZEROUPPER + ret + + .p2align 4 + L(last_vec_x2_return): + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax +-# else +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax +-# endif ++ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count ++ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and ++ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */ ++ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax ++ VZEROUPPER + ret + ++# ifdef USE_IN_RTM + .p2align 4 + L(last_vec_x3_return): + tzcntl %eax, %eax +-# ifdef USE_AS_RAWMEMCHR +- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax +-# else + /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */ +- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax +-# endif ++ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax + ret +- ++# endif + + # ifndef USE_AS_RAWMEMCHR + L(last_4x_vec_or_less_cmpeq): +diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c +index 016f5784..f28aea77 100644 +--- a/sysdeps/x86_64/multiarch/memchr.c ++++ b/sysdeps/x86_64/multiarch/memchr.c +@@ -24,7 +24,7 @@ + # undef memchr + + # define SYMBOL_NAME memchr +-# include "ifunc-avx2.h" ++# include "ifunc-evex.h" + + libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ()); + strong_alias (memchr, __memchr) +diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +new file mode 100644 +index 00000000..deda1ca3 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S +@@ -0,0 +1,3 @@ ++#define MEMCHR __rawmemchr_evex_rtm ++#define USE_AS_RAWMEMCHR 1 ++#include "memchr-evex-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c +index 8a0bc313..1f764f35 100644 +--- a/sysdeps/x86_64/multiarch/rawmemchr.c ++++ b/sysdeps/x86_64/multiarch/rawmemchr.c +@@ -26,7 +26,7 @@ + # undef __rawmemchr + + # define SYMBOL_NAME rawmemchr +-# include "ifunc-avx2.h" ++# include "ifunc-evex.h" + + libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr, + IFUNC_SELECTOR ()); +diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S +new file mode 100644 +index 00000000..a346cd35 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S +@@ -0,0 +1,3 @@ ++#define MEMCHR __wmemchr_evex_rtm ++#define USE_AS_WMEMCHR 1 ++#include "memchr-evex-rtm.S" +diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c +index 6d833702..f9c91915 100644 +--- a/sysdeps/x86_64/multiarch/wmemchr.c ++++ b/sysdeps/x86_64/multiarch/wmemchr.c +@@ -26,7 +26,7 @@ + # undef __wmemchr + + # define SYMBOL_NAME wmemchr +-# include "ifunc-avx2.h" ++# include "ifunc-evex.h" + + libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ()); + weak_alias (__wmemchr, wmemchr) +-- +GitLab + diff --git a/glibc-RHEL-15696-45.patch b/glibc-RHEL-15696-45.patch new file mode 100644 index 0000000..380217e --- /dev/null +++ b/glibc-RHEL-15696-45.patch @@ -0,0 +1,873 @@ +From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 17 May 2021 13:56:52 -0400 +Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memcmp-avx2.S. The optimizations include +adding a new vec compare path for small sizes, reorganizing the entry +control flow, and removing some unnecissary ALU instructions from the +main loop. test-memcmp and test-wmemcmp are both passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 + + sysdeps/x86_64/multiarch/ifunc-memcmp.h | 1 + + sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++-------- + 3 files changed, 402 insertions(+), 281 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index ac097e8d..8be0d78a 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, memcmp, + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __memcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, memcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __memcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1), +@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, wmemcmp, + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_avx2_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE) + && CPU_FEATURE_USABLE (RTM)), + __wmemcmp_avx2_movbe_rtm) + IFUNC_IMPL_ADD (array, i, wmemcmp, + (CPU_FEATURE_USABLE (AVX512VL) + && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2) + && CPU_FEATURE_USABLE (MOVBE)), + __wmemcmp_evex_movbe) + IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1), +diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +index 8043c635..690dffe8 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h +@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void) + + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURE_USABLE_P (cpu_features, MOVBE) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) +diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +index 9d5c9c72..16fc673e 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S +@@ -19,17 +19,23 @@ + #if IS_IN (libc) + + /* memcmp/wmemcmp is implemented as: +- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap +- to avoid branches. +- 2. Use overlapping compare to avoid branch. +- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 +- bytes for wmemcmp. +- 4. If size is 8 * VEC_SIZE or less, unroll the loop. +- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory ++ 1. Use ymm vector compares when possible. The only case where ++ vector compares is not possible for when size < VEC_SIZE ++ and loading from either s1 or s2 would cause a page cross. ++ 2. For size from 2 to 7 bytes on page cross, load as big endian ++ with movbe and bswap to avoid branches. ++ 3. Use xmm vector compare when size >= 4 bytes for memcmp or ++ size >= 8 bytes for wmemcmp. ++ 4. Optimistically compare up to first 4 * VEC_SIZE one at a ++ to check for early mismatches. Only do this if its guranteed the ++ work is not wasted. ++ 5. If size is 8 * VEC_SIZE or less, unroll the loop. ++ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. +- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. +- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. +- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ 7. Use 2 vector compares when size is 2 * VEC_SIZE or less. ++ 8. Use 4 vector compares when size is 4 * VEC_SIZE or less. ++ 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ + + # include + +@@ -38,8 +44,10 @@ + # endif + + # ifdef USE_AS_WMEMCMP ++# define CHAR_SIZE 4 + # define VPCMPEQ vpcmpeqd + # else ++# define CHAR_SIZE 1 + # define VPCMPEQ vpcmpeqb + # endif + +@@ -52,7 +60,7 @@ + # endif + + # define VEC_SIZE 32 +-# define VEC_MASK ((1 << VEC_SIZE) - 1) ++# define PAGE_SIZE 4096 + + /* Warning! + wmemcmp has to use SIGNED comparison for elements. +@@ -71,136 +79,359 @@ ENTRY (MEMCMP) + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++ vmovdqu (%rsi), %ymm1 ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ vpmovmskb %ymm1, %eax ++ /* NB: eax must be destination register if going to ++ L(return_vec_[0,2]). For L(return_vec_3 destination register ++ must be ecx. */ ++ incl %eax ++ jnz L(return_vec_0) + + cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_vec) +- +- VPCMPEQ %ymm0, %ymm0, %ymm0 +- /* More than 2 * VEC. */ +- cmpq $(VEC_SIZE * 8), %rdx +- ja L(more_8x_vec) +- cmpq $(VEC_SIZE * 4), %rdx +- jb L(last_4x_vec) +- +- /* From 4 * VEC to 8 * VEC, inclusively. */ +- vmovdqu (%rsi), %ymm1 +- VPCMPEQ (%rdi), %ymm1, %ymm1 ++ jbe L(last_1x_vec) + ++ /* Check second VEC no matter what. */ + vmovdqu VEC_SIZE(%rsi), %ymm2 +- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ /* If all 4 VEC where equal eax will be all 1s so incl will ++ overflow and set zero flag. */ ++ incl %eax ++ jnz L(return_vec_1) + +- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ /* Less than 4 * VEC. */ ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_2x_vec) + ++ /* Check third and fourth VEC no matter what. */ ++ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ vpmovmskb %ymm3, %eax ++ incl %eax ++ jnz L(return_vec_2) + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ vpmovmskb %ymm4, %ecx ++ incl %ecx ++ jnz L(return_vec_3) + +- vpand %ymm1, %ymm2, %ymm5 +- vpand %ymm3, %ymm4, %ymm6 +- vpand %ymm5, %ymm6, %ymm5 ++ /* Go to 4x VEC loop. */ ++ cmpq $(VEC_SIZE * 8), %rdx ++ ja L(more_8x_vec) + +- vptest %ymm0, %ymm5 +- jnc L(4x_vec_end) ++ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any ++ branches. */ + ++ /* Load first two VEC from s2 before adjusting addresses. */ ++ vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1 ++ vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2 + leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi + leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %ymm1 +- VPCMPEQ (%rdi), %ymm1, %ymm1 + +- vmovdqu VEC_SIZE(%rsi), %ymm2 +- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 +- vpand %ymm2, %ymm1, %ymm5 ++ /* Wait to load from s1 until addressed adjust due to ++ unlamination of microfusion with complex address mode. */ ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2 + + vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 +- vpand %ymm3, %ymm5, %ymm5 +- ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 + vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 +- vpand %ymm4, %ymm5, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 + +- vptest %ymm0, %ymm5 +- jnc L(4x_vec_end) +- xorl %eax, %eax ++ /* Reduce VEC0 - VEC4. */ ++ vpand %ymm1, %ymm2, %ymm5 ++ vpand %ymm3, %ymm4, %ymm6 ++ vpand %ymm5, %ymm6, %ymm7 ++ vpmovmskb %ymm7, %ecx ++ incl %ecx ++ jnz L(return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(return_vec_0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + + .p2align 4 +-L(last_2x_vec): +- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++L(return_vec_1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl VEC_SIZE(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl VEC_SIZE(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl VEC_SIZE(%rsi, %rax), %ecx ++ movzbl VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(return_vec_2): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ ++ /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */ ++ .p2align 5 ++L(8x_return_vec_0_1_2_3): ++ /* Returning from L(more_8x_vec) requires restoring rsi. */ ++ addq %rdi, %rsi ++L(return_vec_0_1_2_3): ++ vpmovmskb %ymm1, %eax ++ incl %eax ++ jnz L(return_vec_0) + +-L(last_vec): +- /* Use overlapping loads to avoid branches. */ +- leaq -VEC_SIZE(%rdi, %rdx), %rdi +- leaq -VEC_SIZE(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 + vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++ incl %eax ++ jnz L(return_vec_1) ++ ++ vpmovmskb %ymm3, %eax ++ incl %eax ++ jnz L(return_vec_2) ++L(return_vec_3): ++ tzcntl %ecx, %ecx ++# ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(more_8x_vec): ++ /* Set end of s1 in rdx. */ ++ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx ++ /* rsi stores s2 - s1. This allows loop to only update one ++ pointer. */ ++ subq %rdi, %rsi ++ /* Align s1 pointer. */ ++ andq $-VEC_SIZE, %rdi ++ /* Adjust because first 4x vec where check already. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ .p2align 4 ++L(loop_4x_vec): ++ /* rsi has s2 - s1 so get correct address by adding s1 (in rdi). ++ */ ++ vmovdqu (%rsi, %rdi), %ymm1 ++ VPCMPEQ (%rdi), %ymm1, %ymm1 ++ ++ vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2 ++ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 ++ ++ vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 ++ ++ vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 ++ ++ vpand %ymm1, %ymm2, %ymm5 ++ vpand %ymm3, %ymm4, %ymm6 ++ vpand %ymm5, %ymm6, %ymm7 ++ vpmovmskb %ymm7, %ecx ++ incl %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ subq $-(VEC_SIZE * 4), %rdi ++ /* Check if s1 pointer at end. */ ++ cmpq %rdx, %rdi ++ jb L(loop_4x_vec) ++ ++ subq %rdx, %rdi ++ /* rdi has 4 * VEC_SIZE - remaining length. */ ++ cmpl $(VEC_SIZE * 3), %edi ++ jae L(8x_last_1x_vec) ++ /* Load regardless of branch. */ ++ vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3 ++ cmpl $(VEC_SIZE * 2), %edi ++ jae L(8x_last_2x_vec) ++ ++ /* Check last 4 VEC. */ ++ vmovdqu (%rsi, %rdx), %ymm1 ++ VPCMPEQ (%rdx), %ymm1, %ymm1 ++ ++ vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2 ++ VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2 ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 ++ ++ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 ++ ++ vpand %ymm1, %ymm2, %ymm5 ++ vpand %ymm3, %ymm4, %ymm6 ++ vpand %ymm5, %ymm6, %ymm7 ++ vpmovmskb %ymm7, %ecx ++ /* Restore s1 pointer to rdi. */ ++ movq %rdx, %rdi ++ incl %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ VZEROUPPER_RETURN ++ ++ /* Only entry is from L(more_8x_vec). */ ++ .p2align 4 ++L(8x_last_2x_vec): ++ /* Check second to last VEC. rdx store end pointer of s1 and ++ ymm3 has already been loaded with second to last VEC from s2. ++ */ ++ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3 ++ vpmovmskb %ymm3, %eax ++ incl %eax ++ jnz L(8x_return_vec_2) ++ /* Check last VEC. */ ++ .p2align 4 ++L(8x_last_1x_vec): ++ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4 ++ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4 ++ vpmovmskb %ymm4, %eax ++ incl %eax ++ jnz L(8x_return_vec_3) + VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec): +- /* A byte or int32 is different within 16 or 32 bytes. */ +- tzcntl %eax, %ecx ++L(last_2x_vec): ++ /* Check second to last VEC. */ ++ vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1 ++ VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1 ++ vpmovmskb %ymm1, %eax ++ incl %eax ++ jnz L(return_vec_1_end) ++ /* Check last VEC. */ ++L(last_1x_vec): ++ vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1 ++ VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1 ++ vpmovmskb %ymm1, %eax ++ incl %eax ++ jnz L(return_vec_0_end) ++ VZEROUPPER_RETURN ++ ++ .p2align 4 ++L(8x_return_vec_2): ++ subq $VEC_SIZE, %rdx ++L(8x_return_vec_3): ++ tzcntl %eax, %eax ++ addq %rdx, %rax + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi, %rcx), %edx +- cmpl (%rsi, %rcx), %edx +-L(wmemcmp_return): +- setl %al +- negl %eax +- orl $1, %eax ++ movl (VEC_SIZE * 3)(%rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %edx +- sub %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 3)(%rax), %eax ++ subl %ecx, %eax + # endif + VZEROUPPER_RETURN + +-# ifdef USE_AS_WMEMCMP + .p2align 4 +-L(4): +- xorl %eax, %eax +- movl (%rdi), %edx +- cmpl (%rsi), %edx +- jne L(wmemcmp_return) +- ret ++L(return_vec_1_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else ++ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN ++ + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- je L(exit) +- sbbl %eax, %eax +- orl $1, %eax +- ret ++L(return_vec_0_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -VEC_SIZE(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl -VEC_SIZE(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -VEC_SIZE(%rsi, %rax), %ecx ++ movzbl -VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ VZEROUPPER_RETURN + + .p2align 4 +-L(exit): +- ret ++L(less_vec): ++ /* Check if one or less CHAR. This is necessary for size = 0 but ++ is also faster for size = CHAR_SIZE. */ ++ cmpl $CHAR_SIZE, %edx ++ jbe L(one_or_less) ++ ++ /* Check if loading one VEC from either s1 or s2 could cause a ++ page cross. This can have false positives but is by far the ++ fastest method. */ ++ movl %edi, %eax ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(page_cross_less_vec) ++ ++ /* No page cross possible. */ ++ vmovdqu (%rsi), %ymm2 ++ VPCMPEQ (%rdi), %ymm2, %ymm2 ++ vpmovmskb %ymm2, %eax ++ incl %eax ++ /* Result will be zero if s1 and s2 match. Otherwise first set ++ bit will be first mismatch. */ ++ bzhil %edx, %eax, %edx ++ jnz L(return_vec_0) ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + + .p2align 4 +-L(between_2_3): ++L(page_cross_less_vec): ++ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 ++ bytes. */ ++ cmpl $16, %edx ++ jae L(between_16_31) ++# ifndef USE_AS_WMEMCMP ++ cmpl $8, %edx ++ jae L(between_8_15) ++ cmpl $4, %edx ++ jae L(between_4_7) ++ + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax + movzwl (%rsi), %ecx +@@ -208,223 +439,106 @@ L(between_2_3): + shll $8, %ecx + bswap %eax + bswap %ecx +- movb -1(%rdi, %rdx), %al +- movb -1(%rsi, %rdx), %cl ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax ++ /* No ymm register was touched. */ + ret + + .p2align 4 +-L(1): +- movzbl (%rdi), %eax ++L(one_or_less): ++ jb L(zero) + movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + subl %ecx, %eax +- ret +-# endif +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax ++ /* No ymm register was touched. */ + ret + + .p2align 4 +-L(less_vec): +-# ifdef USE_AS_WMEMCMP +- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ +- cmpb $4, %dl +- je L(4) +- jb L(zero) +-# else +- cmpb $1, %dl +- je L(1) +- jb L(zero) +- cmpb $4, %dl +- jb L(between_2_3) +- cmpb $8, %dl +- jb L(between_4_7) ++L(between_8_15): + # endif +- cmpb $16, %dl +- jae L(between_16_31) +- /* It is between 8 and 15 bytes. */ ++ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +- VPCMPEQ %xmm1, %xmm2, %xmm2 ++ VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) + /* Use overlapping loads to avoid branches. */ + leaq -8(%rdi, %rdx), %rdi + leaq -8(%rsi, %rdx), %rsi + vmovq (%rdi), %xmm1 + vmovq (%rsi), %xmm2 +- VPCMPEQ %xmm1, %xmm2, %xmm2 ++ VPCMPEQ %xmm1, %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) ++ /* No ymm register was touched. */ ++ ret ++ ++ .p2align 4 ++L(zero): ++ xorl %eax, %eax + ret + + .p2align 4 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + vmovdqu (%rsi), %xmm2 +- VPCMPEQ (%rdi), %xmm2, %xmm2 ++ VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) + + /* Use overlapping loads to avoid branches. */ ++ ++ vmovdqu -16(%rsi, %rdx), %xmm2 + leaq -16(%rdi, %rdx), %rdi + leaq -16(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %xmm2 +- VPCMPEQ (%rdi), %xmm2, %xmm2 ++ VPCMPEQ (%rdi), %xmm2, %xmm2 + vpmovmskb %xmm2, %eax +- subl $0xffff, %eax +- jnz L(first_vec) ++ subl $0xffff, %eax ++ jnz L(return_vec_0) ++ /* No ymm register was touched. */ + ret + +- .p2align 4 +-L(more_8x_vec): +- /* More than 8 * VEC. Check the first VEC. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- /* Align the first memory area for aligned loads in the loop. +- Compute how much the first memory area is misaligned. */ +- movq %rdi, %rcx +- andl $(VEC_SIZE - 1), %ecx +- /* Get the negative of offset for alignment. */ +- subq $VEC_SIZE, %rcx +- /* Adjust the second memory area. */ +- subq %rcx, %rsi +- /* Adjust the first memory area which should be aligned now. */ +- subq %rcx, %rdi +- /* Adjust length. */ +- addq %rcx, %rdx +- +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- vmovdqu (%rsi), %ymm1 +- VPCMPEQ (%rdi), %ymm1, %ymm1 +- +- vmovdqu VEC_SIZE(%rsi), %ymm2 +- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2 +- vpand %ymm2, %ymm1, %ymm5 +- +- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3 +- vpand %ymm3, %ymm5, %ymm5 +- +- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4 +- vpand %ymm4, %ymm5, %ymm5 +- +- vptest %ymm0, %ymm5 +- jnc L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- addq $(VEC_SIZE * 4), %rsi +- +- subq $(VEC_SIZE * 4), %rdx +- cmpq $(VEC_SIZE * 4), %rdx +- jae L(loop_4x_vec) +- +- /* Less than 4 * VEC. */ +- cmpq $VEC_SIZE, %rdx +- jbe L(last_vec) +- cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_2x_vec) +- +-L(last_4x_vec): +- /* From 2 * VEC to 4 * VEC. */ +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- /* Use overlapping loads to avoid branches. */ +- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi +- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- vmovdqu (%rsi), %ymm2 +- VPCMPEQ (%rdi), %ymm2, %ymm2 +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- VZEROUPPER_RETURN +- +- .p2align 4 +-L(4x_vec_end): +- vpmovmskb %ymm1, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- vpmovmskb %ymm2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x1) +- vpmovmskb %ymm3, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x2) +- vpmovmskb %ymm4, %eax +- subl $VEC_MASK, %eax +- tzcntl %eax, %ecx + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif +- VZEROUPPER_RETURN +- + .p2align 4 +-L(first_vec_x1): +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rcx), %edx +- cmpl VEC_SIZE(%rsi, %rcx), %edx +- jmp L(wmemcmp_return) ++L(one_or_less): ++ jb L(zero) ++ movl (%rdi), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi), %ecx ++ je L(zero) ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++ /* No ymm register was touched. */ ++ ret + # else +- movzbl VEC_SIZE(%rdi, %rcx), %eax +- movzbl VEC_SIZE(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif +- VZEROUPPER_RETURN + + .p2align 4 +-L(first_vec_x2): +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx +- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx +- sub %edx, %eax ++L(between_4_7): ++ /* Load as big endian with overlapping movbe to avoid branches. ++ */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ jz L(zero_4_7) ++ sbbl %eax, %eax ++ orl $1, %eax ++L(zero_4_7): ++ /* No ymm register was touched. */ ++ ret + # endif +- VZEROUPPER_RETURN ++ + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-46.patch b/glibc-RHEL-15696-46.patch new file mode 100644 index 0000000..881fe81 --- /dev/null +++ b/glibc-RHEL-15696-46.patch @@ -0,0 +1,851 @@ +From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 17 May 2021 13:57:24 -0400 +Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit optimizes memcmp-evex.S. The optimizations include +adding a new vec compare path for small sizes, reorganizing the entry +control flow, removing some unnecissary ALU instructions from the main +loop, and most importantly replacing the heavy use of vpcmp + kand +logic with vpxor + vptern. test-memcmp and test-wmemcmp are both +passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++-------- + 1 file changed, 408 insertions(+), 302 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 9c093972..654dc7ac 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -19,17 +19,22 @@ + #if IS_IN (libc) + + /* memcmp/wmemcmp is implemented as: +- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap +- to avoid branches. +- 2. Use overlapping compare to avoid branch. +- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8 +- bytes for wmemcmp. +- 4. If size is 8 * VEC_SIZE or less, unroll the loop. +- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory ++ 1. Use ymm vector compares when possible. The only case where ++ vector compares is not possible for when size < CHAR_PER_VEC ++ and loading from either s1 or s2 would cause a page cross. ++ 2. For size from 2 to 7 bytes on page cross, load as big endian ++ with movbe and bswap to avoid branches. ++ 3. Use xmm vector compare when size >= 4 bytes for memcmp or ++ size >= 8 bytes for wmemcmp. ++ 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a ++ to check for early mismatches. Only do this if its guranteed the ++ work is not wasted. ++ 5. If size is 8 * VEC_SIZE or less, unroll the loop. ++ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory + area. +- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less. +- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less. +- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */ ++ 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. ++ 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. ++ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ + + # include + +@@ -40,11 +45,21 @@ + # define VMOVU vmovdqu64 + + # ifdef USE_AS_WMEMCMP +-# define VPCMPEQ vpcmpeqd ++# define CHAR_SIZE 4 ++# define VPCMP vpcmpd + # else +-# define VPCMPEQ vpcmpeqb ++# define CHAR_SIZE 1 ++# define VPCMP vpcmpub + # endif + ++# define VEC_SIZE 32 ++# define PAGE_SIZE 4096 ++# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) ++ ++# define XMM0 xmm16 ++# define XMM1 xmm17 ++# define XMM2 xmm18 ++# define YMM0 ymm16 + # define XMM1 xmm17 + # define XMM2 xmm18 + # define YMM1 ymm17 +@@ -54,15 +69,6 @@ + # define YMM5 ymm21 + # define YMM6 ymm22 + +-# define VEC_SIZE 32 +-# ifdef USE_AS_WMEMCMP +-# define VEC_MASK 0xff +-# define XMM_MASK 0xf +-# else +-# define VEC_MASK 0xffffffff +-# define XMM_MASK 0xffff +-# endif +- + /* Warning! + wmemcmp has to use SIGNED comparison for elements. + memcmp has to use UNSIGNED comparison for elemnts. +@@ -70,145 +76,370 @@ + + .section .text.evex,"ax",@progbits + ENTRY (MEMCMP) +-# ifdef USE_AS_WMEMCMP +- shl $2, %RDX_LP +-# elif defined __ILP32__ ++# ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx + # endif +- cmp $VEC_SIZE, %RDX_LP ++ cmp $CHAR_PER_VEC, %RDX_LP + jb L(less_vec) + + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k1 ++ VMOVU (%rsi), %YMM1 ++ /* Use compare not equals to directly check for mismatch. */ ++ VPCMP $4, (%rdi), %YMM1, %k1 + kmovd %k1, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_vec) +- +- /* More than 2 * VEC. */ +- cmpq $(VEC_SIZE * 8), %rdx +- ja L(more_8x_vec) +- cmpq $(VEC_SIZE * 4), %rdx +- jb L(last_4x_vec) ++ /* NB: eax must be destination register if going to ++ L(return_vec_[0,2]). For L(return_vec_3 destination register ++ must be ecx. */ ++ testl %eax, %eax ++ jnz L(return_vec_0) + +- /* From 4 * VEC to 8 * VEC, inclusively. */ +- VMOVU (%rsi), %YMM1 +- VPCMPEQ (%rdi), %YMM1, %k1 ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(last_1x_vec) + ++ /* Check second VEC no matter what. */ + VMOVU VEC_SIZE(%rsi), %YMM2 +- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 ++ VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1) ++ ++ /* Less than 4 * VEC. */ ++ cmpq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(last_2x_vec) + ++ /* Check third and fourth VEC no matter what. */ + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 ++ VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 ++ VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_3) + +- kandd %k1, %k2, %k5 +- kandd %k3, %k4, %k6 +- kandd %k5, %k6, %k6 ++ /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so ++ compare with zero to get a mask is needed. */ ++ vpxorq %XMM0, %XMM0, %XMM0 + +- kmovd %k6, %eax +- cmpl $VEC_MASK, %eax +- jne L(4x_vec_end) ++ /* Go to 4x VEC loop. */ ++ cmpq $(CHAR_PER_VEC * 8), %rdx ++ ja L(more_8x_vec) + +- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi +- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi +- VMOVU (%rsi), %YMM1 +- VPCMPEQ (%rdi), %YMM1, %k1 ++ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any ++ branches. */ + +- VMOVU VEC_SIZE(%rsi), %YMM2 +- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 +- kandd %k1, %k2, %k5 ++ /* Load first two VEC from s2 before adjusting addresses. */ ++ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1 ++ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2 ++ leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi ++ leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi ++ ++ /* Wait to load from s1 until addressed adjust due to ++ unlamination of microfusion with complex address mode. */ ++ ++ /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it ++ will have some 1s. */ ++ vpxorq (%rdi), %YMM1, %YMM1 ++ vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2 + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 +- kandd %k3, %k5, %k5 ++ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 ++ /* Or together YMM1, YMM2, and YMM3 into YMM3. */ ++ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 +- kandd %k4, %k5, %k5 ++ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while ++ oring with YMM3. Result is stored in YMM4. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 ++ /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ ++ VPCMP $4, %YMM4, %YMM0, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ ret + +- kmovd %k5, %eax +- cmpl $VEC_MASK, %eax +- jne L(4x_vec_end) +- xorl %eax, %eax ++ /* NB: aligning 32 here allows for the rest of the jump targets ++ to be tuned for 32 byte alignment. Most important this ensures ++ the L(more_8x_vec) loop is 32 byte aligned. */ ++ .p2align 5 ++L(less_vec): ++ /* Check if one or less CHAR. This is necessary for size = 0 but ++ is also faster for size = CHAR_SIZE. */ ++ cmpl $1, %edx ++ jbe L(one_or_less) ++ ++ /* Check if loading one VEC from either s1 or s2 could cause a ++ page cross. This can have false positives but is by far the ++ fastest method. */ ++ movl %edi, %eax ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(page_cross_less_vec) ++ ++ /* No page cross possible. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMP $4, (%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Create mask in ecx for potentially in bound matches. */ ++ bzhil %edx, %eax, %eax ++ jnz L(return_vec_0) + ret + + .p2align 4 +-L(last_2x_vec): +- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++L(return_vec_0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret + +-L(last_vec): +- /* Use overlapping loads to avoid branches. */ +- leaq -VEC_SIZE(%rdi, %rdx), %rdi +- leaq -VEC_SIZE(%rsi, %rdx), %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++ /* NB: No p2align necessary. Alignment % 16 is naturally 1 ++ which is good enough for a target not in a loop. */ ++L(return_vec_1): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl VEC_SIZE(%rsi, %rax), %ecx ++ movzbl VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + ret + +- .p2align 4 +-L(first_vec): +- /* A byte or int32 is different within 16 or 32 bytes. */ +- tzcntl %eax, %ecx ++ /* NB: No p2align necessary. Alignment % 16 is naturally 2 ++ which is good enough for a target not in a loop. */ ++L(return_vec_2): ++ tzcntl %eax, %eax + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (%rdi, %rcx, 4), %edx +- cmpl (%rsi, %rcx, 4), %edx +-L(wmemcmp_return): +- setl %al +- negl %eax +- orl $1, %eax ++ movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else +- movzbl (%rdi, %rcx), %eax +- movzbl (%rsi, %rcx), %edx +- sub %edx, %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax + # endif + ret + ++ .p2align 4 ++L(8x_return_vec_0_1_2_3): ++ /* Returning from L(more_8x_vec) requires restoring rsi. */ ++ addq %rdi, %rsi ++L(return_vec_0_1_2_3): ++ VPCMP $4, %YMM1, %YMM0, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) ++ ++ VPCMP $4, %YMM2, %YMM0, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1) ++ ++ VPCMP $4, %YMM3, %YMM0, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_2) ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++ ret ++ + .p2align 4 +-L(4): +- xorl %eax, %eax +- movl (%rdi), %edx +- cmpl (%rsi), %edx +- jne L(wmemcmp_return) ++L(more_8x_vec): ++ /* Set end of s1 in rdx. */ ++ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx ++ /* rsi stores s2 - s1. This allows loop to only update one ++ pointer. */ ++ subq %rdi, %rsi ++ /* Align s1 pointer. */ ++ andq $-VEC_SIZE, %rdi ++ /* Adjust because first 4x vec where check already. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ .p2align 4 ++L(loop_4x_vec): ++ VMOVU (%rsi, %rdi), %YMM1 ++ vpxorq (%rdi), %YMM1, %YMM1 ++ ++ VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 ++ vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 ++ ++ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 ++ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 ++ VPCMP $4, %YMM4, %YMM0, %k1 ++ kmovd %k1, %ecx ++ testl %ecx, %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpq %rdx, %rdi ++ jb L(loop_4x_vec) ++ ++ subq %rdx, %rdi ++ /* rdi has 4 * VEC_SIZE - remaining length. */ ++ cmpl $(VEC_SIZE * 3), %edi ++ jae L(8x_last_1x_vec) ++ /* Load regardless of branch. */ ++ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3 ++ cmpl $(VEC_SIZE * 2), %edi ++ jae L(8x_last_2x_vec) ++ ++ VMOVU (%rsi, %rdx), %YMM1 ++ vpxorq (%rdx), %YMM1, %YMM1 ++ ++ VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 ++ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 ++ ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 ++ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 ++ ++ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 ++ VPCMP $4, %YMM4, %YMM0, %k1 ++ kmovd %k1, %ecx ++ /* Restore s1 pointer to rdi. */ ++ movq %rdx, %rdi ++ testl %ecx, %ecx ++ jnz L(8x_return_vec_0_1_2_3) ++ /* NB: eax must be zero to reach here. */ ++ ret ++ ++ /* Only entry is from L(more_8x_vec). */ ++ .p2align 4 ++L(8x_last_2x_vec): ++ VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(8x_return_vec_2) ++ /* Naturally aligned to 16 bytes. */ ++L(8x_last_1x_vec): ++ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 ++ VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(8x_return_vec_3) ++ ret ++ ++ .p2align 4 ++L(last_2x_vec): ++ /* Check second to last VEC. */ ++ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 ++ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1_end) ++ ++ /* Check last VEC. */ ++ .p2align 4 ++L(last_1x_vec): ++ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1 ++ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0_end) + ret ++ ++ .p2align 4 ++L(8x_return_vec_2): ++ subq $VEC_SIZE, %rdx ++L(8x_return_vec_3): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ leaq (%rdx, %rax, CHAR_SIZE), %rax ++ movl (VEC_SIZE * 3)(%rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else ++ addq %rdx, %rax ++ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 3)(%rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ + .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- je L(exit) +- sbbl %eax, %eax +- orl $1, %eax ++L(return_vec_0_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -VEC_SIZE(%rsi, %rax), %ecx ++ movzbl -VEC_SIZE(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + ret + + .p2align 4 +-L(exit): ++L(return_vec_1_end): ++ tzcntl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif + ret + ++ + .p2align 4 ++L(page_cross_less_vec): ++ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 ++ bytes. */ ++ cmpl $(16 / CHAR_SIZE), %edx ++ jae L(between_16_31) ++# ifndef USE_AS_WMEMCMP ++ cmpl $8, %edx ++ jae L(between_8_15) ++ cmpl $4, %edx ++ jae L(between_4_7) + L(between_2_3): + /* Load as big endian to avoid branches. */ + movzwl (%rdi), %eax +@@ -217,224 +448,99 @@ L(between_2_3): + shll $8, %ecx + bswap %eax + bswap %ecx +- movb -1(%rdi, %rdx), %al +- movb -1(%rsi, %rdx), %cl ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx + /* Subtraction is okay because the upper 8 bits are zero. */ + subl %ecx, %eax + ret +- + .p2align 4 +-L(1): +- movzbl (%rdi), %eax ++L(one_or_less): ++ jb L(zero) + movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + subl %ecx, %eax + ret +-# endif +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax +- ret + + .p2align 4 +-L(less_vec): +-# ifdef USE_AS_WMEMCMP +- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */ +- cmpb $4, %dl +- je L(4) +- jb L(zero) +-# else +- cmpb $1, %dl +- je L(1) +- jb L(zero) +- cmpb $4, %dl +- jb L(between_2_3) +- cmpb $8, %dl +- jb L(between_4_7) ++L(between_8_15): + # endif +- cmpb $16, %dl +- jae L(between_16_31) +- /* It is between 8 and 15 bytes. */ ++ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 +- VPCMPEQ %XMM1, %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) ++ VPCMP $4, %XMM1, %XMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + /* Use overlapping loads to avoid branches. */ +- leaq -8(%rdi, %rdx), %rdi +- leaq -8(%rsi, %rdx), %rsi ++ leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi ++ leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi + vmovq (%rdi), %XMM1 + vmovq (%rsi), %XMM2 +- VPCMPEQ %XMM1, %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) ++ VPCMP $4, %XMM1, %XMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + ret + + .p2align 4 +-L(between_16_31): +- /* From 16 to 31 bytes. No branch when size == 16. */ +- VMOVU (%rsi), %XMM2 +- VPCMPEQ (%rdi), %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) +- +- /* Use overlapping loads to avoid branches. */ +- leaq -16(%rdi, %rdx), %rdi +- leaq -16(%rsi, %rdx), %rsi +- VMOVU (%rsi), %XMM2 +- VPCMPEQ (%rdi), %XMM2, %k2 +- kmovw %k2, %eax +- subl $XMM_MASK, %eax +- jnz L(first_vec) ++L(zero): ++ xorl %eax, %eax + ret + + .p2align 4 +-L(more_8x_vec): +- /* More than 8 * VEC. Check the first VEC. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- /* Align the first memory area for aligned loads in the loop. +- Compute how much the first memory area is misaligned. */ +- movq %rdi, %rcx +- andl $(VEC_SIZE - 1), %ecx +- /* Get the negative of offset for alignment. */ +- subq $VEC_SIZE, %rcx +- /* Adjust the second memory area. */ +- subq %rcx, %rsi +- /* Adjust the first memory area which should be aligned now. */ +- subq %rcx, %rdi +- /* Adjust length. */ +- addq %rcx, %rdx +- +-L(loop_4x_vec): +- /* Compare 4 * VEC at a time forward. */ +- VMOVU (%rsi), %YMM1 +- VPCMPEQ (%rdi), %YMM1, %k1 +- +- VMOVU VEC_SIZE(%rsi), %YMM2 +- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2 +- kandd %k2, %k1, %k5 +- +- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3 +- kandd %k3, %k5, %k5 +- +- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4 +- kandd %k4, %k5, %k5 +- +- kmovd %k5, %eax +- cmpl $VEC_MASK, %eax +- jne L(4x_vec_end) +- +- addq $(VEC_SIZE * 4), %rdi +- addq $(VEC_SIZE * 4), %rsi +- +- subq $(VEC_SIZE * 4), %rdx +- cmpq $(VEC_SIZE * 4), %rdx +- jae L(loop_4x_vec) +- +- /* Less than 4 * VEC. */ +- cmpq $VEC_SIZE, %rdx +- jbe L(last_vec) +- cmpq $(VEC_SIZE * 2), %rdx +- jbe L(last_2x_vec) +- +-L(last_4x_vec): +- /* From 2 * VEC to 4 * VEC. */ +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) ++L(between_16_31): ++ /* From 16 to 31 bytes. No branch when size == 16. */ ++ VMOVU (%rsi), %XMM2 ++ VPCMP $4, (%rdi), %XMM2, %k1 ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + + /* Use overlapping loads to avoid branches. */ +- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi +- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) + +- addq $VEC_SIZE, %rdi +- addq $VEC_SIZE, %rsi +- VMOVU (%rsi), %YMM2 +- VPCMPEQ (%rdi), %YMM2, %k2 +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- ret +- +- .p2align 4 +-L(4x_vec_end): ++ VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 ++ leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi ++ leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi ++ VPCMP $4, (%rdi), %XMM2, %k1 + kmovd %k1, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec) +- kmovd %k2, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x1) +- kmovd %k3, %eax +- subl $VEC_MASK, %eax +- jnz L(first_vec_x2) +- kmovd %k4, %eax +- subl $VEC_MASK, %eax +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif ++ testl %eax, %eax ++ jnz L(return_vec_0) + ret + +- .p2align 4 +-L(first_vec_x1): +- tzcntl %eax, %ecx + # ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rcx, 4), %edx +- cmpl VEC_SIZE(%rsi, %rcx, 4), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl VEC_SIZE(%rdi, %rcx), %eax +- movzbl VEC_SIZE(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif ++ .p2align 4 ++L(one_or_less): ++ jb L(zero) ++ movl (%rdi), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi), %ecx ++ je L(zero) ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + ret ++# else + + .p2align 4 +-L(first_vec_x2): +- tzcntl %eax, %ecx +-# ifdef USE_AS_WMEMCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx +- cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx +- jmp L(wmemcmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx +- sub %edx, %eax +-# endif ++L(between_4_7): ++ /* Load as big endian with overlapping movbe to avoid branches. ++ */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ jz L(zero_4_7) ++ sbbl %eax, %eax ++ orl $1, %eax ++L(zero_4_7): + ret ++# endif ++ + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-47.patch b/glibc-RHEL-15696-47.patch new file mode 100644 index 0000000..70c3171 --- /dev/null +++ b/glibc-RHEL-15696-47.patch @@ -0,0 +1,104 @@ +From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 20 May 2021 13:13:51 -0400 +Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. This commit makes a few small improvements to +memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64 +instead of 128. Either alignment will perform equally well in a loop +and 128 just increases the odds of having to do an extra iteration +which can be significant overhead for small values. 2) Align some +targets and the loop. 3) Remove an ALU from the alignment process. 4) +Reorder the last 4x VEC so that they are stored after the loop. 5) +Move the condition for leq 8x VEC to before the alignment +process. test-memset and test-wmemset are both passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + .../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++-------- + 1 file changed, 28 insertions(+), 22 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index f877ac9d..909c33f6 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + VMOVU %VEC(0), (%rdi) + VZEROUPPER_RETURN + ++ .p2align 4 + L(stosb_more_2x_vec): + cmp __x86_rep_stosb_threshold(%rip), %RDX_LP + ja L(stosb) ++#else ++ .p2align 4 + #endif + L(more_2x_vec): +- cmpq $(VEC_SIZE * 4), %rdx +- ja L(loop_start) ++ /* Stores to first 2x VEC before cmp as any path forward will ++ require it. */ + VMOVU %VEC(0), (%rdi) + VMOVU %VEC(0), VEC_SIZE(%rdi) +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) ++ cmpq $(VEC_SIZE * 4), %rdx ++ ja L(loop_start) + VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) ++ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) + L(return): + #if VEC_SIZE > 16 + ZERO_UPPER_VEC_REGISTERS_RETURN +@@ -192,28 +197,29 @@ L(return): + #endif + + L(loop_start): +- leaq (VEC_SIZE * 4)(%rdi), %rcx +- VMOVU %VEC(0), (%rdi) +- andq $-(VEC_SIZE * 4), %rcx +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +- VMOVU %VEC(0), VEC_SIZE(%rdi) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) +- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx) +- addq %rdi, %rdx +- andq $-(VEC_SIZE * 4), %rdx +- cmpq %rdx, %rcx +- je L(return) ++ cmpq $(VEC_SIZE * 8), %rdx ++ jbe L(loop_end) ++ andq $-(VEC_SIZE * 2), %rdi ++ subq $-(VEC_SIZE * 4), %rdi ++ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx ++ .p2align 4 + L(loop): +- VMOVA %VEC(0), (%rcx) +- VMOVA %VEC(0), VEC_SIZE(%rcx) +- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx) +- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx) +- addq $(VEC_SIZE * 4), %rcx +- cmpq %rcx, %rdx +- jne L(loop) ++ VMOVA %VEC(0), (%rdi) ++ VMOVA %VEC(0), VEC_SIZE(%rdi) ++ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi) ++ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) ++ subq $-(VEC_SIZE * 4), %rdi ++ cmpq %rcx, %rdi ++ jb L(loop) ++L(loop_end): ++ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. ++ rdx as length is also unchanged. */ ++ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx) ++ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx) ++ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx) ++ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) + VZEROUPPER_SHORT_RETURN + + .p2align 4 +-- +GitLab + diff --git a/glibc-RHEL-15696-48.patch b/glibc-RHEL-15696-48.patch new file mode 100644 index 0000000..645536e --- /dev/null +++ b/glibc-RHEL-15696-48.patch @@ -0,0 +1,84 @@ +From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 23 May 2021 19:43:24 -0400 +Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +This patch changes the condition for copy 4x VEC so that if length is +exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of +8x VEC case. + +Results For Skylake memcpy-avx2-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 0 , 9.137 , 6.873 , New , 75.22 +128 , 7 , 0 , 12.933 , 7.732 , New , 59.79 +128 , 0 , 7 , 11.852 , 6.76 , New , 57.04 +128 , 7 , 7 , 12.587 , 6.808 , New , 54.09 + +Results For Icelake memcpy-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 0 , 9.963 , 5.416 , New , 54.36 +128 , 7 , 0 , 16.467 , 8.061 , New , 48.95 +128 , 0 , 7 , 14.388 , 7.644 , New , 53.13 +128 , 7 , 7 , 14.546 , 7.642 , New , 52.54 + +Results For Tigerlake memcpy-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 0 , 8.979 , 4.95 , New , 55.13 +128 , 7 , 0 , 14.245 , 7.122 , New , 50.0 +128 , 0 , 7 , 12.668 , 6.675 , New , 52.69 +128 , 7 , 7 , 13.042 , 6.802 , New , 52.15 + +Results For Skylake memmove-avx2-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 32 , 6.181 , 5.691 , New , 92.07 +128 , 32 , 0 , 6.165 , 5.752 , New , 93.3 +128 , 0 , 7 , 13.923 , 9.37 , New , 67.3 +128 , 7 , 0 , 12.049 , 10.182 , New , 84.5 + +Results For Icelake memmove-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 32 , 5.479 , 4.889 , New , 89.23 +128 , 32 , 0 , 5.127 , 4.911 , New , 95.79 +128 , 0 , 7 , 18.885 , 13.547 , New , 71.73 +128 , 7 , 0 , 15.565 , 14.436 , New , 92.75 + +Results For Tigerlake memmove-evex-erms +size, al1 , al2 , Cur T , New T , Win , New / Cur +128 , 0 , 32 , 5.275 , 4.815 , New , 91.28 +128 , 32 , 0 , 5.376 , 4.565 , New , 84.91 +128 , 0 , 7 , 19.426 , 14.273 , New , 73.47 +128 , 7 , 0 , 15.924 , 14.951 , New , 93.89 + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +index 3e2dd6bc..572cef04 100644 +--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +@@ -417,8 +417,8 @@ L(more_2x_vec): + cmpq $(VEC_SIZE * 8), %rdx + ja L(more_8x_vec) + cmpq $(VEC_SIZE * 4), %rdx +- jb L(last_4x_vec) +- /* Copy from 4 * VEC to 8 * VEC, inclusively. */ ++ jbe L(last_4x_vec) ++ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) +@@ -437,7 +437,7 @@ L(more_2x_vec): + VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx) + VZEROUPPER_RETURN + L(last_4x_vec): +- /* Copy from 2 * VEC to 4 * VEC. */ ++ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */ + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2) +-- +GitLab + diff --git a/glibc-RHEL-15696-49.patch b/glibc-RHEL-15696-49.patch new file mode 100644 index 0000000..b59f582 --- /dev/null +++ b/glibc-RHEL-15696-49.patch @@ -0,0 +1,55 @@ +From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Jun 2021 19:19:34 -0400 +Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S +Content-type: text/plain; charset=UTF-8 + +No bug. The way wcsnlen will check if near the end of maxlen +is the following macro: + + mov %r11, %rsi; \ + subq %rax, %rsi; \ + andq $-64, %rax; \ + testq $-64, %rsi; \ + je L(strnlen_ret) + +Which words independently of s + maxlen overflowing. So the +second overflow check is unnecissary for correctness and +just extra overhead in the common no overflow case. + +test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are +all passing + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strlen-vec.S | 7 ------- + 1 file changed, 7 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S +index 439e486a..b7657282 100644 +--- a/sysdeps/x86_64/multiarch/strlen-vec.S ++++ b/sysdeps/x86_64/multiarch/strlen-vec.S +@@ -71,19 +71,12 @@ L(n_nonzero): + suffice. */ + mov %RSI_LP, %R10_LP + sar $62, %R10_LP +- test %R10_LP, %R10_LP + jnz __wcslen_sse4_1 + sal $2, %RSI_LP + # endif + +- + /* Initialize long lived registers. */ +- + add %RDI_LP, %RSI_LP +-# ifdef AS_WCSLEN +-/* Check for overflow again from s + maxlen * sizeof(wchar_t). */ +- jbe __wcslen_sse4_1 +-# endif + mov %RSI_LP, %R10_LP + and $-64, %R10_LP + mov %RSI_LP, %R11_LP +-- +GitLab + diff --git a/glibc-RHEL-15696-5.patch b/glibc-RHEL-15696-5.patch new file mode 100644 index 0000000..75d3978 --- /dev/null +++ b/glibc-RHEL-15696-5.patch @@ -0,0 +1,290 @@ +From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:32:24 -0800 +Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes memset/wmemset for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use + RDX_LP for length. Clear the upper 32 bits of RDX register. + * sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset. + * sysdeps/x86_64/x32/tst-size_t-memset.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise. +--- + .../multiarch/memset-avx512-no-vzeroupper.S | 6 +- + .../multiarch/memset-vec-unaligned-erms.S | 34 +++++---- + sysdeps/x86_64/x32/Makefile | 4 +- + sysdeps/x86_64/x32/tst-size_t-memset.c | 73 +++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wmemset.c | 20 +++++ + 5 files changed, 121 insertions(+), 16 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S +index 689cc119..99e25519 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S +@@ -29,12 +29,16 @@ + .section .text.avx512,"ax",@progbits + #if defined PIC + ENTRY (MEMSET_CHK) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (MEMSET_CHK) + #endif + + ENTRY (MEMSET) ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif + vpxor %xmm0, %xmm0, %xmm0 + vmovd %esi, %xmm1 + lea (%rdi, %rdx), %rsi +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 270a1d49..9a0fd818 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -65,8 +65,8 @@ + .section SECTION(.text),"ax",@progbits + #if VEC_SIZE == 16 && IS_IN (libc) + ENTRY (__bzero) +- movq %rdi, %rax /* Set return value. */ +- movq %rsi, %rdx /* Set n. */ ++ mov %RDI_LP, %RAX_LP /* Set return value. */ ++ mov %RSI_LP, %RDX_LP /* Set n. */ + pxor %xmm0, %xmm0 + jmp L(entry_from_bzero) + END (__bzero) +@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero) + #if IS_IN (libc) + # if defined SHARED + ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + # endif + + ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) +- shlq $2, %rdx ++ shl $2, %RDX_LP + WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + jmp L(entry_from_bzero) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) +@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned)) + + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + #endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif + L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) +@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned)) + + # if VEC_SIZE == 16 + ENTRY (__memset_chk_erms) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END (__memset_chk_erms) + + /* Only used to measure performance of REP STOSB. */ + ENTRY (__memset_erms) + /* Skip zero length. */ +- testq %rdx, %rdx ++ test %RDX_LP, %RDX_LP + jnz L(stosb) + movq %rdi, %rax + ret +@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms)) + L(stosb): + /* Issue vzeroupper before rep stosb. */ + VZEROUPPER +- movq %rdx, %rcx ++ mov %RDX_LP, %RCX_LP + movzbl %sil, %eax +- movq %rdi, %rdx ++ mov %RDI_LP, %RDX_LP + rep stosb +- movq %rdx, %rax ++ mov %RDX_LP, %RAX_LP + ret + # if VEC_SIZE == 16 + END (__memset_erms) +@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms)) + + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) +- cmpq %rdx, %rcx ++ cmp %RDX_LP, %RCX_LP + jb HIDDEN_JUMPTARGET (__chk_fail) + END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +- cmpq $VEC_SIZE, %rdx ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ mov %edx, %edx ++# endif ++ cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) +- cmpq $(VEC_SIZE * 2), %rdx ++ cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index e99dbd7c..98bd9ae9 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -7,9 +7,9 @@ endif + + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ +- tst-size_t-memrchr ++ tst-size_t-memrchr tst-size_t-memset + endif + + ifeq ($(subdir),wcsmbs) +-tests += tst-size_t-wmemchr tst-size_t-wmemcmp ++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c +new file mode 100644 +index 00000000..2c367af6 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memset.c +@@ -0,0 +1,73 @@ ++/* Test memset with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifdef WIDE ++# define TEST_NAME "wmemset" ++#else ++# define TEST_NAME "memset" ++#endif /* WIDE */ ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# define MEMSET wmemset ++# define CHAR wchar_t ++#else ++# define MEMSET memset ++# define CHAR char ++#endif /* WIDE */ ++ ++IMPL (MEMSET, 1) ++ ++typedef CHAR *(*proto_t) (CHAR *, int, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_memset (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, (uintptr_t) b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ CHAR ch = 0x23; ++ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 }; ++ parameter_t c = { { 0 }, (void *) (uintptr_t) ch }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ c.fn = impl->fn; ++ CHAR *p = (CHAR *) do_memset (src, c); ++ size_t i; ++ for (i = 0; i < src.len; i++) ++ if (p[i] != ch) ++ { ++ error (0, 0, "Wrong result in function %s", impl->name); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c +new file mode 100644 +index 00000000..955eb488 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c +@@ -0,0 +1,20 @@ ++/* Test wmemset with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-memset.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-50.patch b/glibc-RHEL-15696-50.patch new file mode 100644 index 0000000..e896698 --- /dev/null +++ b/glibc-RHEL-15696-50.patch @@ -0,0 +1,43 @@ +From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001 +Author: Shen-Ta Hsieh 2021-05-23 21:43:10 +Committer: H.J. Lu 2021-06-27 10:56:57 +Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc) +Child: 1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support) +Branches: master, remotes/origin/master and many more (41) +Follows: glibc-2.33.9000 +Precedes: glibc-2.34 + + math: redirect roundeven function + + This patch redirect roundeven function for futhermore changes. + + Signed-off-by: Shen-Ta Hsieh + Reviewed-by: H.J. Lu + +Conflicts: + * + (rewritten for older branch) + +diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c +index 7bbbb2dc..8728d0f2 100644 +--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c ++++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +@@ -67,5 +68,6 @@ __roundeven (double x) + INSERT_WORDS64 (x, ix); + return x; + } +-hidden_def (__roundeven) ++#ifndef __roundeven + libm_alias_double (__roundeven, roundeven) ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-51.patch b/glibc-RHEL-15696-51.patch new file mode 100644 index 0000000..105843d --- /dev/null +++ b/glibc-RHEL-15696-51.patch @@ -0,0 +1,118 @@ +From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001 +From: Shen-Ta Hsieh +Date: Mon, 24 May 2021 09:43:10 +0800 +Subject: [PATCH] math: redirect roundeven function +Content-type: text/plain; charset=UTF-8 + +This patch redirect roundeven function for futhermore changes. + +Signed-off-by: Shen-Ta Hsieh +Reviewed-by: H.J. Lu +--- + include/math.h | 3 ++- + sysdeps/ieee754/dbl-64/s_roundeven.c | 4 +++- + sysdeps/ieee754/float128/s_roundevenf128.c | 1 + + sysdeps/ieee754/flt-32/s_roundevenf.c | 3 +++ + sysdeps/ieee754/ldbl-128/s_roundevenl.c | 1 + + sysdeps/ieee754/ldbl-96/s_roundevenl.c | 1 + + 6 files changed, 11 insertions(+), 2 deletions(-) + +Conflicts: + include/math.h + (missing MATH_REDIRECT macros) + +diff --git a/include/math.h b/include/math.h +index e21d34b8..1f9f9a54 100644 +--- a/include/math.h ++++ b/include/math.h +@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling) + libm_hidden_proto (__issignalingf) + libm_hidden_proto (__exp) + libm_hidden_proto (__expf) +-libm_hidden_proto (__roundeven) + + # ifndef __NO_LONG_DOUBLE_MATH + libm_hidden_proto (__fpclassifyl) +@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128) + + # if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0) + # ifndef NO_MATH_REDIRECT ++float (roundevenf) (float) asm ("__roundevenf"); ++double (roundeven) (double) asm ("__roundeven"); + /* Declare sqrt for use within GLIBC. Compilers typically inline sqrt as a + single instruction. Use an asm to avoid use of PLTs if it doesn't. */ + float (sqrtf) (float) asm ("__ieee754_sqrtf"); +diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c +index 1438e81d..61962184 100644 +--- a/sysdeps/ieee754/dbl-64/s_roundeven.c ++++ b/sysdeps/ieee754/dbl-64/s_roundeven.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +@@ -101,5 +102,6 @@ __roundeven (double x) + INSERT_WORDS (x, hx, lx); + return x; + } +-hidden_def (__roundeven) ++#ifndef __roundeven + libm_alias_double (__roundeven, roundeven) ++#endif +diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c +index 5a9b3f39..e0faf727 100644 +--- a/sysdeps/ieee754/float128/s_roundevenf128.c ++++ b/sysdeps/ieee754/float128/s_roundevenf128.c +@@ -1,2 +1,3 @@ ++#define NO_MATH_REDIRECT + #include + #include "../ldbl-128/s_roundevenl.c" +diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c +index 90f991d5..a661875e 100644 +--- a/sysdeps/ieee754/flt-32/s_roundevenf.c ++++ b/sysdeps/ieee754/flt-32/s_roundevenf.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +@@ -67,4 +68,6 @@ __roundevenf (float x) + SET_FLOAT_WORD (x, ix); + return x; + } ++#ifndef __roundevenf + libm_alias_float (__roundeven, roundeven) ++#endif +diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c +index 5fc59af4..b9375b6c 100644 +--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c ++++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c +index be2e4fa4..65031ab7 100644 +--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c ++++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c +@@ -17,6 +17,7 @@ + License along with the GNU C Library; if not, see + . */ + ++#define NO_MATH_REDIRECT + #include + #include + #include +-- +GitLab + diff --git a/glibc-RHEL-15696-52.patch b/glibc-RHEL-15696-52.patch new file mode 100644 index 0000000..4602f51 --- /dev/null +++ b/glibc-RHEL-15696-52.patch @@ -0,0 +1,242 @@ +From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001 +From: Shen-Ta Hsieh +Date: Mon, 24 May 2021 09:43:11 +0800 +Subject: [PATCH] x86_64: roundeven with sse4.1 support +Content-type: text/plain; charset=UTF-8 + +This patch adds support for the sse4.1 hardware floating point +roundeven. + +Here is some benchmark results on my systems: + +=AMD Ryzen 9 3900X 12-Core Processor= + +* benchmark result before this commit +| | roundeven | roundevenf | +|------------|--------------|--------------| +| duration | 3.75587e+09 | 3.75114e+09 | +| iterations | 3.93053e+08 | 4.35402e+08 | +| max | 52.592 | 58.71 | +| min | 7.98 | 7.22 | +| mean | 9.55563 | 8.61535 | + +* benchmark result after this commit +| | roundeven | roundevenf | +|------------|---------------|--------------| +| duration | 3.73815e+09 | 3.73738e+09 | +| iterations | 5.82692e+08 | 5.91498e+08 | +| max | 56.468 | 51.642 | +| min | 6.27 | 6.156 | +| mean | 6.41532 | 6.3185 | + +=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz= + +* benchmark result before this commit +| | roundeven | roundevenf | +|------------|--------------|--------------| +| duration | 2.18208e+09 | 2.18258e+09 | +| iterations | 2.39932e+08 | 2.46924e+08 | +| max | 96.378 | 98.035 | +| min | 6.776 | 5.94 | +| mean | 9.09456 | 8.83907 | + +* benchmark result after this commit +| | roundeven | roundevenf | +|------------|--------------|--------------| +| duration | 2.17415e+09 | 2.17005e+09 | +| iterations | 3.56193e+08 | 4.09824e+08 | +| max | 51.693 | 97.192 | +| min | 5.926 | 5.093 | +| mean | 6.10385 | 5.29507 | + +Signed-off-by: Shen-Ta Hsieh +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/fpu/multiarch/Makefile | 5 +-- + sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c | 2 ++ + .../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++ + sysdeps/x86_64/fpu/multiarch/s_roundeven.c | 31 +++++++++++++++++++ + sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c | 3 ++ + .../fpu/multiarch/s_roundevenf-sse4_1.S | 24 ++++++++++++++ + sysdeps/x86_64/fpu/multiarch/s_roundevenf.c | 31 +++++++++++++++++++ + 7 files changed, 118 insertions(+), 2 deletions(-) + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S + create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c + +diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile +index 9f387248..6ddd1c01 100644 +--- a/sysdeps/x86_64/fpu/multiarch/Makefile ++++ b/sysdeps/x86_64/fpu/multiarch/Makefile +@@ -1,11 +1,12 @@ + ifeq ($(subdir),math) + libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \ + s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \ +- s_trunc-c s_truncf-c ++ s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c + + libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \ + s_floorf-sse4_1 s_nearbyint-sse4_1 \ +- s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \ ++ s_nearbyintf-sse4_1 s_roundeven-sse4_1 \ ++ s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \ + s_trunc-sse4_1 s_truncf-sse4_1 + + libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \ +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c +new file mode 100644 +index 00000000..c7be43cb +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c +@@ -0,0 +1,2 @@ ++#define __roundeven __roundeven_c ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S +new file mode 100644 +index 00000000..6ae8f6b1 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S +@@ -0,0 +1,24 @@ ++/* Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++ .section .text.sse4.1,"ax",@progbits ++ENTRY(__roundeven_sse41) ++ roundsd $8, %xmm0, %xmm0 ++ ret ++END(__roundeven_sse41) +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c +new file mode 100644 +index 00000000..d92eda65 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c +@@ -0,0 +1,31 @@ ++/* Multiple versions of __roundeven. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define roundeven __redirect_roundeven ++#define __roundeven __redirect___roundeven ++#include ++#undef roundeven ++#undef __roundeven ++ ++#define SYMBOL_NAME roundeven ++#include "ifunc-sse4_1.h" ++ ++libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ()); ++libm_alias_double (__roundeven, roundeven) +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c +new file mode 100644 +index 00000000..72a6e7d1 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c +@@ -0,0 +1,3 @@ ++#undef __roundevenf ++#define __roundevenf __roundevenf_c ++#include +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S +new file mode 100644 +index 00000000..a76e1080 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S +@@ -0,0 +1,24 @@ ++/* Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++ .section .text.sse4.1,"ax",@progbits ++ENTRY(__roundevenf_sse41) ++ roundss $8, %xmm0, %xmm0 ++ ret ++END(__roundevenf_sse41) +diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c +new file mode 100644 +index 00000000..2ee196e6 +--- /dev/null ++++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c +@@ -0,0 +1,31 @@ ++/* Multiple versions of __roundevenf. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define roundevenf __redirect_roundevenf ++#define __roundevenf __redirect___roundevenf ++#include ++#undef roundevenf ++#undef __roundevenf ++ ++#define SYMBOL_NAME roundevenf ++#include "ifunc-sse4_1.h" ++ ++libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ()); ++libm_alias_float (__roundeven, roundeven) +-- +GitLab + diff --git a/glibc-RHEL-15696-53.patch b/glibc-RHEL-15696-53.patch new file mode 100644 index 0000000..7221d38 --- /dev/null +++ b/glibc-RHEL-15696-53.patch @@ -0,0 +1,41 @@ +From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 9 Jan 2022 16:02:28 -0600 +Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755] +Content-type: text/plain; charset=UTF-8 + +Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to +__wcscmp_evex. For x86_64 this covers the entire address range so any +length larger could not possibly be used to bound `s1` or `s2`. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 459eeed0..d5aa6daa 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -97,6 +97,16 @@ ENTRY (STRCMP) + je L(char0) + jb L(zero) + # ifdef USE_AS_WCSCMP ++# ifndef __ILP32__ ++ movq %rdx, %rcx ++ /* Check if length could overflow when multiplied by ++ sizeof(wchar_t). Checking top 8 bits will cover all potential ++ overflow cases as well as redirect cases where its impossible to ++ length to bound a valid memory region. In these cases just use ++ 'wcscmp'. */ ++ shrq $56, %rcx ++ jnz __wcscmp_evex ++# endif + /* Convert units: from wide to byte char. */ + shl $2, %RDX_LP + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-54.patch b/glibc-RHEL-15696-54.patch new file mode 100644 index 0000000..b2aaaa1 --- /dev/null +++ b/glibc-RHEL-15696-54.patch @@ -0,0 +1,268 @@ +From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 20 Aug 2021 06:42:24 -0700 +Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ + #28252] +Content-type: text/plain; charset=UTF-8 + +Optimize loads of all bits set into ZMM register in AVX512 SVML codes +by replacing + + vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX + +and + + vmovups .L_2il0floatpacket.13(%rip), %zmmX + +with + vpternlogd $0xff, %zmmX, %zmmX, %zmmX + +This fixes BZ #28252. +--- + .../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------ + .../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++---------- + .../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------ + .../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------ + 10 files changed, 11 insertions(+), 64 deletions(-) + +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +index 24e3b363..07dfed85 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S +@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos + vmovaps %zmm0, %zmm8 + + /* Check for large arguments path */ +- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2 ++ vpternlogd $0xff, %zmm2, %zmm2, %zmm2 + + /* + ARGUMENT RANGE REDUCTION: +@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_cos_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.16: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.16,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +index ae8af8d8..ddb60e5b 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S +@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log + + /* preserve mantissa, set input exponent to 2^(-10) */ + vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2 +- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1 ++ vpternlogd $0xff, %zmm1, %zmm1, %zmm1 + vpsrlq $32, %zmm4, %zmm6 + + /* reciprocal approximation good to at least 11 bits */ +@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_log_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.12: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.12,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +index 2d4b14fd..529c454a 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S +@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin + andq $-64, %rsp + subq $1280, %rsp + movq __svml_d_trig_data@GOTPCREL(%rip), %rax +- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14 ++ vpternlogd $0xff, %zmm1, %zmm1, %zmm14 + vmovups __dAbsMask(%rax), %zmm7 + vmovups __dInvPI(%rax), %zmm2 + vmovups __dRShifter(%rax), %zmm1 +@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin + jmp .LBL_2_7 + #endif + END (_ZGVeN8v_sin_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.14: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.14,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +index 2df626c0..e501a53a 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S +@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos + + /* SinPoly = SinR*SinPoly */ + vfmadd213pd %zmm5, %zmm5, %zmm4 +- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3 ++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3 + + /* Update Cos result's sign */ + vxorpd %zmm2, %zmm1, %zmm1 +@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl) + ENTRY (_ZGVeN8vvv_sincos_skx) + WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx + END (_ZGVeN8vvv_sincos_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.15: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.15,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +index 6ea1137b..377af394 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S +@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf + X = X - Y*PI1 - Y*PI2 - Y*PI3 + */ + vmovaps %zmm0, %zmm6 +- vmovups .L_2il0floatpacket.13(%rip), %zmm12 ++ vpternlogd $0xff, %zmm12, %zmm12, %zmm12 + vmovups __sRShifter(%rax), %zmm3 + vmovups __sPI1_FMA(%rax), %zmm5 + vmovups __sA9_FMA(%rax), %zmm9 +@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf + jmp .LBL_2_7 + #endif + END (_ZGVeN16v_cosf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +index 89ba0df2..46f33d46 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S +@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf + vmovaps %zmm0, %zmm7 + + /* compare against threshold */ +- vmovups .L_2il0floatpacket.13(%rip), %zmm3 ++ vpternlogd $0xff, %zmm3, %zmm3, %zmm3 + vmovups __sInvLn2(%rax), %zmm4 + vmovups __sShifter(%rax), %zmm1 + vmovups __sLn2hi(%rax), %zmm6 +@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf + + #endif + END (_ZGVeN16v_expf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +index 4cf0a96f..9e254956 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S +@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf + andq $-64, %rsp + subq $1280, %rsp + movq __svml_slog_data@GOTPCREL(%rip), %rax +- vmovups .L_2il0floatpacket.7(%rip), %zmm6 ++ vpternlogd $0xff, %zmm6, %zmm6, %zmm6 + vmovups _iBrkValue(%rax), %zmm4 + vmovups _sPoly_7(%rax), %zmm8 + +@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf + + #endif + END (_ZGVeN16v_logf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.7: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.7,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +index bdcd50af..e8331ba1 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S +@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + vpsrlq $32, %zmm3, %zmm2 + vpmovqd %zmm2, %ymm11 + vcvtps2pd %ymm14, %zmm13 +- vmovups .L_2il0floatpacket.23(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + vmovaps %zmm14, %zmm26 + vpandd _ABSMASK(%rax), %zmm1, %zmm8 + vpcmpd $1, _INF(%rax), %zmm8, %k2 +@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + vpmovqd %zmm11, %ymm5 + vpxord %zmm10, %zmm10, %zmm10 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3} +- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4 ++ vpternlogd $0xff, %zmm4, %zmm4, %zmm4 + vpxord %zmm11, %zmm11, %zmm11 + vcvtdq2pd %ymm7, %zmm7 + vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1} +@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf + jmp .LBL_2_7 + #endif + END (_ZGVeN16vv_powf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.23: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.23,@object +-.L_2il0floatpacket.24: +- .long 0xffffffff,0xffffffff +- .type .L_2il0floatpacket.24,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +index 5fa4bc41..1f46f334 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S +@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf + + /* Result sign calculations */ + vpternlogd $150, %zmm0, %zmm14, %zmm1 +- vmovups .L_2il0floatpacket.13(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + + /* Add correction term 0.5 for cos() part */ + vaddps %zmm8, %zmm5, %zmm15 +@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl) + ENTRY (_ZGVeN16vvv_sincosf_skx) + WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx + END (_ZGVeN16vvv_sincosf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.13: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.13,@object +diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +index 141f747e..1fc9308a 100644 +--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S ++++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S +@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf + movq __svml_s_trig_data@GOTPCREL(%rip), %rax + + /* Check for large and special values */ +- vmovups .L_2il0floatpacket.11(%rip), %zmm14 ++ vpternlogd $0xff, %zmm14, %zmm14, %zmm14 + vmovups __sAbsMask(%rax), %zmm5 + vmovups __sInvPI(%rax), %zmm1 + vmovups __sRShifter(%rax), %zmm2 +@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf + jmp .LBL_2_7 + #endif + END (_ZGVeN16v_sinf_skx) +- +- .section .rodata, "a" +-.L_2il0floatpacket.11: +- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff +- .type .L_2il0floatpacket.11,@object +-- +GitLab + diff --git a/glibc-RHEL-15696-55.patch b/glibc-RHEL-15696-55.patch new file mode 100644 index 0000000..d44eef1 --- /dev/null +++ b/glibc-RHEL-15696-55.patch @@ -0,0 +1,48 @@ +From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 21 Sep 2021 18:31:49 -0500 +Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be + specified +Content-type: text/plain; charset=UTF-8 + +No bug. + +This change adds a new macro ENTRY_P2ALIGN which takes a second +argument, log2 of the desired function alignment. + +The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this +doesn't affect any existing functionality. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86/sysdep.h | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index 01bac0f6..a70bb3a2 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -78,15 +78,18 @@ enum cf_protection_level + #define ASM_SIZE_DIRECTIVE(name) .size name,.-name; + + /* Define an entry point visible from C. */ +-#define ENTRY(name) \ ++#define ENTRY_P2ALIGN(name, alignment) \ + .globl C_SYMBOL_NAME(name); \ + .type C_SYMBOL_NAME(name),@function; \ +- .align ALIGNARG(4); \ ++ .align ALIGNARG(alignment); \ + C_LABEL(name) \ + cfi_startproc; \ + _CET_ENDBR; \ + CALL_MCOUNT + ++/* Common entry 16 byte aligns. */ ++#define ENTRY(name) ENTRY_P2ALIGN (name, 4) ++ + #undef END + #define END(name) \ + cfi_endproc; \ +-- +GitLab + diff --git a/glibc-RHEL-15696-56.patch b/glibc-RHEL-15696-56.patch new file mode 100644 index 0000000..45b9975 --- /dev/null +++ b/glibc-RHEL-15696-56.patch @@ -0,0 +1,658 @@ +From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 21 Sep 2021 18:45:03 -0500 +Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and + size +Content-type: text/plain; charset=UTF-8 + +No bug. + +The frontend optimizations are to: +1. Reorganize logically connected basic blocks so they are either in + the same cache line or adjacent cache lines. +2. Avoid cases when basic blocks unnecissarily cross cache lines. +3. Try and 32 byte align any basic blocks possible without sacrificing + code size. Smaller / Less hot basic blocks are used for this. + +Overall code size shrunk by 168 bytes. This should make up for any +extra costs due to aligning to 64 bytes. + +In general performance before deviated a great deal dependending on +whether entry alignment % 64 was 0, 16, 32, or 48. These changes +essentially make it so that the current implementation is at least +equal to the best alignment of the original for any arguments. + +The only additional optimization is in the page cross case. Branch on +equals case was removed from the size == [4, 7] case. As well the [4, +7] and [2, 3] case where swapped as [4, 7] is likely a more hot +argument size. + +test-memcmp and test-wmemcmp are both passing. +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++-------- + 1 file changed, 242 insertions(+), 192 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 654dc7ac..2761b54f 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -34,7 +34,24 @@ + area. + 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less. + 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less. +- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */ ++ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. ++ ++When possible the implementation tries to optimize for frontend in the ++following ways: ++Throughput: ++ 1. All code sections that fit are able to run optimally out of the ++ LSD. ++ 2. All code sections that fit are able to run optimally out of the ++ DSB ++ 3. Basic blocks are contained in minimum number of fetch blocks ++ necessary. ++ ++Latency: ++ 1. Logically connected basic blocks are put in the same ++ cache-line. ++ 2. Logically connected basic blocks that do not fit in the same ++ cache-line are put in adjacent lines. This can get beneficial ++ L2 spatial prefetching and L1 next-line prefetching. */ + + # include + +@@ -47,9 +64,11 @@ + # ifdef USE_AS_WMEMCMP + # define CHAR_SIZE 4 + # define VPCMP vpcmpd ++# define VPTEST vptestmd + # else + # define CHAR_SIZE 1 + # define VPCMP vpcmpub ++# define VPTEST vptestmb + # endif + + # define VEC_SIZE 32 +@@ -75,7 +94,9 @@ + */ + + .section .text.evex,"ax",@progbits +-ENTRY (MEMCMP) ++/* Cache align memcmp entry. This allows for much more thorough ++ frontend optimization. */ ++ENTRY_P2ALIGN (MEMCMP, 6) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -89,7 +110,7 @@ ENTRY (MEMCMP) + VPCMP $4, (%rdi), %YMM1, %k1 + kmovd %k1, %eax + /* NB: eax must be destination register if going to +- L(return_vec_[0,2]). For L(return_vec_3 destination register ++ L(return_vec_[0,2]). For L(return_vec_3) destination register + must be ecx. */ + testl %eax, %eax + jnz L(return_vec_0) +@@ -121,10 +142,6 @@ ENTRY (MEMCMP) + testl %ecx, %ecx + jnz L(return_vec_3) + +- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so +- compare with zero to get a mask is needed. */ +- vpxorq %XMM0, %XMM0, %XMM0 +- + /* Go to 4x VEC loop. */ + cmpq $(CHAR_PER_VEC * 8), %rdx + ja L(more_8x_vec) +@@ -148,47 +165,61 @@ ENTRY (MEMCMP) + + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 +- /* Or together YMM1, YMM2, and YMM3 into YMM3. */ +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while +- oring with YMM3. Result is stored in YMM4. */ +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 +- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */ +- VPCMP $4, %YMM4, %YMM0, %k1 ++ oring with YMM1. Result is stored in YMM4. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ ++ /* Or together YMM2, YMM3, and YMM4 into YMM4. */ ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ ++ /* Test YMM4 against itself. Store any CHAR mismatches in k1. ++ */ ++ VPTEST %YMM4, %YMM4, %k1 ++ /* k1 must go to ecx for L(return_vec_0_1_2_3). */ + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret + +- /* NB: aligning 32 here allows for the rest of the jump targets +- to be tuned for 32 byte alignment. Most important this ensures +- the L(more_8x_vec) loop is 32 byte aligned. */ +- .p2align 5 +-L(less_vec): +- /* Check if one or less CHAR. This is necessary for size = 0 but +- is also faster for size = CHAR_SIZE. */ +- cmpl $1, %edx +- jbe L(one_or_less) ++ .p2align 4 ++L(8x_end_return_vec_0_1_2_3): ++ movq %rdx, %rdi ++L(8x_return_vec_0_1_2_3): ++ addq %rdi, %rsi ++L(return_vec_0_1_2_3): ++ VPTEST %YMM1, %YMM1, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) + +- /* Check if loading one VEC from either s1 or s2 could cause a +- page cross. This can have false positives but is by far the +- fastest method. */ +- movl %edi, %eax +- orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(page_cross_less_vec) ++ VPTEST %YMM2, %YMM2, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_1) + +- /* No page cross possible. */ +- VMOVU (%rsi), %YMM2 +- VPCMP $4, (%rdi), %YMM2, %k1 +- kmovd %k1, %eax +- /* Create mask in ecx for potentially in bound matches. */ +- bzhil %edx, %eax, %eax +- jnz L(return_vec_0) ++ VPTEST %YMM3, %YMM3, %k0 ++ kmovd %k0, %eax ++ testl %eax, %eax ++ jnz L(return_vec_2) ++L(return_vec_3): ++ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one ++ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache ++ line. */ ++ bsfl %ecx, %ecx ++# ifdef USE_AS_WMEMCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif + ret + + .p2align 4 +@@ -209,10 +240,11 @@ L(return_vec_0): + # endif + ret + +- /* NB: No p2align necessary. Alignment % 16 is naturally 1 +- which is good enough for a target not in a loop. */ ++ .p2align 4 + L(return_vec_1): +- tzcntl %eax, %eax ++ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one ++ fetch block. */ ++ bsfl %eax, %eax + # ifdef USE_AS_WMEMCMP + movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +@@ -226,10 +258,11 @@ L(return_vec_1): + # endif + ret + +- /* NB: No p2align necessary. Alignment % 16 is naturally 2 +- which is good enough for a target not in a loop. */ ++ .p2align 4,, 10 + L(return_vec_2): +- tzcntl %eax, %eax ++ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one ++ fetch block. */ ++ bsfl %eax, %eax + # ifdef USE_AS_WMEMCMP + movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +@@ -243,40 +276,6 @@ L(return_vec_2): + # endif + ret + +- .p2align 4 +-L(8x_return_vec_0_1_2_3): +- /* Returning from L(more_8x_vec) requires restoring rsi. */ +- addq %rdi, %rsi +-L(return_vec_0_1_2_3): +- VPCMP $4, %YMM1, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_0) +- +- VPCMP $4, %YMM2, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_1) +- +- VPCMP $4, %YMM3, %YMM0, %k0 +- kmovd %k0, %eax +- testl %eax, %eax +- jnz L(return_vec_2) +-L(return_vec_3): +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WMEMCMP +- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax +- xorl %edx, %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx +- subl %ecx, %eax +-# endif +- ret +- + .p2align 4 + L(more_8x_vec): + /* Set end of s1 in rdx. */ +@@ -288,21 +287,19 @@ L(more_8x_vec): + andq $-VEC_SIZE, %rdi + /* Adjust because first 4x vec where check already. */ + subq $-(VEC_SIZE * 4), %rdi ++ + .p2align 4 + L(loop_4x_vec): + VMOVU (%rsi, %rdi), %YMM1 + vpxorq (%rdi), %YMM1, %YMM1 +- + VMOVU VEC_SIZE(%rsi, %rdi), %YMM2 + vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2 +- + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 +- + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4 +- VPCMP $4, %YMM4, %YMM0, %k1 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(8x_return_vec_0_1_2_3) +@@ -319,28 +316,25 @@ L(loop_4x_vec): + cmpl $(VEC_SIZE * 2), %edi + jae L(8x_last_2x_vec) + ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 ++ + VMOVU (%rsi, %rdx), %YMM1 + vpxorq (%rdx), %YMM1, %YMM1 + + VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 + vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 +- +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3 +- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3 +- + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4 +- VPCMP $4, %YMM4, %YMM0, %k1 ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 ++ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 ++ VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +- /* Restore s1 pointer to rdi. */ +- movq %rdx, %rdi + testl %ecx, %ecx +- jnz L(8x_return_vec_0_1_2_3) ++ jnz L(8x_end_return_vec_0_1_2_3) + /* NB: eax must be zero to reach here. */ + ret + + /* Only entry is from L(more_8x_vec). */ +- .p2align 4 ++ .p2align 4,, 10 + L(8x_last_2x_vec): + VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 + kmovd %k1, %eax +@@ -355,7 +349,31 @@ L(8x_last_1x_vec): + jnz L(8x_return_vec_3) + ret + +- .p2align 4 ++ /* Not ideally aligned (at offset +9 bytes in fetch block) but ++ not aligning keeps it in the same cache line as ++ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code ++ size. */ ++ .p2align 4,, 4 ++L(8x_return_vec_2): ++ subq $VEC_SIZE, %rdx ++L(8x_return_vec_3): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ leaq (%rdx, %rax, CHAR_SIZE), %rax ++ movl (VEC_SIZE * 3)(%rax), %ecx ++ xorl %edx, %edx ++ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ addq %rdx, %rax ++ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ movzbl (VEC_SIZE * 3)(%rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4,, 10 + L(last_2x_vec): + /* Check second to last VEC. */ + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1 +@@ -374,26 +392,49 @@ L(last_1x_vec): + jnz L(return_vec_0_end) + ret + +- .p2align 4 +-L(8x_return_vec_2): +- subq $VEC_SIZE, %rdx +-L(8x_return_vec_3): +- tzcntl %eax, %eax ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ /* Use bsf to save code size. This is necessary to have ++ L(one_or_less) fit in aligning bytes between. */ ++ bsfl %eax, %eax ++ addl %edx, %eax + # ifdef USE_AS_WMEMCMP +- leaq (%rdx, %rax, CHAR_SIZE), %rax +- movl (VEC_SIZE * 3)(%rax), %ecx ++ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx ++ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx + setg %dl + leal -1(%rdx, %rdx), %eax + # else +- addq %rdx, %rax +- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx +- movzbl (VEC_SIZE * 3)(%rax), %eax ++ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx ++ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax + subl %ecx, %eax + # endif + ret + ++ /* NB: L(one_or_less) fits in alignment padding between ++ L(return_vec_1_end) and L(return_vec_0_end). */ ++# ifdef USE_AS_WMEMCMP ++L(one_or_less): ++ jb L(zero) ++ movl (%rdi), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi), %ecx ++ je L(zero) ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++ ret ++# else ++L(one_or_less): ++ jb L(zero) ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax ++ subl %ecx, %eax ++ ret ++# endif ++L(zero): ++ xorl %eax, %eax ++ ret ++ + .p2align 4 + L(return_vec_0_end): + tzcntl %eax, %eax +@@ -412,23 +453,56 @@ L(return_vec_0_end): + ret + + .p2align 4 +-L(return_vec_1_end): ++L(less_vec): ++ /* Check if one or less CHAR. This is necessary for size == 0 ++ but is also faster for size == CHAR_SIZE. */ ++ cmpl $1, %edx ++ jbe L(one_or_less) ++ ++ /* Check if loading one VEC from either s1 or s2 could cause a ++ page cross. This can have false positives but is by far the ++ fastest method. */ ++ movl %edi, %eax ++ orl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ cmpl $(PAGE_SIZE - VEC_SIZE), %eax ++ jg L(page_cross_less_vec) ++ ++ /* No page cross possible. */ ++ VMOVU (%rsi), %YMM2 ++ VPCMP $4, (%rdi), %YMM2, %k1 ++ kmovd %k1, %eax ++ /* Check if any matches where in bounds. Intentionally not ++ storing result in eax to limit dependency chain if it goes to ++ L(return_vec_0_lv). */ ++ bzhil %edx, %eax, %edx ++ jnz L(return_vec_0_lv) ++ xorl %eax, %eax ++ ret ++ ++ /* Essentially duplicate of L(return_vec_0). Ends up not costing ++ any code as shrinks L(less_vec) by allowing 2-byte encoding of ++ the jump and ends up fitting in aligning bytes. As well fits on ++ same cache line as L(less_vec) so also saves a line from having ++ to be fetched on cold calls to memcmp. */ ++ .p2align 4,, 4 ++L(return_vec_0_lv): + tzcntl %eax, %eax +- addl %edx, %eax + # ifdef USE_AS_WMEMCMP +- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx ++ movl (%rdi, %rax, CHAR_SIZE), %ecx + xorl %edx, %edx +- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ + setg %dl + leal -1(%rdx, %rdx), %eax + # else +- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx +- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax + subl %ecx, %eax + # endif + ret + +- + .p2align 4 + L(page_cross_less_vec): + /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 +@@ -439,108 +513,84 @@ L(page_cross_less_vec): + cmpl $8, %edx + jae L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) +-L(between_2_3): +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax +- ret +- .p2align 4 +-L(one_or_less): +- jb L(zero) +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax ++ jb L(between_2_3) ++ ++ /* Load as big endian with overlapping movbe to avoid branches. ++ */ ++ movbe (%rdi), %eax ++ movbe (%rsi), %ecx ++ shlq $32, %rax ++ shlq $32, %rcx ++ movbe -4(%rdi, %rdx), %edi ++ movbe -4(%rsi, %rdx), %esi ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ /* edx is guranteed to be positive int32 in range [4, 7]. */ ++ cmovne %edx, %eax ++ /* ecx is -1 if rcx > rax. Otherwise 0. */ ++ sbbl %ecx, %ecx ++ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx == ++ rax then eax and ecx are zero. If rax < rax then ecx is -1 so ++ eax doesn't matter. */ ++ orl %ecx, %eax + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(between_8_15): + # endif + /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ +- vmovq (%rdi), %XMM1 +- vmovq (%rsi), %XMM2 +- VPCMP $4, %XMM1, %XMM2, %k1 ++ vmovq (%rdi), %xmm1 ++ vmovq (%rsi), %xmm2 ++ VPCMP $4, %xmm1, %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) ++ jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi +- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi +- vmovq (%rdi), %XMM1 +- vmovq (%rsi), %XMM2 +- VPCMP $4, %XMM1, %XMM2, %k1 ++ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1 ++ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ VPCMP $4, %xmm1, %xmm2, %k1 ++ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ret +- +- .p2align 4 +-L(zero): +- xorl %eax, %eax ++ jnz L(return_vec_0_end) + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ +- VMOVU (%rsi), %XMM2 +- VPCMP $4, (%rdi), %XMM2, %k1 ++ ++ /* Use movups to save code size. */ ++ movups (%rsi), %xmm2 ++ VPCMP $4, (%rdi), %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ++ jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- +- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2 +- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi +- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi +- VPCMP $4, (%rdi), %XMM2, %k1 ++ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 ++ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx + kmovd %k1, %eax + testl %eax, %eax +- jnz L(return_vec_0) +- ret +- +-# ifdef USE_AS_WMEMCMP +- .p2align 4 +-L(one_or_less): +- jb L(zero) +- movl (%rdi), %ecx +- xorl %edx, %edx +- cmpl (%rsi), %ecx +- je L(zero) +- setg %dl +- leal -1(%rdx, %rdx), %eax ++ jnz L(return_vec_0_end) + ret +-# else + +- .p2align 4 +-L(between_4_7): +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- jz L(zero_4_7) +- sbbl %eax, %eax +- orl $1, %eax +-L(zero_4_7): ++# ifndef USE_AS_WMEMCMP ++L(between_2_3): ++ /* Load as big endian to avoid branches. */ ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ /* Subtraction is okay because the upper 8 bits are zero. */ ++ subl %ecx, %eax + ret + # endif +- + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-57.patch b/glibc-RHEL-15696-57.patch new file mode 100644 index 0000000..51d5dd0 --- /dev/null +++ b/glibc-RHEL-15696-57.patch @@ -0,0 +1,510 @@ +From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 20 Sep 2021 16:20:15 -0500 +Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. + +Optimization are + +1. change control flow for L(more_2x_vec) to fall through to loop and + jump for L(less_4x_vec) and L(less_8x_vec). This uses less code + size and saves jumps for length > 4x VEC_SIZE. + +2. For EVEX/AVX512 move L(less_vec) closer to entry. + +3. Avoid complex address mode for length > 2x VEC_SIZE + +4. Slightly better aligning code for the loop from the perspective of + code size and uops. + +5. Align targets so they make full use of their fetch block and if + possible cache line. + +6. Try and reduce total number of icache lines that will need to be + pulled in for a given length. + +7. Include "local" version of stosb target. For AVX2/EVEX/AVX512 + jumping to the stosb target in the sse2 code section will almost + certainly be to a new page. The new version does increase code size + marginally by duplicating the target but should get better iTLB + behavior as a result. + +test-memset, test-wmemset, and test-bzero are all passing. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/memset.S | 10 +- + .../multiarch/memset-avx2-unaligned-erms.S | 10 +- + .../multiarch/memset-avx512-unaligned-erms.S | 11 +- + .../multiarch/memset-evex-unaligned-erms.S | 11 +- + .../multiarch/memset-vec-unaligned-erms.S | 285 ++++++++++++------ + 5 files changed, 232 insertions(+), 95 deletions(-) + +Conflicts: + sysdeps/x86_64/memset.S + (GNU URL) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index b3426795..8672b030 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -18,13 +18,15 @@ + . */ + + #include ++#define USE_WITH_SSE2 1 + + #define VEC_SIZE 16 ++#define MOV_SIZE 3 ++#define RET_SIZE 1 ++ + #define VEC(i) xmm##i +-/* Don't use movups and movaps since it will get larger nop paddings for +- alignment. */ +-#define VMOVU movdqu +-#define VMOVA movdqa ++#define VMOVU movups ++#define VMOVA movaps + + #define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index ae0860f3..1af668af 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -1,8 +1,14 @@ + #if IS_IN (libc) ++# define USE_WITH_AVX2 1 ++ + # define VEC_SIZE 32 ++# define MOV_SIZE 4 ++# define RET_SIZE 4 ++ + # define VEC(i) ymm##i +-# define VMOVU vmovdqu +-# define VMOVA vmovdqa ++ ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 8ad842fc..f14d6f84 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -1,11 +1,18 @@ + #if IS_IN (libc) ++# define USE_WITH_AVX512 1 ++ + # define VEC_SIZE 64 ++# define MOV_SIZE 6 ++# define RET_SIZE 1 ++ + # define XMM0 xmm16 + # define YMM0 ymm16 + # define VEC0 zmm16 + # define VEC(i) VEC##i +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ + # define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 640f0929..64b09e77 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -1,11 +1,18 @@ + #if IS_IN (libc) ++# define USE_WITH_EVEX 1 ++ + # define VEC_SIZE 32 ++# define MOV_SIZE 6 ++# define RET_SIZE 1 ++ + # define XMM0 xmm16 + # define YMM0 ymm16 + # define VEC0 ymm16 + # define VEC(i) VEC##i +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++ ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 ++ + # define VZEROUPPER + + # define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 909c33f6..f08b7323 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -63,8 +63,27 @@ + # endif + #endif + ++#if VEC_SIZE == 64 ++# define LOOP_4X_OFFSET (VEC_SIZE * 4) ++#else ++# define LOOP_4X_OFFSET (0) ++#endif ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++# define END_REG rcx ++# define LOOP_REG rdi ++#else ++# define END_REG rdi ++# define LOOP_REG rdx ++#endif ++ + #define PAGE_SIZE 4096 + ++/* Macro to calculate size of small memset block for aligning ++ purposes. */ ++#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1) ++ ++ + #ifndef SECTION + # error SECTION is not defined! + #endif +@@ -74,6 +93,7 @@ + ENTRY (__bzero) + mov %RDI_LP, %RAX_LP /* Set return value. */ + mov %RSI_LP, %RDX_LP /* Set n. */ ++ xorl %esi, %esi + pxor %XMM0, %XMM0 + jmp L(entry_from_bzero) + END (__bzero) +@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + +-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) ++ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) + MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms)) + jb L(less_vec) + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +- VMOVU %VEC(0), (%rdi) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. ++ */ ++ VMOVU %VEC(0), (%rax) ++ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) + VZEROUPPER_RETURN +- +- .p2align 4 +-L(stosb_more_2x_vec): +- cmp __x86_rep_stosb_threshold(%rip), %RDX_LP +- ja L(stosb) +-#else +- .p2align 4 + #endif +-L(more_2x_vec): +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rdi) +- VMOVU %VEC(0), VEC_SIZE(%rdi) +- cmpq $(VEC_SIZE * 4), %rdx +- ja L(loop_start) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx) +- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx) +-L(return): +-#if VEC_SIZE > 16 +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ ++ .p2align 4,, 10 ++L(last_2x_vec): ++#ifdef USE_LESS_VEC_MASK_STORE ++ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) + #else +- ret ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) + #endif ++ VZEROUPPER_RETURN + +-L(loop_start): +- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi) +- cmpq $(VEC_SIZE * 8), %rdx +- jbe L(loop_end) +- andq $-(VEC_SIZE * 2), %rdi +- subq $-(VEC_SIZE * 4), %rdi +- leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx +- .p2align 4 +-L(loop): +- VMOVA %VEC(0), (%rdi) +- VMOVA %VEC(0), VEC_SIZE(%rdi) +- VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi) +- VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi) +- subq $-(VEC_SIZE * 4), %rdi +- cmpq %rcx, %rdi +- jb L(loop) +-L(loop_end): +- /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN. +- rdx as length is also unchanged. */ +- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx) +- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx) +- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) +- VZEROUPPER_SHORT_RETURN +- +- .p2align 4 ++ /* If have AVX512 mask instructions put L(less_vec) close to ++ entry as it doesn't take much space and is likely a hot target. ++ */ ++#ifdef USE_LESS_VEC_MASK_STORE ++ .p2align 4,, 10 + L(less_vec): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! + # endif +-# ifdef USE_LESS_VEC_MASK_STORE + /* Clear high bits from edi. Only keeping bits relevant to page + cross check. Note that we are using rax which is set in +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. +- */ ++ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */ + andl $(PAGE_SIZE - 1), %edi +- /* Check if VEC_SIZE store cross page. Mask stores suffer serious +- performance degradation when it has to fault supress. */ ++ /* Check if VEC_SIZE store cross page. Mask stores suffer ++ serious performance degradation when it has to fault supress. ++ */ + cmpl $(PAGE_SIZE - VEC_SIZE), %edi ++ /* This is generally considered a cold target. */ + ja L(cross_page) + # if VEC_SIZE > 32 + movq $-1, %rcx +@@ -247,58 +235,185 @@ L(less_vec): + bzhil %edx, %ecx, %ecx + kmovd %ecx, %k1 + # endif +- vmovdqu8 %VEC(0), (%rax) {%k1} ++ vmovdqu8 %VEC(0), (%rax){%k1} + VZEROUPPER_RETURN + ++# if defined USE_MULTIARCH && IS_IN (libc) ++ /* Include L(stosb_local) here if including L(less_vec) between ++ L(stosb_more_2x_vec) and ENTRY. This is to cache align the ++ L(stosb_more_2x_vec) target. */ ++ .p2align 4,, 10 ++L(stosb_local): ++ movzbl %sil, %eax ++ mov %RDX_LP, %RCX_LP ++ mov %RDI_LP, %RDX_LP ++ rep stosb ++ mov %RDX_LP, %RAX_LP ++ VZEROUPPER_RETURN ++# endif ++#endif ++ ++#if defined USE_MULTIARCH && IS_IN (libc) + .p2align 4 +-L(cross_page): ++L(stosb_more_2x_vec): ++ cmp __x86_rep_stosb_threshold(%rip), %RDX_LP ++ ja L(stosb_local) ++#endif ++ /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] ++ and (4x, 8x] jump to target. */ ++L(more_2x_vec): ++ ++ /* Two different methods of setting up pointers / compare. The ++ two methods are based on the fact that EVEX/AVX512 mov ++ instructions take more bytes then AVX2/SSE2 mov instructions. As ++ well that EVEX/AVX512 machines also have fast LEA_BID. Both ++ setup and END_REG to avoid complex address mode. For EVEX/AVX512 ++ this saves code size and keeps a few targets in one fetch block. ++ For AVX2/SSE2 this helps prevent AGU bottlenecks. */ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + ++ LOOP_4X_OFFSET) with LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ ++ /* Stores to first 2x VEC before cmp as any path forward will ++ require it. */ ++ VMOVU %VEC(0), (%rax) ++ VMOVU %VEC(0), VEC_SIZE(%rax) ++ ++ ++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) ++ /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ ++ addq %rdx, %END_REG ++#endif ++ ++ cmpq $(VEC_SIZE * 4), %rdx ++ jbe L(last_2x_vec) ++ ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) ++ VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) ++ ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add ++ extra offset to addresses in loop. Used for AVX512 to save space ++ as no way to get (VEC_SIZE * 4) in imm8. */ ++# if LOOP_4X_OFFSET == 0 ++ subq $-(VEC_SIZE * 4), %LOOP_REG + # endif +-# if VEC_SIZE > 32 +- cmpb $32, %dl +- jae L(between_32_63) ++ /* Avoid imm32 compare here to save code size. */ ++ cmpq %rdi, %rcx ++#else ++ addq $-(VEC_SIZE * 4), %END_REG ++ cmpq $(VEC_SIZE * 8), %rdx ++#endif ++ jbe L(last_4x_vec) ++#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) ++ /* Set LOOP_REG (rdx). */ ++ leaq (VEC_SIZE * 4)(%rax), %LOOP_REG ++#endif ++ /* Align dst for loop. */ ++ andq $(VEC_SIZE * -2), %LOOP_REG ++ .p2align 4 ++L(loop): ++ VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG) ++ VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG) ++ subq $-(VEC_SIZE * 4), %LOOP_REG ++ cmpq %END_REG, %LOOP_REG ++ jb L(loop) ++ .p2align 4,, MOV_SIZE ++L(last_4x_vec): ++ VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG) ++ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG) ++L(return): ++#if VEC_SIZE > 16 ++ ZERO_UPPER_VEC_REGISTERS_RETURN ++#else ++ ret ++#endif ++ ++ .p2align 4,, 10 ++#ifndef USE_LESS_VEC_MASK_STORE ++# if defined USE_MULTIARCH && IS_IN (libc) ++ /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in ++ range for 2-byte jump encoding. */ ++L(stosb_local): ++ movzbl %sil, %eax ++ mov %RDX_LP, %RCX_LP ++ mov %RDI_LP, %RDX_LP ++ rep stosb ++ mov %RDX_LP, %RAX_LP ++ VZEROUPPER_RETURN + # endif +-# if VEC_SIZE > 16 +- cmpb $16, %dl ++ /* Define L(less_vec) only if not otherwise defined. */ ++ .p2align 4 ++L(less_vec): ++#endif ++L(cross_page): ++#if VEC_SIZE > 32 ++ cmpl $32, %edx ++ jae L(between_32_63) ++#endif ++#if VEC_SIZE > 16 ++ cmpl $16, %edx + jae L(between_16_31) +-# endif +- MOVQ %XMM0, %rcx +- cmpb $8, %dl ++#endif ++ MOVQ %XMM0, %rdi ++ cmpl $8, %edx + jae L(between_8_15) +- cmpb $4, %dl ++ cmpl $4, %edx + jae L(between_4_7) +- cmpb $1, %dl ++ cmpl $1, %edx + ja L(between_2_3) +- jb 1f +- movb %cl, (%rax) +-1: ++ jb L(return) ++ movb %sil, (%rax) + VZEROUPPER_RETURN +-# if VEC_SIZE > 32 ++ ++ /* Align small targets only if not doing so would cross a fetch ++ line. */ ++#if VEC_SIZE > 32 ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, -32(%rax,%rdx) + VMOVU %YMM0, (%rax) ++ VMOVU %YMM0, -32(%rax, %rdx) + VZEROUPPER_RETURN +-# endif +-# if VEC_SIZE > 16 +- /* From 16 to 31. No branch when size == 16. */ ++#endif ++ ++#if VEC_SIZE >= 32 ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + L(between_16_31): +- VMOVU %XMM0, -16(%rax,%rdx) ++ /* From 16 to 31. No branch when size == 16. */ + VMOVU %XMM0, (%rax) ++ VMOVU %XMM0, -16(%rax, %rdx) + VZEROUPPER_RETURN +-# endif +- /* From 8 to 15. No branch when size == 8. */ ++#endif ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + L(between_8_15): +- movq %rcx, -8(%rax,%rdx) +- movq %rcx, (%rax) ++ /* From 8 to 15. No branch when size == 8. */ ++ movq %rdi, (%rax) ++ movq %rdi, -8(%rax, %rdx) + VZEROUPPER_RETURN ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %ecx, -4(%rax,%rdx) +- movl %ecx, (%rax) ++ movl %edi, (%rax) ++ movl %edi, -4(%rax, %rdx) + VZEROUPPER_RETURN ++ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %cx, -2(%rax,%rdx) +- movw %cx, (%rax) ++ movw %di, (%rax) ++ movb %dil, -1(%rax, %rdx) + VZEROUPPER_RETURN + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-58.patch b/glibc-RHEL-15696-58.patch new file mode 100644 index 0000000..cec0788 --- /dev/null +++ b/glibc-RHEL-15696-58.patch @@ -0,0 +1,45 @@ +From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sat, 23 Oct 2021 01:26:47 -0400 +Subject: [PATCH] x86: Replace sse2 instructions with avx in + memcmp-evex-movbe.S +Content-type: text/plain; charset=UTF-8 + +This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'. + +it could potentially be dangerous to use SSE2 if this function is ever +called without using 'vzeroupper' beforehand. While compilers appear +to use 'vzeroupper' before function calls if AVX2 has been used, using +SSE2 here is more brittle. Since it is not absolutely necessary it +should be avoided. + +It costs 2-extra bytes but the extra bytes should only eat into +alignment padding. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 2761b54f..640f6757 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -561,13 +561,13 @@ L(between_16_31): + /* From 16 to 31 bytes. No branch when size == 16. */ + + /* Use movups to save code size. */ +- movups (%rsi), %xmm2 ++ vmovdqu (%rsi), %xmm2 + VPCMP $4, (%rdi), %xmm2, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_0_lv) + /* Use overlapping loads to avoid branches. */ +- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2 ++ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2 + VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 + addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx + kmovd %k1, %eax +-- +GitLab + diff --git a/glibc-RHEL-15696-59.patch b/glibc-RHEL-15696-59.patch new file mode 100644 index 0000000..efc618c --- /dev/null +++ b/glibc-RHEL-15696-59.patch @@ -0,0 +1,695 @@ +From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 29 Oct 2021 12:40:20 -0700 +Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load +Content-type: text/plain; charset=UTF-8 + +In strcmp-evex.S, to compare 2 32-byte strings, replace + + VMOVU (%rdi, %rdx), %YMM0 + VMOVU (%rsi, %rdx), %YMM1 + /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ + VPCMP $4, %YMM0, %YMM1, %k0 + VPCMP $0, %YMMZERO, %YMM0, %k1 + VPCMP $0, %YMMZERO, %YMM1, %k2 + /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ + kord %k1, %k2, %k1 + /* Each bit in K1 represents a NULL or a mismatch. */ + kord %k0, %k1, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jne L(last_vector) + +with + + VMOVU (%rdi, %rdx), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi, %rdx). */ + VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} + kmovd %k1, %ecx + incl %ecx + jne L(last_vector) + +It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake +and Ice Lake. + +Co-Authored-By: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------ + 1 file changed, 243 insertions(+), 218 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index d5aa6daa..82f12ac8 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -41,6 +41,8 @@ + # ifdef USE_AS_WCSCMP + /* Compare packed dwords. */ + # define VPCMP vpcmpd ++# define VPMINU vpminud ++# define VPTESTM vptestmd + # define SHIFT_REG32 r8d + # define SHIFT_REG64 r8 + /* 1 dword char == 4 bytes. */ +@@ -48,6 +50,8 @@ + # else + /* Compare packed bytes. */ + # define VPCMP vpcmpb ++# define VPMINU vpminub ++# define VPTESTM vptestmb + # define SHIFT_REG32 ecx + # define SHIFT_REG64 rcx + /* 1 byte char == 1 byte. */ +@@ -67,6 +71,9 @@ + # define YMM5 ymm22 + # define YMM6 ymm23 + # define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -76,7 +83,7 @@ + /* The main idea of the string comparison (byte or dword) using 256-bit + EVEX instructions consists of comparing (VPCMP) two ymm vectors. The + latter can be on either packed bytes or dwords depending on +- USE_AS_WCSCMP. In order to check the null char, algorithm keeps the ++ USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the + matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2 + KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes) + are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd +@@ -123,27 +130,21 @@ ENTRY (STRCMP) + jg L(cross_page) + /* Start comparing 4 vectors. */ + VMOVU (%rdi), %YMM0 +- VMOVU (%rsi), %YMM1 + +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 + +- /* Check for NULL in YMM0. */ +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- /* Check for NULL in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (%rsi). */ ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} + +- /* Each bit in K1 represents: +- 1. A mismatch in YMM0 and YMM1. Or +- 2. A NULL in YMM0 or YMM1. +- */ +- kord %k0, %k1, %k1 +- +- ktestd %k1, %k1 +- je L(next_3_vectors) + kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ je L(next_3_vectors) + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -172,9 +173,7 @@ L(return): + # endif + ret + +- .p2align 4 + L(return_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -210,9 +209,7 @@ L(return_vec_size): + # endif + ret + +- .p2align 4 + L(return_2_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -248,9 +245,7 @@ L(return_2_vec_size): + # endif + ret + +- .p2align 4 + L(return_3_vec_size): +- kmovd %k1, %ecx + tzcntl %ecx, %edx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -289,43 +284,45 @@ L(return_3_vec_size): + .p2align 4 + L(next_3_vectors): + VMOVU VEC_SIZE(%rdi), %YMM0 +- VMOVU VEC_SIZE(%rsi), %YMM1 +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_vec_size) + +- VMOVU (VEC_SIZE * 2)(%rdi), %YMM2 +- VMOVU (VEC_SIZE * 3)(%rdi), %YMM3 +- VMOVU (VEC_SIZE * 2)(%rsi), %YMM4 +- VMOVU (VEC_SIZE * 3)(%rsi), %YMM5 +- +- /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */ +- VPCMP $4, %YMM2, %YMM4, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM4, %k2 +- /* Each bit in K1 represents a NULL in YMM2 or YMM4. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ ++ VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_2_vec_size) + +- /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */ +- VPCMP $4, %YMM3, %YMM5, %k0 +- VPCMP $0, %YMMZERO, %YMM3, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k2 +- /* Each bit in K1 represents a NULL in YMM3 or YMM5. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- ktestd %k1, %k1 ++ VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 ++ /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ ++ VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(return_3_vec_size) + L(main_loop_header): + leaq (VEC_SIZE * 4)(%rdi), %rdx +@@ -375,56 +372,51 @@ L(back_to_loop): + VMOVA VEC_SIZE(%rax), %YMM2 + VMOVA (VEC_SIZE * 2)(%rax), %YMM4 + VMOVA (VEC_SIZE * 3)(%rax), %YMM6 +- VMOVU (%rdx), %YMM1 +- VMOVU VEC_SIZE(%rdx), %YMM3 +- VMOVU (VEC_SIZE * 2)(%rdx), %YMM5 +- VMOVU (VEC_SIZE * 3)(%rdx), %YMM7 +- +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K4 represents a NULL or a mismatch in YMM0 and +- YMM1. */ +- kord %k0, %k1, %k4 +- +- VPCMP $4, %YMM2, %YMM3, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM3, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K5 represents a NULL or a mismatch in YMM2 and +- YMM3. */ +- kord %k0, %k1, %k5 +- +- VPCMP $4, %YMM4, %YMM5, %k0 +- VPCMP $0, %YMMZERO, %YMM4, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K6 represents a NULL or a mismatch in YMM4 and +- YMM5. */ +- kord %k0, %k1, %k6 +- +- VPCMP $4, %YMM6, %YMM7, %k0 +- VPCMP $0, %YMMZERO, %YMM6, %k1 +- VPCMP $0, %YMMZERO, %YMM7, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K7 represents a NULL or a mismatch in YMM6 and +- YMM7. */ +- kord %k0, %k1, %k7 +- +- kord %k4, %k5, %k0 +- kord %k6, %k7, %k1 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- kortestd %k0, %k1 +- je L(loop) +- ktestd %k4, %k4 ++ ++ VPMINU %YMM0, %YMM2, %YMM8 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ ++ /* A zero CHAR in YMM8 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM8 ++ ++ /* Each bit set in K1 represents a non-null CHAR in YMM8. */ ++ VPTESTM %YMM8, %YMM8, %k1 ++ ++ /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ ++ vpxorq (%rdx), %YMM0, %YMM1 ++ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 ++ vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ ++ vporq %YMM1, %YMM3, %YMM9 ++ vporq %YMM5, %YMM7, %YMM10 ++ ++ /* A non-zero CHAR in YMM9 represents a mismatch. */ ++ vporq %YMM9, %YMM10, %YMM9 ++ ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ ++ VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ je L(loop) ++ ++ /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ VPTESTM %YMM0, %YMM0, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM0 and (%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_vec) +- kmovd %k4, %edi +- tzcntl %edi, %ecx ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +@@ -466,9 +458,18 @@ L(test_vec): + cmpq $VEC_SIZE, %r11 + jbe L(zero) + # endif +- ktestd %k5, %k5 ++ /* Each bit set in K1 represents a non-null CHAR in YMM2. */ ++ VPTESTM %YMM2, %YMM2, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM2 and VEC_SIZE(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_2_vec) +- kmovd %k5, %ecx + tzcntl %ecx, %edi + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -512,9 +513,18 @@ L(test_2_vec): + cmpq $(VEC_SIZE * 2), %r11 + jbe L(zero) + # endif +- ktestd %k6, %k6 ++ /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ VPTESTM %YMM4, %YMM4, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM4 and (VEC_SIZE * 2)(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + je L(test_3_vec) +- kmovd %k6, %ecx + tzcntl %ecx, %edi + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +@@ -558,8 +568,18 @@ L(test_3_vec): + cmpq $(VEC_SIZE * 3), %r11 + jbe L(zero) + # endif +- kmovd %k7, %esi +- tzcntl %esi, %ecx ++ /* Each bit set in K1 represents a non-null CHAR in YMM6. */ ++ VPTESTM %YMM6, %YMM6, %k1 ++ /* Each bit cleared in K0 represents a mismatch or a null CHAR ++ in YMM6 and (VEC_SIZE * 3)(%rdx). */ ++ VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} ++ kmovd %k0, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP + /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ + sall $2, %ecx +@@ -615,39 +635,51 @@ L(loop_cross_page): + + VMOVU (%rax, %r10), %YMM2 + VMOVU VEC_SIZE(%rax, %r10), %YMM3 +- VMOVU (%rdx, %r10), %YMM4 +- VMOVU VEC_SIZE(%rdx, %r10), %YMM5 +- +- VPCMP $4, %YMM4, %YMM2, %k0 +- VPCMP $0, %YMMZERO, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM4, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch in YMM2 and +- YMM4. */ +- kord %k0, %k1, %k1 +- +- VPCMP $4, %YMM5, %YMM3, %k3 +- VPCMP $0, %YMMZERO, %YMM3, %k4 +- VPCMP $0, %YMMZERO, %YMM5, %k5 +- kord %k4, %k5, %k4 +- /* Each bit in K3 represents a NULL or a mismatch in YMM3 and +- YMM5. */ +- kord %k3, %k4, %k3 ++ ++ /* Each bit set in K2 represents a non-null CHAR in YMM2. */ ++ VPTESTM %YMM2, %YMM2, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM2 and 32 bytes at (%rdx, %r10). */ ++ VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} ++ kmovd %k1, %r9d ++ /* Don't use subl since it is the lower 16/32 bits of RDI ++ below. */ ++ notl %r9d ++# ifdef USE_AS_WCSCMP ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %r9d ++# endif ++ ++ /* Each bit set in K4 represents a non-null CHAR in YMM3. */ ++ VPTESTM %YMM3, %YMM3, %k4 ++ /* Each bit cleared in K3 represents a mismatch or a null CHAR ++ in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ ++ VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} ++ kmovd %k3, %edi ++# ifdef USE_AS_WCSCMP ++ /* Don't use subl since it is the upper 8 bits of EDI below. */ ++ notl %edi ++ andl $0xff, %edi ++# else ++ incl %edi ++# endif + + # ifdef USE_AS_WCSCMP +- /* NB: Each bit in K1/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k2 ++ /* NB: Each bit in EDI/R9D represents 4-byte element. */ ++ sall $8, %edi + /* NB: Divide shift count by 4 since each bit in K1 represent 4 + bytes. */ + movl %ecx, %SHIFT_REG32 + sarl $2, %SHIFT_REG32 ++ ++ /* Each bit in EDI represents a null CHAR or a mismatch. */ ++ orl %r9d, %edi + # else +- kshiftlq $32, %k3, %k2 +-# endif ++ salq $32, %rdi + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rdi ++ /* Each bit in RDI represents a null CHAR or a mismatch. */ ++ orq %r9, %rdi ++# endif + + /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ + shrxq %SHIFT_REG64, %rdi, %rdi +@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec): + /* The first VEC_SIZE * 2 bytes match or are ignored. */ + VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 + VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 +- VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2 +- VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3 +- +- VPCMP $4, %YMM0, %YMM2, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM2, %k2 +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch in YMM0 and +- YMM2. */ +- kord %k0, %k1, %k1 +- +- VPCMP $4, %YMM1, %YMM3, %k3 +- VPCMP $0, %YMMZERO, %YMM1, %k4 +- VPCMP $0, %YMMZERO, %YMM3, %k5 +- kord %k4, %k5, %k4 +- /* Each bit in K3 represents a NULL or a mismatch in YMM1 and +- YMM3. */ +- kord %k3, %k4, %k3 + ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ ++ VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} ++ kmovd %k1, %r9d ++ /* Don't use subl since it is the lower 16/32 bits of RDI ++ below. */ ++ notl %r9d + # ifdef USE_AS_WCSCMP +- /* NB: Each bit in K1/K3 represents 4-byte element. */ +- kshiftlw $8, %k3, %k2 ++ /* Only last 8 bits are valid. */ ++ andl $0xff, %r9d ++# endif ++ ++ VPTESTM %YMM1, %YMM1, %k4 ++ /* Each bit cleared in K3 represents a mismatch or a null CHAR ++ in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ ++ VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} ++ kmovd %k3, %edi ++# ifdef USE_AS_WCSCMP ++ /* Don't use subl since it is the upper 8 bits of EDI below. */ ++ notl %edi ++ andl $0xff, %edi + # else +- kshiftlq $32, %k3, %k2 ++ incl %edi + # endif + +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korq %k1, %k2, %k1 +- kmovq %k1, %rdi ++# ifdef USE_AS_WCSCMP ++ /* NB: Each bit in EDI/R9D represents 4-byte element. */ ++ sall $8, %edi ++ ++ /* Each bit in EDI represents a null CHAR or a mismatch. */ ++ orl %r9d, %edi ++# else ++ salq $32, %rdi ++ ++ /* Each bit in RDI represents a null CHAR or a mismatch. */ ++ orq %r9, %rdi ++# endif + + xorl %r8d, %r8d + /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec): + /* R8 has number of bytes skipped. */ + movl %ecx, %r8d + # ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 ++ /* NB: Divide shift count by 4 since each bit in RDI represent 4 + bytes. */ + sarl $2, %ecx +-# endif ++ /* Skip ECX bytes. */ ++ shrl %cl, %edi ++# else + /* Skip ECX bytes. */ + shrq %cl, %rdi ++# endif + 1: + /* Before jumping back to the loop, set ESI to the number of + VEC_SIZE * 4 blocks before page crossing. */ +@@ -818,7 +863,7 @@ L(cross_page_loop): + movzbl (%rdi, %rdx), %eax + movzbl (%rsi, %rdx), %ecx + # endif +- /* Check null char. */ ++ /* Check null CHAR. */ + testl %eax, %eax + jne L(cross_page_loop) + /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +@@ -901,18 +946,17 @@ L(cross_page): + jg L(cross_page_1_vector) + L(loop_1_vector): + VMOVU (%rdi, %rdx), %YMM0 +- VMOVU (%rsi, %rdx), %YMM1 +- +- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */ +- VPCMP $4, %YMM0, %YMM1, %k0 +- VPCMP $0, %YMMZERO, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k2 +- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 ++ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in YMM0 and 32 bytes at (%rsi, %rdx). */ ++ VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} + kmovd %k1, %ecx +- testl %ecx, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xff, %ecx ++# else ++ incl %ecx ++# endif + jne L(last_vector) + + addl $VEC_SIZE, %edx +@@ -931,18 +975,17 @@ L(cross_page_1_vector): + cmpl $(PAGE_SIZE - 16), %eax + jg L(cross_page_1_xmm) + VMOVU (%rdi, %rdx), %XMM0 +- VMOVU (%rsi, %rdx), %XMM1 +- +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- korw %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- korw %k0, %k1, %k1 +- kmovw %k1, %ecx +- testl %ecx, %ecx ++ ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and 16 bytes at (%rsi, %rdx). */ ++ VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ kmovd %k1, %ecx ++# ifdef USE_AS_WCSCMP ++ subl $0xf, %ecx ++# else ++ subl $0xffff, %ecx ++# endif + jne L(last_vector) + + addl $16, %edx +@@ -965,25 +1008,16 @@ L(cross_page_1_xmm): + vmovq (%rdi, %rdx), %XMM0 + vmovq (%rsi, %rdx), %XMM1 + +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 +- kmovd %k1, %ecx +- ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and XMM1. */ ++ VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ kmovb %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* Only last 2 bits are valid. */ +- andl $0x3, %ecx ++ subl $0x3, %ecx + # else +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx ++ subl $0xff, %ecx + # endif +- +- testl %ecx, %ecx + jne L(last_vector) + + addl $8, %edx +@@ -1002,25 +1036,16 @@ L(cross_page_8bytes): + vmovd (%rdi, %rdx), %XMM0 + vmovd (%rsi, %rdx), %XMM1 + +- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */ +- VPCMP $4, %XMM0, %XMM1, %k0 +- VPCMP $0, %XMMZERO, %XMM0, %k1 +- VPCMP $0, %XMMZERO, %XMM1, %k2 +- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */ +- kord %k1, %k2, %k1 +- /* Each bit in K1 represents a NULL or a mismatch. */ +- kord %k0, %k1, %k1 ++ VPTESTM %YMM0, %YMM0, %k2 ++ /* Each bit cleared in K1 represents a mismatch or a null CHAR ++ in XMM0 and XMM1. */ ++ VPCMP $0, %XMM1, %XMM0, %k1{%k2} + kmovd %k1, %ecx +- + # ifdef USE_AS_WCSCMP +- /* Only the last bit is valid. */ +- andl $0x1, %ecx ++ subl $0x1, %ecx + # else +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx ++ subl $0xf, %ecx + # endif +- +- testl %ecx, %ecx + jne L(last_vector) + + addl $4, %edx +-- +GitLab + diff --git a/glibc-RHEL-15696-6.patch b/glibc-RHEL-15696-6.patch new file mode 100644 index 0000000..f6725a6 --- /dev/null +++ b/glibc-RHEL-15696-6.patch @@ -0,0 +1,300 @@ +From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:33:52 -0800 +Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes the strncmp family for x32. Tested on x86-64 and x32. +On x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length. + * sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise. + * sysdeps/x86_64/strcmp.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp, + tst-size_t-strncmp and tst-size_t-wcsncmp. + * sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file. + * sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise. + * sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise. +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 6 +- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 6 +- + sysdeps/x86_64/strcmp.S | 6 +- + sysdeps/x86_64/x32/Makefile | 6 +- + sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-strncmp.c | 78 +++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | 20 ++++++ + 7 files changed, 170 insertions(+), 11 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 327e3d87..156c1949 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -79,15 +79,15 @@ + ENTRY (STRCMP) + # ifdef USE_AS_STRNCMP + /* Check for simple cases (0 or 1) in offset. */ +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je L(char0) + jb L(zero) + # ifdef USE_AS_WCSCMP + /* Convert units: from wide to byte char. */ +- shl $2, %rdx ++ shl $2, %RDX_LP + # endif + /* Register %r11 tracks the maximum offset. */ +- movq %rdx, %r11 ++ mov %RDX_LP, %R11_LP + # endif + movl %edi, %eax + xorl %edx, %edx +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index d3c07bd2..a1ebea46 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -156,11 +156,11 @@ STRCMP_SSE42: + #endif + + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + je LABEL(strcmp_exitz) +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je LABEL(Byte0) +- mov %rdx, %r11 ++ mov %RDX_LP, %R11_LP + #endif + mov %esi, %ecx + mov %edi, %eax +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index e16945b9..f47c8ad4 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -135,11 +135,11 @@ ENTRY (STRCMP) + * This implementation uses SSE to compare up to 16 bytes at a time. + */ + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L +- test %rdx, %rdx ++ test %RDX_LP, %RDX_LP + je LABEL(strcmp_exitz) +- cmp $1, %rdx ++ cmp $1, %RDX_LP + je LABEL(Byte0) +- mov %rdx, %r11 ++ mov %RDX_LP, %R11_LP + #endif + mov %esi, %ecx + mov %edi, %eax +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 98bd9ae9..db302839 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -7,9 +7,11 @@ endif + + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ +- tst-size_t-memrchr tst-size_t-memset ++ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ ++ tst-size_t-strncmp + endif + + ifeq ($(subdir),wcsmbs) +-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset ++tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \ ++ tst-size_t-wcsncmp + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c +new file mode 100644 +index 00000000..86233593 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c +@@ -0,0 +1,59 @@ ++/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "strncasecmp" ++#include "test-size_t.h" ++ ++IMPL (strncasecmp, 1) ++ ++typedef int (*proto_t) (const char *, const char *, size_t); ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_strncasecmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ strncpy ((char *) buf1, (const char *) buf2, page_size); ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_strncasecmp (dest, src); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c +new file mode 100644 +index 00000000..54e6bd83 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c +@@ -0,0 +1,78 @@ ++/* Test strncmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifdef WIDE ++# define TEST_NAME "wcsncmp" ++#else ++# define TEST_NAME "strncmp" ++#endif ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++ ++# define STRNCMP wcsncmp ++# define STRNCPY wcsncpy ++# define CHAR wchar_t ++#else ++# define STRNCMP strncmp ++# define STRNCPY strncpy ++# define CHAR char ++#endif ++ ++IMPL (STRNCMP, 1) ++ ++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); ++ ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_strncmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ size_t size = page_size / sizeof (CHAR); ++ parameter_t dest = { { size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size); ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_strncmp (dest, src); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c +new file mode 100644 +index 00000000..4829647c +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c +@@ -0,0 +1,20 @@ ++/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-strncmp.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-60.patch b/glibc-RHEL-15696-60.patch new file mode 100644 index 0000000..a3739eb --- /dev/null +++ b/glibc-RHEL-15696-60.patch @@ -0,0 +1,54 @@ +From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001 +From: Fangrui Song +Date: Tue, 2 Nov 2021 20:59:52 -0700 +Subject: [PATCH] x86-64: Replace movzx with movzbl +Content-type: text/plain; charset=UTF-8 + +Clang cannot assemble movzx in the AT&T dialect mode. + +../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction + movzx (%rsi), %ecx + ^~~~ + +Change movzx to movzbl, which follows the AT&T dialect and is used +elsewhere in the file. + +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++-- + sysdeps/x86_64/strcmp.S | 4 ++-- + 2 files changed, 4 insertions(+), 4 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index a1ebea46..d8fdeb3a 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz): + .p2align 4 + // XXX Same as code above + LABEL(Byte0): +- movzx (%rsi), %ecx +- movzx (%rdi), %eax ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index f47c8ad4..aa6df898 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz): + + .p2align 4 + LABEL(Byte0): +- movzx (%rsi), %ecx +- movzx (%rdi), %eax ++ movzbl (%rsi), %ecx ++ movzbl (%rdi), %eax + + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx +-- +GitLab + diff --git a/glibc-RHEL-15696-61.patch b/glibc-RHEL-15696-61.patch new file mode 100644 index 0000000..d6dbe81 --- /dev/null +++ b/glibc-RHEL-15696-61.patch @@ -0,0 +1,56 @@ +From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 30 Apr 2021 05:58:59 -0700 +Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM +Content-type: text/plain; charset=UTF-8 + +The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed +that REP MOVSB became faster after 2112 bytes: + + Vector Move REP MOVSB +length=2112, align1=0, align2=0: 24.20 24.40 +length=2112, align1=1, align2=0: 26.07 23.13 +length=2112, align1=0, align2=1: 27.18 28.13 +length=2112, align1=1, align2=1: 26.23 25.16 +length=2176, align1=0, align2=0: 23.18 22.52 +length=2176, align1=2, align2=0: 25.45 22.52 +length=2176, align1=0, align2=2: 27.14 27.82 +length=2176, align1=2, align2=2: 22.73 25.56 +length=2240, align1=0, align2=0: 24.62 24.25 +length=2240, align1=3, align2=0: 29.77 27.15 +length=2240, align1=0, align2=3: 35.55 29.93 +length=2240, align1=3, align2=3: 34.49 25.15 +length=2304, align1=0, align2=0: 34.75 26.64 +length=2304, align1=4, align2=0: 32.09 22.63 +length=2304, align1=0, align2=4: 28.43 31.24 + +Use REP MOVSB for data size > 2112 bytes in memcpy on processors with +fast short REP MOVSB (FSRM). + + * sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set + rep_movsb_threshold to 2112 on processors with fast short REP + MOVSB (FSRM). +--- + sysdeps/x86/cacheinfo.h | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index f72f634a..cc3941d3 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -430,6 +430,12 @@ init_cacheinfo (void) + rep_movsb_threshold = 2048 * (16 / 16); + minimum_rep_movsb_threshold = 16 * 8; + } ++ ++ /* NB: The default REP MOVSB threshold is 2112 on processors with fast ++ short REP MOVSB (FSRM). */ ++ if (CPU_FEATURE_USABLE_P (cpu_features, FSRM)) ++ rep_movsb_threshold = 2112; ++ + if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold) + __x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold; + else +-- +GitLab + diff --git a/glibc-RHEL-15696-62.patch b/glibc-RHEL-15696-62.patch new file mode 100644 index 0000000..a7a9286 --- /dev/null +++ b/glibc-RHEL-15696-62.patch @@ -0,0 +1,136 @@ +From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 1 Nov 2021 00:49:52 -0500 +Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in + dl-cacheinfo.h +Content-type: text/plain; charset=UTF-8 + +No bug. + +This patch doubles the rep_movsb_threshold when using ERMS. Based on +benchmarks the vector copy loop, especially now that it handles 4k +aliasing, is better for these medium ranged. + +On Skylake with ERMS: + +Size, Align1, Align2, dst>src,(rep movsb) / (vec copy) +4096, 0, 0, 0, 0.975 +4096, 0, 0, 1, 0.953 +4096, 12, 0, 0, 0.969 +4096, 12, 0, 1, 0.872 +4096, 44, 0, 0, 0.979 +4096, 44, 0, 1, 0.83 +4096, 0, 12, 0, 1.006 +4096, 0, 12, 1, 0.989 +4096, 0, 44, 0, 0.739 +4096, 0, 44, 1, 0.942 +4096, 12, 12, 0, 1.009 +4096, 12, 12, 1, 0.973 +4096, 44, 44, 0, 0.791 +4096, 44, 44, 1, 0.961 +4096, 2048, 0, 0, 0.978 +4096, 2048, 0, 1, 0.951 +4096, 2060, 0, 0, 0.986 +4096, 2060, 0, 1, 0.963 +4096, 2048, 12, 0, 0.971 +4096, 2048, 12, 1, 0.941 +4096, 2060, 12, 0, 0.977 +4096, 2060, 12, 1, 0.949 +8192, 0, 0, 0, 0.85 +8192, 0, 0, 1, 0.845 +8192, 13, 0, 0, 0.937 +8192, 13, 0, 1, 0.939 +8192, 45, 0, 0, 0.932 +8192, 45, 0, 1, 0.927 +8192, 0, 13, 0, 0.621 +8192, 0, 13, 1, 0.62 +8192, 0, 45, 0, 0.53 +8192, 0, 45, 1, 0.516 +8192, 13, 13, 0, 0.664 +8192, 13, 13, 1, 0.659 +8192, 45, 45, 0, 0.593 +8192, 45, 45, 1, 0.575 +8192, 2048, 0, 0, 0.854 +8192, 2048, 0, 1, 0.834 +8192, 2061, 0, 0, 0.863 +8192, 2061, 0, 1, 0.857 +8192, 2048, 13, 0, 0.63 +8192, 2048, 13, 1, 0.629 +8192, 2061, 13, 0, 0.627 +8192, 2061, 13, 1, 0.62 + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86/cacheinfo.h | 8 +++++--- + sysdeps/x86/dl-tunables.list | 26 +++++++++++++++----------- + 2 files changed, 20 insertions(+), 14 deletions(-) + +diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h +index cc3941d3..ac025e08 100644 +--- a/sysdeps/x86/cacheinfo.h ++++ b/sysdeps/x86/cacheinfo.h +@@ -411,18 +411,20 @@ init_cacheinfo (void) + + /* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */ + unsigned int minimum_rep_movsb_threshold; +- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */ ++ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for ++ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB ++ threshold is 2048 * (VEC_SIZE / 16). */ + unsigned int rep_movsb_threshold; + if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) + && !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512)) + { +- rep_movsb_threshold = 2048 * (64 / 16); ++ rep_movsb_threshold = 4096 * (64 / 16); + minimum_rep_movsb_threshold = 64 * 8; + } + else if (CPU_FEATURE_PREFERRED_P (cpu_features, + AVX_Fast_Unaligned_Load)) + { +- rep_movsb_threshold = 2048 * (32 / 16); ++ rep_movsb_threshold = 4096 * (32 / 16); + minimum_rep_movsb_threshold = 32 * 8; + } + else +diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list +index 89bf2966..56c6834a 100644 +--- a/sysdeps/x86/dl-tunables.list ++++ b/sysdeps/x86/dl-tunables.list +@@ -32,17 +32,21 @@ glibc { + } + x86_rep_movsb_threshold { + type: SIZE_T +- # Since there is overhead to set up REP MOVSB operation, REP MOVSB +- # isn't faster on short data. The memcpy micro benchmark in glibc +- # shows that 2KB is the approximate value above which REP MOVSB +- # becomes faster than SSE2 optimization on processors with Enhanced +- # REP MOVSB. Since larger register size can move more data with a +- # single load and store, the threshold is higher with larger register +- # size. Note: Since the REP MOVSB threshold must be greater than 8 +- # times of vector size and the default value is 2048 * (vector size +- # / 16), the default value and the minimum value must be updated at +- # run-time. NB: Don't set the default value since we can't tell if +- # the tunable value is set by user or not [BZ #27069]. ++ # Since there is overhead to set up REP MOVSB operation, REP ++ # MOVSB isn't faster on short data. The memcpy micro benchmark ++ # in glibc shows that 2KB is the approximate value above which ++ # REP MOVSB becomes faster than SSE2 optimization on processors ++ # with Enhanced REP MOVSB. Since larger register size can move ++ # more data with a single load and store, the threshold is ++ # higher with larger register size. Micro benchmarks show AVX ++ # REP MOVSB becomes faster apprximately at 8KB. The AVX512 ++ # threshold is extrapolated to 16KB. For machines with FSRM the ++ # threshold is universally set at 2112 bytes. Note: Since the ++ # REP MOVSB threshold must be greater than 8 times of vector ++ # size and the default value is 4096 * (vector size / 16), the ++ # default value and the minimum value must be updated at ++ # run-time. NB: Don't set the default value since we can't tell ++ # if the tunable value is set by user or not [BZ #27069]. + minval: 1 + } + x86_rep_stosb_threshold { +-- +GitLab + diff --git a/glibc-RHEL-15696-63.patch b/glibc-RHEL-15696-63.patch new file mode 100644 index 0000000..c14e8b3 --- /dev/null +++ b/glibc-RHEL-15696-63.patch @@ -0,0 +1,2428 @@ +From 2f9062d7171850451e6044ef78d91ff8c017b9c0 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 10 Nov 2021 16:18:56 -0600 +Subject: [PATCH] x86: Shrink memcmp-sse4.S code size +Content-type: text/plain; charset=UTF-8 + +No bug. + +This implementation refactors memcmp-sse4.S primarily with minimizing +code size in mind. It does this by removing the lookup table logic and +removing the unrolled check from (256, 512] bytes. + +memcmp-sse4 code size reduction : -3487 bytes +wmemcmp-sse4 code size reduction: -1472 bytes + +The current memcmp-sse4.S implementation has a large code size +cost. This has serious adverse affects on the ICache / ITLB. While +in micro-benchmarks the implementations appears fast, traces of +real-world code have shown that the speed in micro benchmarks does not +translate when the ICache/ITLB are not primed, and that the cost +of the code size has measurable negative affects on overall +application performance. + +See https://research.google/pubs/pub48320/ for more details. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-sse4.S | 2267 +++++++----------------- + 1 file changed, 646 insertions(+), 1621 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S +index 302900f5..50060006 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S ++++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S +@@ -25,14 +25,14 @@ + # define MEMCMP __memcmp_sse4_1 + # endif + +-# define JMPTBL(I, B) (I - B) ++#ifdef USE_AS_WMEMCMP ++# define CMPEQ pcmpeqd ++# define CHAR_SIZE 4 ++#else ++# define CMPEQ pcmpeqb ++# define CHAR_SIZE 1 ++#endif + +-# define BRANCH_TO_JMPTBL_ENTRY(TABLE, INDEX, SCALE) \ +- lea TABLE(%rip), %r11; \ +- movslq (%r11, INDEX, SCALE), %rcx; \ +- add %r11, %rcx; \ +- _CET_NOTRACK jmp *%rcx; \ +- ud2 + + /* Warning! + wmemcmp has to use SIGNED comparison for elements. +@@ -47,33 +47,253 @@ ENTRY (MEMCMP) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +- pxor %xmm0, %xmm0 + cmp $79, %RDX_LP + ja L(79bytesormore) ++ ++ cmp $CHAR_SIZE, %RDX_LP ++ jbe L(firstbyte) ++ ++ /* N in (CHAR_SIZE, 79) bytes. */ ++ cmpl $32, %edx ++ ja L(more_32_bytes) ++ ++ cmpl $16, %edx ++ jae L(16_to_32_bytes) ++ + # ifndef USE_AS_WMEMCMP +- cmp $1, %RDX_LP +- je L(firstbyte) ++ cmpl $8, %edx ++ jae L(8_to_16_bytes) ++ ++ cmpl $4, %edx ++ jb L(2_to_3_bytes) ++ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ ++ bswap %eax ++ bswap %ecx ++ ++ shlq $32, %rax ++ shlq $32, %rcx ++ ++ movl -4(%rdi, %rdx), %edi ++ movl -4(%rsi, %rdx), %esi ++ ++ bswap %edi ++ bswap %esi ++ ++ orq %rdi, %rax ++ orq %rsi, %rcx ++ subq %rcx, %rax ++ cmovne %edx, %eax ++ sbbl %ecx, %ecx ++ orl %ecx, %eax ++ ret ++ ++ .p2align 4,, 8 ++L(2_to_3_bytes): ++ movzwl (%rdi), %eax ++ movzwl (%rsi), %ecx ++ shll $8, %eax ++ shll $8, %ecx ++ bswap %eax ++ bswap %ecx ++ movzbl -1(%rdi, %rdx), %edi ++ movzbl -1(%rsi, %rdx), %esi ++ orl %edi, %eax ++ orl %esi, %ecx ++ subl %ecx, %eax ++ ret ++ ++ .p2align 4,, 8 ++L(8_to_16_bytes): ++ movq (%rdi), %rax ++ movq (%rsi), %rcx ++ ++ bswap %rax ++ bswap %rcx ++ ++ subq %rcx, %rax ++ jne L(8_to_16_bytes_done) ++ ++ movq -8(%rdi, %rdx), %rax ++ movq -8(%rsi, %rdx), %rcx ++ ++ bswap %rax ++ bswap %rcx ++ ++ subq %rcx, %rax ++ ++L(8_to_16_bytes_done): ++ cmovne %edx, %eax ++ sbbl %ecx, %ecx ++ orl %ecx, %eax ++ ret ++# else ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ jne L(8_to_16_bytes_done) ++ movl 4(%rdi), %ecx ++ cmpl 4(%rsi), %ecx ++ jne L(8_to_16_bytes_done) ++ movl -4(%rdi, %rdx), %ecx ++ cmpl -4(%rsi, %rdx), %ecx ++ jne L(8_to_16_bytes_done) ++ ret + # endif +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +-# ifndef USE_AS_WMEMCMP +- .p2align 4 ++ .p2align 4,, 3 ++L(ret_zero): ++ xorl %eax, %eax ++L(zero): ++ ret ++ ++ .p2align 4,, 8 + L(firstbyte): ++ jb L(ret_zero) ++# ifdef USE_AS_WMEMCMP ++ xorl %eax, %eax ++ movl (%rdi), %ecx ++ cmpl (%rsi), %ecx ++ je L(zero) ++L(8_to_16_bytes_done): ++ setg %al ++ leal -1(%rax, %rax), %eax ++# else + movzbl (%rdi), %eax + movzbl (%rsi), %ecx + sub %ecx, %eax ++# endif + ret ++ ++ .p2align 4 ++L(vec_return_begin_48): ++ addq $16, %rdi ++ addq $16, %rsi ++L(vec_return_begin_32): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl 32(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl 32(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl 32(%rsi, %rax), %ecx ++ movzbl 32(%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(vec_return_begin_16): ++ addq $16, %rdi ++ addq $16, %rsi ++L(vec_return_begin): ++ bsfl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ .p2align 4 ++L(vec_return_end_16): ++ subl $16, %edx ++L(vec_return_end): ++ bsfl %eax, %eax ++ addl %edx, %eax ++# ifdef USE_AS_WMEMCMP ++ movl -16(%rdi, %rax), %ecx ++ xorl %edx, %edx ++ cmpl -16(%rsi, %rax), %ecx ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl -16(%rsi, %rax), %ecx ++ movzbl -16(%rdi, %rax), %eax ++ subl %ecx, %eax + # endif ++ ret ++ ++ .p2align 4,, 8 ++L(more_32_bytes): ++ movdqu (%rdi), %xmm0 ++ movdqu (%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm0 ++ movdqu 16(%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ cmpl $64, %edx ++ jbe L(32_to_64_bytes) ++ movdqu 32(%rdi), %xmm0 ++ movdqu 32(%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ .p2align 4,, 6 ++L(32_to_64_bytes): ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret ++ ++ .p2align 4 ++L(16_to_32_bytes): ++ movdqu (%rdi), %xmm0 ++ movdqu (%rsi), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret ++ + + .p2align 4 + L(79bytesormore): ++ movdqu (%rdi), %xmm0 + movdqu (%rsi), %xmm1 +- movdqu (%rdi), %xmm2 +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ + mov %rsi, %rcx + and $-16, %rsi + add $16, %rsi +@@ -86,1694 +306,499 @@ L(79bytesormore): + + cmp $128, %rdx + ja L(128bytesormore) +-L(less128bytes): +- sub $64, %rdx +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) + +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- cmp $32, %rdx +- jb L(less32bytesin64) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin64): +- add $64, %rdi +- add $64, %rsi +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ .p2align 4,, 6 ++L(less128bytes): ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ cmp $96, %rdx ++ jb L(32_to_64_bytes) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ subq $64, %rdx ++ ++ .p2align 4,, 6 ++L(last_64_bytes): ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + ++ .p2align 4 + L(128bytesormore): +- cmp $512, %rdx +- ja L(512bytesormore) + cmp $256, %rdx +- ja L(less512bytes) ++ ja L(unaligned_loop) + L(less256bytes): +- sub $128, %rdx +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqu 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqu 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- add $128, %rsi +- add $128, %rdi +- +- cmp $64, %rdx +- jae L(less128bytes) +- +- cmp $32, %rdx +- jb L(less32bytesin128) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin128): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +- +-L(less512bytes): +- sub $256, %rdx +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqu 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqu 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqu 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqu 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqu 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqu 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- movdqu 128(%rdi), %xmm2 +- pxor 128(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(144bytesin256) +- +- movdqu 144(%rdi), %xmm2 +- pxor 144(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(160bytesin256) +- +- movdqu 160(%rdi), %xmm2 +- pxor 160(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(176bytesin256) +- +- movdqu 176(%rdi), %xmm2 +- pxor 176(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(192bytesin256) +- +- movdqu 192(%rdi), %xmm2 +- pxor 192(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(208bytesin256) +- +- movdqu 208(%rdi), %xmm2 +- pxor 208(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(224bytesin256) +- +- movdqu 224(%rdi), %xmm2 +- pxor 224(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(240bytesin256) +- +- movdqu 240(%rdi), %xmm2 +- pxor 240(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(256bytesin256) +- +- add $256, %rsi +- add $256, %rdi +- +- cmp $128, %rdx +- jae L(less256bytes) ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ ++ movdqu (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqu 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqu 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $-128, %rdx ++ subq $-64, %rsi ++ subq $-64, %rdi + + cmp $64, %rdx +- jae L(less128bytes) ++ ja L(less128bytes) + + cmp $32, %rdx +- jb L(less32bytesin256) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin256): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ ja L(last_64_bytes) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 +-L(512bytesormore): ++L(unaligned_loop): + # ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP + # else + mov __x86_data_cache_size_half(%rip), %R8_LP + # endif +- mov %r8, %r9 +- shr $1, %r8 +- add %r9, %r8 +- cmp %r8, %rdx +- ja L(L2_L3_cache_unaglined) ++ movq %r8, %r9 ++ addq %r8, %r8 ++ addq %r9, %r8 ++ cmpq %r8, %rdx ++ ja L(L2_L3_cache_unaligned) + sub $64, %rdx + .p2align 4 + L(64bytesormore_loop): +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 + +- movdqu 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqu 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- movdqu 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(64bytesormore_loop) ++ ja L(64bytesormore_loop) + +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ .p2align 4,, 6 ++L(loop_tail): ++ addq %rdx, %rdi ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 ++ ++ addq %rdx, %rsi ++ movdqu (%rsi), %xmm4 ++ movdqu 16(%rsi), %xmm5 ++ movdqu 32(%rsi), %xmm6 ++ movdqu 48(%rsi), %xmm7 ++ ++ CMPEQ %xmm4, %xmm0 ++ CMPEQ %xmm5, %xmm1 ++ CMPEQ %xmm6, %xmm2 ++ CMPEQ %xmm7, %xmm3 ++ ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 ++ ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) ++ ret + +-L(L2_L3_cache_unaglined): +- sub $64, %rdx ++L(L2_L3_cache_unaligned): ++ subq $64, %rdx + .p2align 4 + L(L2_L3_unaligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 + +- movdqu 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqu (%rdi), %xmm0 ++ movdqu 16(%rdi), %xmm1 ++ movdqu 32(%rdi), %xmm2 ++ movdqu 48(%rdi), %xmm3 ++ ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqu 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- movdqu 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(L2_L3_unaligned_128bytes_loop) ++ ja L(L2_L3_unaligned_128bytes_loop) ++ jmp L(loop_tail) + +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) + +-/* +- * This case is for machines which are sensitive for unaligned instructions. +- */ ++ /* This case is for machines which are sensitive for unaligned ++ * instructions. */ + .p2align 4 + L(2aligned): + cmp $128, %rdx + ja L(128bytesormorein2aligned) + L(less128bytesin2aligned): +- sub $64, %rdx +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- cmp $32, %rdx +- jb L(less32bytesin64in2alinged) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin64in2alinged): +- add $64, %rdi +- add $64, %rsi +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ cmp $96, %rdx ++ jb L(32_to_64_bytes) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ subq $64, %rdx ++ ++ .p2align 4,, 6 ++L(aligned_last_64_bytes): ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 + L(128bytesormorein2aligned): +- cmp $512, %rdx +- ja L(512bytesormorein2aligned) + cmp $256, %rdx +- ja L(256bytesormorein2aligned) ++ ja L(aligned_loop) + L(less256bytesin2alinged): +- sub $128, %rdx +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqa 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqa 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- add $128, %rsi +- add $128, %rdi ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $64, %rdi ++ addq $64, %rsi ++ ++ movdqa (%rdi), %xmm1 ++ CMPEQ (%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin) ++ ++ movdqa 16(%rdi), %xmm1 ++ CMPEQ 16(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_16) ++ ++ movdqa 32(%rdi), %xmm1 ++ CMPEQ 32(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_32) ++ ++ movdqa 48(%rdi), %xmm1 ++ CMPEQ 48(%rsi), %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_begin_48) ++ ++ addq $-128, %rdx ++ subq $-64, %rsi ++ subq $-64, %rdi + + cmp $64, %rdx +- jae L(less128bytesin2aligned) ++ ja L(less128bytesin2aligned) + + cmp $32, %rdx +- jb L(less32bytesin128in2aligned) +- +- movdqu (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqu 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin128in2aligned): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +- +- .p2align 4 +-L(256bytesormorein2aligned): +- +- sub $256, %rdx +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- +- movdqa 32(%rdi), %xmm2 +- pxor 32(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(48bytesin256) +- +- movdqa 48(%rdi), %xmm2 +- pxor 48(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(64bytesin256) +- +- movdqa 64(%rdi), %xmm2 +- pxor 64(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(80bytesin256) +- +- movdqa 80(%rdi), %xmm2 +- pxor 80(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(96bytesin256) +- +- movdqa 96(%rdi), %xmm2 +- pxor 96(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(112bytesin256) +- +- movdqa 112(%rdi), %xmm2 +- pxor 112(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(128bytesin256) +- +- movdqa 128(%rdi), %xmm2 +- pxor 128(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(144bytesin256) +- +- movdqa 144(%rdi), %xmm2 +- pxor 144(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(160bytesin256) +- +- movdqa 160(%rdi), %xmm2 +- pxor 160(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(176bytesin256) +- +- movdqa 176(%rdi), %xmm2 +- pxor 176(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(192bytesin256) +- +- movdqa 192(%rdi), %xmm2 +- pxor 192(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(208bytesin256) +- +- movdqa 208(%rdi), %xmm2 +- pxor 208(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(224bytesin256) +- +- movdqa 224(%rdi), %xmm2 +- pxor 224(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(240bytesin256) +- +- movdqa 240(%rdi), %xmm2 +- pxor 240(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(256bytesin256) +- +- add $256, %rsi +- add $256, %rdi +- +- cmp $128, %rdx +- jae L(less256bytesin2alinged) +- +- cmp $64, %rdx +- jae L(less128bytesin2aligned) +- +- cmp $32, %rdx +- jb L(less32bytesin256in2alinged) +- +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(16bytesin256) +- +- movdqa 16(%rdi), %xmm2 +- pxor 16(%rsi), %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(32bytesin256) +- sub $32, %rdx +- add $32, %rdi +- add $32, %rsi +-L(less32bytesin256in2alinged): +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ ja L(aligned_last_64_bytes) ++ ++ movdqu -32(%rdi, %rdx), %xmm0 ++ movdqu -32(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end_16) ++ ++ movdqu -16(%rdi, %rdx), %xmm0 ++ movdqu -16(%rsi, %rdx), %xmm1 ++ CMPEQ %xmm0, %xmm1 ++ pmovmskb %xmm1, %eax ++ incw %ax ++ jnz L(vec_return_end) ++ ret + + .p2align 4 +-L(512bytesormorein2aligned): ++L(aligned_loop): + # ifdef DATA_CACHE_SIZE_HALF + mov $DATA_CACHE_SIZE_HALF, %R8_LP + # else + mov __x86_data_cache_size_half(%rip), %R8_LP + # endif +- mov %r8, %r9 +- shr $1, %r8 +- add %r9, %r8 +- cmp %r8, %rdx +- ja L(L2_L3_cache_aglined) ++ movq %r8, %r9 ++ addq %r8, %r8 ++ addq %r9, %r8 ++ cmpq %r8, %rdx ++ ja L(L2_L3_cache_aligned) + + sub $64, %rdx + .p2align 4 + L(64bytesormore_loopin2aligned): +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 +- +- movdqa 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqa (%rdi), %xmm0 ++ movdqa 16(%rdi), %xmm1 ++ movdqa 32(%rdi), %xmm2 ++ movdqa 48(%rdi), %xmm3 + +- movdqa 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqa 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + add $64, %rsi + add $64, %rdi + sub $64, %rdx +- jae L(64bytesormore_loopin2aligned) +- +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) +-L(L2_L3_cache_aglined): +- sub $64, %rdx ++ ja L(64bytesormore_loopin2aligned) ++ jmp L(loop_tail) + ++L(L2_L3_cache_aligned): ++ subq $64, %rdx + .p2align 4 + L(L2_L3_aligned_128bytes_loop): + prefetchnta 0x1c0(%rdi) + prefetchnta 0x1c0(%rsi) +- movdqa (%rdi), %xmm2 +- pxor (%rsi), %xmm2 +- movdqa %xmm2, %xmm1 +- +- movdqa 16(%rdi), %xmm3 +- pxor 16(%rsi), %xmm3 +- por %xmm3, %xmm1 ++ movdqa (%rdi), %xmm0 ++ movdqa 16(%rdi), %xmm1 ++ movdqa 32(%rdi), %xmm2 ++ movdqa 48(%rdi), %xmm3 + +- movdqa 32(%rdi), %xmm4 +- pxor 32(%rsi), %xmm4 +- por %xmm4, %xmm1 ++ CMPEQ (%rsi), %xmm0 ++ CMPEQ 16(%rsi), %xmm1 ++ CMPEQ 32(%rsi), %xmm2 ++ CMPEQ 48(%rsi), %xmm3 + +- movdqa 48(%rdi), %xmm5 +- pxor 48(%rsi), %xmm5 +- por %xmm5, %xmm1 ++ pand %xmm0, %xmm1 ++ pand %xmm2, %xmm3 ++ pand %xmm1, %xmm3 + +- ptest %xmm1, %xmm0 +- jnc L(64bytesormore_loop_end) +- add $64, %rsi +- add $64, %rdi +- sub $64, %rdx +- jae L(L2_L3_aligned_128bytes_loop) +- +- add $64, %rdx +- add %rdx, %rsi +- add %rdx, %rdi +- BRANCH_TO_JMPTBL_ENTRY(L(table_64bytes), %rdx, 4) ++ pmovmskb %xmm3, %eax ++ incw %ax ++ jnz L(64bytesormore_loop_end) + ++ addq $64, %rsi ++ addq $64, %rdi ++ subq $64, %rdx ++ ja L(L2_L3_aligned_128bytes_loop) ++ jmp L(loop_tail) + + .p2align 4 + L(64bytesormore_loop_end): +- add $16, %rdi +- add $16, %rsi +- ptest %xmm2, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- ptest %xmm3, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- ptest %xmm4, %xmm0 +- jnc L(16bytes) +- +- add $16, %rdi +- add $16, %rsi +- jmp L(16bytes) +- +-L(256bytesin256): +- add $256, %rdi +- add $256, %rsi +- jmp L(16bytes) +-L(240bytesin256): +- add $240, %rdi +- add $240, %rsi +- jmp L(16bytes) +-L(224bytesin256): +- add $224, %rdi +- add $224, %rsi +- jmp L(16bytes) +-L(208bytesin256): +- add $208, %rdi +- add $208, %rsi +- jmp L(16bytes) +-L(192bytesin256): +- add $192, %rdi +- add $192, %rsi +- jmp L(16bytes) +-L(176bytesin256): +- add $176, %rdi +- add $176, %rsi +- jmp L(16bytes) +-L(160bytesin256): +- add $160, %rdi +- add $160, %rsi +- jmp L(16bytes) +-L(144bytesin256): +- add $144, %rdi +- add $144, %rsi +- jmp L(16bytes) +-L(128bytesin256): +- add $128, %rdi +- add $128, %rsi +- jmp L(16bytes) +-L(112bytesin256): +- add $112, %rdi +- add $112, %rsi +- jmp L(16bytes) +-L(96bytesin256): +- add $96, %rdi +- add $96, %rsi +- jmp L(16bytes) +-L(80bytesin256): +- add $80, %rdi +- add $80, %rsi +- jmp L(16bytes) +-L(64bytesin256): +- add $64, %rdi +- add $64, %rsi +- jmp L(16bytes) +-L(48bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(32bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(16bytesin256): +- add $16, %rdi +- add $16, %rsi +-L(16bytes): +- mov -16(%rdi), %rax +- mov -16(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(8bytes): +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(12bytes): +- mov -12(%rdi), %rax +- mov -12(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(4bytes): +- mov -4(%rsi), %ecx +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +-L(0bytes): +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal case for wmemcmp */ +- .p2align 4 +-L(65bytes): +- movdqu -65(%rdi), %xmm1 +- movdqu -65(%rsi), %xmm2 +- mov $-65, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(49bytes): +- movdqu -49(%rdi), %xmm1 +- movdqu -49(%rsi), %xmm2 +- mov $-49, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(33bytes): +- movdqu -33(%rdi), %xmm1 +- movdqu -33(%rsi), %xmm2 +- mov $-33, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(17bytes): +- mov -17(%rdi), %rax +- mov -17(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(9bytes): +- mov -9(%rdi), %rax +- mov -9(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %edx +- sub %edx, %eax +- ret +- +- .p2align 4 +-L(13bytes): +- mov -13(%rdi), %rax +- mov -13(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(5bytes): +- mov -5(%rdi), %eax +- mov -5(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %edx +- sub %edx, %eax +- ret +- +- .p2align 4 +-L(66bytes): +- movdqu -66(%rdi), %xmm1 +- movdqu -66(%rsi), %xmm2 +- mov $-66, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(50bytes): +- movdqu -50(%rdi), %xmm1 +- movdqu -50(%rsi), %xmm2 +- mov $-50, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(34bytes): +- movdqu -34(%rdi), %xmm1 +- movdqu -34(%rsi), %xmm2 +- mov $-34, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(18bytes): +- mov -18(%rdi), %rax +- mov -18(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(10bytes): +- mov -10(%rdi), %rax +- mov -10(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzwl -2(%rdi), %eax +- movzwl -2(%rsi), %ecx +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(14bytes): +- mov -14(%rdi), %rax +- mov -14(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(6bytes): +- mov -6(%rdi), %eax +- mov -6(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +-L(2bytes): +- movzwl -2(%rsi), %ecx +- movzwl -2(%rdi), %eax +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(67bytes): +- movdqu -67(%rdi), %xmm2 +- movdqu -67(%rsi), %xmm1 +- mov $-67, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(51bytes): +- movdqu -51(%rdi), %xmm2 +- movdqu -51(%rsi), %xmm1 +- mov $-51, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(35bytes): +- movdqu -35(%rsi), %xmm1 +- movdqu -35(%rdi), %xmm2 +- mov $-35, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(19bytes): +- mov -19(%rdi), %rax +- mov -19(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +-L(11bytes): +- mov -11(%rdi), %rax +- mov -11(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(15bytes): +- mov -15(%rdi), %rax +- mov -15(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(7bytes): +- mov -7(%rdi), %eax +- mov -7(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(3bytes): +- movzwl -3(%rdi), %eax +- movzwl -3(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin2bytes) +-L(1bytes): +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %ecx +- sub %ecx, %eax +- ret +-# endif +- +- .p2align 4 +-L(68bytes): +- movdqu -68(%rdi), %xmm2 +- movdqu -68(%rsi), %xmm1 +- mov $-68, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(52bytes): +- movdqu -52(%rdi), %xmm2 +- movdqu -52(%rsi), %xmm1 +- mov $-52, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(36bytes): +- movdqu -36(%rdi), %xmm2 +- movdqu -36(%rsi), %xmm1 +- mov $-36, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(20bytes): +- movdqu -20(%rdi), %xmm2 +- movdqu -20(%rsi), %xmm1 +- mov $-20, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -4(%rsi), %ecx +- +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(69bytes): +- movdqu -69(%rsi), %xmm1 +- movdqu -69(%rdi), %xmm2 +- mov $-69, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(53bytes): +- movdqu -53(%rsi), %xmm1 +- movdqu -53(%rdi), %xmm2 +- mov $-53, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(37bytes): +- movdqu -37(%rsi), %xmm1 +- movdqu -37(%rdi), %xmm2 +- mov $-37, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(21bytes): +- movdqu -21(%rsi), %xmm1 +- movdqu -21(%rdi), %xmm2 +- mov $-21, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(70bytes): +- movdqu -70(%rsi), %xmm1 +- movdqu -70(%rdi), %xmm2 +- mov $-70, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(54bytes): +- movdqu -54(%rsi), %xmm1 +- movdqu -54(%rdi), %xmm2 +- mov $-54, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(38bytes): +- movdqu -38(%rsi), %xmm1 +- movdqu -38(%rdi), %xmm2 +- mov $-38, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(22bytes): +- movdqu -22(%rsi), %xmm1 +- movdqu -22(%rdi), %xmm2 +- mov $-22, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(71bytes): +- movdqu -71(%rsi), %xmm1 +- movdqu -71(%rdi), %xmm2 +- mov $-71, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(55bytes): +- movdqu -55(%rdi), %xmm2 +- movdqu -55(%rsi), %xmm1 +- mov $-55, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(39bytes): +- movdqu -39(%rdi), %xmm2 +- movdqu -39(%rsi), %xmm1 +- mov $-39, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(23bytes): +- movdqu -23(%rdi), %xmm2 +- movdqu -23(%rsi), %xmm1 +- mov $-23, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +-# endif +- +- .p2align 4 +-L(72bytes): +- movdqu -72(%rsi), %xmm1 +- movdqu -72(%rdi), %xmm2 +- mov $-72, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(56bytes): +- movdqu -56(%rdi), %xmm2 +- movdqu -56(%rsi), %xmm1 +- mov $-56, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(40bytes): +- movdqu -40(%rdi), %xmm2 +- movdqu -40(%rsi), %xmm1 +- mov $-40, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(24bytes): +- movdqu -24(%rdi), %xmm2 +- movdqu -24(%rsi), %xmm1 +- mov $-24, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -8(%rsi), %rcx +- mov -8(%rdi), %rax +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(73bytes): +- movdqu -73(%rsi), %xmm1 +- movdqu -73(%rdi), %xmm2 +- mov $-73, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(57bytes): +- movdqu -57(%rdi), %xmm2 +- movdqu -57(%rsi), %xmm1 +- mov $-57, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(41bytes): +- movdqu -41(%rdi), %xmm2 +- movdqu -41(%rsi), %xmm1 +- mov $-41, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(25bytes): +- movdqu -25(%rdi), %xmm2 +- movdqu -25(%rsi), %xmm1 +- mov $-25, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -9(%rdi), %rax +- mov -9(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzbl -1(%rdi), %eax +- movzbl -1(%rsi), %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(74bytes): +- movdqu -74(%rsi), %xmm1 +- movdqu -74(%rdi), %xmm2 +- mov $-74, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(58bytes): +- movdqu -58(%rdi), %xmm2 +- movdqu -58(%rsi), %xmm1 +- mov $-58, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(42bytes): +- movdqu -42(%rdi), %xmm2 +- movdqu -42(%rsi), %xmm1 +- mov $-42, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(26bytes): +- movdqu -26(%rdi), %xmm2 +- movdqu -26(%rsi), %xmm1 +- mov $-26, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -10(%rdi), %rax +- mov -10(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- movzwl -2(%rdi), %eax +- movzwl -2(%rsi), %ecx +- jmp L(diffin2bytes) +- +- .p2align 4 +-L(75bytes): +- movdqu -75(%rsi), %xmm1 +- movdqu -75(%rdi), %xmm2 +- mov $-75, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(59bytes): +- movdqu -59(%rdi), %xmm2 +- movdqu -59(%rsi), %xmm1 +- mov $-59, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(43bytes): +- movdqu -43(%rdi), %xmm2 +- movdqu -43(%rsi), %xmm1 +- mov $-43, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(27bytes): +- movdqu -27(%rdi), %xmm2 +- movdqu -27(%rsi), %xmm1 +- mov $-27, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -11(%rdi), %rax +- mov -11(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rdi), %eax +- mov -4(%rsi), %ecx +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +-# endif +- .p2align 4 +-L(76bytes): +- movdqu -76(%rsi), %xmm1 +- movdqu -76(%rdi), %xmm2 +- mov $-76, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(60bytes): +- movdqu -60(%rdi), %xmm2 +- movdqu -60(%rsi), %xmm1 +- mov $-60, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(44bytes): +- movdqu -44(%rdi), %xmm2 +- movdqu -44(%rsi), %xmm1 +- mov $-44, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(28bytes): +- movdqu -28(%rdi), %xmm2 +- movdqu -28(%rsi), %xmm1 +- mov $-28, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -12(%rdi), %rax +- mov -12(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -4(%rsi), %ecx +-# ifndef USE_AS_WMEMCMP +- mov -4(%rdi), %eax +- cmp %eax, %ecx +-# else +- cmp -4(%rdi), %ecx +-# endif +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +- +-# ifndef USE_AS_WMEMCMP +-/* unreal cases for wmemcmp */ +- .p2align 4 +-L(77bytes): +- movdqu -77(%rsi), %xmm1 +- movdqu -77(%rdi), %xmm2 +- mov $-77, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(61bytes): +- movdqu -61(%rdi), %xmm2 +- movdqu -61(%rsi), %xmm1 +- mov $-61, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(45bytes): +- movdqu -45(%rdi), %xmm2 +- movdqu -45(%rsi), %xmm1 +- mov $-45, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(29bytes): +- movdqu -29(%rdi), %xmm2 +- movdqu -29(%rsi), %xmm1 +- mov $-29, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -13(%rdi), %rax +- mov -13(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(78bytes): +- movdqu -78(%rsi), %xmm1 +- movdqu -78(%rdi), %xmm2 +- mov $-78, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(62bytes): +- movdqu -62(%rdi), %xmm2 +- movdqu -62(%rsi), %xmm1 +- mov $-62, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(46bytes): +- movdqu -46(%rdi), %xmm2 +- movdqu -46(%rsi), %xmm1 +- mov $-46, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(30bytes): +- movdqu -30(%rdi), %xmm2 +- movdqu -30(%rsi), %xmm1 +- mov $-30, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -14(%rdi), %rax +- mov -14(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +- .p2align 4 +-L(79bytes): +- movdqu -79(%rsi), %xmm1 +- movdqu -79(%rdi), %xmm2 +- mov $-79, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(63bytes): +- movdqu -63(%rdi), %xmm2 +- movdqu -63(%rsi), %xmm1 +- mov $-63, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(47bytes): +- movdqu -47(%rdi), %xmm2 +- movdqu -47(%rsi), %xmm1 +- mov $-47, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(31bytes): +- movdqu -31(%rdi), %xmm2 +- movdqu -31(%rsi), %xmm1 +- mov $-31, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- mov -15(%rdi), %rax +- mov -15(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +-# endif +- .p2align 4 +-L(64bytes): +- movdqu -64(%rdi), %xmm2 +- movdqu -64(%rsi), %xmm1 +- mov $-64, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(48bytes): +- movdqu -48(%rdi), %xmm2 +- movdqu -48(%rsi), %xmm1 +- mov $-48, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +-L(32bytes): +- movdqu -32(%rdi), %xmm2 +- movdqu -32(%rsi), %xmm1 +- mov $-32, %dl +- pxor %xmm1, %xmm2 +- ptest %xmm2, %xmm0 +- jnc L(less16bytes) +- +- mov -16(%rdi), %rax +- mov -16(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- +- mov -8(%rdi), %rax +- mov -8(%rsi), %rcx +- cmp %rax, %rcx +- jne L(diffin8bytes) +- xor %eax, %eax +- ret +- +-/* +- * Aligned 8 bytes to avoid 2 branch "taken" in one 16 alinged code block. +- */ +- .p2align 3 +-L(less16bytes): +- movsbq %dl, %rdx +- mov (%rsi, %rdx), %rcx +- mov (%rdi, %rdx), %rax +- cmp %rax, %rcx +- jne L(diffin8bytes) +- mov 8(%rsi, %rdx), %rcx +- mov 8(%rdi, %rdx), %rax +-L(diffin8bytes): +- cmp %eax, %ecx +- jne L(diffin4bytes) +- shr $32, %rcx +- shr $32, %rax +- ++ pmovmskb %xmm0, %ecx ++ incw %cx ++ jnz L(loop_end_ret) ++ ++ pmovmskb %xmm1, %ecx ++ notw %cx ++ sall $16, %ecx ++ jnz L(loop_end_ret) ++ ++ pmovmskb %xmm2, %ecx ++ notw %cx ++ shlq $32, %rcx ++ jnz L(loop_end_ret) ++ ++ addq $48, %rdi ++ addq $48, %rsi ++ movq %rax, %rcx ++ ++ .p2align 4,, 6 ++L(loop_end_ret): ++ bsfq %rcx, %rcx + # ifdef USE_AS_WMEMCMP +-/* for wmemcmp */ +- cmp %eax, %ecx +- jne L(diffin4bytes) +- xor %eax, %eax +- ret +-# endif +- +-L(diffin4bytes): +-# ifndef USE_AS_WMEMCMP +- cmp %cx, %ax +- jne L(diffin2bytes) +- shr $16, %ecx +- shr $16, %eax +-L(diffin2bytes): +- cmp %cl, %al +- jne L(end) +- and $0xffff, %eax +- and $0xffff, %ecx +- sub %ecx, %eax +- ret +- +- .p2align 4 +-L(end): +- and $0xff, %eax +- and $0xff, %ecx +- sub %ecx, %eax +- ret ++ movl (%rdi, %rcx), %eax ++ xorl %edx, %edx ++ cmpl (%rsi, %rcx), %eax ++ setg %dl ++ leal -1(%rdx, %rdx), %eax + # else +- +-/* for wmemcmp */ +- mov $1, %eax +- jl L(nequal_bigger) +- neg %eax +- ret +- +- .p2align 4 +-L(nequal_bigger): +- ret +- +-L(unreal_case): +- xor %eax, %eax +- ret ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +- ++ ret + END (MEMCMP) +- +- .section .rodata.sse4.1,"a",@progbits +- .p2align 3 +-# ifndef USE_AS_WMEMCMP +-L(table_64bytes): +- .int JMPTBL (L(0bytes), L(table_64bytes)) +- .int JMPTBL (L(1bytes), L(table_64bytes)) +- .int JMPTBL (L(2bytes), L(table_64bytes)) +- .int JMPTBL (L(3bytes), L(table_64bytes)) +- .int JMPTBL (L(4bytes), L(table_64bytes)) +- .int JMPTBL (L(5bytes), L(table_64bytes)) +- .int JMPTBL (L(6bytes), L(table_64bytes)) +- .int JMPTBL (L(7bytes), L(table_64bytes)) +- .int JMPTBL (L(8bytes), L(table_64bytes)) +- .int JMPTBL (L(9bytes), L(table_64bytes)) +- .int JMPTBL (L(10bytes), L(table_64bytes)) +- .int JMPTBL (L(11bytes), L(table_64bytes)) +- .int JMPTBL (L(12bytes), L(table_64bytes)) +- .int JMPTBL (L(13bytes), L(table_64bytes)) +- .int JMPTBL (L(14bytes), L(table_64bytes)) +- .int JMPTBL (L(15bytes), L(table_64bytes)) +- .int JMPTBL (L(16bytes), L(table_64bytes)) +- .int JMPTBL (L(17bytes), L(table_64bytes)) +- .int JMPTBL (L(18bytes), L(table_64bytes)) +- .int JMPTBL (L(19bytes), L(table_64bytes)) +- .int JMPTBL (L(20bytes), L(table_64bytes)) +- .int JMPTBL (L(21bytes), L(table_64bytes)) +- .int JMPTBL (L(22bytes), L(table_64bytes)) +- .int JMPTBL (L(23bytes), L(table_64bytes)) +- .int JMPTBL (L(24bytes), L(table_64bytes)) +- .int JMPTBL (L(25bytes), L(table_64bytes)) +- .int JMPTBL (L(26bytes), L(table_64bytes)) +- .int JMPTBL (L(27bytes), L(table_64bytes)) +- .int JMPTBL (L(28bytes), L(table_64bytes)) +- .int JMPTBL (L(29bytes), L(table_64bytes)) +- .int JMPTBL (L(30bytes), L(table_64bytes)) +- .int JMPTBL (L(31bytes), L(table_64bytes)) +- .int JMPTBL (L(32bytes), L(table_64bytes)) +- .int JMPTBL (L(33bytes), L(table_64bytes)) +- .int JMPTBL (L(34bytes), L(table_64bytes)) +- .int JMPTBL (L(35bytes), L(table_64bytes)) +- .int JMPTBL (L(36bytes), L(table_64bytes)) +- .int JMPTBL (L(37bytes), L(table_64bytes)) +- .int JMPTBL (L(38bytes), L(table_64bytes)) +- .int JMPTBL (L(39bytes), L(table_64bytes)) +- .int JMPTBL (L(40bytes), L(table_64bytes)) +- .int JMPTBL (L(41bytes), L(table_64bytes)) +- .int JMPTBL (L(42bytes), L(table_64bytes)) +- .int JMPTBL (L(43bytes), L(table_64bytes)) +- .int JMPTBL (L(44bytes), L(table_64bytes)) +- .int JMPTBL (L(45bytes), L(table_64bytes)) +- .int JMPTBL (L(46bytes), L(table_64bytes)) +- .int JMPTBL (L(47bytes), L(table_64bytes)) +- .int JMPTBL (L(48bytes), L(table_64bytes)) +- .int JMPTBL (L(49bytes), L(table_64bytes)) +- .int JMPTBL (L(50bytes), L(table_64bytes)) +- .int JMPTBL (L(51bytes), L(table_64bytes)) +- .int JMPTBL (L(52bytes), L(table_64bytes)) +- .int JMPTBL (L(53bytes), L(table_64bytes)) +- .int JMPTBL (L(54bytes), L(table_64bytes)) +- .int JMPTBL (L(55bytes), L(table_64bytes)) +- .int JMPTBL (L(56bytes), L(table_64bytes)) +- .int JMPTBL (L(57bytes), L(table_64bytes)) +- .int JMPTBL (L(58bytes), L(table_64bytes)) +- .int JMPTBL (L(59bytes), L(table_64bytes)) +- .int JMPTBL (L(60bytes), L(table_64bytes)) +- .int JMPTBL (L(61bytes), L(table_64bytes)) +- .int JMPTBL (L(62bytes), L(table_64bytes)) +- .int JMPTBL (L(63bytes), L(table_64bytes)) +- .int JMPTBL (L(64bytes), L(table_64bytes)) +- .int JMPTBL (L(65bytes), L(table_64bytes)) +- .int JMPTBL (L(66bytes), L(table_64bytes)) +- .int JMPTBL (L(67bytes), L(table_64bytes)) +- .int JMPTBL (L(68bytes), L(table_64bytes)) +- .int JMPTBL (L(69bytes), L(table_64bytes)) +- .int JMPTBL (L(70bytes), L(table_64bytes)) +- .int JMPTBL (L(71bytes), L(table_64bytes)) +- .int JMPTBL (L(72bytes), L(table_64bytes)) +- .int JMPTBL (L(73bytes), L(table_64bytes)) +- .int JMPTBL (L(74bytes), L(table_64bytes)) +- .int JMPTBL (L(75bytes), L(table_64bytes)) +- .int JMPTBL (L(76bytes), L(table_64bytes)) +- .int JMPTBL (L(77bytes), L(table_64bytes)) +- .int JMPTBL (L(78bytes), L(table_64bytes)) +- .int JMPTBL (L(79bytes), L(table_64bytes)) +-# else +-L(table_64bytes): +- .int JMPTBL (L(0bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(4bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(8bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(12bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(16bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(20bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(24bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(28bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(32bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(36bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(40bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(44bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(48bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(52bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(56bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(60bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(64bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(68bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(72bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(76bytes), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +- .int JMPTBL (L(unreal_case), L(table_64bytes)) +-# endif + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-64.patch b/glibc-RHEL-15696-64.patch new file mode 100644 index 0000000..ba7f14a --- /dev/null +++ b/glibc-RHEL-15696-64.patch @@ -0,0 +1,39 @@ +From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 11 Nov 2021 06:31:51 -0800 +Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ + #28537] +Content-type: text/plain; charset=UTF-8 + +Replace boolean CAS with value CAS to avoid the extra load. + +Reviewed-by: Szabolcs Nagy +--- + nptl/pthread_mutex_lock.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index 29cc143e..60ada70d 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock, +- oldval | FUTEX_WAITERS, +- oldval) +- != 0) ++ int val; ++ if ((val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, ++ oldval)) != oldval) + { +- oldval = mutex->__data.__lock; ++ oldval = val; + continue; + } + oldval |= FUTEX_WAITERS; +-- +GitLab + diff --git a/glibc-RHEL-15696-65.patch b/glibc-RHEL-15696-65.patch new file mode 100644 index 0000000..296d4a9 --- /dev/null +++ b/glibc-RHEL-15696-65.patch @@ -0,0 +1,39 @@ +From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 11 Nov 2021 06:54:01 -0800 +Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common + [BZ #28537] +Content-type: text/plain; charset=UTF-8 + +Replace boolean CAS with value CAS to avoid the extra load. + +Reviewed-by: Szabolcs Nagy +--- + nptl/pthread_mutex_timedlock.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c +index 888c12fe..c4627ef6 100644 +--- a/nptl/pthread_mutex_timedlock.c ++++ b/nptl/pthread_mutex_timedlock.c +@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex, + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock, +- oldval | FUTEX_WAITERS, +- oldval) +- != 0) ++ int val; ++ if ((val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, ++ oldval)) != oldval) + { +- oldval = mutex->__data.__lock; ++ oldval = val; + continue; + } + oldval |= FUTEX_WAITERS; +-- +GitLab + diff --git a/glibc-RHEL-15696-66.patch b/glibc-RHEL-15696-66.patch new file mode 100644 index 0000000..4579636 --- /dev/null +++ b/glibc-RHEL-15696-66.patch @@ -0,0 +1,51 @@ +From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Tue, 2 Nov 2021 18:33:07 -0700 +Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537] +Content-type: text/plain; charset=UTF-8 + +CAS instruction is expensive. From the x86 CPU's point of view, getting +a cache line for writing is more expensive than reading. See Appendix +A.2 Spinlock in: + +https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf + +The full compare and swap will grab the cache line exclusive and cause +excessive cache line bouncing. + +Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock +loop if compare may fail to reduce cache line bouncing on contended locks. + +Reviewed-by: Szabolcs Nagy +--- + nptl/pthread_mutex_lock.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index 60ada70d..eb4d8baa 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -56,6 +56,11 @@ + #define FORCE_ELISION(m, s) + #endif + ++#ifndef LLL_MUTEX_READ_LOCK ++# define LLL_MUTEX_READ_LOCK(mutex) \ ++ atomic_load_relaxed (&(mutex)->__data.__lock) ++#endif ++ + static int __pthread_mutex_lock_full (pthread_mutex_t *mutex) + __attribute_noinline__; + +@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex) + break; + } + atomic_spin_nop (); ++ if (LLL_MUTEX_READ_LOCK (mutex) != 0) ++ continue; + } + while (LLL_MUTEX_TRYLOCK (mutex) != 0); + +-- +GitLab + diff --git a/glibc-RHEL-15696-67.patch b/glibc-RHEL-15696-67.patch new file mode 100644 index 0000000..73c8306 --- /dev/null +++ b/glibc-RHEL-15696-67.patch @@ -0,0 +1,71 @@ +From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 12 Nov 2021 11:47:42 -0800 +Subject: [PATCH] Move assignment out of the CAS condition +Content-type: text/plain; charset=UTF-8 + +Update + +commit 49302b8fdf9103b6fc0a398678668a22fa19574c +Author: H.J. Lu +Date: Thu Nov 11 06:54:01 2021 -0800 + + Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537] + + Replace boolean CAS with value CAS to avoid the extra load. + +and + +commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f +Author: H.J. Lu +Date: Thu Nov 11 06:31:51 2021 -0800 + + Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537] + + Replace boolean CAS with value CAS to avoid the extra load. + +by moving assignment out of the CAS condition. +--- + nptl/pthread_mutex_lock.c | 7 +++---- + nptl/pthread_mutex_timedlock.c | 7 +++---- + 2 files changed, 6 insertions(+), 8 deletions(-) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index eb4d8baa..a633d95e 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex) + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- int val; +- if ((val = atomic_compare_and_exchange_val_acq +- (&mutex->__data.__lock, oldval | FUTEX_WAITERS, +- oldval)) != oldval) ++ int val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval); ++ if (val != oldval) + { + oldval = val; + continue; +diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c +index c4627ef6..a76c30b7 100644 +--- a/nptl/pthread_mutex_timedlock.c ++++ b/nptl/pthread_mutex_timedlock.c +@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex, + meantime. */ + if ((oldval & FUTEX_WAITERS) == 0) + { +- int val; +- if ((val = atomic_compare_and_exchange_val_acq +- (&mutex->__data.__lock, oldval | FUTEX_WAITERS, +- oldval)) != oldval) ++ int val = atomic_compare_and_exchange_val_acq ++ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval); ++ if (val != oldval) + { + oldval = val; + continue; +-- +GitLab + diff --git a/glibc-RHEL-15696-68.patch b/glibc-RHEL-15696-68.patch new file mode 100644 index 0000000..df35b31 --- /dev/null +++ b/glibc-RHEL-15696-68.patch @@ -0,0 +1,60 @@ +From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 3 Dec 2021 15:29:25 -0800 +Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646] +Content-type: text/plain; charset=UTF-8 + +Must use notl %edi here as lower bits are for CHAR comparisons +potentially out of range thus can be 0 without indicating mismatch. +This fixes BZ #28646. + +Co-Authored-By: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +Conflicts: + string/test-strcmp.c + (new check omitted) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 82f12ac8..6f5c4bf9 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -656,12 +656,13 @@ L(loop_cross_page): + in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ + VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} + kmovd %k3, %edi ++ /* Must use notl %edi here as lower bits are for CHAR ++ comparisons potentially out of range thus can be 0 without ++ indicating mismatch. */ ++ notl %edi + # ifdef USE_AS_WCSCMP + /* Don't use subl since it is the upper 8 bits of EDI below. */ +- notl %edi + andl $0xff, %edi +-# else +- incl %edi + # endif + + # ifdef USE_AS_WCSCMP +@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec): + in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ + VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} + kmovd %k3, %edi ++ /* Must use notl %edi here as lower bits are for CHAR ++ comparisons potentially out of range thus can be 0 without ++ indicating mismatch. */ ++ notl %edi + # ifdef USE_AS_WCSCMP + /* Don't use subl since it is the upper 8 bits of EDI below. */ +- notl %edi + andl $0xff, %edi +-# else +- incl %edi + # endif + + # ifdef USE_AS_WCSCMP +-- +GitLab + diff --git a/glibc-RHEL-15696-69.patch b/glibc-RHEL-15696-69.patch new file mode 100644 index 0000000..9f859f2 --- /dev/null +++ b/glibc-RHEL-15696-69.patch @@ -0,0 +1,35 @@ +From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 6 Dec 2021 07:14:12 -0800 +Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512 + and AVX-VNNI +Content-type: text/plain; charset=UTF-8 + +Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since +they won't lower CPU frequency when ZMM load and store instructions are +used. +--- + sysdeps/x86/cpu-features.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c +index 956bfb4f..5ff2baa0 100644 +--- a/sysdeps/x86/cpu-features.c ++++ b/sysdeps/x86/cpu-features.c +@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features) + |= bit_arch_Prefer_No_VZEROUPPER; + else + { +- cpu_features->preferred[index_arch_Prefer_No_AVX512] +- |= bit_arch_Prefer_No_AVX512; ++ /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency ++ when ZMM load and store instructions are used. */ ++ if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI)) ++ cpu_features->preferred[index_arch_Prefer_No_AVX512] ++ |= bit_arch_Prefer_No_AVX512; + + /* Avoid RTM abort triggered by VZEROUPPER inside a + transactionally executing RTM region. */ +-- +GitLab + diff --git a/glibc-RHEL-15696-7.patch b/glibc-RHEL-15696-7.patch new file mode 100644 index 0000000..8ef468c --- /dev/null +++ b/glibc-RHEL-15696-7.patch @@ -0,0 +1,153 @@ +From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:35:18 -0800 +Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ# + 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes strncpy for x32. Tested on x86-64 and x32. On x86-64, +libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length. + * sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise. + * sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy. + * sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file. +--- + .../x86_64/multiarch/strcpy-sse2-unaligned.S | 4 +- + sysdeps/x86_64/multiarch/strcpy-ssse3.S | 6 +- + sysdeps/x86_64/x32/Makefile | 2 +- + sysdeps/x86_64/x32/tst-size_t-strncpy.c | 58 +++++++++++++++++++ + 4 files changed, 64 insertions(+), 6 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c + +Conflicts: + ChangeLog + (removed) + sysdeps/x86_64/multiarch/strcpy-avx2.S + (skipped, only needed for x32 arch) + +diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +index 72bf7e85..50aca22d 100644 +--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S ++++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S +@@ -40,8 +40,8 @@ + .text + ENTRY (STRCPY) + # ifdef USE_AS_STRNCPY +- mov %rdx, %r8 +- test %r8, %r8 ++ mov %RDX_LP, %R8_LP ++ test %R8_LP, %R8_LP + jz L(ExitZero) + # endif + mov %rsi, %rcx +diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S +index 9858d0c4..0a62814a 100644 +--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S ++++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S +@@ -31,13 +31,13 @@ ENTRY (STRCPY) + + mov %rsi, %rcx + # ifdef USE_AS_STRNCPY +- mov %rdx, %r8 ++ mov %RDX_LP, %R8_LP + # endif + mov %rdi, %rdx + # ifdef USE_AS_STRNCPY +- test %r8, %r8 ++ test %R8_LP, %R8_LP + jz L(Exit0) +- cmp $8, %r8 ++ cmp $8, %R8_LP + jbe L(StrncpyExit8Bytes) + # endif + cmpb $0, (%rcx) +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index db302839..2a9e20a9 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -8,7 +8,7 @@ endif + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ + tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ +- tst-size_t-strncmp ++ tst-size_t-strncmp tst-size_t-strncpy + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c +new file mode 100644 +index 00000000..4dec71e6 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c +@@ -0,0 +1,58 @@ ++/* Test strncpy with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_NAME "strncpy" ++#include "test-size_t.h" ++ ++IMPL (strncpy, 1) ++ ++typedef char *(*proto_t) (char *, const char*, size_t); ++ ++static void * ++__attribute__ ((noinline, noclone)) ++do_strncpy (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ do_strncpy (dest, src); ++ int res = strncmp (dest.p, src.p, dest.len); ++ if (res) ++ { ++ error (0, 0, "Wrong result in function %s: %i != 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-70.patch b/glibc-RHEL-15696-70.patch new file mode 100644 index 0000000..8935ac5 --- /dev/null +++ b/glibc-RHEL-15696-70.patch @@ -0,0 +1,389 @@ +From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 24 Dec 2021 18:54:41 -0600 +Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S +Content-type: text/plain; charset=UTF-8 + +No bug. +Optimizations are twofold. + +1) Replace page cross and 0/1 checks with masked load instructions in + L(less_vec). In applications this reduces branch-misses in the + hot [0, 32] case. +2) Change controlflow so that L(less_vec) case gets the fall through. + +Change 2) helps copies in the [0, 32] size range but comes at the cost +of copies in the [33, 64] size range. From profiles of GCC and +Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this +appears to the the right tradeoff. + +Signed-off-by: Noah Goldstein +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++-------------- + 1 file changed, 56 insertions(+), 193 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +index 640f6757..d2899e7c 100644 +--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S ++++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S +@@ -62,15 +62,18 @@ Latency: + # define VMOVU vmovdqu64 + + # ifdef USE_AS_WMEMCMP ++# define VMOVU_MASK vmovdqu32 + # define CHAR_SIZE 4 + # define VPCMP vpcmpd + # define VPTEST vptestmd + # else ++# define VMOVU_MASK vmovdqu8 + # define CHAR_SIZE 1 + # define VPCMP vpcmpub + # define VPTEST vptestmb + # endif + ++ + # define VEC_SIZE 32 + # define PAGE_SIZE 4096 + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) +@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6) + movl %edx, %edx + # endif + cmp $CHAR_PER_VEC, %RDX_LP +- jb L(less_vec) ++ /* Fall through for [0, VEC_SIZE] as its the hottest. */ ++ ja L(more_1x_vec) ++ ++ /* Create mask for CHAR's we want to compare. This allows us to ++ avoid having to include page cross logic. */ ++ movl $-1, %ecx ++ bzhil %edx, %ecx, %ecx ++ kmovd %ecx, %k2 ++ ++ /* Safe to load full ymm with mask. */ ++ VMOVU_MASK (%rsi), %YMM2{%k2} ++ VPCMP $4,(%rdi), %YMM2, %k1{%k2} ++ kmovd %k1, %eax ++ testl %eax, %eax ++ jnz L(return_vec_0) ++ ret + ++ .p2align 4 ++L(return_vec_0): ++ tzcntl %eax, %eax ++# ifdef USE_AS_WMEMCMP ++ movl (%rdi, %rax, CHAR_SIZE), %ecx ++ xorl %edx, %edx ++ cmpl (%rsi, %rax, CHAR_SIZE), %ecx ++ /* NB: no partial register stall here because xorl zero idiom ++ above. */ ++ setg %dl ++ leal -1(%rdx, %rdx), %eax ++# else ++ movzbl (%rsi, %rax), %ecx ++ movzbl (%rdi, %rax), %eax ++ subl %ecx, %eax ++# endif ++ ret ++ ++ ++ .p2align 4 ++L(more_1x_vec): + /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */ + VMOVU (%rsi), %YMM1 + /* Use compare not equals to directly check for mismatch. */ +- VPCMP $4, (%rdi), %YMM1, %k1 ++ VPCMP $4,(%rdi), %YMM1, %k1 + kmovd %k1, %eax + /* NB: eax must be destination register if going to + L(return_vec_[0,2]). For L(return_vec_3) destination register +@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6) + + /* Check third and fourth VEC no matter what. */ + VMOVU (VEC_SIZE * 2)(%rsi), %YMM3 +- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1 ++ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 +- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1 ++ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1 + kmovd %k1, %ecx + testl %ecx, %ecx + jnz L(return_vec_3) +@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6) + VMOVU (VEC_SIZE * 3)(%rsi), %YMM4 + /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while + oring with YMM1. Result is stored in YMM4. */ +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + + /* Or together YMM2, YMM3, and YMM4 into YMM4. */ + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 +@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6) + /* NB: eax must be zero to reach here. */ + ret + +- .p2align 4 ++ ++ .p2align 4,, 8 + L(8x_end_return_vec_0_1_2_3): + movq %rdx, %rdi + L(8x_return_vec_0_1_2_3): +@@ -222,23 +262,6 @@ L(return_vec_3): + # endif + ret + +- .p2align 4 +-L(return_vec_0): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax, CHAR_SIZE), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax, CHAR_SIZE), %ecx +- /* NB: no partial register stall here because xorl zero idiom +- above. */ +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret + + .p2align 4 + L(return_vec_1): +@@ -297,7 +320,7 @@ L(loop_4x_vec): + VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3 + vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3 + VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +@@ -324,7 +347,7 @@ L(loop_4x_vec): + VMOVU VEC_SIZE(%rsi, %rdx), %YMM2 + vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2 + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4 +- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 ++ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4 + vpternlogd $0xfe, %YMM2, %YMM3, %YMM4 + VPTEST %YMM4, %YMM4, %k1 + kmovd %k1, %ecx +@@ -336,14 +359,14 @@ L(loop_4x_vec): + /* Only entry is from L(more_8x_vec). */ + .p2align 4,, 10 + L(8x_last_2x_vec): +- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1 ++ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_2) + /* Naturally aligned to 16 bytes. */ + L(8x_last_1x_vec): + VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1 +- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1 ++ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1 + kmovd %k1, %eax + testl %eax, %eax + jnz L(8x_return_vec_3) +@@ -392,7 +415,9 @@ L(last_1x_vec): + jnz L(return_vec_0_end) + ret + +- .p2align 4,, 10 ++ ++ /* Don't align. Takes 2-fetch blocks either way and aligning ++ will cause code to spill into another cacheline. */ + L(return_vec_1_end): + /* Use bsf to save code size. This is necessary to have + L(one_or_less) fit in aligning bytes between. */ +@@ -411,31 +436,8 @@ L(return_vec_1_end): + # endif + ret + +- /* NB: L(one_or_less) fits in alignment padding between +- L(return_vec_1_end) and L(return_vec_0_end). */ +-# ifdef USE_AS_WMEMCMP +-L(one_or_less): +- jb L(zero) +- movl (%rdi), %ecx +- xorl %edx, %edx +- cmpl (%rsi), %ecx +- je L(zero) +- setg %dl +- leal -1(%rdx, %rdx), %eax +- ret +-# else +-L(one_or_less): +- jb L(zero) +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +- ret +-# endif +-L(zero): +- xorl %eax, %eax +- ret +- +- .p2align 4 ++ /* Don't align. Takes 2-fetch blocks either way and aligning ++ will cause code to spill into another cacheline. */ + L(return_vec_0_end): + tzcntl %eax, %eax + addl %edx, %eax +@@ -451,146 +453,7 @@ L(return_vec_0_end): + subl %ecx, %eax + # endif + ret ++ /* 1-byte until next cache line. */ + +- .p2align 4 +-L(less_vec): +- /* Check if one or less CHAR. This is necessary for size == 0 +- but is also faster for size == CHAR_SIZE. */ +- cmpl $1, %edx +- jbe L(one_or_less) +- +- /* Check if loading one VEC from either s1 or s2 could cause a +- page cross. This can have false positives but is by far the +- fastest method. */ +- movl %edi, %eax +- orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(page_cross_less_vec) +- +- /* No page cross possible. */ +- VMOVU (%rsi), %YMM2 +- VPCMP $4, (%rdi), %YMM2, %k1 +- kmovd %k1, %eax +- /* Check if any matches where in bounds. Intentionally not +- storing result in eax to limit dependency chain if it goes to +- L(return_vec_0_lv). */ +- bzhil %edx, %eax, %edx +- jnz L(return_vec_0_lv) +- xorl %eax, %eax +- ret +- +- /* Essentially duplicate of L(return_vec_0). Ends up not costing +- any code as shrinks L(less_vec) by allowing 2-byte encoding of +- the jump and ends up fitting in aligning bytes. As well fits on +- same cache line as L(less_vec) so also saves a line from having +- to be fetched on cold calls to memcmp. */ +- .p2align 4,, 4 +-L(return_vec_0_lv): +- tzcntl %eax, %eax +-# ifdef USE_AS_WMEMCMP +- movl (%rdi, %rax, CHAR_SIZE), %ecx +- xorl %edx, %edx +- cmpl (%rsi, %rax, CHAR_SIZE), %ecx +- /* NB: no partial register stall here because xorl zero idiom +- above. */ +- setg %dl +- leal -1(%rdx, %rdx), %eax +-# else +- movzbl (%rsi, %rax), %ecx +- movzbl (%rdi, %rax), %eax +- subl %ecx, %eax +-# endif +- ret +- +- .p2align 4 +-L(page_cross_less_vec): +- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28 +- bytes. */ +- cmpl $(16 / CHAR_SIZE), %edx +- jae L(between_16_31) +-# ifndef USE_AS_WMEMCMP +- cmpl $8, %edx +- jae L(between_8_15) +- cmpl $4, %edx +- jb L(between_2_3) +- +- /* Load as big endian with overlapping movbe to avoid branches. +- */ +- movbe (%rdi), %eax +- movbe (%rsi), %ecx +- shlq $32, %rax +- shlq $32, %rcx +- movbe -4(%rdi, %rdx), %edi +- movbe -4(%rsi, %rdx), %esi +- orq %rdi, %rax +- orq %rsi, %rcx +- subq %rcx, %rax +- /* edx is guranteed to be positive int32 in range [4, 7]. */ +- cmovne %edx, %eax +- /* ecx is -1 if rcx > rax. Otherwise 0. */ +- sbbl %ecx, %ecx +- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx == +- rax then eax and ecx are zero. If rax < rax then ecx is -1 so +- eax doesn't matter. */ +- orl %ecx, %eax +- ret +- +- .p2align 4,, 8 +-L(between_8_15): +-# endif +- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */ +- vmovq (%rdi), %xmm1 +- vmovq (%rsi), %xmm2 +- VPCMP $4, %xmm1, %xmm2, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_lv) +- /* Use overlapping loads to avoid branches. */ +- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1 +- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2 +- VPCMP $4, %xmm1, %xmm2, %k1 +- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_end) +- ret +- +- .p2align 4,, 8 +-L(between_16_31): +- /* From 16 to 31 bytes. No branch when size == 16. */ +- +- /* Use movups to save code size. */ +- vmovdqu (%rsi), %xmm2 +- VPCMP $4, (%rdi), %xmm2, %k1 +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_lv) +- /* Use overlapping loads to avoid branches. */ +- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2 +- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1 +- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx +- kmovd %k1, %eax +- testl %eax, %eax +- jnz L(return_vec_0_end) +- ret +- +-# ifndef USE_AS_WMEMCMP +-L(between_2_3): +- /* Load as big endian to avoid branches. */ +- movzwl (%rdi), %eax +- movzwl (%rsi), %ecx +- shll $8, %eax +- shll $8, %ecx +- bswap %eax +- bswap %ecx +- movzbl -1(%rdi, %rdx), %edi +- movzbl -1(%rsi, %rdx), %esi +- orl %edi, %eax +- orl %esi, %ecx +- /* Subtraction is okay because the upper 8 bits are zero. */ +- subl %ecx, %eax +- ret +-# endif + END (MEMCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-71.patch b/glibc-RHEL-15696-71.patch new file mode 100644 index 0000000..2d018d0 --- /dev/null +++ b/glibc-RHEL-15696-71.patch @@ -0,0 +1,43 @@ +From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001 +From: Jangwoong Kim <6812skiii@gmail.com> +Date: Tue, 14 Dec 2021 21:30:51 +0900 +Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop +Content-type: text/plain; charset=UTF-8 + +The commit: +"Add LLL_MUTEX_READ_LOCK [BZ #28537]" +SHA1: d672a98a1af106bd68deb15576710cd61363f7a6 + +introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop +if atomic load fails. But, "continue" inside of do-while loop +does not skip the evaluation of escape expression, thus CAS +is not skipped. + +Replace do-while with while and skip LLL_MUTEX_TRYLOCK if +LLL_MUTEX_READ_LOCK fails. + +Reviewed-by: H.J. Lu +--- + nptl/pthread_mutex_lock.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c +index a633d95e..d96a9933 100644 +--- a/nptl/pthread_mutex_lock.c ++++ b/nptl/pthread_mutex_lock.c +@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex) + break; + } + atomic_spin_nop (); +- if (LLL_MUTEX_READ_LOCK (mutex) != 0) +- continue; + } +- while (LLL_MUTEX_TRYLOCK (mutex) != 0); ++ while (LLL_MUTEX_READ_LOCK (mutex) != 0 ++ || LLL_MUTEX_TRYLOCK (mutex) != 0); + + mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8; + } +-- +GitLab + diff --git a/glibc-RHEL-15696-72.patch b/glibc-RHEL-15696-72.patch new file mode 100644 index 0000000..34f2a61 --- /dev/null +++ b/glibc-RHEL-15696-72.patch @@ -0,0 +1,146 @@ +From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 18 Feb 2022 14:19:15 -0600 +Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896] +Content-type: text/plain; charset=UTF-8 + +In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would +call strcmp-avx2 and wcscmp-avx2 respectively. This would have +not checks around vzeroupper and would trigger spurious +aborts. This commit fixes that. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on +AVX2 machines with and without RTM. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/Makefile | 5 ++++- + sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++--------- + sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++ + 3 files changed, 48 insertions(+), 10 deletions(-) + create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c + +diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile +index 2d814915..c2111f49 100644 +--- a/sysdeps/x86/Makefile ++++ b/sysdeps/x86/Makefile +@@ -28,7 +28,9 @@ tests += \ + tst-strcpy-rtm \ + tst-strlen-rtm \ + tst-strncmp-rtm \ +- tst-strrchr-rtm ++ tst-strrchr-rtm \ ++ tst-wcsncmp-rtm \ ++# tests + + CFLAGS-tst-memchr-rtm.c += -mrtm + CFLAGS-tst-memcmp-rtm.c += -mrtm +@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm + CFLAGS-tst-strlen-rtm.c += -mrtm + CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error + CFLAGS-tst-strrchr-rtm.c += -mrtm ++CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error + endif + + ifneq ($(enable-cet),no) +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 4d0004b5..4e9f094f 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -19,18 +19,32 @@ + #include + #include + ++#ifdef WIDE ++# define CHAR wchar_t ++# define MEMSET wmemset ++# define STRNCMP wcsncmp ++# define TEST_NAME wcsncmp ++#else /* !WIDE */ ++# define CHAR char ++# define MEMSET memset ++# define STRNCMP strncmp ++# define TEST_NAME strncmp ++#endif /* !WIDE */ ++ ++ ++ + #define LOOP 3000 + #define STRING_SIZE 1024 +-char string1[STRING_SIZE]; +-char string2[STRING_SIZE]; ++CHAR string1[STRING_SIZE]; ++CHAR string2[STRING_SIZE]; + + __attribute__ ((noinline, noclone)) + static int + prepare (void) + { +- memset (string1, 'a', STRING_SIZE - 1); +- memset (string2, 'a', STRING_SIZE - 1); +- if (strncmp (string1, string2, STRING_SIZE) == 0) ++ MEMSET (string1, 'a', STRING_SIZE - 1); ++ MEMSET (string2, 'a', STRING_SIZE - 1); ++ if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return EXIT_SUCCESS; + else + return EXIT_FAILURE; +@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone)) + static int + function (void) + { +- if (strncmp (string1, string2, STRING_SIZE) == 0) ++ if (STRNCMP (string1, string2, STRING_SIZE) == 0) + return 0; + else + return 1; +@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone)) + static int + function_overflow (void) + { +- if (strncmp (string1, string2, SIZE_MAX) == 0) ++ if (STRNCMP (string1, string2, SIZE_MAX) == 0) + return 0; + else + return 1; +@@ -59,9 +73,9 @@ function_overflow (void) + static int + do_test (void) + { +- int status = do_test_1 ("strncmp", LOOP, prepare, function); ++ int status = do_test_1 (TEST_NAME, LOOP, prepare, function); + if (status != EXIT_SUCCESS) + return status; +- status = do_test_1 ("strncmp", LOOP, prepare, function_overflow); ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); + return status; + } +diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c +new file mode 100644 +index 00000000..bad3b863 +--- /dev/null ++++ b/sysdeps/x86/tst-wcsncmp-rtm.c +@@ -0,0 +1,21 @@ ++/* Test case for wcsncmp inside a transactionally executing RTM region. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include ++#include "tst-strncmp-rtm.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-73.patch b/glibc-RHEL-15696-73.patch new file mode 100644 index 0000000..e8cc3a2 --- /dev/null +++ b/glibc-RHEL-15696-73.patch @@ -0,0 +1,37 @@ +From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Fri, 18 Feb 2022 17:00:25 -0600 +Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c +Content-type: text/plain; charset=UTF-8 + +Previously TEST_NAME was passing a function pointer. This didn't fail +because of the -Wno-error flag (to allow for overflow sizes passed +to strncmp/wcsncmp) + +Reviewed-by: H.J. Lu +--- + sysdeps/x86/tst-strncmp-rtm.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index 4e9f094f..aef9866c 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -23,12 +23,12 @@ + # define CHAR wchar_t + # define MEMSET wmemset + # define STRNCMP wcsncmp +-# define TEST_NAME wcsncmp ++# define TEST_NAME "wcsncmp" + #else /* !WIDE */ + # define CHAR char + # define MEMSET memset + # define STRNCMP strncmp +-# define TEST_NAME strncmp ++# define TEST_NAME "strncmp" + #endif /* !WIDE */ + + +-- +GitLab + diff --git a/glibc-RHEL-15696-74.patch b/glibc-RHEL-15696-74.patch new file mode 100644 index 0000000..e5e6842 --- /dev/null +++ b/glibc-RHEL-15696-74.patch @@ -0,0 +1,1798 @@ +From b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 10 Jan 2022 15:35:38 -0600 +Subject: [PATCH] x86: Optimize strcmp-avx2.S +Content-type: text/plain; charset=UTF-8 + +Optimization are primarily to the loop logic and how the page cross +logic interacts with the loop. + +The page cross logic is at times more expensive for short strings near +the end of a page but not crossing the page. This is done to retest +the page cross conditions with a non-faulty check and to improve the +logic for entering the loop afterwards. This is only particular cases, +however, and is general made up for by more than 10x improvements on +the transition from the page cross -> loop case. + +The non-page cross cases are improved most for smaller sizes [0, 128] +and go about even for (128, 4096]. The loop page cross logic is +improved so some more significant speedup is seen there as well. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 1592 ++++++++++++++---------- + 1 file changed, 940 insertions(+), 652 deletions(-) + +Conflicts: + sysdeps/x86_64/multiarch/strcmp-avx2.S + (account for sw28896 patches) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 70d8499b..554ffe4c 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -26,35 +26,57 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif ++# define VMOVU vmovdqu ++# define VMOVA vmovdqa + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ ++ /* Compare packed dwords. */ + # define VPCMPEQ vpcmpeqd +-/* Compare packed dwords and store minimum. */ ++ /* Compare packed dwords and store minimum. */ + # define VPMINU vpminud +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ ++ /* Compare packed bytes. */ + # define VPCMPEQ vpcmpeqb +-/* Compare packed bytes and store minimum. */ ++ /* Compare packed bytes and store minimum. */ + # define VPMINU vpminub +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ + # ifndef VZEROUPPER + # define VZEROUPPER vzeroupper + # endif + ++# if defined USE_AS_STRNCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ ++# define xmmZERO xmm15 ++# define ymmZERO ymm15 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif +@@ -79,783 +101,1049 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section SECTION(.text),"ax",@progbits +-ENTRY (STRCMP) ++ .section SECTION(.text), "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx ++# endif + cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ + movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ ++ ++ /* Multiplying length by sizeof(wchar_t) can result in overflow. ++ Check if that is possible. All cases where overflow are possible ++ are cases where length is large enough that it can never be a ++ bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz OVERFLOW_STRCMP +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++ jnz __wcscmp_avx2 ++ ++ leaq (, %rdx, 4), %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP + # endif ++ vpxor %xmmZERO, %xmmZERO, %xmmZERO + movl %edi, %eax +- xorl %edx, %edx +- /* Make %xmm7 (%ymm7) all zeros in this function. */ +- vpxor %xmm7, %xmm7, %xmm7 + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ +- vmovdqu (%rdi), %ymm1 +- VPCMPEQ (%rsi), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- je L(next_3_vectors) +- tzcntl %ecx, %edx ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (%rdi), %ymm0 ++ /* 1s where s1 and s2 equal. */ ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s at null CHAR. */ ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ /* 1s where s1 and s2 equal AND not null CHAR. */ ++ vpandn %ymm1, %ymm2, %ymm1 ++ ++ /* All 1s -> keep going, any 0s -> return. */ ++ vpmovmskb %ymm1, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $VEC_SIZE, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* All 1s represents all equals. incl will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ incl %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + L(return_vzeroupper): + ZERO_UPPER_VEC_REGISTERS_RETURN + +- .p2align 4 +-L(return_vec_size): +- tzcntl %ecx, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 8 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ VZEROUPPER_RETURN ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_avx2 ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ ++ jnbe __strcmp_avx2 ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif ++L(ret1): ++ ret + # endif +- VZEROUPPER_RETURN + +- .p2align 4 +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ /* rdx must be > CHAR_PER_VEC so save to subtract w.o fear of ++ overflow. */ ++ addq $-VEC_SIZE, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): + VZEROUPPER_RETURN + +- .p2align 4 +-L(return_3_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++L(return_vec_3): ++ salq $32, %rcx ++# endif ++ ++L(return_vec_2): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + VZEROUPPER_RETURN ++# endif ++ ++ .p2align 4,, 10 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_2) ++ ++ VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_3) + +- .p2align 4 +-L(next_3_vectors): +- vmovdqu VEC_SIZE(%rdi), %ymm6 +- VPCMPEQ VEC_SIZE(%rsi), %ymm6, %ymm3 +- VPMINU %ymm6, %ymm3, %ymm3 +- VPCMPEQ %ymm7, %ymm3, %ymm3 +- vpmovmskb %ymm3, %ecx +- testl %ecx, %ecx +- jne L(return_vec_size) +- vmovdqu (VEC_SIZE * 2)(%rdi), %ymm5 +- vmovdqu (VEC_SIZE * 3)(%rdi), %ymm4 +- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm5, %ymm2 +- VPMINU %ymm5, %ymm2, %ymm2 +- VPCMPEQ %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm2, %ymm2 +- vpmovmskb %ymm2, %ecx +- testl %ecx, %ecx +- jne L(return_2_vec_size) +- VPMINU %ymm4, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ /* any non-zero positive value that doesn't inference with 0x1. + */ +- subq %rdx, %r11 +- jbe L(zero) +-# endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) ++ movl $2, %r8d + ++# else ++ xorl %r8d, %r8d ++# endif ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ ++# ifdef USE_AS_STRNCMP ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++# endif ++L(prepare_loop_no_len): ++ ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++ addq %rdi, %rsi ++ ++# ifdef USE_AS_STRNCMP ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) +-# endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- vmovdqa (%rax), %ymm0 +- vmovdqa VEC_SIZE(%rax), %ymm3 +- VPCMPEQ (%rdx), %ymm0, %ymm4 +- VPCMPEQ VEC_SIZE(%rdx), %ymm3, %ymm1 +- VPMINU %ymm0, %ymm4, %ymm4 +- VPMINU %ymm3, %ymm1, %ymm1 +- vmovdqa (VEC_SIZE * 2)(%rax), %ymm2 +- VPMINU %ymm1, %ymm4, %ymm0 +- vmovdqa (VEC_SIZE * 3)(%rax), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm2, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm3, %ymm6 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPMINU %ymm5, %ymm0, %ymm0 +- VPMINU %ymm6, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- +- /* Test each mask (32 bits) individually because for VEC_SIZE +- == 32 is not possible to OR the four masks and keep all bits +- in a 64-bit integer register, differing from SSE2 strcmp +- where ORing is possible. */ +- vpmovmskb %ymm0, %ecx ++ subq $(VEC_SIZE * 4), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %ymm0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %ymm2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ ++ VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 ++ ++ VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ ++ ++ /* If any mismatches or null CHAR then 0 CHAR, otherwise non- ++ zero. */ ++ vpand %ymm0, %ymm1, %ymm1 ++ ++ ++ vpand %ymm2, %ymm3, %ymm3 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ ++ VPMINU %ymm1, %ymm3, %ymm3 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ ++ /* Reduce all 0 CHARs for the 4x VEC into ymm7. */ ++ VPMINU %ymm3, %ymm7, %ymm7 ++ ++ /* If any 0 CHAR then done. */ ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jz L(loop) ++ ++ /* Find which VEC has the mismatch of end of string. */ ++ VPCMPEQ %ymm1, %ymmZERO, %ymm1 ++ vpmovmskb %ymm1, %ecx + testl %ecx, %ecx +- je L(loop) +- VPCMPEQ %ymm7, %ymm4, %ymm0 +- vpmovmskb %ymm0, %edi +- testl %edi, %edi +- je L(test_vec) +- tzcntl %edi, %ecx ++ jnz L(return_vec_0_end) ++ ++ ++ VPCMPEQ %ymm3, %ymmZERO, %ymm3 ++ vpmovmskb %ymm3, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_1_end) ++ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ subq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++ VPCMPEQ %ymm5, %ymmZERO, %ymm5 ++ vpmovmskb %ymm5, %ecx ++ testl %ecx, %ecx ++ jnz L(return_vec_2_end) ++ ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++ tzcntl %LOOP_REG, %LOOP_REG ++ ++# ifdef USE_AS_STRNCMP ++ subl $-(VEC_SIZE), %LOOP_REG ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %ecx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_vec): + # ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) ++ .p2align 4,, 2 ++L(ret_zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN + # endif +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- vpmovmskb %ymm1, %ecx +- testl %ecx, %ecx +- je L(test_2_vec) +- tzcntl %ecx, %edi ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++L(return_vec_1_end): ++ salq $32, %rcx ++# endif ++L(return_vec_0_end): ++# ifndef USE_AS_STRNCMP ++ tzcntl %ecx, %ecx ++# else ++ tzcntq %rcx, %rcx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (%rsi, %rcx), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax ++# endif ++L(ret6): ++ VZEROUPPER_RETURN ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(test_2_vec): ++ .p2align 4,, 10 ++L(return_vec_2_end): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- vpmovmskb %ymm5, %ecx +- testl %ecx, %ecx +- je L(test_3_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx ++ je L(ret11) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret11): + VZEROUPPER_RETURN + +- .p2align 4 +-L(test_3_vec): ++ ++ /* Page cross in rsi in next 4x VEC. */ ++ ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ ++ ++ /* Optimistically rsi and rdi and both aligned inwhich case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) ++ ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) ++ ++ VMOVA (%rdi), %ymm0 ++ VPCMPEQ (%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 ++ VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ ++ movl $-1, %r10d ++ shlxl %esi, %r10d, %r10d ++ notl %ecx ++ + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) +-# endif +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- vpmovmskb %ymm6, %esi +- tzcntl %esi, %ecx ++ cmpq %rax, %rdx ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + VZEROUPPER_RETURN + +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 +-# endif +- +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) +- +- vmovdqu (%rax, %r10), %ymm2 +- vmovdqu VEC_SIZE(%rax, %r10), %ymm3 +- VPCMPEQ (%rdx, %r10), %ymm2, %ymm0 +- VPCMPEQ VEC_SIZE(%rdx, %r10), %ymm3, %ymm1 +- VPMINU %ymm2, %ymm0, %ymm0 +- VPMINU %ymm3, %ymm1, %ymm1 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm1, %ymm1 +- +- vpmovmskb %ymm0, %edi +- vpmovmskb %ymm1, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi +- +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrq %cl, %rdi +- +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- vmovdqu (VEC_SIZE * 2)(%rax, %r10), %ymm2 +- vmovdqu (VEC_SIZE * 3)(%rax, %r10), %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rdx, %r10), %ymm2, %ymm5 +- VPMINU %ymm2, %ymm5, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rdx, %r10), %ymm3, %ymm6 +- VPCMPEQ %ymm7, %ymm5, %ymm5 +- VPMINU %ymm3, %ymm6, %ymm6 +- VPCMPEQ %ymm7, %ymm6, %ymm6 +- +- vpmovmskb %ymm5, %edi +- vpmovmskb %ymm6, %esi +- +- salq $32, %rsi +- xorq %rsi, %rdi + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* Skip ECX bytes. */ +- shrq %cl, %rdi +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi +- +- testq %rdi, %rdi ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVU VEC_SIZE(%rdi), %ymm0 ++ VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_1_end) ++ + # ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) +-# else +- je L(back_to_loop) ++ cmpq $(VEC_SIZE * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif +- tzcntq %rdi, %rcx +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx ++ ++ subl $-(VEC_SIZE * 4), %eax ++ ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 ++ VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ jnz L(return_vec_page_cross_1) ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ /* Must check length here as length might proclude reading next ++ page. */ ++ cmpq %rax, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# endif ++ ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 ++ ++ VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 ++ VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ vpand %ymm4, %ymm5, %ymm5 ++ vpand %ymm6, %ymm7, %ymm7 ++ VPMINU %ymm5, %ymm7, %ymm7 ++ VPCMPEQ %ymm7, %ymmZERO, %ymm7 ++ vpmovmskb %ymm7, %LOOP_REG ++ testl %LOOP_REG, %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(VEC_SIZE * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ VZEROUPPER_RETURN + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ jmp L(loop_skip_page_cross_check) + # endif +- VZEROUPPER_RETURN + ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ leal -VEC_SIZE(%rax, %rcx), %ecx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_in_loop_page_cross) ++# else ++ addl %eax, %ecx + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ movl VEC_OFFSET(%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx + subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++L(ret9): ++ VZEROUPPER_RETURN ++ ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif ++ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ xorl %r8d, %r8d + # endif +- /* Check null char. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ ++ .p2align 4,, 10 ++L(page_cross_loop): ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ incl %ecx ++ ++ jnz L(check_ret_vec_page_cross) ++ addl $VEC_SIZE, %OFFSET_REG ++# ifdef USE_AS_STRNCMP ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif +- VZEROUPPER_RETURN ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ ++ VMOVU (%rdi, %OFFSET_REG64), %ymm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ VPCMPEQ %ymm0, %ymmZERO, %ymm2 ++ vpandn %ymm1, %ymm2, %ymm1 ++ vpmovmskb %ymm1, %ecx ++ ++# ifdef USE_AS_STRNCMP ++ leal VEC_SIZE(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++ addq %rdi, %rdx ++# endif ++ incl %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- VZEROUPPER_RETURN ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ VZEROUPPER_RETURN + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ incl %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + VZEROUPPER_RETURN ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- VZEROUPPER_RETURN ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $16, %eax ++ ja L(less_16_till_page) ++ ++ VMOVU (%rdi), %xmm0 ++ VPCMPEQ (%rsi), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ movl $16, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif +- tzcntl %ecx, %edx ++ ++ VMOVU (%rdi, %OFFSET_REG64), %xmm0 ++ VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incw %cx ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ addl $16, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi + # endif +-# ifdef USE_AS_WCSCMP ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ ret + # endif +- VZEROUPPER_RETURN + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- vmovdqu (%rdi, %rdx), %ymm1 +- VPCMPEQ (%rsi, %rdx), %ymm1, %ymm0 +- VPMINU %ymm1, %ymm0, %ymm0 +- VPCMPEQ %ymm7, %ymm0, %ymm0 +- vpmovmskb %ymm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) + +- addl $VEC_SIZE, %edx ++ .p2align 4,, 10 ++L(less_16_till_page): ++ /* Find largest load size we can use. */ ++ cmpl $24, %eax ++ ja L(less_8_till_page) + +- addl $VEC_SIZE, %eax +-# ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- vmovdqu (%rdi, %rdx), %xmm1 +- VPCMPEQ (%rsi, %rdx), %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $8, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif ++ movl $24, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ ++ ++ ++ vmovq (%rdi, %OFFSET_REG64), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ incb %cl ++ jnz L(check_ret_vec_page_cross) ++ + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) +-# endif +- +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %xmm1 +- vmovq (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 8 bits are valid. */ +- andl $0xff, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ addl $8, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(VEC_SIZE * 4), %rdx + +- addl $8, %edx +- addl $8, %eax ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): ++# ifdef USE_AS_WCSCMP ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addq %rdi, %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ + # endif ++ testl %eax, %eax ++ jnz L(prepare_loop_no_len) ++ ret + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %xmm1 +- vmovd (%rsi, %rdx), %xmm0 +- VPCMPEQ %xmm0, %xmm1, %xmm0 +- VPMINU %xmm1, %xmm0, %xmm0 +- VPCMPEQ %xmm7, %xmm0, %xmm0 +- vpmovmskb %xmm0, %ecx +- /* Only last 4 bits are valid. */ +- andl $0xf, %ecx +- testl %ecx, %ecx +- jne L(last_vector) ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ ++# else ++ ++ /* Find largest load size we can use. */ ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $28, %OFFSET_REG ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ ++ ++ vmovd (%rdi, %OFFSET_REG64), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64), %xmm1 ++ VPCMPEQ %xmm0, %xmmZERO, %xmm2 ++ VPCMPEQ %xmm1, %xmm0, %xmm1 ++ vpandn %xmm1, %xmm2, %xmm1 ++ vpmovmskb %ymm1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) ++ ++# ifdef USE_AS_STRNCMP ++ addl $4, %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(VEC_SIZE * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax +- VZEROUPPER_RETURN +-END (STRCMP) ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(VEC_SIZE * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax ++ ret ++# endif ++END(STRCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-75.patch b/glibc-RHEL-15696-75.patch new file mode 100644 index 0000000..4bd0cd4 --- /dev/null +++ b/glibc-RHEL-15696-75.patch @@ -0,0 +1,1992 @@ +From 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 10 Jan 2022 15:35:39 -0600 +Subject: [PATCH] x86: Optimize strcmp-evex.S +Content-type: text/plain; charset=UTF-8 + +Optimization are primarily to the loop logic and how the page cross +logic interacts with the loop. + +The page cross logic is at times more expensive for short strings near +the end of a page but not crossing the page. This is done to retest +the page cross conditions with a non-faulty check and to improve the +logic for entering the loop afterwards. This is only particular cases, +however, and is general made up for by more than 10x improvements on +the transition from the page cross -> loop case. + +The non-page cross cases as well are nearly universally improved. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass. + +Signed-off-by: Noah Goldstein +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 1712 +++++++++++++----------- + 1 file changed, 919 insertions(+), 793 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 6f5c4bf9..99d8409a 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -26,54 +26,69 @@ + + # define PAGE_SIZE 4096 + +-/* VEC_SIZE = Number of bytes in a ymm register */ ++ /* VEC_SIZE = Number of bytes in a ymm register. */ + # define VEC_SIZE 32 ++# define CHAR_PER_VEC (VEC_SIZE / SIZE_OF_CHAR) + +-/* Shift for dividing by (VEC_SIZE * 4). */ +-# define DIVIDE_BY_VEC_4_SHIFT 7 +-# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT) +-# endif +- +-# define VMOVU vmovdqu64 +-# define VMOVA vmovdqa64 ++# define VMOVU vmovdqu64 ++# define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-/* Compare packed dwords. */ +-# define VPCMP vpcmpd ++# define TESTEQ subl $0xff, ++ /* Compare packed dwords. */ ++# define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd +-# define SHIFT_REG32 r8d +-# define SHIFT_REG64 r8 +-/* 1 dword char == 4 bytes. */ ++ /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else +-/* Compare packed bytes. */ +-# define VPCMP vpcmpb ++# define TESTEQ incl ++ /* Compare packed bytes. */ ++# define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb +-# define SHIFT_REG32 ecx +-# define SHIFT_REG64 rcx +-/* 1 byte char == 1 byte. */ ++ /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif + ++# ifdef USE_AS_STRNCMP ++# define LOOP_REG r9d ++# define LOOP_REG64 r9 ++ ++# define OFFSET_REG8 r9b ++# define OFFSET_REG r9d ++# define OFFSET_REG64 r9 ++# else ++# define LOOP_REG edx ++# define LOOP_REG64 rdx ++ ++# define OFFSET_REG8 dl ++# define OFFSET_REG edx ++# define OFFSET_REG64 rdx ++# endif ++ ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++# define VEC_OFFSET 0 ++# else ++# define VEC_OFFSET (-VEC_SIZE) ++# endif ++ + # define XMMZERO xmm16 +-# define XMM0 xmm17 +-# define XMM1 xmm18 ++# define XMM0 xmm17 ++# define XMM1 xmm18 + + # define YMMZERO ymm16 +-# define YMM0 ymm17 +-# define YMM1 ymm18 +-# define YMM2 ymm19 +-# define YMM3 ymm20 +-# define YMM4 ymm21 +-# define YMM5 ymm22 +-# define YMM6 ymm23 +-# define YMM7 ymm24 +-# define YMM8 ymm25 +-# define YMM9 ymm26 +-# define YMM10 ymm27 ++# define YMM0 ymm17 ++# define YMM1 ymm18 ++# define YMM2 ymm19 ++# define YMM3 ymm20 ++# define YMM4 ymm21 ++# define YMM5 ymm22 ++# define YMM6 ymm23 ++# define YMM7 ymm24 ++# define YMM8 ymm25 ++# define YMM9 ymm26 ++# define YMM10 ymm27 + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -96,985 +111,1096 @@ + the maximum offset is reached before a difference is found, zero is + returned. */ + +- .section .text.evex,"ax",@progbits +-ENTRY (STRCMP) ++ .section .text.evex, "ax", @progbits ++ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP +- /* Check for simple cases (0 or 1) in offset. */ +- cmp $1, %RDX_LP +- je L(char0) +- jb L(zero) +-# ifdef USE_AS_WCSCMP +-# ifndef __ILP32__ +- movq %rdx, %rcx +- /* Check if length could overflow when multiplied by +- sizeof(wchar_t). Checking top 8 bits will cover all potential +- overflow cases as well as redirect cases where its impossible to +- length to bound a valid memory region. In these cases just use +- 'wcscmp'. */ +- shrq $56, %rcx +- jnz __wcscmp_evex +-# endif +- /* Convert units: from wide to byte char. */ +- shl $2, %RDX_LP ++# ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %rdx + # endif +- /* Register %r11 tracks the maximum offset. */ +- mov %RDX_LP, %R11_LP ++ cmp $1, %RDX_LP ++ /* Signed comparison intentional. We use this branch to also ++ test cases where length >= 2^63. These very large sizes can be ++ handled with strcmp as there is no way for that length to ++ actually bound the buffer. */ ++ jle L(one_or_less) + # endif + movl %edi, %eax +- xorl %edx, %edx +- /* Make %XMMZERO (%YMMZERO) all zeros in this function. */ +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO + orl %esi, %eax +- andl $(PAGE_SIZE - 1), %eax +- cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax +- jg L(cross_page) +- /* Start comparing 4 vectors. */ ++ /* Shift out the bits irrelivant to page boundary ([63:12]). */ ++ sall $20, %eax ++ /* Check if s1 or s2 may cross a page in next 4x VEC loads. */ ++ cmpl $((PAGE_SIZE -(VEC_SIZE * 4)) << 20), %eax ++ ja L(page_cross) ++ ++L(no_page_cross): ++ /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %YMM0 +- +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ + VPCMP $0, (%rsi), %YMM0, %k1{%k2} +- + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(next_3_vectors) +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx) is after the maximum +- offset (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $CHAR_PER_VEC, %rdx ++ jbe L(vec_0_test_len) + # endif ++ ++ /* TESTEQ is `incl` for strcmp/strncmp and `subl $0xff` for ++ wcscmp/wcsncmp. */ ++ ++ /* All 1s represents all equals. TESTEQ will overflow to zero in ++ all equals case. Otherwise 1s will carry until position of first ++ mismatch. */ ++ TESTEQ %ecx ++ jz L(more_3x_vec) ++ ++ .p2align 4,, 4 ++L(return_vec_0): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- je L(return) +-L(wcscmp_return): ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret0) + setl %al + negl %eax + orl $1, %eax +-L(return): + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret0): + ret + +-L(return_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after +- the maximum offset (%r11). */ +- addq $VEC_SIZE, %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ .p2align 4,, 4 ++L(vec_0_test_len): ++ notl %ecx ++ bzhil %edx, %ecx, %eax ++ jnz L(return_vec_0) ++ /* Align if will cross fetch block. */ ++ .p2align 4,, 2 ++L(ret_zero): + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif +-# else ++ ret ++ ++ .p2align 4,, 5 ++L(one_or_less): ++ jb L(ret_zero) + # ifdef USE_AS_WCSCMP ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __wcscmp_evex ++ movl (%rdi), %edx + xorl %eax, %eax +- movl VEC_SIZE(%rdi, %rdx), %ecx +- cmpl VEC_SIZE(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (%rsi), %edx ++ je L(ret1) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl VEC_SIZE(%rdi, %rdx), %eax +- movzbl VEC_SIZE(%rsi, %rdx), %edx +- subl %edx, %eax ++ /* 'nbe' covers the case where length is negative (large ++ unsigned). */ ++ jnbe __strcmp_evex ++ movzbl (%rdi), %eax ++ movzbl (%rsi), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret1): + ret ++# endif + +-L(return_2_vec_size): +- tzcntl %ecx, %edx ++ .p2align 4,, 10 ++L(return_vec_1): ++ tzcntl %ecx, %ecx ++# ifdef USE_AS_STRNCMP ++ /* rdx must be > CHAR_PER_VEC so its safe to subtract without ++ worrying about underflow. */ ++ addq $-CHAR_PER_VEC, %rdx ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret2) ++ setl %al ++ negl %eax ++ orl $1, %eax ++# else ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif ++L(ret2): ++ ret ++ ++ .p2align 4,, 10 + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 2), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++L(return_vec_3): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_2): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +-L(return_3_vec_size): +- tzcntl %ecx, %edx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is +- after the maximum offset (%r11). */ +- addq $(VEC_SIZE * 3), %rdx +- cmpq %r11, %rdx +- jae L(zero) +-# ifdef USE_AS_WCSCMP ++ cmpq %rcx, %rdx ++ jbe L(ret_zero) ++# endif ++ ++# ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 2)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax +-# endif ++ cmpl (VEC_SIZE * 2)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret3) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else ++ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++# endif ++L(ret3): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_3): ++ tzcntl %ecx, %ecx + # ifdef USE_AS_WCSCMP ++ movl (VEC_SIZE * 3)(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx +- cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ cmpl (VEC_SIZE * 3)(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret4) ++ setl %al ++ negl %eax ++ orl $1, %eax + # else +- movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax +- movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx +- subl %edx, %eax ++ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax ++ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ subl %ecx, %eax + # endif +-# endif ++L(ret4): + ret ++# endif + +- .p2align 4 +-L(next_3_vectors): +- VMOVU VEC_SIZE(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ ++ /* 32 byte align here ensures the main loop is ideally aligned ++ for DSB. */ ++ .p2align 5 ++L(more_3x_vec): ++ /* Safe to compare 4x vectors. */ ++ VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at VEC_SIZE(%rsi). */ +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1) ++ ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) + # endif +- jne L(return_vec_size) + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- jne L(return_2_vec_size) ++ TESTEQ %ecx ++ jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 +- /* Each bit set in K2 represents a non-null CHAR in YMM0. */ + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */ + VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} + kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_3) ++ ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero) ++# endif ++ ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d ++ + # else +- incl %ecx ++ xorl %r8d, %r8d + # endif +- jne L(return_3_vec_size) +-L(main_loop_header): +- leaq (VEC_SIZE * 4)(%rdi), %rdx +- movl $PAGE_SIZE, %ecx +- /* Align load via RAX. */ +- andq $-(VEC_SIZE * 4), %rdx +- subq %rdi, %rdx +- leaq (%rdi, %rdx), %rax ++ ++ /* The prepare labels are various entry points from the page ++ cross logic. */ ++L(prepare_loop): ++ + # ifdef USE_AS_STRNCMP +- /* Starting from this point, the maximum offset, or simply the +- 'offset', DECREASES by the same amount when base pointers are +- moved forward. Return 0 when: +- 1) On match: offset <= the matched vector index. +- 2) On mistmach, offset is before the mistmatched index. +- */ +- subq %rdx, %r11 +- jbe L(zero) ++# ifdef USE_AS_WCSCMP ++L(prepare_loop_no_len): ++ movl %edi, %ecx ++ andl $(VEC_SIZE * 4 - 1), %ecx ++ shrl $2, %ecx ++ leaq (CHAR_PER_VEC * 2)(%rdx, %rcx), %rdx ++# else ++ /* Store N + (VEC_SIZE * 4) and place check at the begining of ++ the loop. */ ++ leaq (VEC_SIZE * 2)(%rdi, %rdx), %rdx ++L(prepare_loop_no_len): ++# endif ++# else ++L(prepare_loop_no_len): + # endif +- addq %rsi, %rdx +- movq %rdx, %rsi +- andl $(PAGE_SIZE - 1), %esi +- /* Number of bytes before page crossing. */ +- subq %rsi, %rcx +- /* Number of VEC_SIZE * 4 blocks before page crossing. */ +- shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx +- /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */ +- movl %ecx, %esi +- jmp L(loop_start) + ++ /* Align s1 and adjust s2 accordingly. */ ++ subq %rdi, %rsi ++ andq $-(VEC_SIZE * 4), %rdi ++L(prepare_loop_readj): ++ addq %rdi, %rsi ++# if (defined USE_AS_STRNCMP) && !(defined USE_AS_WCSCMP) ++ subq %rdi, %rdx ++# endif ++ ++L(prepare_loop_aligned): ++ /* eax stores distance from rsi to next page cross. These cases ++ need to be handled specially as the 4x loop could potentially ++ read memory past the length of s1 or s2 and across a page ++ boundary. */ ++ movl $-(VEC_SIZE * 4), %eax ++ subl %esi, %eax ++ andl $(PAGE_SIZE - 1), %eax ++ ++ vpxorq %YMMZERO, %YMMZERO, %YMMZERO ++ ++ /* Loop 4x comparisons at a time. */ + .p2align 4 + L(loop): ++ ++ /* End condition for strncmp. */ + # ifdef USE_AS_STRNCMP +- /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease +- the maximum offset (%r11) by the same amount. */ +- subq $(VEC_SIZE * 4), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 4), %rdx ++ jbe L(ret_zero) + # endif +- addq $(VEC_SIZE * 4), %rax +- addq $(VEC_SIZE * 4), %rdx +-L(loop_start): +- testl %esi, %esi +- leal -1(%esi), %esi +- je L(loop_cross_page) +-L(back_to_loop): +- /* Main loop, comparing 4 vectors are a time. */ +- VMOVA (%rax), %YMM0 +- VMOVA VEC_SIZE(%rax), %YMM2 +- VMOVA (VEC_SIZE * 2)(%rax), %YMM4 +- VMOVA (VEC_SIZE * 3)(%rax), %YMM6 ++ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ ++ /* Check if rsi loads will cross a page boundary. */ ++ addl $-(VEC_SIZE * 4), %eax ++ jnb L(page_cross_during_loop) ++ ++ /* Loop entry after handling page cross during loop. */ ++L(loop_skip_page_cross_check): ++ VMOVA (VEC_SIZE * 0)(%rdi), %YMM0 ++ VMOVA (VEC_SIZE * 1)(%rdi), %YMM2 ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + + VPMINU %YMM0, %YMM2, %YMM8 + VPMINU %YMM4, %YMM6, %YMM9 + +- /* A zero CHAR in YMM8 means that there is a null CHAR. */ +- VPMINU %YMM8, %YMM9, %YMM8 ++ /* A zero CHAR in YMM9 means that there is a null CHAR. */ ++ VPMINU %YMM8, %YMM9, %YMM9 + + /* Each bit set in K1 represents a non-null CHAR in YMM8. */ +- VPTESTM %YMM8, %YMM8, %k1 ++ VPTESTM %YMM9, %YMM9, %k1 + +- /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */ +- vpxorq (%rdx), %YMM0, %YMM1 +- vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3 +- vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5 +- vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7 ++ vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 ++ vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while ++ oring with YMM1. Result is stored in YMM6. */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 + +- vporq %YMM1, %YMM3, %YMM9 +- vporq %YMM5, %YMM7, %YMM10 ++ /* Or together YMM3, YMM5, and YMM6. */ ++ vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + +- /* A non-zero CHAR in YMM9 represents a mismatch. */ +- vporq %YMM9, %YMM10, %YMM9 + +- /* Each bit cleared in K0 represents a mismatch or a null CHAR. */ +- VPCMP $0, %YMMZERO, %YMM9, %k0{%k1} +- kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(loop) ++ /* A non-zero CHAR in YMM6 represents a mismatch. */ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG + +- /* Each bit set in K1 represents a non-null CHAR in YMM0. */ ++ TESTEQ %LOOP_REG ++ jz L(loop) ++ ++ ++ /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM0 and (%rdx). */ + VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_vec) +- tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) + +- .p2align 4 +-L(test_vec): +-# ifdef USE_AS_STRNCMP +- /* The first vector matched. Return 0 if the maximum offset +- (%r11) <= VEC_SIZE. */ +- cmpq $VEC_SIZE, %r11 +- jbe L(zero) +-# endif +- /* Each bit set in K1 represents a non-null CHAR in YMM2. */ + VPTESTM %YMM2, %YMM2, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM2 and VEC_SIZE(%rdx). */ + VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx +-# else +- incl %ecx +-# endif +- je L(test_2_vec) +- tzcntl %ecx, %edi +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi +-# endif +-# ifdef USE_AS_STRNCMP +- addq $VEC_SIZE, %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl VEC_SIZE(%rsi, %rdi), %ecx +- cmpl VEC_SIZE(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl VEC_SIZE(%rax, %rdi), %eax +- movzbl VEC_SIZE(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif +-# endif +- ret ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- .p2align 4 +-L(test_2_vec): ++ ++ /* Handle VEC 2 and 3 without branches. */ ++L(return_vec_2_3_end): + # ifdef USE_AS_STRNCMP +- /* The first 2 vectors matched. Return 0 if the maximum offset +- (%r11) <= 2 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 2), %r11 +- jbe L(zero) ++ subq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM4. */ ++ + VPTESTM %YMM4, %YMM4, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM4 and (VEC_SIZE * 2)(%rdx). */ + VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} + kmovd %k0, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ TESTEQ %ecx ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %LOOP_REG ++ orl %ecx, %LOOP_REG + # else +- incl %ecx ++ salq $CHAR_PER_VEC, %LOOP_REG64 ++ orq %rcx, %LOOP_REG64 ++# endif ++L(return_vec_3_end): ++ /* LOOP_REG contains matches for null/mismatch from the loop. If ++ VEC 0,1,and 2 all have no null and no mismatches then mismatch ++ must entirely be from VEC 3 which is fully represented by ++ LOOP_REG. */ ++# if CHAR_PER_VEC <= 16 ++ tzcntl %LOOP_REG, %LOOP_REG ++# else ++ tzcntq %LOOP_REG64, %LOOP_REG64 ++# endif ++# ifdef USE_AS_STRNCMP ++ cmpq %LOOP_REG64, %rdx ++ jbe L(ret_zero_end) + # endif +- je L(test_3_vec) +- tzcntl %ecx, %edi ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edi ++ movl (VEC_SIZE * 2)(%rdi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ xorl %eax, %eax ++ cmpl (VEC_SIZE * 2)(%rsi, %LOOP_REG64, SIZE_OF_CHAR), %ecx ++ je L(ret5) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax ++ movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret5): ++ ret ++ + # ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rdi +- cmpq %rdi, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ .p2align 4,, 2 ++L(ret_zero_end): + xorl %eax, %eax +- movl (%rsi, %rdi), %ecx +- cmpl (%rdx, %rdi), %ecx +- jne L(wcscmp_return) ++ ret ++# endif ++ ++ ++ /* The L(return_vec_N_end) differ from L(return_vec_N) in that ++ they use the value of `r8` to negate the return value. This is ++ because the page cross logic can swap `rdi` and `rsi`. */ ++ .p2align 4,, 10 ++# ifdef USE_AS_STRNCMP ++L(return_vec_1_end): ++# if CHAR_PER_VEC <= 16 ++ sall $CHAR_PER_VEC, %ecx + # else +- movzbl (%rax, %rdi), %eax +- movzbl (%rdx, %rdi), %edx +- subl %edx, %eax ++ salq $CHAR_PER_VEC, %rcx + # endif ++# endif ++L(return_vec_0_end): ++# if (CHAR_PER_VEC <= 16) || !(defined USE_AS_STRNCMP) ++ tzcntl %ecx, %ecx + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx +- cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx +- subl %edx, %eax +-# endif ++ tzcntq %rcx, %rcx + # endif +- ret + +- .p2align 4 +-L(test_3_vec): + # ifdef USE_AS_STRNCMP +- /* The first 3 vectors matched. Return 0 if the maximum offset +- (%r11) <= 3 * VEC_SIZE. */ +- cmpq $(VEC_SIZE * 3), %r11 +- jbe L(zero) ++ cmpq %rcx, %rdx ++ jbe L(ret_zero_end) + # endif +- /* Each bit set in K1 represents a non-null CHAR in YMM6. */ +- VPTESTM %YMM6, %YMM6, %k1 +- /* Each bit cleared in K0 represents a mismatch or a null CHAR +- in YMM6 and (VEC_SIZE * 3)(%rdx). */ +- VPCMP $0, %YMMZERO, %YMM7, %k0{%k1} +- kmovd %k0, %ecx ++ + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret6) ++ setl %al ++ negl %eax ++ /* This is the non-zero case for `eax` so just xorl with `r8d` ++ flip is `rdi` and `rsi` where swapped. */ ++ xorl %r8d, %eax + # else +- incl %ecx ++ movzbl (%rdi, %rcx), %eax ++ movzbl (%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ /* Flip `eax` if `rdi` and `rsi` where swapped in page cross ++ logic. Subtract `r8d` after xor for zero case. */ ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret6): ++ ret ++ ++# ifndef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_vec_1_end): + tzcntl %ecx, %ecx +-# ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 3), %rcx +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_SIZE(%rdi, %rcx, SIZE_OF_CHAR), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %esi +- cmpl (%rdx, %rcx), %esi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif +-# else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 3)(%rsi, %rcx), %esi +- cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi +- jne L(wcscmp_return) ++ cmpl VEC_SIZE(%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret7) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +- movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx +- subl %edx, %eax ++ movzbl VEC_SIZE(%rdi, %rcx), %eax ++ movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif +-# endif ++L(ret7): + ret +- +- .p2align 4 +-L(loop_cross_page): +- xorl %r10d, %r10d +- movq %rdx, %rcx +- /* Align load via RDX. We load the extra ECX bytes which should +- be ignored. */ +- andl $((VEC_SIZE * 4) - 1), %ecx +- /* R10 is -RCX. */ +- subq %rcx, %r10 +- +- /* This works only if VEC_SIZE * 2 == 64. */ +-# if (VEC_SIZE * 2) != 64 +-# error (VEC_SIZE * 2) != 64 + # endif + +- /* Check if the first VEC_SIZE * 2 bytes should be ignored. */ +- cmpl $(VEC_SIZE * 2), %ecx +- jge L(loop_cross_page_2_vec) + +- VMOVU (%rax, %r10), %YMM2 +- VMOVU VEC_SIZE(%rax, %r10), %YMM3 ++ /* Page cross in rsi in next 4x VEC. */ + +- /* Each bit set in K2 represents a non-null CHAR in YMM2. */ +- VPTESTM %YMM2, %YMM2, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM2 and 32 bytes at (%rdx, %r10). */ +- VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ /* TODO: Improve logic here. */ ++ .p2align 4,, 10 ++L(page_cross_during_loop): ++ /* eax contains [distance_from_page - (VEC_SIZE * 4)]. */ + +- /* Each bit set in K4 represents a non-null CHAR in YMM3. */ +- VPTESTM %YMM3, %YMM3, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */ +- VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi +-# endif ++ /* Optimistically rsi and rdi and both aligned in which case we ++ don't need any logic here. */ ++ cmpl $-(VEC_SIZE * 4), %eax ++ /* Don't adjust eax before jumping back to loop and we will ++ never hit page cross case again. */ ++ je L(loop_skip_page_cross_check) + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi +- /* NB: Divide shift count by 4 since each bit in K1 represent 4 +- bytes. */ +- movl %ecx, %SHIFT_REG32 +- sarl $2, %SHIFT_REG32 +- +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Check if we can safely load a VEC. */ ++ cmpl $-(VEC_SIZE * 3), %eax ++ jle L(less_1x_vec_till_page_cross) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi +-# endif ++ VMOVA (%rdi), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_0_end) ++ ++ /* if distance >= 2x VEC then eax > -(VEC_SIZE * 2). */ ++ cmpl $-(VEC_SIZE * 2), %eax ++ jg L(more_2x_vec_till_page_cross) ++ ++ .p2align 4,, 4 ++L(less_1x_vec_till_page_cross): ++ subl $-(VEC_SIZE * 4), %eax ++ /* Guranteed safe to read from rdi - VEC_SIZE here. The only ++ concerning case is first iteration if incoming s1 was near start ++ of a page and s2 near end. If s1 was near the start of the page ++ we already aligned up to nearest VEC_SIZE * 4 so gurnateed safe ++ to read back -VEC_SIZE. If rdi is truly at the start of a page ++ here, it means the previous page (rdi - VEC_SIZE) has already ++ been loaded earlier so must be valid. */ ++ VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} ++ ++ /* Mask of potentially valid bits. The lower bits can be out of ++ range comparisons (but safe regarding page crosses). */ + +- /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */ +- shrxq %SHIFT_REG64, %rdi, %rdi +- testq %rdi, %rdi +- je L(loop_cross_page_2_vec) +- tzcntq %rdi, %rcx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx ++ movl $-1, %r10d ++ movl %esi, %ecx ++ andl $(VEC_SIZE - 1), %ecx ++ shrl $2, %ecx ++ shlxl %ecx, %r10d, %ecx ++ movzbl %cl, %r10d ++# else ++ movl $-1, %ecx ++ shlxl %esi, %ecx, %r10d + # endif ++ ++ kmovd %k1, %ecx ++ notl %ecx ++ ++ + # ifdef USE_AS_STRNCMP +- cmpq %rcx, %r11 +- jbe L(zero) + # ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx + # else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax ++ cmpq %rax, %rdx + # endif ++ jbe L(return_page_cross_end_check) ++# endif ++ movl %eax, %OFFSET_REG ++ ++ /* Readjust eax before potentially returning to the loop. */ ++ addl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ ++ andl %r10d, %ecx ++ jz L(loop_skip_page_cross_check) ++ ++ .p2align 4,, 3 ++L(return_page_cross_end): ++ tzcntl %ecx, %ecx ++ ++# if (defined USE_AS_STRNCMP) || (defined USE_AS_WCSCMP) ++ leal -VEC_SIZE(%OFFSET_REG64, %rcx, SIZE_OF_CHAR), %ecx ++L(return_page_cross_cmp_mem): + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ addl %OFFSET_REG, %ecx ++# endif ++# ifdef USE_AS_WCSCMP ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret8) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax ++# else ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret8): + ret + +- .p2align 4 +-L(loop_cross_page_2_vec): +- /* The first VEC_SIZE * 2 bytes match or are ignored. */ +- VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0 +- VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1 ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 10 ++L(return_page_cross_end_check): ++ tzcntl %ecx, %ecx ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_WCSCMP ++ sall $2, %edx ++# endif ++ cmpl %ecx, %edx ++ ja L(return_page_cross_cmp_mem) ++ xorl %eax, %eax ++ ret ++# endif ++ + ++ .p2align 4,, 10 ++L(more_2x_vec_till_page_cross): ++ /* If more 2x vec till cross we will complete a full loop ++ iteration here. */ ++ ++ VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2} +- kmovd %k1, %r9d +- /* Don't use subl since it is the lower 16/32 bits of RDI +- below. */ +- notl %r9d +-# ifdef USE_AS_WCSCMP +- /* Only last 8 bits are valid. */ +- andl $0xff, %r9d +-# endif ++ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_1_end) + +- VPTESTM %YMM1, %YMM1, %k4 +- /* Each bit cleared in K3 represents a mismatch or a null CHAR +- in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */ +- VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4} +- kmovd %k3, %edi +- /* Must use notl %edi here as lower bits are for CHAR +- comparisons potentially out of range thus can be 0 without +- indicating mismatch. */ +- notl %edi +-# ifdef USE_AS_WCSCMP +- /* Don't use subl since it is the upper 8 bits of EDI below. */ +- andl $0xff, %edi ++# ifdef USE_AS_STRNCMP ++ cmpq $(CHAR_PER_VEC * 2), %rdx ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +-# ifdef USE_AS_WCSCMP +- /* NB: Each bit in EDI/R9D represents 4-byte element. */ +- sall $8, %edi ++ subl $-(VEC_SIZE * 4), %eax + +- /* Each bit in EDI represents a null CHAR or a mismatch. */ +- orl %r9d, %edi +-# else +- salq $32, %rdi ++ /* Safe to include comparisons from lower bytes. */ ++ VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_0) ++ ++ VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(return_vec_page_cross_1) + +- /* Each bit in RDI represents a null CHAR or a mismatch. */ +- orq %r9, %rdi ++# ifdef USE_AS_STRNCMP ++ /* Must check length here as length might proclude reading next ++ page. */ ++# ifdef USE_AS_WCSCMP ++ movl %eax, %r11d ++ shrl $2, %r11d ++ cmpq %r11, %rdx ++# else ++ cmpq %rax, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) + # endif + +- xorl %r8d, %r8d +- /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */ +- subl $(VEC_SIZE * 2), %ecx +- jle 1f +- /* R8 has number of bytes skipped. */ +- movl %ecx, %r8d +-# ifdef USE_AS_WCSCMP +- /* NB: Divide shift count by 4 since each bit in RDI represent 4 +- bytes. */ +- sarl $2, %ecx +- /* Skip ECX bytes. */ +- shrl %cl, %edi ++ /* Finish the loop. */ ++ VMOVA (VEC_SIZE * 2)(%rdi), %YMM4 ++ VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 ++ VPMINU %YMM4, %YMM6, %YMM9 ++ VPTESTM %YMM9, %YMM9, %k1 ++ ++ vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 ++ /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ ++ vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 ++ ++ VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ kmovd %k0, %LOOP_REG ++ TESTEQ %LOOP_REG ++ jnz L(return_vec_2_3_end) ++ ++ /* Best for code size to include ucond-jmp here. Would be faster ++ if this case is hot to duplicate the L(return_vec_2_3_end) code ++ as fall-through and have jump back to loop on mismatch ++ comparison. */ ++ subq $-(VEC_SIZE * 4), %rdi ++ subq $-(VEC_SIZE * 4), %rsi ++ addl $(PAGE_SIZE - VEC_SIZE * 8), %eax ++# ifdef USE_AS_STRNCMP ++ subq $(CHAR_PER_VEC * 4), %rdx ++ ja L(loop_skip_page_cross_check) ++L(ret_zero_in_loop_page_cross): ++ xorl %eax, %eax ++ ret + # else +- /* Skip ECX bytes. */ +- shrq %cl, %rdi ++ jmp L(loop_skip_page_cross_check) + # endif +-1: +- /* Before jumping back to the loop, set ESI to the number of +- VEC_SIZE * 4 blocks before page crossing. */ +- movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi + +- testq %rdi, %rdi +-# ifdef USE_AS_STRNCMP +- /* At this point, if %rdi value is 0, it already tested +- VEC_SIZE*4+%r10 byte starting from %rax. This label +- checks whether strncmp maximum offset reached or not. */ +- je L(string_nbyte_offset_check) ++ ++ .p2align 4,, 10 ++L(return_vec_page_cross_0): ++ addl $-VEC_SIZE, %eax ++L(return_vec_page_cross_1): ++ tzcntl %ecx, %ecx ++# if defined USE_AS_STRNCMP || defined USE_AS_WCSCMP ++ leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx ++# ifdef USE_AS_STRNCMP ++# ifdef USE_AS_WCSCMP ++ /* Must divide ecx instead of multiply rdx due to overflow. */ ++ movl %ecx, %eax ++ shrl $2, %eax ++ cmpq %rax, %rdx ++# else ++ cmpq %rcx, %rdx ++# endif ++ jbe L(ret_zero_in_loop_page_cross) ++# endif + # else +- je L(back_to_loop) ++ addl %eax, %ecx + # endif +- tzcntq %rdi, %rcx ++ + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %ecx +-# endif +- addq %r10, %rcx +- /* Adjust for number of bytes skipped. */ +- addq %r8, %rcx +-# ifdef USE_AS_STRNCMP +- addq $(VEC_SIZE * 2), %rcx +- subq %rcx, %r11 +- jbe L(zero) +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi ++ movl VEC_OFFSET(%rdi, %rcx), %edx + xorl %eax, %eax +- movl (%rsi, %rcx), %edi +- cmpl (%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (%rax, %rcx), %eax +- movzbl (%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ cmpl VEC_OFFSET(%rsi, %rcx), %edx ++ je L(ret9) ++ setl %al ++ negl %eax ++ xorl %r8d, %eax + # else +-# ifdef USE_AS_WCSCMP +- movq %rax, %rsi +- xorl %eax, %eax +- movl (VEC_SIZE * 2)(%rsi, %rcx), %edi +- cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi +- jne L(wcscmp_return) +-# else +- movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax +- movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx +- subl %edx, %eax +-# endif ++ movzbl VEC_OFFSET(%rdi, %rcx), %eax ++ movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret9): + ret + +-# ifdef USE_AS_STRNCMP +-L(string_nbyte_offset_check): +- leaq (VEC_SIZE * 4)(%r10), %r10 +- cmpq %r10, %r11 +- jbe L(zero) +- jmp L(back_to_loop) ++ ++ .p2align 4,, 10 ++L(page_cross): ++# ifndef USE_AS_STRNCMP ++ /* If both are VEC aligned we don't need any special logic here. ++ Only valid for strcmp where stop condition is guranteed to be ++ reachable by just reading memory. */ ++ testl $((VEC_SIZE - 1) << 20), %eax ++ jz L(no_page_cross) + # endif + +- .p2align 4 +-L(cross_page_loop): +- /* Check one byte/dword at a time. */ ++ movl %edi, %eax ++ movl %esi, %ecx ++ andl $(PAGE_SIZE - 1), %eax ++ andl $(PAGE_SIZE - 1), %ecx ++ ++ xorl %OFFSET_REG, %OFFSET_REG ++ ++ /* Check which is closer to page cross, s1 or s2. */ ++ cmpl %eax, %ecx ++ jg L(page_cross_s2) ++ ++ /* The previous page cross check has false positives. Check for ++ true positive as page cross logic is very expensive. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %eax ++ jbe L(no_page_cross) ++ ++ ++ /* Set r8 to not interfere with normal return value (rdi and rsi ++ did not swap). */ + # ifdef USE_AS_WCSCMP +- cmpl %ecx, %eax ++ /* any non-zero positive value that doesn't inference with 0x1. ++ */ ++ movl $2, %r8d + # else +- subl %ecx, %eax ++ xorl %r8d, %r8d + # endif +- jne L(different) +- addl $SIZE_OF_CHAR, %edx +- cmpl $(VEC_SIZE * 4), %edx +- je L(main_loop_header) ++ ++ /* Check if less than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jg L(less_1x_vec_till_page) ++ ++ ++ /* If more than 1x VEC till page cross, loop throuh safely ++ loadable memory until within 1x VEC of page cross. */ ++ .p2align 4,, 8 ++L(page_cross_loop): ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ kmovd %k1, %ecx ++ TESTEQ %ecx ++ jnz L(check_ret_vec_page_cross) ++ addl $CHAR_PER_VEC, %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross) + # endif ++ addl $VEC_SIZE, %eax ++ jl L(page_cross_loop) ++ + # ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx ++ shrl $2, %eax + # endif +- /* Check null CHAR. */ +- testl %eax, %eax +- jne L(cross_page_loop) +- /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED +- comparisons. */ +- subl %ecx, %eax +-# ifndef USE_AS_WCSCMP +-L(different): ++ ++ ++ subl %eax, %OFFSET_REG ++ /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed ++ to not cross page so is safe to load. Since we have already ++ loaded at least 1 VEC from rsi it is also guranteed to be safe. ++ */ ++ VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 ++ VPTESTM %YMM0, %YMM0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ ++ kmovd %k1, %ecx ++# ifdef USE_AS_STRNCMP ++ leal CHAR_PER_VEC(%OFFSET_REG64), %eax ++ cmpq %rax, %rdx ++ jbe L(check_ret_vec_page_cross2) ++# ifdef USE_AS_WCSCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++# else ++ addq %rdi, %rdx ++# endif + # endif +- ret ++ TESTEQ %ecx ++ jz L(prepare_loop_no_len) + ++ .p2align 4,, 4 ++L(ret_vec_page_cross): ++# ifndef USE_AS_STRNCMP ++L(check_ret_vec_page_cross): ++# endif ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++L(ret_vec_page_cross_cont): + # ifdef USE_AS_WCSCMP +- .p2align 4 +-L(different): +- /* Use movl to avoid modifying EFLAGS. */ +- movl $0, %eax ++ movl (%rdi, %rcx, SIZE_OF_CHAR), %edx ++ xorl %eax, %eax ++ cmpl (%rsi, %rcx, SIZE_OF_CHAR), %edx ++ je L(ret12) + setl %al + negl %eax +- orl $1, %eax +- ret ++ xorl %r8d, %eax ++# else ++ movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax ++ movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ subl %ecx, %eax ++ xorl %r8d, %eax ++ subl %r8d, %eax + # endif ++L(ret12): ++ ret ++ + + # ifdef USE_AS_STRNCMP +- .p2align 4 +-L(zero): ++ .p2align 4,, 10 ++L(check_ret_vec_page_cross2): ++ TESTEQ %ecx ++L(check_ret_vec_page_cross): ++ tzcntl %ecx, %ecx ++ addl %OFFSET_REG, %ecx ++ cmpq %rcx, %rdx ++ ja L(ret_vec_page_cross_cont) ++ .p2align 4,, 2 ++L(ret_zero_page_cross): + xorl %eax, %eax + ret ++# endif + +- .p2align 4 +-L(char0): +-# ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi), %ecx +- cmpl (%rsi), %ecx +- jne L(wcscmp_return) +-# else +- movzbl (%rsi), %ecx +- movzbl (%rdi), %eax +- subl %ecx, %eax +-# endif +- ret ++ .p2align 4,, 4 ++L(page_cross_s2): ++ /* Ensure this is a true page cross. */ ++ subl $(PAGE_SIZE - VEC_SIZE * 4), %ecx ++ jbe L(no_page_cross) ++ ++ ++ movl %ecx, %eax ++ movq %rdi, %rcx ++ movq %rsi, %rdi ++ movq %rcx, %rsi ++ ++ /* set r8 to negate return value as rdi and rsi swapped. */ ++# ifdef USE_AS_WCSCMP ++ movl $-4, %r8d ++# else ++ movl $-1, %r8d + # endif ++ xorl %OFFSET_REG, %OFFSET_REG + +- .p2align 4 +-L(last_vector): +- addq %rdx, %rdi +- addq %rdx, %rsi +-# ifdef USE_AS_STRNCMP +- subq %rdx, %r11 ++ /* Check if more than 1x VEC till page cross. */ ++ subl $(VEC_SIZE * 3), %eax ++ jle L(page_cross_loop) ++ ++ .p2align 4,, 6 ++L(less_1x_vec_till_page): ++# ifdef USE_AS_WCSCMP ++ shrl $2, %eax + # endif +- tzcntl %ecx, %edx ++ /* Find largest load size we can use. */ ++ cmpl $(16 / SIZE_OF_CHAR), %eax ++ ja L(less_16_till_page) ++ ++ /* Use 16 byte comparison. */ ++ vmovdqu (%rdi), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */ +- sall $2, %edx ++ subl $0xf, %ecx ++# else ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++ movl $(16 / SIZE_OF_CHAR), %OFFSET_REG + # ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subl %eax, %OFFSET_REG ++# else ++ /* Explicit check for 16 byte alignment. */ ++ subl %eax, %OFFSET_REG ++ jz L(prepare_loop) + # endif ++ vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- xorl %eax, %eax +- movl (%rdi, %rdx), %ecx +- cmpl (%rsi, %rdx), %ecx +- jne L(wcscmp_return) ++ subl $0xf, %ecx + # else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %edx +- subl %edx, %eax ++ incw %cx + # endif ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(16 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (16 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (16 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ ++# ifdef USE_AS_STRNCMP ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case0): ++ xorl %eax, %eax + ret ++# endif + +- /* Comparing on page boundary region requires special treatment: +- It must done one vector at the time, starting with the wider +- ymm vector if possible, if not, with xmm. If fetching 16 bytes +- (xmm) still passes the boundary, byte comparison must be done. +- */ +- .p2align 4 +-L(cross_page): +- /* Try one ymm vector at a time. */ +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jg L(cross_page_1_vector) +-L(loop_1_vector): +- VMOVU (%rdi, %rdx), %YMM0 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in YMM0 and 32 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2} ++ .p2align 4,, 10 ++L(less_16_till_page): ++ cmpl $(24 / SIZE_OF_CHAR), %eax ++ ja L(less_8_till_page) ++ ++ /* Use 8 byte comparison. */ ++ vmovq (%rdi), %xmm0 ++ vmovq (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xff, %ecx ++ subl $0x3, %ecx + # else +- incl %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) + +- addl $VEC_SIZE, %edx + +- addl $VEC_SIZE, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $(8 / SIZE_OF_CHAR), %rdx ++ jbe L(ret_zero_page_cross_slow_case0) + # endif +- cmpl $(PAGE_SIZE - VEC_SIZE), %eax +- jle L(loop_1_vector) +-L(cross_page_1_vector): +- /* Less than 32 bytes to check, try one xmm vector. */ +- cmpl $(PAGE_SIZE - 16), %eax +- jg L(cross_page_1_xmm) +- VMOVU (%rdi, %rdx), %XMM0 ++ movl $(24 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and 16 bytes at (%rsi, %rdx). */ +- VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2} ++ vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP +- subl $0xf, %ecx ++ subl $0x3, %ecx + # else +- subl $0xffff, %ecx ++ incb %cl + # endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++ + +- addl $16, %edx +-# ifndef USE_AS_WCSCMP +- addl $16, %eax +-# endif + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ addl $(8 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case0) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (8 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (8 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi + # endif ++ jmp L(prepare_loop_aligned) + +-L(cross_page_1_xmm): +-# ifndef USE_AS_WCSCMP +- /* Less than 16 bytes to check, try 8 byte vector. NB: No need +- for wcscmp nor wcsncmp since wide char is 4 bytes. */ +- cmpl $(PAGE_SIZE - 8), %eax +- jg L(cross_page_8bytes) +- vmovq (%rdi, %rdx), %XMM0 +- vmovq (%rsi, %rdx), %XMM1 + +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} +- kmovb %k1, %ecx ++ ++ ++ .p2align 4,, 10 ++L(less_8_till_page): + # ifdef USE_AS_WCSCMP +- subl $0x3, %ecx ++ /* If using wchar then this is the only check before we reach ++ the page boundary. */ ++ movl (%rdi), %eax ++ movl (%rsi), %ecx ++ cmpl %ecx, %eax ++ jnz L(ret_less_8_wcs) ++# ifdef USE_AS_STRNCMP ++ addq $-(CHAR_PER_VEC * 2), %rdx ++ /* We already checked for len <= 1 so cannot hit that case here. ++ */ ++# endif ++ testl %eax, %eax ++ jnz L(prepare_loop) ++ ret ++ ++ .p2align 4,, 8 ++L(ret_less_8_wcs): ++ setl %OFFSET_REG8 ++ negl %OFFSET_REG ++ movl %OFFSET_REG, %eax ++ xorl %r8d, %eax ++ ret ++ + # else +- subl $0xff, %ecx +-# endif +- jne L(last_vector) ++ cmpl $28, %eax ++ ja L(less_4_till_page) ++ ++ vmovd (%rdi), %xmm0 ++ vmovd (%rsi), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ kmovd %k1, %ecx ++ subl $0xf, %ecx ++ jnz L(check_ret_vec_page_cross) + +- addl $8, %edx +- addl $8, %eax + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ cmpq $4, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) + # endif ++ movl $(28 / SIZE_OF_CHAR), %OFFSET_REG ++ subl %eax, %OFFSET_REG + +-L(cross_page_8bytes): +- /* Less than 8 bytes to check, try 4 byte vector. */ +- cmpl $(PAGE_SIZE - 4), %eax +- jg L(cross_page_4bytes) +- vmovd (%rdi, %rdx), %XMM0 +- vmovd (%rsi, %rdx), %XMM1 +- +- VPTESTM %YMM0, %YMM0, %k2 +- /* Each bit cleared in K1 represents a mismatch or a null CHAR +- in XMM0 and XMM1. */ +- VPCMP $0, %XMM1, %XMM0, %k1{%k2} ++ vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 ++ vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 ++ VPTESTM %xmm0, %xmm0, %k2 ++ VPCMP $0, %xmm1, %xmm0, %k1{%k2} + kmovd %k1, %ecx +-# ifdef USE_AS_WCSCMP +- subl $0x1, %ecx +-# else + subl $0xf, %ecx +-# endif +- jne L(last_vector) ++ jnz L(check_ret_vec_page_cross) ++# ifdef USE_AS_STRNCMP ++ addl $(4 / SIZE_OF_CHAR), %OFFSET_REG ++ subq %OFFSET_REG64, %rdx ++ jbe L(ret_zero_page_cross_slow_case1) ++ subq $-(CHAR_PER_VEC * 4), %rdx ++ ++ leaq -(VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq -(VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# else ++ leaq (4 - VEC_SIZE * 4)(%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %rdi ++ leaq (4 - VEC_SIZE * 4)(%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %rsi ++# endif ++ jmp L(prepare_loop_aligned) ++ + +- addl $4, %edx + # ifdef USE_AS_STRNCMP +- /* Return 0 if the current offset (%rdx) >= the maximum offset +- (%r11). */ +- cmpq %r11, %rdx +- jae L(zero) ++ .p2align 4,, 2 ++L(ret_zero_page_cross_slow_case1): ++ xorl %eax, %eax ++ ret + # endif + +-L(cross_page_4bytes): +-# endif +- /* Less than 4 bytes to check, try one byte/dword at a time. */ +-# ifdef USE_AS_STRNCMP +- cmpq %r11, %rdx +- jae L(zero) +-# endif +-# ifdef USE_AS_WCSCMP +- movl (%rdi, %rdx), %eax +- movl (%rsi, %rdx), %ecx +-# else +- movzbl (%rdi, %rdx), %eax +- movzbl (%rsi, %rdx), %ecx +-# endif +- testl %eax, %eax +- jne L(cross_page_loop) ++ .p2align 4,, 10 ++L(less_4_till_page): ++ subq %rdi, %rsi ++ /* Extremely slow byte comparison loop. */ ++L(less_4_loop): ++ movzbl (%rdi), %eax ++ movzbl (%rsi, %rdi), %ecx + subl %ecx, %eax ++ jnz L(ret_less_4_loop) ++ testl %ecx, %ecx ++ jz L(ret_zero_4_loop) ++# ifdef USE_AS_STRNCMP ++ decq %rdx ++ jz L(ret_zero_4_loop) ++# endif ++ incq %rdi ++ /* end condition is reach page boundary (rdi is aligned). */ ++ testl $31, %edi ++ jnz L(less_4_loop) ++ leaq -(VEC_SIZE * 4)(%rdi, %rsi), %rsi ++ addq $-(VEC_SIZE * 4), %rdi ++# ifdef USE_AS_STRNCMP ++ subq $-(CHAR_PER_VEC * 4), %rdx ++# endif ++ jmp L(prepare_loop_aligned) ++ ++L(ret_zero_4_loop): ++ xorl %eax, %eax ++ ret ++L(ret_less_4_loop): ++ xorl %r8d, %eax ++ subl %r8d, %eax + ret +-END (STRCMP) ++# endif ++END(STRCMP) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-76.patch b/glibc-RHEL-15696-76.patch new file mode 100644 index 0000000..84d9a6f --- /dev/null +++ b/glibc-RHEL-15696-76.patch @@ -0,0 +1,33 @@ +From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 4 Feb 2022 11:09:10 -0800 +Subject: [PATCH] x86-64: Fix strcmp-avx2.S +Content-type: text/plain; charset=UTF-8 + +Change "movl %edx, %rdx" to "movl %edx, %edx" in: + +commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:38 2022 -0600 + + x86: Optimize strcmp-avx2.S +--- + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 554ffe4c..04675aa4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -106,7 +106,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also +-- +GitLab + diff --git a/glibc-RHEL-15696-77.patch b/glibc-RHEL-15696-77.patch new file mode 100644 index 0000000..1a1cdae --- /dev/null +++ b/glibc-RHEL-15696-77.patch @@ -0,0 +1,33 @@ +From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Fri, 4 Feb 2022 11:11:08 -0800 +Subject: [PATCH] x86-64: Fix strcmp-evex.S +Content-type: text/plain; charset=UTF-8 + +Change "movl %edx, %rdx" to "movl %edx, %edx" in: + +commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9 +Author: Noah Goldstein +Date: Mon Jan 10 15:35:39 2022 -0600 + + x86: Optimize strcmp-evex.S +--- + sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 99d8409a..ed56af8e 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -116,7 +116,7 @@ ENTRY(STRCMP) + # ifdef USE_AS_STRNCMP + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ +- movl %edx, %rdx ++ movl %edx, %edx + # endif + cmp $1, %RDX_LP + /* Signed comparison intentional. We use this branch to also +-- +GitLab + diff --git a/glibc-RHEL-15696-78.patch b/glibc-RHEL-15696-78.patch new file mode 100644 index 0000000..885b715 --- /dev/null +++ b/glibc-RHEL-15696-78.patch @@ -0,0 +1,459 @@ +From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sun, 6 Feb 2022 00:54:18 -0600 +Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S +Content-type: text/plain; charset=UTF-8 + +No bug. + +Split vec generation into multiple steps. This allows the +broadcast in AVX2 to use 'xmm' registers for the L(less_vec) +case. This saves an expensive lane-cross instruction and removes +the need for 'vzeroupper'. + +For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for +byte broadcast. + +Results for memset-avx2 small (geomean of N = 20 benchset runs). + +size, New Time, Old Time, New / Old + 0, 4.100, 3.831, 0.934 + 1, 5.074, 4.399, 0.867 + 2, 4.433, 4.411, 0.995 + 4, 4.487, 4.415, 0.984 + 8, 4.454, 4.396, 0.987 + 16, 4.502, 4.443, 0.987 + +All relevant string/wcsmbs tests are passing. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/memset.S | 21 ++- + .../multiarch/memset-avx2-unaligned-erms.S | 18 +- + .../multiarch/memset-avx512-unaligned-erms.S | 18 +- + .../multiarch/memset-evex-unaligned-erms.S | 18 +- + .../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++------- + 5 files changed, 152 insertions(+), 87 deletions(-) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 8672b030..27debd2b 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -28,17 +28,22 @@ + #define VMOVU movups + #define VMOVA movaps + +-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- punpcklbw %xmm0, %xmm0; \ +- punpcklwd %xmm0, %xmm0; \ +- pshufd $0, %xmm0, %xmm0 ++ pxor %xmm1, %xmm1; \ ++ pshufb %xmm1, %xmm0; \ ++ movq r, %rax + +-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- movq r, %rax; \ +- pshufd $0, %xmm0, %xmm0 ++ pshufd $0, %xmm0, %xmm0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + #define SECTION(p) p + +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index 1af668af..c0bf2875 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -10,15 +10,18 @@ + # define VMOVU vmovdqu + # define VMOVA vmovdqa + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastb %xmm0, %ymm0 ++ movq r, %rax; + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- vmovd d, %xmm0; \ +- movq r, %rax; \ +- vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ MEMSET_SET_VEC0_AND_SET_RETURN(d, r) ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0 ++# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0 ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0 ++# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0 + + # ifndef SECTION + # define SECTION(p) p##.avx +@@ -30,5 +33,6 @@ + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif + ++# define USE_XMM_LESS_VEC + # include "memset-vec-unaligned-erms.S" + #endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index f14d6f84..5241216a 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex512 + # define MEMSET_SYMBOL(p,s) p##_avx512_##s +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 64b09e77..63700215 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -15,13 +15,19 @@ + + # define VZEROUPPER + +-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastb d, %VEC0 ++# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastb d, %VEC0; \ ++ movq r, %rax + +-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \ +- movq r, %rax; \ +- vpbroadcastd d, %VEC0 ++# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ ++ vpbroadcastd d, %VEC0; \ ++ movq r, %rax ++ ++# define MEMSET_VDUP_TO_VEC0_HIGH() ++# define MEMSET_VDUP_TO_VEC0_LOW() ++ ++# define WMEMSET_VDUP_TO_VEC0_HIGH() ++# define WMEMSET_VDUP_TO_VEC0_LOW() + + # define SECTION(p) p##.evex + # define MEMSET_SYMBOL(p,s) p##_evex_##s +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index f08b7323..a67f9833 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -58,8 +58,10 @@ + #ifndef MOVQ + # if VEC_SIZE > 16 + # define MOVQ vmovq ++# define MOVD vmovd + # else + # define MOVQ movq ++# define MOVD movd + # endif + #endif + +@@ -72,9 +74,17 @@ + #if defined USE_WITH_EVEX || defined USE_WITH_AVX512 + # define END_REG rcx + # define LOOP_REG rdi ++# define LESS_VEC_REG rax + #else + # define END_REG rdi + # define LOOP_REG rdx ++# define LESS_VEC_REG rdi ++#endif ++ ++#ifdef USE_XMM_LESS_VEC ++# define XMM_SMALL 1 ++#else ++# define XMM_SMALL 0 + #endif + + #define PAGE_SIZE 4096 +@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) + + ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + shl $2, %RDX_LP +- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) +- jmp L(entry_from_bzero) ++ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) ++ WMEMSET_VDUP_TO_VEC0_LOW() ++ cmpq $VEC_SIZE, %rdx ++ jb L(less_vec_no_vdup) ++ WMEMSET_VDUP_TO_VEC0_HIGH() ++ jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + +@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + #endif + + ENTRY (MEMSET_SYMBOL (__memset, unaligned)) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx +@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH() ++L(entry_from_wmemset): + cmpq $(VEC_SIZE * 2), %rdx + ja L(more_2x_vec) + /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ +@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + # endif + + ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6) +- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi) ++ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif + cmp $VEC_SIZE, %RDX_LP + jb L(less_vec) ++ MEMSET_VDUP_TO_VEC0_HIGH () + cmp $(VEC_SIZE * 2), %RDX_LP + ja L(stosb_more_2x_vec) +- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. +- */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + VZEROUPPER_RETURN + #endif + +- .p2align 4,, 10 ++ .p2align 4,, 4 + L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE +- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx) +- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx) ++ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) + #else + VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi) + VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi) +@@ -212,6 +228,7 @@ L(last_2x_vec): + #ifdef USE_LESS_VEC_MASK_STORE + .p2align 4,, 10 + L(less_vec): ++L(less_vec_no_vdup): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -262,28 +279,18 @@ L(stosb_more_2x_vec): + /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x] + and (4x, 8x] jump to target. */ + L(more_2x_vec): +- +- /* Two different methods of setting up pointers / compare. The +- two methods are based on the fact that EVEX/AVX512 mov +- instructions take more bytes then AVX2/SSE2 mov instructions. As +- well that EVEX/AVX512 machines also have fast LEA_BID. Both +- setup and END_REG to avoid complex address mode. For EVEX/AVX512 +- this saves code size and keeps a few targets in one fetch block. +- For AVX2/SSE2 this helps prevent AGU bottlenecks. */ +-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 +- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + +- LOOP_4X_OFFSET) with LEA_BID. */ +- +- /* END_REG is rcx for EVEX/AVX512. */ +- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG +-#endif +- +- /* Stores to first 2x VEC before cmp as any path forward will +- require it. */ +- VMOVU %VEC(0), (%rax) +- VMOVU %VEC(0), VEC_SIZE(%rax) ++ /* Store next 2x vec regardless. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi) + + ++ /* Two different methods of setting up pointers / compare. The two ++ methods are based on the fact that EVEX/AVX512 mov instructions take ++ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512 ++ machines also have fast LEA_BID. Both setup and END_REG to avoid complex ++ address mode. For EVEX/AVX512 this saves code size and keeps a few ++ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU ++ bottlenecks. */ + #if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512) + /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */ + addq %rdx, %END_REG +@@ -292,6 +299,15 @@ L(more_2x_vec): + cmpq $(VEC_SIZE * 4), %rdx + jbe L(last_2x_vec) + ++ ++#if defined USE_WITH_EVEX || defined USE_WITH_AVX512 ++ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with ++ LEA_BID. */ ++ ++ /* END_REG is rcx for EVEX/AVX512. */ ++ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG ++#endif ++ + /* Store next 2x vec regardless. */ + VMOVU %VEC(0), (VEC_SIZE * 2)(%rax) + VMOVU %VEC(0), (VEC_SIZE * 3)(%rax) +@@ -355,65 +371,93 @@ L(stosb_local): + /* Define L(less_vec) only if not otherwise defined. */ + .p2align 4 + L(less_vec): ++ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to ++ xmm). This is only does anything for AVX2. */ ++ MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_no_vdup): + #endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +- jae L(between_32_63) ++ jge L(between_32_63) + #endif + #if VEC_SIZE > 16 + cmpl $16, %edx +- jae L(between_16_31) ++ jge L(between_16_31) ++#endif ++#ifndef USE_XMM_LESS_VEC ++ MOVQ %XMM0, %rcx + #endif +- MOVQ %XMM0, %rdi + cmpl $8, %edx +- jae L(between_8_15) ++ jge L(between_8_15) + cmpl $4, %edx +- jae L(between_4_7) ++ jge L(between_4_7) + cmpl $1, %edx +- ja L(between_2_3) +- jb L(return) +- movb %sil, (%rax) +- VZEROUPPER_RETURN ++ jg L(between_2_3) ++ jl L(between_0_0) ++ movb %sil, (%LESS_VEC_REG) ++L(between_0_0): ++ ret + +- /* Align small targets only if not doing so would cross a fetch +- line. */ ++ /* Align small targets only if not doing so would cross a fetch line. ++ */ + #if VEC_SIZE > 32 + .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) + /* From 32 to 63. No branch when size == 32. */ + L(between_32_63): +- VMOVU %YMM0, (%rax) +- VMOVU %YMM0, -32(%rax, %rdx) ++ VMOVU %YMM0, (%LESS_VEC_REG) ++ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx) + VZEROUPPER_RETURN + #endif + + #if VEC_SIZE >= 32 +- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE) ++ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1) + L(between_16_31): + /* From 16 to 31. No branch when size == 16. */ +- VMOVU %XMM0, (%rax) +- VMOVU %XMM0, -16(%rax, %rdx) +- VZEROUPPER_RETURN ++ VMOVU %XMM0, (%LESS_VEC_REG) ++ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx) ++ ret + #endif + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1) + L(between_8_15): + /* From 8 to 15. No branch when size == 8. */ +- movq %rdi, (%rax) +- movq %rdi, -8(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVQ %XMM0, (%rdi) ++ MOVQ %XMM0, -8(%rdi, %rdx) ++#else ++ movq %rcx, (%LESS_VEC_REG) ++ movq %rcx, -8(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE) ++ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2. ++ */ ++ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1) + L(between_4_7): + /* From 4 to 7. No branch when size == 4. */ +- movl %edi, (%rax) +- movl %edi, -4(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ MOVD %XMM0, (%rdi) ++ MOVD %XMM0, -4(%rdi, %rdx) ++#else ++ movl %ecx, (%LESS_VEC_REG) ++ movl %ecx, -4(%LESS_VEC_REG, %rdx) ++#endif ++ ret + +- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE) ++ /* 4 * XMM_SMALL for the third mov for AVX2. */ ++ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1) + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ +- movw %di, (%rax) +- movb %dil, -1(%rax, %rdx) +- VZEROUPPER_RETURN ++#ifdef USE_XMM_LESS_VEC ++ movb %sil, (%rdi) ++ movb %sil, 1(%rdi) ++ movb %sil, -1(%rdi, %rdx) ++#else ++ movw %cx, (%LESS_VEC_REG) ++ movb %sil, -1(%LESS_VEC_REG, %rdx) ++#endif ++ ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-79.patch b/glibc-RHEL-15696-79.patch new file mode 100644 index 0000000..91e850f --- /dev/null +++ b/glibc-RHEL-15696-79.patch @@ -0,0 +1,40 @@ +From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Mon, 7 Feb 2022 00:32:23 -0600 +Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2 + Only) +Content-type: text/plain; charset=UTF-8 + +commit b62ace2740a106222e124cc86956448fa07abf4d +Author: Noah Goldstein +Date: Sun Feb 6 00:54:18 2022 -0600 + + x86: Improve vec generation in memset-vec-unaligned-erms.S + +Revert usage of 'pshufb' in broadcast logic as it is an SSSE3 +instruction and memset.S is restricted to only SSE2 instructions. +--- + sysdeps/x86_64/memset.S | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 27debd2b..4cb4aa71 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -30,9 +30,10 @@ + + # define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +- pxor %xmm1, %xmm1; \ +- pshufb %xmm1, %xmm0; \ +- movq r, %rax ++ movq r, %rax; \ ++ punpcklbw %xmm0, %xmm0; \ ++ punpcklwd %xmm0, %xmm0; \ ++ pshufd $0, %xmm0, %xmm0 + + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ +-- +GitLab + diff --git a/glibc-RHEL-15696-8.patch b/glibc-RHEL-15696-8.patch new file mode 100644 index 0000000..5cf7633 --- /dev/null +++ b/glibc-RHEL-15696-8.patch @@ -0,0 +1,218 @@ +From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 21 Jan 2019 11:36:36 -0800 +Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter + [BZ# 24097] +Content-type: text/plain; charset=UTF-8 + +On x32, the size_t parameter may be passed in the lower 32 bits of a +64-bit register with the non-zero upper 32 bits. The string/memory +functions written in assembly can only use the lower 32 bits of a +64-bit register as length or must clear the upper 32 bits before using +the full 64-bit register for length. + +This pach fixes strnlen/wcsnlen for x32. Tested on x86-64 and x32. On +x86-64, libc.so is the same with and withou the fix. + + [BZ# 24097] + CVE-2019-6488 + * sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length. + Clear the upper 32 bits of RSI register. + * sysdeps/x86_64/strlen.S: Use RSI_LP for length. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen + and tst-size_t-wcsnlen. + * sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file. + * sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise. +--- + sysdeps/x86_64/multiarch/strlen-avx2.S | 9 ++-- + sysdeps/x86_64/strlen.S | 12 ++--- + sysdeps/x86_64/x32/Makefile | 4 +- + sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++ + sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++ + 5 files changed, 106 insertions(+), 11 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c + create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c + +Conflicts: + ChangeLog + (removed) + +diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S +index fb2418cd..645e0446 100644 +--- a/sysdeps/x86_64/multiarch/strlen-avx2.S ++++ b/sysdeps/x86_64/multiarch/strlen-avx2.S +@@ -42,12 +42,15 @@ + ENTRY (STRLEN) + # ifdef USE_AS_STRNLEN + /* Check for zero length. */ +- testq %rsi, %rsi ++ test %RSI_LP, %RSI_LP + jz L(zero) + # ifdef USE_AS_WCSLEN +- shl $2, %rsi ++ shl $2, %RSI_LP ++# elif defined __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %esi, %esi + # endif +- movq %rsi, %r8 ++ mov %RSI_LP, %R8_LP + # endif + movl %edi, %ecx + movq %rdi, %rdx +diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S +index 01cb5fa8..f845f3d4 100644 +--- a/sysdeps/x86_64/strlen.S ++++ b/sysdeps/x86_64/strlen.S +@@ -59,21 +59,21 @@ ENTRY(strlen) + + #ifdef AS_STRNLEN + /* Do not read anything when n==0. */ +- test %rsi, %rsi ++ test %RSI_LP, %RSI_LP + jne L(n_nonzero) + xor %rax, %rax + ret + L(n_nonzero): + # ifdef AS_WCSLEN +- shlq $2, %rsi ++ shl $2, %RSI_LP + # endif + + /* Initialize long lived registers. */ + +- add %rdi, %rsi +- mov %rsi, %r10 +- and $-64, %r10 +- mov %rsi, %r11 ++ add %RDI_LP, %RSI_LP ++ mov %RSI_LP, %R10_LP ++ and $-64, %R10_LP ++ mov %RSI_LP, %R11_LP + #endif + + pxor %xmm0, %xmm0 +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 2a9e20a9..1557724b 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -8,10 +8,10 @@ endif + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ + tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ +- tst-size_t-strncmp tst-size_t-strncpy ++ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen + endif + + ifeq ($(subdir),wcsmbs) + tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \ +- tst-size_t-wcsncmp ++ tst-size_t-wcsncmp tst-size_t-wcsnlen + endif +diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c +new file mode 100644 +index 00000000..690a4a8a +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c +@@ -0,0 +1,72 @@ ++/* Test strnlen with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifdef WIDE ++# define TEST_NAME "wcsnlen" ++#else ++# define TEST_NAME "strnlen" ++#endif /* WIDE */ ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# define STRNLEN wcsnlen ++# define CHAR wchar_t ++#else ++# define STRNLEN strnlen ++# define CHAR char ++#endif /* WIDE */ ++ ++IMPL (STRNLEN, 1) ++ ++typedef size_t (*proto_t) (const CHAR *, size_t); ++ ++static size_t ++__attribute__ ((noinline, noclone)) ++do_strnlen (parameter_t a, parameter_t b) ++{ ++ return CALL (&a, a.p, b.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ size_t size = page_size / sizeof (CHAR); ++ parameter_t src = { { 0 }, buf2 }; ++ parameter_t c = { { size }, (void *) (uintptr_t) 'a' }; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ size_t res = do_strnlen (src, c); ++ if (res != size) ++ { ++ error (0, 0, "Wrong result in function %s: 0x%x != 0x%x", ++ impl->name, res, size); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c +new file mode 100644 +index 00000000..093b4bbe +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c +@@ -0,0 +1,20 @@ ++/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define WIDE 1 ++#include "tst-size_t-strnlen.c" +-- +GitLab + diff --git a/glibc-RHEL-15696-80.patch b/glibc-RHEL-15696-80.patch new file mode 100644 index 0000000..53a3e7e --- /dev/null +++ b/glibc-RHEL-15696-80.patch @@ -0,0 +1,753 @@ +From 3d9f171bfb5325bd5f427e9fc386453358c6e840 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 7 Feb 2022 05:55:15 -0800 +Subject: [PATCH] x86-64: Optimize bzero +Content-type: text/plain; charset=UTF-8 + +memset with zero as the value to set is by far the majority value (99%+ +for Python3 and GCC). + +bzero can be slightly more optimized for this case by using a zero-idiom +xor for broadcasting the set value to a register (vector or GPR). + +Co-developed-by: Noah Goldstein +--- + sysdeps/generic/ifunc-init.h | 5 +- + sysdeps/x86_64/memset.S | 8 + + sysdeps/x86_64/multiarch/Makefile | 205 +++++++++++------- + sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++ + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 ++++ + .../memset-avx2-unaligned-erms-rtm.S | 1 + + .../multiarch/memset-avx2-unaligned-erms.S | 6 + + .../multiarch/memset-avx512-unaligned-erms.S | 3 + + .../multiarch/memset-evex-unaligned-erms.S | 3 + + .../multiarch/memset-sse2-unaligned-erms.S | 1 + + .../multiarch/memset-vec-unaligned-erms.S | 110 +++++++--- + 11 files changed, 384 insertions(+), 106 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/bzero.c + +Conflicts: + sysdeps/generic/ifunc-init.h + (needs macros from cf4fd28ea453d1a9cec93939bc88b58ccef5437a (memcmpeq)) + sysdeps/x86_64/multiarch/Makefile + (file ordering) + +diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h +index 241e4161..f7a72375 100644 +--- a/sysdeps/generic/ifunc-init.h ++++ b/sysdeps/generic/ifunc-init.h +@@ -50,5 +50,8 @@ + '___' as the optimized implementation and + '_ifunc_selector' as the IFUNC selector. */ + #define REDIRECT_NAME EVALUATOR1 (__redirect, SYMBOL_NAME) +-#define OPTIMIZE(name) EVALUATOR2 (SYMBOL_NAME, name) ++#define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name) ++#define OPTIMIZE2(name) EVALUATOR2 (SYMBOL_NAME, name) ++/* Default is to use OPTIMIZE2. */ ++#define OPTIMIZE(name) OPTIMIZE2(name) + #define IFUNC_SELECTOR EVALUATOR1 (SYMBOL_NAME, ifunc_selector) +diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S +index 4cb4aa71..a1353f89 100644 +--- a/sysdeps/x86_64/memset.S ++++ b/sysdeps/x86_64/memset.S +@@ -35,6 +35,9 @@ + punpcklwd %xmm0, %xmm0; \ + pshufd $0, %xmm0, %xmm0 + ++# define BZERO_ZERO_VEC0() \ ++ pxor %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + movd d, %xmm0; \ + pshufd $0, %xmm0, %xmm0; \ +@@ -53,6 +56,10 @@ + # define MEMSET_SYMBOL(p,s) memset + #endif + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) __bzero ++#endif ++ + #ifndef WMEMSET_SYMBOL + # define WMEMSET_CHK_SYMBOL(p,s) p + # define WMEMSET_SYMBOL(p,s) __wmemset +@@ -63,6 +70,7 @@ + libc_hidden_builtin_def (memset) + + #if IS_IN (libc) ++weak_alias (__bzero, bzero) + libc_hidden_def (__wmemset) + weak_alias (__wmemset, wmemset) + libc_hidden_weak (wmemset) +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 26be4095..37d8d6f0 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -1,85 +1,130 @@ + ifeq ($(subdir),string) + +-sysdep_routines += strncat-c stpncpy-c strncpy-c \ +- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \ +- strcmp-sse4_2 strcmp-avx2 \ +- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \ +- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \ +- memrchr-sse2 memrchr-avx2 \ +- memcmp-sse2 \ +- memcmp-avx2-movbe \ +- memcmp-sse4 memcpy-ssse3 \ +- memmove-ssse3 \ +- memcpy-ssse3-back \ +- memmove-ssse3-back \ +- memmove-avx512-no-vzeroupper \ +- strcasecmp_l-sse2 strcasecmp_l-ssse3 \ +- strcasecmp_l-sse4_2 strcasecmp_l-avx \ +- strncase_l-sse2 strncase_l-ssse3 \ +- strncase_l-sse4_2 strncase_l-avx \ +- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \ +- strrchr-sse2 strrchr-avx2 \ +- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \ +- strcat-avx2 strncat-avx2 \ +- strcat-ssse3 strncat-ssse3\ +- strcpy-avx2 strncpy-avx2 \ +- strcpy-sse2 stpcpy-sse2 \ +- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \ +- strcpy-sse2-unaligned strncpy-sse2-unaligned \ +- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \ +- stpcpy-avx2 stpncpy-avx2 \ +- strcat-sse2 \ +- strcat-sse2-unaligned strncat-sse2-unaligned \ +- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \ +- strcspn-sse2 strpbrk-sse2 strspn-sse2 \ +- strcspn-c strpbrk-c strspn-c varshift \ +- memset-avx512-no-vzeroupper \ +- memmove-sse2-unaligned-erms \ +- memmove-avx-unaligned-erms \ +- memmove-avx512-unaligned-erms \ +- memset-sse2-unaligned-erms \ +- memset-avx2-unaligned-erms \ +- memset-avx512-unaligned-erms \ +- memchr-avx2-rtm \ +- memcmp-avx2-movbe-rtm \ +- memmove-avx-unaligned-erms-rtm \ +- memrchr-avx2-rtm \ +- memset-avx2-unaligned-erms-rtm \ +- rawmemchr-avx2-rtm \ +- strchr-avx2-rtm \ +- strcmp-avx2-rtm \ +- strchrnul-avx2-rtm \ +- stpcpy-avx2-rtm \ +- stpncpy-avx2-rtm \ +- strcat-avx2-rtm \ +- strcpy-avx2-rtm \ +- strlen-avx2-rtm \ +- strncat-avx2-rtm \ +- strncmp-avx2-rtm \ +- strncpy-avx2-rtm \ +- strnlen-avx2-rtm \ +- strrchr-avx2-rtm \ +- memchr-evex \ +- memcmp-evex-movbe \ +- memmove-evex-unaligned-erms \ +- memrchr-evex \ +- memset-evex-unaligned-erms \ +- rawmemchr-evex \ +- stpcpy-evex \ +- stpncpy-evex \ +- strcat-evex \ +- strchr-evex \ +- strchrnul-evex \ +- strcmp-evex \ +- strcpy-evex \ +- strlen-evex \ +- strncat-evex \ +- strncmp-evex \ +- strncpy-evex \ +- strnlen-evex \ +- strrchr-evex \ +- memchr-evex-rtm \ +- rawmemchr-evex-rtm ++sysdep_routines += \ ++ bzero \ ++ memchr-avx2 \ ++ memchr-avx2-rtm \ ++ memchr-evex \ ++ memchr-evex-rtm \ ++ memchr-sse2 \ ++ memcmp-avx2-movbe \ ++ memcmp-avx2-movbe-rtm \ ++ memcmp-evex-movbe \ ++ memcmp-sse2 \ ++ memcmp-sse4 \ ++ memcmp-ssse3 \ ++ memcpy-ssse3 \ ++ memcpy-ssse3-back \ ++ memmove-avx-unaligned-erms \ ++ memmove-avx-unaligned-erms-rtm \ ++ memmove-avx512-no-vzeroupper \ ++ memmove-avx512-unaligned-erms \ ++ memmove-evex-unaligned-erms \ ++ memmove-sse2-unaligned-erms \ ++ memmove-ssse3 \ ++ memmove-ssse3-back \ ++ memrchr-avx2 \ ++ memrchr-avx2-rtm \ ++ memrchr-evex \ ++ memrchr-sse2 \ ++ memset-avx2-unaligned-erms \ ++ memset-avx2-unaligned-erms-rtm \ ++ memset-avx512-no-vzeroupper \ ++ memset-avx512-unaligned-erms \ ++ memset-evex-unaligned-erms \ ++ memset-sse2-unaligned-erms \ ++ rawmemchr-avx2 \ ++ rawmemchr-avx2-rtm \ ++ rawmemchr-evex \ ++ rawmemchr-evex-rtm \ ++ rawmemchr-sse2 \ ++ stpcpy-avx2 \ ++ stpcpy-avx2-rtm \ ++ stpcpy-evex \ ++ stpcpy-sse2 \ ++ stpcpy-sse2-unaligned \ ++ stpcpy-ssse3 \ ++ stpncpy-avx2 \ ++ stpncpy-avx2-rtm \ ++ stpncpy-c \ ++ stpncpy-evex \ ++ stpncpy-sse2-unaligned \ ++ stpncpy-ssse3 \ ++ strcasecmp_l-avx \ ++ strcasecmp_l-sse2 \ ++ strcasecmp_l-sse4_2 \ ++ strcasecmp_l-ssse3 \ ++ strcat-avx2 \ ++ strcat-avx2-rtm \ ++ strcat-evex \ ++ strcat-sse2 \ ++ strcat-sse2-unaligned \ ++ strcat-ssse3 \ ++ strchr-avx2 \ ++ strchr-avx2-rtm \ ++ strchr-evex \ ++ strchr-sse2 \ ++ strchr-sse2-no-bsf \ ++ strchrnul-avx2 \ ++ strchrnul-avx2-rtm \ ++ strchrnul-evex \ ++ strchrnul-sse2 \ ++ strcmp-avx2 \ ++ strcmp-avx2-rtm \ ++ strcmp-evex \ ++ strcmp-sse2 \ ++ strcmp-sse2-unaligned \ ++ strcmp-sse4_2 \ ++ strcmp-ssse3 \ ++ strcpy-avx2 \ ++ strcpy-avx2-rtm \ ++ strcpy-evex \ ++ strcpy-sse2 \ ++ strcpy-sse2-unaligned \ ++ strcpy-ssse3 \ ++ strcspn-c \ ++ strcspn-sse2 \ ++ strlen-avx2 \ ++ strlen-avx2-rtm \ ++ strlen-evex \ ++ strlen-sse2 \ ++ strncase_l-avx \ ++ strncase_l-sse2 \ ++ strncase_l-sse4_2 \ ++ strncase_l-ssse3 \ ++ strncat-avx2 \ ++ strncat-avx2-rtm \ ++ strncat-c \ ++ strncat-evex \ ++ strncat-sse2-unaligned \ ++ strncat-ssse3 \ ++ strncmp-avx2 \ ++ strncmp-avx2-rtm \ ++ strncmp-evex \ ++ strncmp-sse2 \ ++ strncmp-sse4_2 \ ++ strncmp-ssse3 \ ++ strncpy-avx2 \ ++ strncpy-avx2-rtm \ ++ strncpy-c \ ++ strncpy-evex \ ++ strncpy-sse2-unaligned \ ++ strncpy-ssse3 \ ++ strnlen-avx2 \ ++ strnlen-avx2-rtm \ ++ strnlen-evex \ ++ strnlen-sse2 \ ++ strpbrk-c \ ++ strpbrk-sse2 \ ++ strrchr-avx2 \ ++ strrchr-avx2-rtm \ ++ strrchr-evex \ ++ strrchr-sse2 \ ++ strspn-c \ ++ strspn-sse2 \ ++ strstr-sse2-unaligned \ ++ varshift \ ++# sysdep_routines + CFLAGS-varshift.c += -msse4 + CFLAGS-strcspn-c.c += -msse4 + CFLAGS-strpbrk-c.c += -msse4 +diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c +new file mode 100644 +index 00000000..58a14b2c +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/bzero.c +@@ -0,0 +1,106 @@ ++/* Multiple versions of bzero. ++ All versions must be listed in ifunc-impl-list.c. ++ Copyright (C) 2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++/* Define multiple versions only for the definition in libc. */ ++#if IS_IN (libc) ++# define __bzero __redirect___bzero ++# include ++# undef __bzero ++ ++# define SYMBOL_NAME __bzero ++# include ++ ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned) ++ attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms) ++ attribute_hidden; ++ ++static inline void * ++IFUNC_SELECTOR (void) ++{ ++ const struct cpu_features* cpu_features = __get_cpu_features (); ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F) ++ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx512_unaligned_erms); ++ ++ return OPTIMIZE1 (avx512_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW) ++ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (evex_unaligned_erms); ++ ++ return OPTIMIZE1 (evex_unaligned); ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms_rtm); ++ ++ return OPTIMIZE1 (avx2_unaligned_rtm); ++ } ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (avx2_unaligned_erms); ++ ++ return OPTIMIZE1 (avx2_unaligned); ++ } ++ } ++ ++ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS)) ++ return OPTIMIZE1 (sse2_unaligned_erms); ++ ++ return OPTIMIZE1 (sse2_unaligned); ++} ++ ++libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ()); ++ ++weak_alias (__bzero, bzero) ++#endif +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 8be0d78a..c963d391 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + __memset_avx512_no_vzeroupper) + ) + ++ /* Support sysdeps/x86_64/multiarch/bzero.c. */ ++ IFUNC_IMPL (i, name, bzero, ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, 1, ++ __bzero_sse2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ CPU_FEATURE_USABLE (AVX2), ++ __bzero_avx2_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __bzero_avx2_unaligned_erms_rtm) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_evex_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned_erms) ++ IFUNC_IMPL_ADD (array, i, bzero, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW) ++ && CPU_FEATURE_USABLE (BMI2)), ++ __bzero_avx512_unaligned) ++ ) ++ + /* Support sysdeps/x86_64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + IFUNC_IMPL_ADD (array, i, rawmemchr, +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +index 8ac3e479..5a5ee6f6 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S +@@ -5,6 +5,7 @@ + + #define SECTION(p) p##.avx.rtm + #define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm ++#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm + #define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm + + #include "memset-avx2-unaligned-erms.S" +diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +index c0bf2875..a093a283 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S +@@ -14,6 +14,9 @@ + vmovd d, %xmm0; \ + movq r, %rax; + ++# define BZERO_ZERO_VEC0() \ ++ vpxor %xmm0, %xmm0, %xmm0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + MEMSET_SET_VEC0_AND_SET_RETURN(d, r) + +@@ -29,6 +32,9 @@ + # ifndef MEMSET_SYMBOL + # define MEMSET_SYMBOL(p,s) p##_avx2_##s + # endif ++# ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) p##_avx2_##s ++# endif + # ifndef WMEMSET_SYMBOL + # define WMEMSET_SYMBOL(p,s) p##_avx2_##s + # endif +diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +index 5241216a..727c9213 100644 +--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +index 63700215..5d8fa78f 100644 +--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S +@@ -19,6 +19,9 @@ + vpbroadcastb d, %VEC0; \ + movq r, %rax + ++# define BZERO_ZERO_VEC0() \ ++ vpxorq %XMM0, %XMM0, %XMM0 ++ + # define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \ + vpbroadcastd d, %VEC0; \ + movq r, %rax +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index 56b81f5c..8f579ad6 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -22,6 +22,7 @@ + + #if IS_IN (libc) + # define MEMSET_SYMBOL(p,s) p##_sse2_##s ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) + # define WMEMSET_SYMBOL(p,s) p##_sse2_##s + + # ifdef SHARED +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index a67f9833..06f5f5d7 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -26,6 +26,10 @@ + + #include + ++#ifndef BZERO_SYMBOL ++# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s) ++#endif ++ + #ifndef MEMSET_CHK_SYMBOL + # define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s) + #endif +@@ -87,6 +91,18 @@ + # define XMM_SMALL 0 + #endif + ++#ifdef USE_LESS_VEC_MASK_STORE ++# define SET_REG64 rcx ++# define SET_REG32 ecx ++# define SET_REG16 cx ++# define SET_REG8 cl ++#else ++# define SET_REG64 rsi ++# define SET_REG32 esi ++# define SET_REG16 si ++# define SET_REG8 sil ++#endif ++ + #define PAGE_SIZE 4096 + + /* Macro to calculate size of small memset block for aligning +@@ -96,18 +112,6 @@ + + #ifndef SECTION + # error SECTION is not defined! +-#endif +- +- .section SECTION(.text),"ax",@progbits +-#if VEC_SIZE == 16 && IS_IN (libc) +-ENTRY (__bzero) +- mov %RDI_LP, %RAX_LP /* Set return value. */ +- mov %RSI_LP, %RDX_LP /* Set n. */ +- xorl %esi, %esi +- pxor %XMM0, %XMM0 +- jmp L(entry_from_bzero) +-END (__bzero) +-weak_alias (__bzero, bzero) + #endif + + #if IS_IN (libc) +@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned)) + WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi) + WMEMSET_VDUP_TO_VEC0_LOW() + cmpq $VEC_SIZE, %rdx +- jb L(less_vec_no_vdup) ++ jb L(less_vec_from_wmemset) + WMEMSET_VDUP_TO_VEC0_HIGH() + jmp L(entry_from_wmemset) + END (WMEMSET_SYMBOL (__wmemset, unaligned)) + #endif + ++ENTRY (BZERO_SYMBOL(__bzero, unaligned)) ++#if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++#ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++#ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++#endif ++#if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++#endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned)) ++ + #if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned)) + cmp %RDX_LP, %RCX_LP +@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned)) + /* Clear the upper 32 bits. */ + mov %edx, %edx + # endif +-L(entry_from_bzero): + cmpq $VEC_SIZE, %rdx + jb L(less_vec) + MEMSET_VDUP_TO_VEC0_HIGH() +@@ -187,6 +215,31 @@ END (__memset_erms) + END (MEMSET_SYMBOL (__memset, erms)) + # endif + ++ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6) ++# if VEC_SIZE > 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ mov %RDI_LP, %RAX_LP ++ mov %RSI_LP, %RDX_LP ++# ifndef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++ cmp $VEC_SIZE, %RDX_LP ++ jb L(less_vec_no_vdup) ++# ifdef USE_LESS_VEC_MASK_STORE ++ xorl %esi, %esi ++# endif ++# if VEC_SIZE <= 16 ++ BZERO_ZERO_VEC0 () ++# endif ++ cmp $(VEC_SIZE * 2), %RDX_LP ++ ja L(stosb_more_2x_vec) ++ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */ ++ VMOVU %VEC(0), (%rdi) ++ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx) ++ VZEROUPPER_RETURN ++END (BZERO_SYMBOL(__bzero, unaligned_erms)) ++ + # if defined SHARED && IS_IN (libc) + ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms)) + cmp %RDX_LP, %RCX_LP +@@ -229,6 +282,7 @@ L(last_2x_vec): + .p2align 4,, 10 + L(less_vec): + L(less_vec_no_vdup): ++L(less_vec_from_wmemset): + /* Less than 1 VEC. */ + # if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64 + # error Unsupported VEC_SIZE! +@@ -374,8 +428,11 @@ L(less_vec): + /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to + xmm). This is only does anything for AVX2. */ + MEMSET_VDUP_TO_VEC0_LOW () ++L(less_vec_from_wmemset): ++#if VEC_SIZE > 16 + L(less_vec_no_vdup): + #endif ++#endif + L(cross_page): + #if VEC_SIZE > 32 + cmpl $32, %edx +@@ -386,7 +443,10 @@ L(cross_page): + jge L(between_16_31) + #endif + #ifndef USE_XMM_LESS_VEC +- MOVQ %XMM0, %rcx ++ MOVQ %XMM0, %SET_REG64 ++#endif ++#if VEC_SIZE <= 16 ++L(less_vec_no_vdup): + #endif + cmpl $8, %edx + jge L(between_8_15) +@@ -395,7 +455,7 @@ L(cross_page): + cmpl $1, %edx + jg L(between_2_3) + jl L(between_0_0) +- movb %sil, (%LESS_VEC_REG) ++ movb %SET_REG8, (%LESS_VEC_REG) + L(between_0_0): + ret + +@@ -428,8 +488,8 @@ L(between_8_15): + MOVQ %XMM0, (%rdi) + MOVQ %XMM0, -8(%rdi, %rdx) + #else +- movq %rcx, (%LESS_VEC_REG) +- movq %rcx, -8(%LESS_VEC_REG, %rdx) ++ movq %SET_REG64, (%LESS_VEC_REG) ++ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -442,8 +502,8 @@ L(between_4_7): + MOVD %XMM0, (%rdi) + MOVD %XMM0, -4(%rdi, %rdx) + #else +- movl %ecx, (%LESS_VEC_REG) +- movl %ecx, -4(%LESS_VEC_REG, %rdx) ++ movl %SET_REG32, (%LESS_VEC_REG) ++ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx) + #endif + ret + +@@ -452,12 +512,12 @@ L(between_4_7): + L(between_2_3): + /* From 2 to 3. No branch when size == 2. */ + #ifdef USE_XMM_LESS_VEC +- movb %sil, (%rdi) +- movb %sil, 1(%rdi) +- movb %sil, -1(%rdi, %rdx) ++ movb %SET_REG8, (%rdi) ++ movb %SET_REG8, 1(%rdi) ++ movb %SET_REG8, -1(%rdi, %rdx) + #else +- movw %cx, (%LESS_VEC_REG) +- movb %sil, -1(%LESS_VEC_REG, %rdx) ++ movw %SET_REG16, (%LESS_VEC_REG) ++ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx) + #endif + ret + END (MEMSET_SYMBOL (__memset, unaligned_erms)) +-- +GitLab + diff --git a/glibc-RHEL-15696-81.patch b/glibc-RHEL-15696-81.patch new file mode 100644 index 0000000..960a4cc --- /dev/null +++ b/glibc-RHEL-15696-81.patch @@ -0,0 +1,33 @@ +From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Sat, 12 Feb 2022 00:45:00 -0600 +Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms +Content-type: text/plain; charset=UTF-8 + +commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + +Remove setting the .text section for the code. This commit +adds that back. +--- + sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +index 06f5f5d7..4fb475c0 100644 +--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S +@@ -114,6 +114,7 @@ + # error SECTION is not defined! + #endif + ++ .section SECTION(.text), "ax", @progbits + #if IS_IN (libc) + # if defined SHARED + ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned)) +-- +GitLab + diff --git a/glibc-RHEL-15696-82.patch b/glibc-RHEL-15696-82.patch new file mode 100644 index 0000000..23ee46e --- /dev/null +++ b/glibc-RHEL-15696-82.patch @@ -0,0 +1,90 @@ +From e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Tue, 15 Feb 2022 20:27:21 -0600 +Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895] +Content-type: text/plain; charset=UTF-8 + +Logic can read before the start of `s1` / `s2` if both `s1` and `s2` +are near the start of a page. To avoid having the result contimated by +these comparisons the `strcmp` variants would mask off these +comparisons. This was missing in the `strncmp` variants causing +the bug. This commit adds the masking to `strncmp` so that out of +range comparisons don't affect the result. + +test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as +well a full xcheck on x86_64 linux. +Reviewed-by: H.J. Lu +--- + string/test-strncmp.c | 23 +++++++++++++++++++++++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 1 + + sysdeps/x86_64/multiarch/strcmp-evex.S | 1 + + 3 files changed, 25 insertions(+) + +diff --git a/string/test-strncmp.c b/string/test-strncmp.c +index 927a6daa..e61fffd9 100644 +--- a/string/test-strncmp.c ++++ b/string/test-strncmp.c +@@ -403,6 +403,28 @@ check2 (void) + free (s2); + } + ++static void ++check4 (void) ++{ ++ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of ++ the end of the page. 2) For there to be no mismatch/null byte before the ++ first page cross. 3) For length (`n`) to be large enough for one string to ++ cross the page. And 4) for there to be either mismatch/null bytes before ++ the start of the strings. */ ++ ++ size_t size = 10; ++ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1); ++ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa)); ++ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed)); ++ int exp_result; ++ ++ STRCPY (s1, L ("tst-tlsmod%")); ++ STRCPY (s2, L ("tst-tls-manydynamic73mod")); ++ exp_result = SIMPLE_STRNCMP (s1, s2, size); ++ FOR_EACH_IMPL (impl, 0) ++ check_result (impl, s1, s2, size, exp_result); ++} ++ + static void + check3 (void) + { +@@ -445,6 +467,7 @@ test_main (void) + check1 (); + check2 (); + check3 (); ++ check4 (); + + printf ("%23s", ""); + FOR_EACH_IMPL (impl, 0) +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 04675aa4..179cc0e3 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -661,6 +661,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx), %ecx + cmpl %ecx, %edx +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index ed56af8e..0dfa62bd 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -689,6 +689,7 @@ L(ret8): + # ifdef USE_AS_STRNCMP + .p2align 4,, 10 + L(return_page_cross_end_check): ++ andl %r10d, %ecx + tzcntl %ecx, %ecx + leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx + # ifdef USE_AS_WCSCMP +-- +GitLab + diff --git a/glibc-RHEL-15696-83.patch b/glibc-RHEL-15696-83.patch new file mode 100644 index 0000000..e7475a8 --- /dev/null +++ b/glibc-RHEL-15696-83.patch @@ -0,0 +1,77 @@ +From 9fef7039a7d04947bc89296ee0d187bc8d89b772 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 15:50:33 -0500 +Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ + #28896] +Content-type: text/plain; charset=UTF-8 + +Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not +__wcscmp_avx2. + +commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87 +Author: Noah Goldstein +Date: Sun Jan 9 16:02:21 2022 -0600 + + x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755] + +Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set +to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which +can cause spurious aborts. + +This change will need to be backported. + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +- + 2 files changed, 16 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c +index aef9866c..ba6543be 100644 +--- a/sysdeps/x86/tst-strncmp-rtm.c ++++ b/sysdeps/x86/tst-strncmp-rtm.c +@@ -70,6 +70,16 @@ function_overflow (void) + return 1; + } + ++__attribute__ ((noinline, noclone)) ++static int ++function_overflow2 (void) ++{ ++ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0) ++ return 0; ++ else ++ return 1; ++} ++ + static int + do_test (void) + { +@@ -77,5 +87,10 @@ do_test (void) + if (status != EXIT_SUCCESS) + return status; + status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow); ++ if (status != EXIT_SUCCESS) ++ return status; ++ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2); ++ if (status != EXIT_SUCCESS) ++ return status; + return status; + } +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 179cc0e3..782f9472 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -122,7 +122,7 @@ ENTRY(STRCMP) + are cases where length is large enough that it can never be a + bound on valid memory so just use wcscmp. */ + shrq $56, %rcx +- jnz __wcscmp_avx2 ++ jnz OVERFLOW_STRCMP + + leaq (, %rdx, 4), %rdx + # endif +-- +GitLab + diff --git a/glibc-RHEL-15696-84.patch b/glibc-RHEL-15696-84.patch new file mode 100644 index 0000000..e998eff --- /dev/null +++ b/glibc-RHEL-15696-84.patch @@ -0,0 +1,27 @@ +From 1283948f236f209b7d3f44b69a42b96806fa6da0 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 5 Feb 2022 11:06:01 -0800 +Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ)) +Content-type: text/plain; charset=UTF-8 + +--- + sysdeps/x86/sysdep.h | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h +index a70bb3a2..49b0efe2 100644 +--- a/sysdeps/x86/sysdep.h ++++ b/sysdeps/x86/sysdep.h +@@ -111,7 +111,8 @@ enum cf_protection_level + /* Local label name for asm code. */ + #ifndef L + /* ELF-like local names start with `.L'. */ +-# define L(name) .L##name ++# define LOCAL_LABEL(name) .L##name ++# define L(name) LOCAL_LABEL(name) + #endif + + #define atom_text_section .section ".text.atom", "ax" +-- +GitLab + diff --git a/glibc-RHEL-15696-85.patch b/glibc-RHEL-15696-85.patch new file mode 100644 index 0000000..18f8a47 --- /dev/null +++ b/glibc-RHEL-15696-85.patch @@ -0,0 +1,108 @@ +From c328d0152d4b14cca58407ec68143894c8863004 Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Sat, 5 Feb 2022 11:52:33 -0800 +Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per + line +Content-type: text/plain; charset=UTF-8 + +Conflicts: + sysdeps/x86_64/multiarch/Makefile + (test order changed) + +--- + sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------ + 1 file changed, 48 insertions(+), 30 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 37d8d6f0..8c9e7812 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4 + endif + + ifeq ($(subdir),wcsmbs) +-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \ +- wmemcmp-avx2-movbe \ +- wmemchr-sse2 wmemchr-avx2 \ +- wcscmp-sse2 wcscmp-avx2 \ +- wcsncmp-sse2 wcsncmp-avx2 \ +- wcscpy-ssse3 wcscpy-c \ +- wcschr-sse2 wcschr-avx2 \ +- wcsrchr-sse2 wcsrchr-avx2 \ +- wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \ +- wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \ +- wcschr-avx2-rtm \ +- wcscmp-avx2-rtm \ +- wcslen-avx2-rtm \ +- wcsncmp-avx2-rtm \ +- wcsnlen-avx2-rtm \ +- wcsrchr-avx2-rtm \ +- wmemchr-avx2-rtm \ +- wmemcmp-avx2-movbe-rtm \ +- wcschr-evex \ +- wcscmp-evex \ +- wcslen-evex \ +- wcsncmp-evex \ +- wcsnlen-evex \ +- wcsrchr-evex \ +- wmemchr-evex \ +- wmemcmp-evex-movbe \ +- wmemchr-evex-rtm ++sysdep_routines += \ ++ wcschr-avx2 \ ++ wcschr-avx2-rtm \ ++ wcschr-evex \ ++ wcschr-sse2 \ ++ wcscmp-avx2 \ ++ wcscmp-avx2-rtm \ ++ wcscmp-evex \ ++ wcscmp-sse2 \ ++ wcscpy-c \ ++ wcscpy-ssse3 \ ++ wcslen-avx2 \ ++ wcslen-avx2-rtm \ ++ wcslen-evex \ ++ wcslen-sse2 \ ++ wcslen-sse4_1 \ ++ wcsncmp-avx2 \ ++ wcsncmp-avx2-rtm \ ++ wcsncmp-evex \ ++ wcsncmp-sse2 \ ++ wcsnlen-avx2 \ ++ wcsnlen-avx2-rtm \ ++ wcsnlen-c \ ++ wcsnlen-evex \ ++ wcsnlen-sse4_1 \ ++ wcsrchr-avx2 \ ++ wcsrchr-avx2-rtm \ ++ wcsrchr-evex \ ++ wcsrchr-sse2 \ ++ wmemchr-avx2 \ ++ wmemchr-avx2-rtm \ ++ wmemchr-evex \ ++ wmemchr-evex-rtm \ ++ wmemchr-sse2 \ ++ wmemcmp-avx2-movbe \ ++ wmemcmp-avx2-movbe-rtm \ ++ wmemcmp-c \ ++ wmemcmp-evex-movbe \ ++ wmemcmp-sse4 \ ++ wmemcmp-ssse3 \ ++# sysdep_routines + endif + + ifeq ($(subdir),debug) +-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \ +- memmove_chk-nonshared memset_chk-nonshared \ +- wmemset_chk-nonshared ++sysdep_routines += \ ++ memcpy_chk-nonshared \ ++ memmove_chk-nonshared \ ++ mempcpy_chk-nonshared \ ++ memset_chk-nonshared \ ++ wmemset_chk-nonshared \ ++# sysdep_routines + endif +-- +GitLab + diff --git a/glibc-RHEL-15696-86.patch b/glibc-RHEL-15696-86.patch new file mode 100644 index 0000000..d4fb42f --- /dev/null +++ b/glibc-RHEL-15696-86.patch @@ -0,0 +1,36 @@ +From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Thu, 10 Feb 2022 11:52:50 -0800 +Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset +Content-type: text/plain; charset=UTF-8 + +commit 3d9f171bfb5325bd5f427e9fc386453358c6e840 +Author: H.J. Lu +Date: Mon Feb 7 05:55:15 2022 -0800 + + x86-64: Optimize bzero + +added the optimized bzero. Remove bzero weak alias in SS2 memset to +avoid undefined __bzero in memset-sse2-unaligned-erms. +--- + sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +index 8f579ad6..af51362b 100644 +--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S ++++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S +@@ -31,9 +31,7 @@ + # endif + + # undef weak_alias +-# define weak_alias(original, alias) \ +- .weak bzero; bzero = __bzero +- ++# define weak_alias(original, alias) + # undef strong_alias + # define strong_alias(ignored1, ignored2) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-87.patch b/glibc-RHEL-15696-87.patch new file mode 100644 index 0000000..4882613 --- /dev/null +++ b/glibc-RHEL-15696-87.patch @@ -0,0 +1,29 @@ +From bf92893a14ebc161b08b28acc24fa06ae6be19cb Mon Sep 17 00:00:00 2001 +From: Adhemerval Zanella +Date: Thu, 10 Feb 2022 11:23:24 -0300 +Subject: [PATCH] x86_64: Remove bcopy optimizations +Content-type: text/plain; charset=UTF-8 + +The symbols is not present in current POSIX specification and compiler +already generates memmove call. +--- + sysdeps/x86_64/multiarch/bcopy.S | 7 ------- + 1 file changed, 7 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/bcopy.S + +diff --git a/sysdeps/x86_64/multiarch/bcopy.S b/sysdeps/x86_64/multiarch/bcopy.S +deleted file mode 100644 +index 639f02bd..00000000 +--- a/sysdeps/x86_64/multiarch/bcopy.S ++++ /dev/null +@@ -1,7 +0,0 @@ +-#include +- +- .text +-ENTRY(bcopy) +- xchg %rdi, %rsi +- jmp __libc_memmove /* Branch to IFUNC memmove. */ +-END(bcopy) +-- +GitLab + diff --git a/glibc-RHEL-15696-88.patch b/glibc-RHEL-15696-88.patch new file mode 100644 index 0000000..d075f80 --- /dev/null +++ b/glibc-RHEL-15696-88.patch @@ -0,0 +1,372 @@ +From a6fbf4d51e9ba8063c4f8331564892ead9c67344 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:16 -0500 +Subject: [PATCH] x86: Code cleanup in strchr-avx2 and comment justifying + branch +Content-type: text/plain; charset=UTF-8 + +Small code cleanup for size: -53 bytes. + +Add comment justifying using a branch to do NULL/non-null return. + +All string/memory tests pass and no regressions in benchtests. + +geometric_mean(N=20) of all benchmarks Original / New: 1.00 +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strchr-avx2.S | 204 +++++++++++++------------ + 1 file changed, 107 insertions(+), 97 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S +index 5884726b..89dd2bf7 100644 +--- a/sysdeps/x86_64/multiarch/strchr-avx2.S ++++ b/sysdeps/x86_64/multiarch/strchr-avx2.S +@@ -48,13 +48,13 @@ + # define PAGE_SIZE 4096 + + .section SECTION(.text),"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + vmovd %esi, %xmm0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax + VPBROADCAST %xmm0, %ymm0 +- vpxor %xmm9, %xmm9, %xmm9 ++ vpxor %xmm1, %xmm1, %xmm1 + + /* Check if we cross page boundary with one vector load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -62,37 +62,29 @@ ENTRY (STRCHR) + + /* Check the first VEC_SIZE bytes. Search for both CHAR and the + null byte. */ +- vmovdqu (%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqu (%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rdi, %rax), %CHAR_REG +- jne L(zero) +-# endif +- addq %rdi, %rax +- VZEROUPPER_RETURN +- +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x4): +- tzcntl %eax, %eax +- addq $(VEC_SIZE * 3 + 1), %rdi +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ ++ /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ + jne L(zero) + # endif + addq %rdi, %rax +- VZEROUPPER_RETURN ++L(return_vzeroupper): ++ ZERO_UPPER_VEC_REGISTERS_RETURN + + # ifndef USE_AS_STRCHRNUL + L(zero): +@@ -103,7 +95,8 @@ L(zero): + + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + incq %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -113,9 +106,10 @@ L(first_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -125,9 +119,10 @@ L(first_vec_x2): + addq %rdi, %rax + VZEROUPPER_RETURN + +- .p2align 4 ++ .p2align 4,, 8 + L(first_vec_x3): +- tzcntl %eax, %eax ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax + addq $(VEC_SIZE * 2 + 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ +@@ -137,6 +132,21 @@ L(first_vec_x3): + addq %rdi, %rax + VZEROUPPER_RETURN + ++ .p2align 4,, 10 ++L(first_vec_x4): ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $(VEC_SIZE * 3 + 1), %rdi ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax), %CHAR_REG ++ jne L(zero) ++# endif ++ addq %rdi, %rax ++ VZEROUPPER_RETURN ++ ++ ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE - 1. This is the same number of +@@ -146,90 +156,92 @@ L(aligned_more): + L(cross_page_continue): + /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time + since data is only aligned to VEC_SIZE. */ +- vmovdqa 1(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa 1(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x1) + +- vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x2) + +- vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x3) + +- vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(first_vec_x4) +- /* Align data to VEC_SIZE * 4 - 1. */ +- addq $(VEC_SIZE * 4 + 1), %rdi +- andq $-(VEC_SIZE * 4), %rdi ++ /* Align data to VEC_SIZE * 4 - 1. */ ++ incq %rdi ++ orq $(VEC_SIZE * 4 - 1), %rdi + .p2align 4 + L(loop_4x_vec): + /* Compare 4 * VEC at a time forward. */ +- vmovdqa (%rdi), %ymm5 +- vmovdqa (VEC_SIZE)(%rdi), %ymm6 +- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7 +- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8 ++ vmovdqa 1(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm7 + + /* Leaves only CHARS matching esi as 0. */ +- vpxor %ymm5, %ymm0, %ymm1 + vpxor %ymm6, %ymm0, %ymm2 + vpxor %ymm7, %ymm0, %ymm3 +- vpxor %ymm8, %ymm0, %ymm4 + +- VPMINU %ymm1, %ymm5, %ymm1 + VPMINU %ymm2, %ymm6, %ymm2 + VPMINU %ymm3, %ymm7, %ymm3 +- VPMINU %ymm4, %ymm8, %ymm4 + +- VPMINU %ymm1, %ymm2, %ymm5 +- VPMINU %ymm3, %ymm4, %ymm6 ++ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm6 ++ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm7 ++ ++ vpxor %ymm6, %ymm0, %ymm4 ++ vpxor %ymm7, %ymm0, %ymm5 ++ ++ VPMINU %ymm4, %ymm6, %ymm4 ++ VPMINU %ymm5, %ymm7, %ymm5 + +- VPMINU %ymm5, %ymm6, %ymm6 ++ VPMINU %ymm2, %ymm3, %ymm6 ++ VPMINU %ymm4, %ymm5, %ymm7 + +- VPCMPEQ %ymm6, %ymm9, %ymm6 +- vpmovmskb %ymm6, %ecx ++ VPMINU %ymm6, %ymm7, %ymm7 ++ ++ VPCMPEQ %ymm7, %ymm1, %ymm7 ++ vpmovmskb %ymm7, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- +- VPCMPEQ %ymm1, %ymm9, %ymm1 +- vpmovmskb %ymm1, %eax ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpmovmskb %ymm2, %eax + testl %eax, %eax + jnz L(last_vec_x0) + + +- VPCMPEQ %ymm5, %ymm9, %ymm2 +- vpmovmskb %ymm2, %eax ++ VPCMPEQ %ymm3, %ymm1, %ymm3 ++ vpmovmskb %ymm3, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMPEQ %ymm3, %ymm9, %ymm3 +- vpmovmskb %ymm3, %eax ++ VPCMPEQ %ymm4, %ymm1, %ymm4 ++ vpmovmskb %ymm4, %eax + /* rcx has combined result from all 4 VEC. It will only be used + if the first 3 other VEC all did not contain a match. */ + salq $32, %rcx + orq %rcx, %rax + tzcntq %rax, %rax +- subq $(VEC_SIZE * 2), %rdi ++ subq $(VEC_SIZE * 2 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -239,10 +251,11 @@ L(loop_4x_vec): + VZEROUPPER_RETURN + + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x0): +- tzcntl %eax, %eax +- addq $-(VEC_SIZE * 4), %rdi ++ /* Use bsf to save code size. */ ++ bsfl %eax, %eax ++ addq $-(VEC_SIZE * 4 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -251,16 +264,11 @@ L(last_vec_x0): + addq %rdi, %rax + VZEROUPPER_RETURN + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- VZEROUPPER_RETURN +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(last_vec_x1): + tzcntl %eax, %eax +- subq $(VEC_SIZE * 3), %rdi ++ subq $(VEC_SIZE * 3 - 1), %rdi + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (%rdi, %rax), %CHAR_REG +@@ -269,18 +277,23 @@ L(last_vec_x1): + addq %rdi, %rax + VZEROUPPER_RETURN + ++# ifndef USE_AS_STRCHRNUL ++L(zero_end): ++ xorl %eax, %eax ++ VZEROUPPER_RETURN ++# endif + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi to VEC_SIZE - 1. */ + orq $(VEC_SIZE - 1), %rdi +- vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8 +- VPCMPEQ %ymm8, %ymm0, %ymm1 +- VPCMPEQ %ymm8, %ymm9, %ymm2 +- vpor %ymm1, %ymm2, %ymm1 +- vpmovmskb %ymm1, %eax ++ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm2 ++ VPCMPEQ %ymm2, %ymm0, %ymm3 ++ VPCMPEQ %ymm2, %ymm1, %ymm2 ++ vpor %ymm3, %ymm2, %ymm3 ++ vpmovmskb %ymm3, %eax + /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT + so no need to manually mod edx. */ + sarxl %edx, %eax, %eax +@@ -291,13 +304,10 @@ L(cross_page_boundary): + xorl %ecx, %ecx + /* Found CHAR or the null byte. */ + cmp (%rdx, %rax), %CHAR_REG +- leaq (%rdx, %rax), %rax +- cmovne %rcx, %rax +-# else +- addq %rdx, %rax ++ jne L(zero_end) + # endif +-L(return_vzeroupper): +- ZERO_UPPER_VEC_REGISTERS_RETURN ++ addq %rdx, %rax ++ VZEROUPPER_RETURN + + END (STRCHR) +-# endif ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-89.patch b/glibc-RHEL-15696-89.patch new file mode 100644 index 0000000..45ee946 --- /dev/null +++ b/glibc-RHEL-15696-89.patch @@ -0,0 +1,343 @@ +From ec285ea90415458225623ddc0492ae3f705af043 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:18 -0500 +Subject: [PATCH] x86: Code cleanup in strchr-evex and comment justifying + branch +Content-type: text/plain; charset=UTF-8 + +Small code cleanup for size: -81 bytes. + +Add comment justifying using a branch to do NULL/non-null return. + +All string/memory tests pass and no regressions in benchtests. + +geometric_mean(N=20) of all benchmarks New / Original: .985 +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strchr-evex.S | 146 ++++++++++++++----------- + 1 file changed, 80 insertions(+), 66 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S +index 7f9d4ee4..0b49e0ac 100644 +--- a/sysdeps/x86_64/multiarch/strchr-evex.S ++++ b/sysdeps/x86_64/multiarch/strchr-evex.S +@@ -30,6 +30,7 @@ + # ifdef USE_AS_WCSCHR + # define VPBROADCAST vpbroadcastd + # define VPCMP vpcmpd ++# define VPTESTN vptestnmd + # define VPMINU vpminud + # define CHAR_REG esi + # define SHIFT_REG ecx +@@ -37,6 +38,7 @@ + # else + # define VPBROADCAST vpbroadcastb + # define VPCMP vpcmpb ++# define VPTESTN vptestnmb + # define VPMINU vpminub + # define CHAR_REG sil + # define SHIFT_REG edx +@@ -61,13 +63,11 @@ + # define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) + + .section .text.evex,"ax",@progbits +-ENTRY (STRCHR) ++ENTRY_P2ALIGN (STRCHR, 5) + /* Broadcast CHAR to YMM0. */ + VPBROADCAST %esi, %YMM0 + movl %edi, %eax + andl $(PAGE_SIZE - 1), %eax +- vpxorq %XMMZERO, %XMMZERO, %XMMZERO +- + /* Check if we cross page boundary with one vector load. + Otherwise it is safe to use an unaligned load. */ + cmpl $(PAGE_SIZE - VEC_SIZE), %eax +@@ -81,49 +81,35 @@ ENTRY (STRCHR) + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jz L(aligned_more) + tzcntl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ /* NB: Use a branch instead of cmovcc here. The expectation is ++ that with strchr the user will branch based on input being ++ null. Since this branch will be 100% predictive of the user ++ branch a branch miss here should save what otherwise would ++ be branch miss in the user code. Otherwise using a branch 1) ++ saves code size and 2) is faster in highly predictable ++ environments. */ ++ jne L(zero) ++# endif + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of bytes. + */ + leaq (%rdi, %rax, CHAR_SIZE), %rax + # else + addq %rdi, %rax +-# endif +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (%rax), %CHAR_REG +- jne L(zero) + # endif + ret + +- /* .p2align 5 helps keep performance more consistent if ENTRY() +- alignment % 32 was either 16 or 0. As well this makes the +- alignment % 32 of the loop_4x_vec fixed which makes tuning it +- easier. */ +- .p2align 5 +-L(first_vec_x3): +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Found CHAR or the null byte. */ +- cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero) +-# endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax +- ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero): +- xorl %eax, %eax +- ret +-# endif + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x4): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -144,9 +130,18 @@ L(first_vec_x4): + leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++# ifndef USE_AS_STRCHRNUL ++L(zero): ++ xorl %eax, %eax ++ ret ++# endif ++ ++ + .p2align 4 + L(first_vec_x1): +- tzcntl %eax, %eax ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Found CHAR or the null byte. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -158,7 +153,7 @@ L(first_vec_x1): + leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax + ret + +- .p2align 4 ++ .p2align 4,, 10 + L(first_vec_x2): + # ifndef USE_AS_STRCHRNUL + /* Check to see if first match was CHAR (k0) or null (k1). */ +@@ -179,6 +174,21 @@ L(first_vec_x2): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + ++ .p2align 4,, 10 ++L(first_vec_x3): ++ /* Use bsf here to save 1-byte keeping keeping the block in 1x ++ fetch block. eax guranteed non-zero. */ ++ bsfl %eax, %eax ++# ifndef USE_AS_STRCHRNUL ++ /* Found CHAR or the null byte. */ ++ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ jne L(zero) ++# endif ++ /* NB: Multiply sizeof char type (1 or 4) to get the number of ++ bytes. */ ++ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax ++ ret ++ + .p2align 4 + L(aligned_more): + /* Align data to VEC_SIZE. */ +@@ -195,7 +205,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x1) +@@ -206,7 +216,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x2) + +@@ -215,7 +225,7 @@ L(cross_page_continue): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(first_vec_x3) +@@ -224,7 +234,7 @@ L(cross_page_continue): + /* Each bit in K0 represents a CHAR in YMM1. */ + VPCMP $0, %YMM1, %YMM0, %k0 + /* Each bit in K1 represents a CHAR in YMM1. */ +- VPCMP $0, %YMM1, %YMMZERO, %k1 ++ VPTESTN %YMM1, %YMM1, %k1 + kortestd %k0, %k1 + jnz L(first_vec_x4) + +@@ -265,33 +275,33 @@ L(loop_4x_vec): + VPMINU %YMM3, %YMM4, %YMM4 + VPMINU %YMM2, %YMM4, %YMM4{%k4}{z} + +- VPCMP $0, %YMMZERO, %YMM4, %k1 ++ VPTESTN %YMM4, %YMM4, %k1 + kmovd %k1, %ecx + subq $-(VEC_SIZE * 4), %rdi + testl %ecx, %ecx + jz L(loop_4x_vec) + +- VPCMP $0, %YMMZERO, %YMM1, %k0 ++ VPTESTN %YMM1, %YMM1, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x1) + +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax + testl %eax, %eax + jnz L(last_vec_x2) + +- VPCMP $0, %YMMZERO, %YMM3, %k0 ++ VPTESTN %YMM3, %YMM3, %k0 + kmovd %k0, %eax + /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */ + # ifdef USE_AS_WCSCHR + sall $8, %ecx + orl %ecx, %eax +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # else + salq $32, %rcx + orq %rcx, %rax +- tzcntq %rax, %rax ++ bsfq %rax, %rax + # endif + # ifndef USE_AS_STRCHRNUL + /* Check if match was CHAR or null. */ +@@ -303,28 +313,28 @@ L(loop_4x_vec): + leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax + ret + +-# ifndef USE_AS_STRCHRNUL +-L(zero_end): +- xorl %eax, %eax +- ret ++ .p2align 4,, 8 ++L(last_vec_x1): ++ bsfl %eax, %eax ++# ifdef USE_AS_WCSCHR ++ /* NB: Multiply wchar_t count by 4 to get the number of bytes. ++ */ ++ leaq (%rdi, %rax, CHAR_SIZE), %rax ++# else ++ addq %rdi, %rax + # endif + +- .p2align 4 +-L(last_vec_x1): +- tzcntl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ +- cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG ++ cmp (%rax), %CHAR_REG + jne L(zero_end) + # endif +- /* NB: Multiply sizeof char type (1 or 4) to get the number of +- bytes. */ +- leaq (%rdi, %rax, CHAR_SIZE), %rax ++ + ret + +- .p2align 4 ++ .p2align 4,, 8 + L(last_vec_x2): +- tzcntl %eax, %eax ++ bsfl %eax, %eax + # ifndef USE_AS_STRCHRNUL + /* Check if match was null. */ + cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG +@@ -336,7 +346,7 @@ L(last_vec_x2): + ret + + /* Cold case for crossing page with first load. */ +- .p2align 4 ++ .p2align 4,, 8 + L(cross_page_boundary): + movq %rdi, %rdx + /* Align rdi. */ +@@ -346,9 +356,9 @@ L(cross_page_boundary): + vpxorq %YMM1, %YMM0, %YMM2 + VPMINU %YMM2, %YMM1, %YMM2 + /* Each bit in K0 represents a CHAR or a null byte in YMM1. */ +- VPCMP $0, %YMMZERO, %YMM2, %k0 ++ VPTESTN %YMM2, %YMM2, %k0 + kmovd %k0, %eax +- /* Remove the leading bits. */ ++ /* Remove the leading bits. */ + # ifdef USE_AS_WCSCHR + movl %edx, %SHIFT_REG + /* NB: Divide shift count by 4 since each bit in K1 represent 4 +@@ -360,20 +370,24 @@ L(cross_page_boundary): + /* If eax is zero continue. */ + testl %eax, %eax + jz L(cross_page_continue) +- tzcntl %eax, %eax +-# ifndef USE_AS_STRCHRNUL +- /* Check to see if match was CHAR or null. */ +- cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG +- jne L(zero_end) +-# endif ++ bsfl %eax, %eax ++ + # ifdef USE_AS_WCSCHR + /* NB: Multiply wchar_t count by 4 to get the number of + bytes. */ + leaq (%rdx, %rax, CHAR_SIZE), %rax + # else + addq %rdx, %rax ++# endif ++# ifndef USE_AS_STRCHRNUL ++ /* Check to see if match was CHAR or null. */ ++ cmp (%rax), %CHAR_REG ++ je L(cross_page_ret) ++L(zero_end): ++ xorl %eax, %eax ++L(cross_page_ret): + # endif + ret + + END (STRCHR) +-# endif ++#endif +-- +GitLab + diff --git a/glibc-RHEL-15696-9.patch b/glibc-RHEL-15696-9.patch new file mode 100644 index 0000000..5aa3e7b --- /dev/null +++ b/glibc-RHEL-15696-9.patch @@ -0,0 +1,206 @@ +From 3f635fb43389b54f682fc9ed2acc0b2aaf4a923d Mon Sep 17 00:00:00 2001 +From: "H.J. Lu" +Date: Mon, 4 Feb 2019 06:31:01 -0800 +Subject: [PATCH] x86-64 memcmp: Use unsigned Jcc instructions on size [BZ + #24155] +Content-type: text/plain; charset=UTF-8 + +Since the size argument is unsigned. we should use unsigned Jcc +instructions, instead of signed, to check size. + +Tested on x86-64 and x32, with and without --disable-multi-arch. + + [BZ #24155] + CVE-2019-7309 + * NEWS: Updated for CVE-2019-7309. + * sysdeps/x86_64/memcmp.S: Use RDX_LP for size. Clear the + upper 32 bits of RDX register for x32. Use unsigned Jcc + instructions, instead of signed. + * sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp-2. + * sysdeps/x86_64/x32/tst-size_t-memcmp-2.c: New test. +--- + sysdeps/x86_64/memcmp.S | 20 +++--- + sysdeps/x86_64/x32/Makefile | 3 +- + sysdeps/x86_64/x32/tst-size_t-memcmp-2.c | 79 ++++++++++++++++++++++++ + 3 files changed, 93 insertions(+), 9 deletions(-) + create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp-2.c + +Conflics: + ChangeLog + (removed) + NEWS + (removed) + +diff --git a/sysdeps/x86_64/memcmp.S b/sysdeps/x86_64/memcmp.S +index bcb4a2e8..45918d37 100644 +--- a/sysdeps/x86_64/memcmp.S ++++ b/sysdeps/x86_64/memcmp.S +@@ -21,14 +21,18 @@ + + .text + ENTRY (memcmp) +- test %rdx, %rdx ++#ifdef __ILP32__ ++ /* Clear the upper 32 bits. */ ++ movl %edx, %edx ++#endif ++ test %RDX_LP, %RDX_LP + jz L(finz) + cmpq $1, %rdx +- jle L(finr1b) ++ jbe L(finr1b) + subq %rdi, %rsi + movq %rdx, %r10 + cmpq $32, %r10 +- jge L(gt32) ++ jae L(gt32) + /* Handle small chunks and last block of less than 32 bytes. */ + L(small): + testq $1, %r10 +@@ -156,7 +160,7 @@ L(A32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + /* Pre-unroll to be ready for unrolled 64B loop. */ + testq $32, %rdi + jz L(A64) +@@ -178,7 +182,7 @@ L(A64): + movq %r11, %r10 + andq $-64, %r10 + cmpq %r10, %rdi +- jge L(mt32) ++ jae L(mt32) + + L(A64main): + movdqu (%rdi,%rsi), %xmm0 +@@ -216,7 +220,7 @@ L(mt32): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + + L(A32main): + movdqu (%rdi,%rsi), %xmm0 +@@ -254,7 +258,7 @@ L(ATR): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + testq $16, %rdi + jz L(ATR32) + +@@ -325,7 +329,7 @@ L(ATR64main): + movq %r11, %r10 + andq $-32, %r10 + cmpq %r10, %rdi +- jge L(mt16) ++ jae L(mt16) + + L(ATR32res): + movdqa (%rdi,%rsi), %xmm0 +diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile +index 1557724b..87489565 100644 +--- a/sysdeps/x86_64/x32/Makefile ++++ b/sysdeps/x86_64/x32/Makefile +@@ -8,7 +8,8 @@ endif + ifeq ($(subdir),string) + tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \ + tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \ +- tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen ++ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen \ ++ tst-size_t-memcmp-2 + endif + + ifeq ($(subdir),wcsmbs) +diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c +new file mode 100644 +index 00000000..d8ae1a08 +--- /dev/null ++++ b/sysdeps/x86_64/x32/tst-size_t-memcmp-2.c +@@ -0,0 +1,79 @@ ++/* Test memcmp with size_t in the lower 32 bits of 64-bit register. ++ Copyright (C) 2019 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define TEST_MAIN ++#ifdef WIDE ++# define TEST_NAME "wmemcmp" ++#else ++# define TEST_NAME "memcmp" ++#endif ++ ++#include "test-size_t.h" ++ ++#ifdef WIDE ++# include ++# include ++ ++# define MEMCMP wmemcmp ++# define CHAR wchar_t ++#else ++# define MEMCMP memcmp ++# define CHAR char ++#endif ++ ++IMPL (MEMCMP, 1) ++ ++typedef int (*proto_t) (const CHAR *, const CHAR *, size_t); ++ ++static int ++__attribute__ ((noinline, noclone)) ++do_memcmp (parameter_t a, parameter_t b) ++{ ++ return CALL (&b, a.p, b.p, a.len); ++} ++ ++static int ++test_main (void) ++{ ++ test_init (); ++ ++ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 }; ++ parameter_t src = { { 0 }, buf2 }; ++ ++ memcpy (buf1, buf2, page_size); ++ ++ CHAR *p = (CHAR *) buf1; ++ p[page_size / sizeof (CHAR) - 1] = (CHAR) 1; ++ ++ int ret = 0; ++ FOR_EACH_IMPL (impl, 0) ++ { ++ src.fn = impl->fn; ++ int res = do_memcmp (dest, src); ++ if (res >= 0) ++ { ++ error (0, 0, "Wrong result in function %s: %i >= 0", ++ impl->name, res); ++ ret = 1; ++ } ++ } ++ ++ return ret ? EXIT_FAILURE : EXIT_SUCCESS; ++} ++ ++#include +-- +GitLab + diff --git a/glibc-RHEL-15696-90.patch b/glibc-RHEL-15696-90.patch new file mode 100644 index 0000000..11835aa --- /dev/null +++ b/glibc-RHEL-15696-90.patch @@ -0,0 +1,147 @@ +From 30d627d477d7255345a4b713cf352ac32d644d61 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:22 -0500 +Subject: [PATCH] x86: Optimize strcspn and strpbrk in strcspn-c.c +Content-type: text/plain; charset=UTF-8 + +Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of +_mm_cmpistri. Also change offset to unsigned to avoid unnecessary +sign extensions. + +geometric_mean(N=20) of all benchmarks that dont fallback on +sse2/strlen; New / Original: .928 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcspn-c.c | 83 +++++++++++++--------------- + 1 file changed, 37 insertions(+), 46 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcspn-c.c b/sysdeps/x86_64/multiarch/strcspn-c.c +index 857af104..6cce4296 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-c.c ++++ b/sysdeps/x86_64/multiarch/strcspn-c.c +@@ -85,83 +85,74 @@ STRCSPN_SSE42 (const char *s, const char *a) + RETURN (NULL, strlen (s)); + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (unsigned int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return STRCSPN_SSE2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return STRCSPN_SSE2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return STRCSPN_SSE2 (s, a); + } + +- offset = (int) ((size_t) s & 15); ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); + + value = __m128i_shift_right (value, offset); + +- int length = _mm_cmpistri (mask, value, 0x2); ++ unsigned int length = _mm_cmpistri (mask, value, 0x2); + /* No need to check ZFlag since ZFlag is always 1. */ +- int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); + if (cflag) + RETURN ((char *) (s + length), length); + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); ++ unsigned int index = _mm_cmpistri (value, value, 0x3a); + if (index < 16 - offset) + RETURN (NULL, index); + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x2); +- int cflag = _mm_cmpistrc (mask, value, 0x2); +- int zflag = _mm_cmpistrz (mask, value, 0x2); ++ unsigned int index = _mm_cmpistri (mask, value, 0x2); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x2); ++ unsigned int zflag = _mm_cmpistrz (mask, value, 0x2); + if (cflag) + RETURN ((char *) (aligned + index), (size_t) (aligned + index - s)); + if (zflag) +-- +GitLab + diff --git a/glibc-RHEL-15696-91.patch b/glibc-RHEL-15696-91.patch new file mode 100644 index 0000000..de3c8ec --- /dev/null +++ b/glibc-RHEL-15696-91.patch @@ -0,0 +1,147 @@ +From 412d10343168b05b8cf6c3683457cf9711d28046 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:24 -0500 +Subject: [PATCH] x86: Optimize strspn in strspn-c.c +Content-type: text/plain; charset=UTF-8 + +Use _mm_cmpeq_epi8 and _mm_movemask_epi8 to get strlen instead of +_mm_cmpistri. Also change offset to unsigned to avoid unnecessary +sign extensions. + +geometric_mean(N=20) of all benchmarks that dont fallback on +sse2; New / Original: .901 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strspn-c.c | 86 +++++++++++++---------------- + 1 file changed, 39 insertions(+), 47 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strspn-c.c b/sysdeps/x86_64/multiarch/strspn-c.c +index 4554cff0..87c5e4bf 100644 +--- a/sysdeps/x86_64/multiarch/strspn-c.c ++++ b/sysdeps/x86_64/multiarch/strspn-c.c +@@ -63,81 +63,73 @@ __strspn_sse42 (const char *s, const char *a) + return 0; + + const char *aligned; +- __m128i mask; +- int offset = (int) ((size_t) a & 15); ++ __m128i mask, maskz, zero; ++ unsigned int maskz_bits; ++ unsigned int offset = (int) ((size_t) a & 15); ++ zero = _mm_set1_epi8 (0); + if (offset != 0) + { + /* Load masks. */ + aligned = (const char *) ((size_t) a & -16L); + __m128i mask0 = _mm_load_si128 ((__m128i *) aligned); +- +- mask = __m128i_shift_right (mask0, offset); ++ maskz = _mm_cmpeq_epi8 (mask0, zero); + + /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16 - offset) +- { +- /* There is no NULL terminator. */ +- __m128i mask1 = _mm_load_si128 ((__m128i *) (aligned + 16)); +- int index = _mm_cmpistri (mask1, mask1, 0x3a); +- length += index; +- +- /* Don't use SSE4.2 if the length of A > 16. */ +- if (length > 16) +- return __strspn_sse2 (s, a); +- +- if (index != 0) +- { +- /* Combine mask0 and mask1. We could play games with +- palignr, but frankly this data should be in L1 now +- so do the merge via an unaligned load. */ +- mask = _mm_loadu_si128 ((__m128i *) a); +- } +- } ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) ++ { ++ mask = __m128i_shift_right (mask0, offset); ++ offset = (unsigned int) ((size_t) s & 15); ++ if (offset) ++ goto start_unaligned; ++ ++ aligned = s; ++ goto start_loop; ++ } + } +- else +- { +- /* A is aligned. */ +- mask = _mm_load_si128 ((__m128i *) a); + +- /* Find where the NULL terminator is. */ +- int length = _mm_cmpistri (mask, mask, 0x3a); +- if (length == 16) +- { +- /* There is no NULL terminator. Don't use SSE4.2 if the length +- of A > 16. */ +- if (a[16] != 0) +- return __strspn_sse2 (s, a); +- } ++ /* A is aligned. */ ++ mask = _mm_loadu_si128 ((__m128i *) a); ++ ++ /* Find where the NULL terminator is. */ ++ maskz = _mm_cmpeq_epi8 (mask, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz); ++ if (maskz_bits == 0) ++ { ++ /* There is no NULL terminator. Don't use SSE4.2 if the length ++ of A > 16. */ ++ if (a[16] != 0) ++ return __strspn_sse2 (s, a); + } ++ aligned = s; ++ offset = (unsigned int) ((size_t) s & 15); + +- offset = (int) ((size_t) s & 15); + if (offset != 0) + { ++ start_unaligned: + /* Check partial string. */ + aligned = (const char *) ((size_t) s & -16L); + __m128i value = _mm_load_si128 ((__m128i *) aligned); ++ __m128i adj_value = __m128i_shift_right (value, offset); + +- value = __m128i_shift_right (value, offset); +- +- int length = _mm_cmpistri (mask, value, 0x12); ++ unsigned int length = _mm_cmpistri (mask, adj_value, 0x12); + /* No need to check CFlag since it is always 1. */ + if (length < 16 - offset) + return length; + /* Find where the NULL terminator is. */ +- int index = _mm_cmpistri (value, value, 0x3a); +- if (index < 16 - offset) ++ maskz = _mm_cmpeq_epi8 (value, zero); ++ maskz_bits = _mm_movemask_epi8 (maskz) >> offset; ++ if (maskz_bits != 0) + return length; + aligned += 16; + } +- else +- aligned = s; + ++start_loop: + while (1) + { + __m128i value = _mm_load_si128 ((__m128i *) aligned); +- int index = _mm_cmpistri (mask, value, 0x12); +- int cflag = _mm_cmpistrc (mask, value, 0x12); ++ unsigned int index = _mm_cmpistri (mask, value, 0x12); ++ unsigned int cflag = _mm_cmpistrc (mask, value, 0x12); + if (cflag) + return (size_t) (aligned + index - s); + aligned += 16; +-- +GitLab + diff --git a/glibc-RHEL-15696-92.patch b/glibc-RHEL-15696-92.patch new file mode 100644 index 0000000..f19914e --- /dev/null +++ b/glibc-RHEL-15696-92.patch @@ -0,0 +1,175 @@ +From fe28e7d9d9535ebab4081d195c553b4fbf39d9ae Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:26 -0500 +Subject: [PATCH] x86: Remove strcspn-sse2.S and use the generic implementation +Content-type: text/plain; charset=UTF-8 + +The generic implementation is faster. + +geometric_mean(N=20) of all benchmarks New / Original: .678 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + .../{strcspn-sse2.S => strcspn-sse2.c} | 6 +- + sysdeps/x86_64/strcspn.S | 122 ------------------ + 2 files changed, 3 insertions(+), 125 deletions(-) + rename sysdeps/x86_64/multiarch/{strcspn-sse2.S => strcspn-sse2.c} (89%) + delete mode 100644 sysdeps/x86_64/strcspn.S + +Conflicts: + sysdeps/x86_64/multiarch/strcspn-sse2.S + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strcspn-sse2.S b/sysdeps/x86_64/multiarch/strcspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strcspn-sse2.S +rename to sysdeps/x86_64/multiarch/strcspn-sse2.c +index 8a0c69d7..32debee4 100644 +--- a/sysdeps/x86_64/multiarch/strcspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strcspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strcspn_sse2 ++# define STRCSPN __strcspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strcspn) ++# define libc_hidden_builtin_def(STRCSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strcspn.S b/sysdeps/x86_64/strcspn.S +deleted file mode 100644 +index 7f9202d6..00000000 +--- a/sysdeps/x86_64/strcspn.S ++++ /dev/null +@@ -1,122 +0,0 @@ +-/* strcspn (str, ss) -- Return the length of the initial segment of STR +- which contains no characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +-#include "asm-syntax.h" +- +- .text +-ENTRY (strcspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup skipset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from skipset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 1(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 2(%rax), %cl /* get byte from skipset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- +- movb 3(%rax), %cl /* get byte from skipset */ +- addq $4, %rax /* increment skipset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in skipset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from skipset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the skipset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the skipset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(4) /* yes => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- je L(5) /* yes => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* yes => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- cmpb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jne L(3) /* no => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove skipset */ +- cfi_adjust_cfa_offset(-256) +-#ifdef USE_AS_STRPBRK +- xorl %edx,%edx +- orb %cl, %cl /* was last character NUL? */ +- cmovzq %rdx, %rax /* Yes: return NULL */ +-#else +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +-#endif +- ret +-END (strcspn) +-libc_hidden_builtin_def (strcspn) +-- +GitLab + diff --git a/glibc-RHEL-15696-93.patch b/glibc-RHEL-15696-93.patch new file mode 100644 index 0000000..45c8527 --- /dev/null +++ b/glibc-RHEL-15696-93.patch @@ -0,0 +1,55 @@ +From 653358535280a599382cb6c77538a187dac6a87f Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:27 -0500 +Subject: [PATCH] x86: Remove strpbrk-sse2.S and use the generic implementation +Content-type: text/plain; charset=UTF-8 + +The generic implementation is faster (see strcspn commit). + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + .../x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} | 7 +++---- + sysdeps/x86_64/strpbrk.S | 3 --- + 2 files changed, 3 insertions(+), 7 deletions(-) + rename sysdeps/x86_64/multiarch/{strpbrk-sse2.S => strpbrk-sse2.c} (87%) + delete mode 100644 sysdeps/x86_64/strpbrk.S + +Conflicts: + sysdeps/x86_64/multiarch/strpbrk-sse2.S + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strpbrk-sse2.S b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +similarity index 87% +rename from sysdeps/x86_64/multiarch/strpbrk-sse2.S +rename to sysdeps/x86_64/multiarch/strpbrk-sse2.c +index 3c6a74db..ec0b6fda 100644 +--- a/sysdeps/x86_64/multiarch/strpbrk-sse2.S ++++ b/sysdeps/x86_64/multiarch/strpbrk-sse2.c +@@ -19,11 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strcspn __strpbrk_sse2 ++# define STRPBRK __strpbrk_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strpbrk) ++# define libc_hidden_builtin_def(STRPBRK) + #endif + +-#define USE_AS_STRPBRK +-#include ++#include +diff --git a/sysdeps/x86_64/strpbrk.S b/sysdeps/x86_64/strpbrk.S +deleted file mode 100644 +index 21888a5b..00000000 +--- a/sysdeps/x86_64/strpbrk.S ++++ /dev/null +@@ -1,3 +0,0 @@ +-#define strcspn strpbrk +-#define USE_AS_STRPBRK +-#include +-- +GitLab + diff --git a/glibc-RHEL-15696-94.patch b/glibc-RHEL-15696-94.patch new file mode 100644 index 0000000..2fa86da --- /dev/null +++ b/glibc-RHEL-15696-94.patch @@ -0,0 +1,168 @@ +From 9c8a6ad620b49a27120ecdd7049c26bf05900397 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:29 -0500 +Subject: [PATCH] x86: Remove strspn-sse2.S and use the generic implementation +Content-type: text/plain; charset=UTF-8 + +The generic implementation is faster. + +geometric_mean(N=20) of all benchmarks New / Original: .710 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + .../{strspn-sse2.S => strspn-sse2.c} | 6 +- + sysdeps/x86_64/strspn.S | 115 ------------------ + 2 files changed, 3 insertions(+), 118 deletions(-) + rename sysdeps/x86_64/multiarch/{strspn-sse2.S => strspn-sse2.c} (89%) + delete mode 100644 sysdeps/x86_64/strspn.S + +Conflicts: + sysdeps/x86_64/multiarch/strspn-sse2.c + (copyright header) + +diff --git a/sysdeps/x86_64/multiarch/strspn-sse2.S b/sysdeps/x86_64/multiarch/strspn-sse2.c +similarity index 89% +rename from sysdeps/x86_64/multiarch/strspn-sse2.S +rename to sysdeps/x86_64/multiarch/strspn-sse2.c +index 4686cdd5..ab0dae40 100644 +--- a/sysdeps/x86_64/multiarch/strspn-sse2.S ++++ b/sysdeps/x86_64/multiarch/strspn-sse2.c +@@ -19,10 +19,10 @@ + #if IS_IN (libc) + + # include +-# define strspn __strspn_sse2 ++# define STRSPN __strspn_sse2 + + # undef libc_hidden_builtin_def +-# define libc_hidden_builtin_def(strspn) ++# define libc_hidden_builtin_def(STRSPN) + #endif + +-#include ++#include +diff --git a/sysdeps/x86_64/strspn.S b/sysdeps/x86_64/strspn.S +deleted file mode 100644 +index 635f1bc6..00000000 +--- a/sysdeps/x86_64/strspn.S ++++ /dev/null +@@ -1,115 +0,0 @@ +-/* strspn (str, ss) -- Return the length of the initial segment of STR +- which contains only characters from SS. +- For AMD x86-64. +- Copyright (C) 1994-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- Contributed by Ulrich Drepper . +- Bug fixes by Alan Modra . +- Adopted for x86-64 by Andreas Jaeger . +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#include +- +- .text +-ENTRY (strspn) +- +- movq %rdi, %rdx /* Save SRC. */ +- +- /* First we create a table with flags for all possible characters. +- For the ASCII (7bit/8bit) or ISO-8859-X character sets which are +- supported by the C string functions we have 256 characters. +- Before inserting marks for the stop characters we clear the whole +- table. */ +- movq %rdi, %r8 /* Save value. */ +- subq $256, %rsp /* Make space for 256 bytes. */ +- cfi_adjust_cfa_offset(256) +- movl $32, %ecx /* 32*8 bytes = 256 bytes. */ +- movq %rsp, %rdi +- xorl %eax, %eax /* We store 0s. */ +- cld +- rep +- stosq +- +- movq %rsi, %rax /* Setup stopset. */ +- +-/* For understanding the following code remember that %rcx == 0 now. +- Although all the following instruction only modify %cl we always +- have a correct zero-extended 64-bit value in %rcx. */ +- +- .p2align 4 +-L(2): movb (%rax), %cl /* get byte from stopset */ +- testb %cl, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 1(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 2(%rax), %cl /* get byte from stopset */ +- testb $0xff, %cl /* is NUL char? */ +- jz L(1) /* yes => start compare loop */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- +- movb 3(%rax), %cl /* get byte from stopset */ +- addq $4, %rax /* increment stopset pointer */ +- movb %cl, (%rsp,%rcx) /* set corresponding byte in stopset table */ +- testb $0xff, %cl /* is NUL char? */ +- jnz L(2) /* no => process next dword from stopset */ +- +-L(1): leaq -4(%rdx), %rax /* prepare loop */ +- +- /* We use a neat trick for the following loop. Normally we would +- have to test for two termination conditions +- 1. a character in the stopset was found +- and +- 2. the end of the string was found +- But as a sign that the character is in the stopset we store its +- value in the table. But the value of NUL is NUL so the loop +- terminates for NUL in every case. */ +- +- .p2align 4 +-L(3): addq $4, %rax /* adjust pointer for full loop round */ +- +- movb (%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(4) /* no => return */ +- +- movb 1(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(5) /* no => return */ +- +- movb 2(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jz L(6) /* no => return */ +- +- movb 3(%rax), %cl /* get byte from string */ +- testb %cl, (%rsp,%rcx) /* is it contained in skipset? */ +- jnz L(3) /* yes => start loop again */ +- +- incq %rax /* adjust pointer */ +-L(6): incq %rax +-L(5): incq %rax +- +-L(4): addq $256, %rsp /* remove stopset */ +- cfi_adjust_cfa_offset(-256) +- subq %rdx, %rax /* we have to return the number of valid +- characters, so compute distance to first +- non-valid character */ +- ret +-END (strspn) +-libc_hidden_builtin_def (strspn) +-- +GitLab + diff --git a/glibc-RHEL-15696-95.patch b/glibc-RHEL-15696-95.patch new file mode 100644 index 0000000..cf21b96 --- /dev/null +++ b/glibc-RHEL-15696-95.patch @@ -0,0 +1,122 @@ +From 670b54bc585ea4a94f3b2e9272ba44aa6b730b73 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:36 -0500 +Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp.S +Content-type: text/plain; charset=UTF-8 + +Slightly faster method of doing TOLOWER that saves an +instruction. + +Also replace the hard coded 5-byte no with .p2align 4. On builds with +CET enabled this misaligned entry to strcasecmp. + +geometric_mean(N=40) of all benchmarks New / Original: .894 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/strcmp.S | 64 +++++++++++++++++++---------------------- + 1 file changed, 29 insertions(+), 35 deletions(-) + +diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S +index aa6df898..f454ce5b 100644 +--- a/sysdeps/x86_64/strcmp.S ++++ b/sysdeps/x86_64/strcmp.S +@@ -78,9 +78,8 @@ ENTRY2 (__strcasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strcasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strcasecmp, strcasecmp) +@@ -97,9 +96,8 @@ ENTRY2 (__strncasecmp) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END2 (__strncasecmp) + # ifndef NO_NOLOCALE_ALIAS + weak_alias (__strncasecmp, strncasecmp) +@@ -149,22 +147,22 @@ ENTRY (STRCMP) + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-.Lbelowupper: +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-.Ltopupper: +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-.Ltouppermask: ++.Llcase_min: ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++.Llcase_max: ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++.Lcase_add: + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa .Lbelowupper(%rip), %xmm5 +-# define UCLOW_reg %xmm5 +- movdqa .Ltopupper(%rip), %xmm6 +-# define UCHIGH_reg %xmm6 +- movdqa .Ltouppermask(%rip), %xmm7 +-# define LCQWORD_reg %xmm7 ++ movdqa .Llcase_min(%rip), %xmm5 ++# define LCASE_MIN_reg %xmm5 ++ movdqa .Llcase_max(%rip), %xmm6 ++# define LCASE_MAX_reg %xmm6 ++ movdqa .Lcase_add(%rip), %xmm7 ++# define CASE_ADD_reg %xmm7 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache) /* rsi: 16-byte load will cross cache line */ +@@ -175,22 +173,18 @@ ENTRY (STRCMP) + movhpd 8(%rdi), %xmm1 + movhpd 8(%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm8; \ +- movdqa UCHIGH_reg, %xmm9; \ +- movdqa reg2, %xmm10; \ +- movdqa UCHIGH_reg, %xmm11; \ +- pcmpgtb UCLOW_reg, %xmm8; \ +- pcmpgtb reg1, %xmm9; \ +- pcmpgtb UCLOW_reg, %xmm10; \ +- pcmpgtb reg2, %xmm11; \ +- pand %xmm9, %xmm8; \ +- pand %xmm11, %xmm10; \ +- pand LCQWORD_reg, %xmm8; \ +- pand LCQWORD_reg, %xmm10; \ +- por %xmm8, reg1; \ +- por %xmm10, reg2 +- TOLOWER (%xmm1, %xmm2) ++# define TOLOWER(reg1, reg2) \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ movdqa LCASE_MIN_reg, %xmm9; \ ++ paddb reg1, %xmm8; \ ++ paddb reg2, %xmm9; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm9; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm9; \ ++ paddb %xmm8, reg1; \ ++ paddb %xmm9, reg2 ++ TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif +-- +GitLab + diff --git a/glibc-RHEL-15696-96.patch b/glibc-RHEL-15696-96.patch new file mode 100644 index 0000000..2d3b891 --- /dev/null +++ b/glibc-RHEL-15696-96.patch @@ -0,0 +1,143 @@ +From d154758e618ec9324f5d339c46db0aa27e8b1226 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:38 -0500 +Subject: [PATCH] x86: Optimize str{n}casecmp TOLOWER logic in strcmp-sse42.S +Content-type: text/plain; charset=UTF-8 + +Slightly faster method of doing TOLOWER that saves an +instruction. + +Also replace the hard coded 5-byte no with .p2align 4. On builds with +CET enabled this misaligned entry to strcasecmp. + +geometric_mean(N=40) of all benchmarks New / Original: .920 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 83 +++++++++++-------------- + 1 file changed, 35 insertions(+), 48 deletions(-) + +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index d8fdeb3a..59e8ddfc 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -89,9 +89,8 @@ ENTRY (GLABEL(__strcasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RDX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strcasecmp)) + /* FALLTHROUGH to strcasecmp_l. */ + #endif +@@ -100,9 +99,8 @@ ENTRY (GLABEL(__strncasecmp)) + movq __libc_tsd_LOCALE@gottpoff(%rip),%rax + mov %fs:(%rax),%RCX_LP + +- // XXX 5 byte should be before the function +- /* 5-byte NOP. */ +- .byte 0x0f,0x1f,0x44,0x00,0x00 ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 + END (GLABEL(__strncasecmp)) + /* FALLTHROUGH to strncasecmp_l. */ + #endif +@@ -170,27 +168,22 @@ STRCMP_SSE42: + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + .section .rodata.cst16,"aM",@progbits,16 + .align 16 +-LABEL(belowupper): +- .quad 0x4040404040404040 +- .quad 0x4040404040404040 +-LABEL(topupper): +-# ifdef USE_AVX +- .quad 0x5a5a5a5a5a5a5a5a +- .quad 0x5a5a5a5a5a5a5a5a +-# else +- .quad 0x5b5b5b5b5b5b5b5b +- .quad 0x5b5b5b5b5b5b5b5b +-# endif +-LABEL(touppermask): ++LABEL(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++LABEL(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++LABEL(case_add): + .quad 0x2020202020202020 + .quad 0x2020202020202020 + .previous +- movdqa LABEL(belowupper)(%rip), %xmm4 +-# define UCLOW_reg %xmm4 +- movdqa LABEL(topupper)(%rip), %xmm5 +-# define UCHIGH_reg %xmm5 +- movdqa LABEL(touppermask)(%rip), %xmm6 +-# define LCQWORD_reg %xmm6 ++ movdqa LABEL(lcase_min)(%rip), %xmm4 ++# define LCASE_MIN_reg %xmm4 ++ movdqa LABEL(lcase_max)(%rip), %xmm5 ++# define LCASE_MAX_reg %xmm5 ++ movdqa LABEL(case_add)(%rip), %xmm6 ++# define CASE_ADD_reg %xmm6 + #endif + cmp $0x30, %ecx + ja LABEL(crosscache)/* rsi: 16-byte load will cross cache line */ +@@ -201,32 +194,26 @@ LABEL(touppermask): + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L + # ifdef USE_AVX + # define TOLOWER(reg1, reg2) \ +- vpcmpgtb UCLOW_reg, reg1, %xmm7; \ +- vpcmpgtb UCHIGH_reg, reg1, %xmm8; \ +- vpcmpgtb UCLOW_reg, reg2, %xmm9; \ +- vpcmpgtb UCHIGH_reg, reg2, %xmm10; \ +- vpandn %xmm7, %xmm8, %xmm8; \ +- vpandn %xmm9, %xmm10, %xmm10; \ +- vpand LCQWORD_reg, %xmm8, %xmm8; \ +- vpand LCQWORD_reg, %xmm10, %xmm10; \ +- vpor reg1, %xmm8, reg1; \ +- vpor reg2, %xmm10, reg2 ++ vpaddb LCASE_MIN_reg, reg1, %xmm7; \ ++ vpaddb LCASE_MIN_reg, reg2, %xmm8; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ ++ vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ ++ vpandn CASE_ADD_reg, %xmm7, %xmm7; \ ++ vpandn CASE_ADD_reg, %xmm8, %xmm8; \ ++ vpaddb %xmm7, reg1, reg1; \ ++ vpaddb %xmm8, reg2, reg2 + # else + # define TOLOWER(reg1, reg2) \ +- movdqa reg1, %xmm7; \ +- movdqa UCHIGH_reg, %xmm8; \ +- movdqa reg2, %xmm9; \ +- movdqa UCHIGH_reg, %xmm10; \ +- pcmpgtb UCLOW_reg, %xmm7; \ +- pcmpgtb reg1, %xmm8; \ +- pcmpgtb UCLOW_reg, %xmm9; \ +- pcmpgtb reg2, %xmm10; \ +- pand %xmm8, %xmm7; \ +- pand %xmm10, %xmm9; \ +- pand LCQWORD_reg, %xmm7; \ +- pand LCQWORD_reg, %xmm9; \ +- por %xmm7, reg1; \ +- por %xmm9, reg2 ++ movdqa LCASE_MIN_reg, %xmm7; \ ++ movdqa LCASE_MIN_reg, %xmm8; \ ++ paddb reg1, %xmm7; \ ++ paddb reg2, %xmm8; \ ++ pcmpgtb LCASE_MAX_reg, %xmm7; \ ++ pcmpgtb LCASE_MAX_reg, %xmm8; \ ++ pandn CASE_ADD_reg, %xmm7; \ ++ pandn CASE_ADD_reg, %xmm8; \ ++ paddb %xmm7, reg1; \ ++ paddb %xmm8, reg2 + # endif + TOLOWER (%xmm1, %xmm2) + #else +-- +GitLab + diff --git a/glibc-RHEL-15696-97.patch b/glibc-RHEL-15696-97.patch new file mode 100644 index 0000000..9592795 --- /dev/null +++ b/glibc-RHEL-15696-97.patch @@ -0,0 +1,759 @@ +From bbf81222343fed5cd704001a2ae0d86c71544151 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 18:56:12 -0500 +Subject: [PATCH] x86: Add AVX2 optimized str{n}casecmp +Content-type: text/plain; charset=UTF-8 + +geometric_mean(N=40) of all benchmarks AVX2 / SSE42: .702 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 4 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 28 +++ + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 12 + + .../x86_64/multiarch/strcasecmp_l-avx2-rtm.S | 15 ++ + sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S | 23 ++ + sysdeps/x86_64/multiarch/strcmp-avx2.S | 237 +++++++++++++++--- + .../x86_64/multiarch/strncase_l-avx2-rtm.S | 16 ++ + sysdeps/x86_64/multiarch/strncase_l-avx2.S | 27 ++ + 8 files changed, 331 insertions(+), 31 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx2.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 8c9e7812..711ecf2e 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -51,6 +51,8 @@ sysdep_routines += \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ + strcasecmp_l-avx \ ++ strcasecmp_l-avx2 \ ++ strcasecmp_l-avx2-rtm \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -89,6 +91,8 @@ sysdep_routines += \ + strlen-evex \ + strlen-sse2 \ + strncase_l-avx \ ++ strncase_l-avx2 \ ++ strncase_l-avx2-rtm \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index c963d391..d873e1be 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_avx) +@@ -431,6 +438,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strcasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strcasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strcasecmp_l_avx) +@@ -558,6 +572,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_avx) +@@ -572,6 +593,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ CPU_FEATURE_USABLE (AVX2), ++ __strncasecmp_l_avx2) ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX2) ++ && CPU_FEATURE_USABLE (RTM)), ++ __strncasecmp_l_avx2_rtm) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (AVX), + __strncasecmp_l_avx) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 6a4bb078..926508c4 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -23,12 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) + { + const struct cpu_features* cpu_features = __get_cpu_features (); + ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) ++ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) ++ { ++ if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) ++ return OPTIMIZE (avx2_rtm); ++ ++ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)) ++ return OPTIMIZE (avx2); ++ } ++ + if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) + return OPTIMIZE (avx); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +new file mode 100644 +index 00000000..09957fc3 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2-rtm.S +@@ -0,0 +1,15 @@ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++ ++#include "strcasecmp_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +new file mode 100644 +index 00000000..e2762f2a +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-avx2.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S +index 782f9472..28cc98b6 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S ++++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S +@@ -20,6 +20,10 @@ + + # include + ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif ++ + # ifndef STRCMP + # define STRCMP __strcmp_avx2 + # endif +@@ -74,13 +78,88 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_avx2 ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_avx2 ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ + # define xmmZERO xmm15 + # define ymmZERO ymm15 + ++# define LCASE_MIN_ymm %ymm10 ++# define LCASE_MAX_ymm %ymm11 ++# define CASE_ADD_ymm %ymm12 ++ ++# define LCASE_MIN_xmm %xmm10 ++# define LCASE_MAX_xmm %xmm11 ++# define CASE_ADD_xmm %xmm12 ++ ++ /* r11 is never use elsewhere so this is safe to maintain. */ ++# define TOLOWER_BASE %r11 ++ + # ifndef SECTION + # define SECTION(p) p##.avx + # endif + ++# ifdef USE_AS_STRCASECMP_L ++# define REG(x, y) x ## y ++# define TOLOWER(reg1_in, reg1_out, reg2_in, reg2_out, ext) \ ++ vpaddb REG(LCASE_MIN_, ext), reg1_in, REG(%ext, 8); \ ++ vpaddb REG(LCASE_MIN_, ext), reg2_in, REG(%ext, 9); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpcmpgtb REG(LCASE_MAX_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 8), REG(%ext, 8); \ ++ vpandn REG(CASE_ADD_, ext), REG(%ext, 9), REG(%ext, 9); \ ++ vpaddb REG(%ext, 8), reg1_in, reg1_out; \ ++ vpaddb REG(%ext, 9), reg2_in, reg2_out ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_ymm(...) TOLOWER(__VA_ARGS__, ymm) ++# define TOLOWER_xmm(...) TOLOWER(__VA_ARGS__, xmm) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, scratch_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, scratch_reg, s2_reg, s2_reg, ext); \ ++ VPCMPEQ scratch_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, scratch_reg, reg_out, ext) \ ++ VMOVU s2_mem, reg_out; \ ++ CMP_R1_R2(s1_reg, reg_out, scratch_reg, reg_out, ext) ++ ++# define CMP_R1_R2_ymm(...) CMP_R1_R2(__VA_ARGS__, ymm) ++# define CMP_R1_R2_xmm(...) CMP_R1_R2(__VA_ARGS__, xmm) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_S2(__VA_ARGS__, ymm) ++# define CMP_R1_S2_xmm(...) CMP_R1_S2(__VA_ARGS__, xmm) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_ymm(...) ++# define TOLOWER_xmm(...) ++ ++# define CMP_R1_R2_ymm(s1_reg, s2_reg, scratch_reg, reg_out) \ ++ VPCMPEQ s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_xmm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++ ++# define CMP_R1_S2_ymm(...) CMP_R1_R2_ymm(__VA_ARGS__) ++# define CMP_R1_S2_xmm(...) CMP_R1_R2_xmm(__VA_ARGS__) ++# endif ++ + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. + strcmp/strncmp have to use UNSIGNED comparison for elements. +@@ -102,8 +181,49 @@ + returned. */ + + .section SECTION(.text), "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifndef GLABEL ++# define GLABEL(...) __VA_ARGS__ ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (GLABEL(STRCASECMP)) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (GLABEL(STRCASECMP)) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -128,6 +248,30 @@ ENTRY(STRCMP) + # endif + # endif + vpxor %xmmZERO, %xmmZERO, %xmmZERO ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++ .quad 0x3f3f3f3f3f3f3f3f ++L(lcase_max): ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++ .quad 0x9999999999999999 ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa L(lcase_min)(%rip), LCASE_MIN_ymm ++ vmovdqa L(lcase_max)(%rip), LCASE_MAX_ymm ++ vmovdqa L(case_add)(%rip), CASE_ADD_ymm ++# endif + movl %edi, %eax + orl %esi, %eax + sall $20, %eax +@@ -138,8 +282,10 @@ ENTRY(STRCMP) + L(no_page_cross): + /* Safe to compare 4x vectors. */ + VMOVU (%rdi), %ymm0 +- /* 1s where s1 and s2 equal. */ +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ /* 1s where s1 and s2 equal. Just VPCMPEQ if its not strcasecmp. ++ Otherwise converts ymm0 and load from rsi to lower. ymm2 is ++ scratch and ymm1 is the return. */ ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + /* 1s at null CHAR. */ + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + /* 1s where s1 and s2 equal AND not null CHAR. */ +@@ -172,6 +318,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -192,6 +340,10 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) + # ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large +@@ -211,6 +363,8 @@ L(one_or_less): + jnbe __strcmp_avx2 + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -238,6 +392,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -269,6 +425,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -289,6 +447,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -299,7 +459,7 @@ L(ret4): + L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -312,7 +472,7 @@ L(more_3x_vec): + # endif + + VMOVU (VEC_SIZE * 2)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 2)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -320,7 +480,7 @@ L(more_3x_vec): + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %ymm0 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 3)(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -395,12 +555,10 @@ L(loop_skip_page_cross_check): + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + + /* ymm1 all 1s where s1 and s2 equal. All 0s otherwise. */ +- VPCMPEQ (VEC_SIZE * 0)(%rsi), %ymm0, %ymm1 +- +- VPCMPEQ (VEC_SIZE * 1)(%rsi), %ymm2, %ymm3 +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 +- ++ CMP_R1_S2_ymm (%ymm0, (VEC_SIZE * 0)(%rsi), %ymm3, %ymm1) ++ CMP_R1_S2_ymm (%ymm2, (VEC_SIZE * 1)(%rsi), %ymm5, %ymm3) ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + + /* If any mismatches or null CHAR then 0 CHAR, otherwise non- + zero. */ +@@ -469,6 +627,8 @@ L(return_vec_2_3_end): + # else + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2 - VEC_OFFSET)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -512,6 +672,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -534,6 +696,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -560,6 +724,8 @@ L(return_vec_2_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -587,7 +753,7 @@ L(page_cross_during_loop): + jle L(less_1x_vec_till_page_cross) + + VMOVA (%rdi), %ymm0 +- VPCMPEQ (%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -609,7 +775,7 @@ L(less_1x_vec_till_page_cross): + here, it means the previous page (rdi - VEC_SIZE) has already + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %ymm0 +- VPCMPEQ -VEC_SIZE(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -VEC_SIZE(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -651,6 +817,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -677,7 +845,7 @@ L(more_2x_vec_till_page_cross): + iteration here. */ + + VMOVU VEC_SIZE(%rdi), %ymm0 +- VPCMPEQ VEC_SIZE(%rsi), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, VEC_SIZE(%rsi), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -693,7 +861,7 @@ L(more_2x_vec_till_page_cross): + + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 2)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 2)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -701,7 +869,7 @@ L(more_2x_vec_till_page_cross): + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %ymm0 +- VPCMPEQ -(VEC_SIZE * 1)(%rsi, %rax), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, -(VEC_SIZE * 1)(%rsi, %rax), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -719,8 +887,8 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 2)(%rdi), %ymm4 + VMOVA (VEC_SIZE * 3)(%rdi), %ymm6 + +- VPCMPEQ (VEC_SIZE * 2)(%rsi), %ymm4, %ymm5 +- VPCMPEQ (VEC_SIZE * 3)(%rsi), %ymm6, %ymm7 ++ CMP_R1_S2_ymm (%ymm4, (VEC_SIZE * 2)(%rsi), %ymm7, %ymm5) ++ CMP_R1_S2_ymm (%ymm6, (VEC_SIZE * 3)(%rsi), %ymm13, %ymm7) + vpand %ymm4, %ymm5, %ymm5 + vpand %ymm6, %ymm7, %ymm7 + VPMINU %ymm5, %ymm7, %ymm7 +@@ -771,6 +939,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -826,7 +996,7 @@ L(page_cross): + L(page_cross_loop): + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -844,11 +1014,11 @@ L(page_cross_loop): + subl %eax, %OFFSET_REG + /* OFFSET_REG has distance to page cross - VEC_SIZE. Guranteed + to not cross page so is safe to load. Since we have already +- loaded at least 1 VEC from rsi it is also guranteed to be safe. +- */ ++ loaded at least 1 VEC from rsi it is also guranteed to be ++ safe. */ + + VMOVU (%rdi, %OFFSET_REG64), %ymm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %ymm0, %ymm1 ++ CMP_R1_S2_ymm (%ymm0, (%rsi, %OFFSET_REG64), %ymm2, %ymm1) + VPCMPEQ %ymm0, %ymmZERO, %ymm2 + vpandn %ymm1, %ymm2, %ymm1 + vpmovmskb %ymm1, %ecx +@@ -881,6 +1051,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -934,7 +1106,7 @@ L(less_1x_vec_till_page): + ja L(less_16_till_page) + + VMOVU (%rdi), %xmm0 +- VPCMPEQ (%rsi), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -952,7 +1124,7 @@ L(less_1x_vec_till_page): + # endif + + VMOVU (%rdi, %OFFSET_REG64), %xmm0 +- VPCMPEQ (%rsi, %OFFSET_REG64), %xmm0, %xmm1 ++ CMP_R1_S2_xmm (%xmm0, (%rsi, %OFFSET_REG64), %xmm2, %xmm1) + VPCMPEQ %xmm0, %xmmZERO, %xmm2 + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx +@@ -990,7 +1162,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1010,7 +1182,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64), %xmm0 + vmovq (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + incb %cl +@@ -1066,7 +1238,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1085,7 +1257,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64), %xmm0 + vmovd (%rsi, %OFFSET_REG64), %xmm1 + VPCMPEQ %xmm0, %xmmZERO, %xmm2 +- VPCMPEQ %xmm1, %xmm0, %xmm1 ++ CMP_R1_R2_xmm (%xmm0, %xmm1, %xmm3, %xmm1) + vpandn %xmm1, %xmm2, %xmm1 + vpmovmskb %ymm1, %ecx + subl $0xf, %ecx +@@ -1119,7 +1291,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1146,5 +1320,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +new file mode 100644 +index 00000000..58c05dcf +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2-rtm.S +@@ -0,0 +1,16 @@ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2_rtm ++#endif ++ ++#define _GLABEL(x) x ## _rtm ++#define GLABEL(x) _GLABEL(x) ++ ++#define ZERO_UPPER_VEC_REGISTERS_RETURN \ ++ ZERO_UPPER_VEC_REGISTERS_RETURN_XTEST ++ ++#define VZEROUPPER_RETURN jmp L(return_vzeroupper) ++ ++#define SECTION(p) p##.avx.rtm ++#define OVERFLOW_STRCMP __strcasecmp_l_avx2_rtm ++ ++#include "strncase_l-avx2.S" +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx2.S b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +new file mode 100644 +index 00000000..48c0aa21 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-avx2.S +@@ -0,0 +1,27 @@ ++/* strncasecmp_l optimized with AVX2. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_avx2 ++#endif ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcasecmp_l_avx2 ++#endif ++#include "strcmp-avx2.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-98.patch b/glibc-RHEL-15696-98.patch new file mode 100644 index 0000000..9941bcc --- /dev/null +++ b/glibc-RHEL-15696-98.patch @@ -0,0 +1,814 @@ +From 84e7c46df4086873eae28a1fb87d2cf5388b1e16 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Thu, 24 Mar 2022 18:56:13 -0500 +Subject: [PATCH] x86: Add EVEX optimized str{n}casecmp +Content-type: text/plain; charset=UTF-8 + +geometric_mean(N=40) of all benchmarks EVEX / SSE42: .621 + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 2 + + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 + + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 5 + + sysdeps/x86_64/multiarch/strcasecmp_l-evex.S | 23 ++ + sysdeps/x86_64/multiarch/strcmp-evex.S | 290 ++++++++++++++++--- + sysdeps/x86_64/multiarch/strncase_l-evex.S | 25 ++ + 6 files changed, 321 insertions(+), 40 deletions(-) + create mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-evex.S + create mode 100644 sysdeps/x86_64/multiarch/strncase_l-evex.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 711ecf2e..359712c1 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -53,6 +53,7 @@ sysdep_routines += \ + strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ ++ strcasecmp_l-evex \ + strcasecmp_l-sse2 \ + strcasecmp_l-sse4_2 \ + strcasecmp_l-ssse3 \ +@@ -93,6 +94,7 @@ sysdep_routines += \ + strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ ++ strncase_l-evex \ + strncase_l-sse2 \ + strncase_l-sse4_2 \ + strncase_l-ssse3 \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index d873e1be..1dedc637 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -418,6 +418,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_avx2) +@@ -438,6 +442,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strcasecmp_l.c. */ + IFUNC_IMPL (i, name, strcasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strcasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strcasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (AVX2), + __strcasecmp_l_avx2) +@@ -572,6 +580,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_avx2) +@@ -593,6 +605,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/x86_64/multiarch/strncase_l.c. */ + IFUNC_IMPL (i, name, strncasecmp_l, ++ IFUNC_IMPL_ADD (array, i, strncasecmp, ++ (CPU_FEATURE_USABLE (AVX512VL) ++ && CPU_FEATURE_USABLE (AVX512BW)), ++ __strncasecmp_l_evex) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (AVX2), + __strncasecmp_l_avx2) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 926508c4..6dd49a21 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -25,6 +25,7 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; ++extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; + + static inline void * + IFUNC_SELECTOR (void) +@@ -34,6 +35,10 @@ IFUNC_SELECTOR (void) + if (CPU_FEATURE_USABLE_P (cpu_features, AVX2) + && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load)) + { ++ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL) ++ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)) ++ return OPTIMIZE (evex); ++ + if (CPU_FEATURE_USABLE_P (cpu_features, RTM)) + return OPTIMIZE (avx2_rtm); + +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +new file mode 100644 +index 00000000..58642db7 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strcasecmp_l-evex.S +@@ -0,0 +1,23 @@ ++/* strcasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strcasecmp_l_evex ++#endif ++#define USE_AS_STRCASECMP_L ++#include "strcmp-evex.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S +index 0dfa62bd..b81b5775 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-evex.S ++++ b/sysdeps/x86_64/multiarch/strcmp-evex.S +@@ -19,6 +19,9 @@ + #if IS_IN (libc) + + # include ++# if defined USE_AS_STRCASECMP_L ++# include "locale-defines.h" ++# endif + + # ifndef STRCMP + # define STRCMP __strcmp_evex +@@ -34,19 +37,29 @@ + # define VMOVA vmovdqa64 + + # ifdef USE_AS_WCSCMP +-# define TESTEQ subl $0xff, ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __wcscmp_evex ++# endif ++ ++# define TESTEQ subl $0xff, + /* Compare packed dwords. */ + # define VPCMP vpcmpd + # define VPMINU vpminud + # define VPTESTM vptestmd ++# define VPTESTNM vptestnmd + /* 1 dword char == 4 bytes. */ + # define SIZE_OF_CHAR 4 + # else ++# ifndef OVERFLOW_STRCMP ++# define OVERFLOW_STRCMP __strcmp_evex ++# endif ++ + # define TESTEQ incl + /* Compare packed bytes. */ + # define VPCMP vpcmpb + # define VPMINU vpminub + # define VPTESTM vptestmb ++# define VPTESTNM vptestnmb + /* 1 byte char == 1 byte. */ + # define SIZE_OF_CHAR 1 + # endif +@@ -73,11 +86,16 @@ + # define VEC_OFFSET (-VEC_SIZE) + # endif + +-# define XMMZERO xmm16 + # define XMM0 xmm17 + # define XMM1 xmm18 + +-# define YMMZERO ymm16 ++# define XMM10 xmm27 ++# define XMM11 xmm28 ++# define XMM12 xmm29 ++# define XMM13 xmm30 ++# define XMM14 xmm31 ++ ++ + # define YMM0 ymm17 + # define YMM1 ymm18 + # define YMM2 ymm19 +@@ -89,6 +107,87 @@ + # define YMM8 ymm25 + # define YMM9 ymm26 + # define YMM10 ymm27 ++# define YMM11 ymm28 ++# define YMM12 ymm29 ++# define YMM13 ymm30 ++# define YMM14 ymm31 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define BYTE_LOOP_REG OFFSET_REG ++# else ++# define BYTE_LOOP_REG ecx ++# endif ++ ++# ifdef USE_AS_STRCASECMP_L ++# ifdef USE_AS_STRNCMP ++# define STRCASECMP __strncasecmp_evex ++# define LOCALE_REG rcx ++# define LOCALE_REG_LP RCX_LP ++# define STRCASECMP_NONASCII __strncasecmp_l_nonascii ++# else ++# define STRCASECMP __strcasecmp_evex ++# define LOCALE_REG rdx ++# define LOCALE_REG_LP RDX_LP ++# define STRCASECMP_NONASCII __strcasecmp_l_nonascii ++# endif ++# endif ++ ++# define LCASE_MIN_YMM %YMM12 ++# define LCASE_MAX_YMM %YMM13 ++# define CASE_ADD_YMM %YMM14 ++ ++# define LCASE_MIN_XMM %XMM12 ++# define LCASE_MAX_XMM %XMM13 ++# define CASE_ADD_XMM %XMM14 ++ ++ /* NB: wcsncmp uses r11 but strcasecmp is never used in ++ conjunction with wcscmp. */ ++# define TOLOWER_BASE %r11 ++ ++# ifdef USE_AS_STRCASECMP_L ++# define _REG(x, y) x ## y ++# define REG(x, y) _REG(x, y) ++# define TOLOWER(reg1, reg2, ext) \ ++ vpsubb REG(LCASE_MIN_, ext), reg1, REG(%ext, 10); \ ++ vpsubb REG(LCASE_MIN_, ext), reg2, REG(%ext, 11); \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 10), %k5; \ ++ vpcmpub $1, REG(LCASE_MAX_, ext), REG(%ext, 11), %k6; \ ++ vpaddb reg1, REG(CASE_ADD_, ext), reg1{%k5}; \ ++ vpaddb reg2, REG(CASE_ADD_, ext), reg2{%k6} ++ ++# define TOLOWER_gpr(src, dst) movl (TOLOWER_BASE, src, 4), dst ++# define TOLOWER_YMM(...) TOLOWER(__VA_ARGS__, YMM) ++# define TOLOWER_XMM(...) TOLOWER(__VA_ARGS__, XMM) ++ ++# define CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) \ ++ TOLOWER (s1_reg, s2_reg, ext); \ ++ VPCMP $0, s1_reg, s2_reg, reg_out ++ ++# define CMP_R1_S2(s1_reg, s2_mem, s2_reg, reg_out, ext) \ ++ VMOVU s2_mem, s2_reg; \ ++ CMP_R1_R2(s1_reg, s2_reg, reg_out, ext) ++ ++# define CMP_R1_R2_YMM(...) CMP_R1_R2(__VA_ARGS__, YMM) ++# define CMP_R1_R2_XMM(...) CMP_R1_R2(__VA_ARGS__, XMM) ++ ++# define CMP_R1_S2_YMM(...) CMP_R1_S2(__VA_ARGS__, YMM) ++# define CMP_R1_S2_XMM(...) CMP_R1_S2(__VA_ARGS__, XMM) ++ ++# else ++# define TOLOWER_gpr(...) ++# define TOLOWER_YMM(...) ++# define TOLOWER_XMM(...) ++ ++# define CMP_R1_R2_YMM(s1_reg, s2_reg, reg_out) \ ++ VPCMP $0, s2_reg, s1_reg, reg_out ++ ++# define CMP_R1_R2_XMM(...) CMP_R1_R2_YMM(__VA_ARGS__) ++ ++# define CMP_R1_S2_YMM(s1_reg, s2_mem, unused, reg_out) \ ++ VPCMP $0, s2_mem, s1_reg, reg_out ++ ++# define CMP_R1_S2_XMM(...) CMP_R1_S2_YMM(__VA_ARGS__) ++# endif + + /* Warning! + wcscmp/wcsncmp have to use SIGNED comparison for elements. +@@ -112,8 +211,45 @@ + returned. */ + + .section .text.evex, "ax", @progbits +-ENTRY(STRCMP) ++ .align 16 ++ .type STRCMP, @function ++ .globl STRCMP ++ .hidden STRCMP ++ ++# ifdef USE_AS_STRCASECMP_L ++ENTRY (STRCASECMP) ++ movq __libc_tsd_LOCALE@gottpoff(%rip), %rax ++ mov %fs:(%rax), %LOCALE_REG_LP ++ ++ /* Either 1 or 5 bytes (dependeing if CET is enabled). */ ++ .p2align 4 ++END (STRCASECMP) ++ /* FALLTHROUGH to strcasecmp/strncasecmp_l. */ ++# endif ++ ++ .p2align 4 ++STRCMP: ++ cfi_startproc ++ _CET_ENDBR ++ CALL_MCOUNT ++ ++# if defined USE_AS_STRCASECMP_L ++ /* We have to fall back on the C implementation for locales with ++ encodings not matching ASCII for single bytes. */ ++# if LOCALE_T___LOCALES != 0 || LC_CTYPE != 0 ++ mov LOCALE_T___LOCALES + LC_CTYPE * LP_SIZE(%LOCALE_REG), %RAX_LP ++# else ++ mov (%LOCALE_REG), %RAX_LP ++# endif ++ testl $1, LOCALE_DATA_VALUES + _NL_CTYPE_NONASCII_CASE * SIZEOF_VALUES(%rax) ++ jne STRCASECMP_NONASCII ++ leaq _nl_C_LC_CTYPE_tolower + 128 * 4(%rip), TOLOWER_BASE ++# endif ++ + # ifdef USE_AS_STRNCMP ++ /* Don't overwrite LOCALE_REG (rcx) until we have pass ++ L(one_or_less). Otherwise we might use the wrong locale in ++ the OVERFLOW_STRCMP (strcasecmp_l). */ + # ifdef __ILP32__ + /* Clear the upper 32 bits. */ + movl %edx, %edx +@@ -125,6 +261,32 @@ ENTRY(STRCMP) + actually bound the buffer. */ + jle L(one_or_less) + # endif ++ ++# if defined USE_AS_STRCASECMP_L ++ .section .rodata.cst32, "aM", @progbits, 32 ++ .align 32 ++L(lcase_min): ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++ .quad 0x4141414141414141 ++L(lcase_max): ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++ .quad 0x1a1a1a1a1a1a1a1a ++L(case_add): ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .quad 0x2020202020202020 ++ .previous ++ ++ vmovdqa64 L(lcase_min)(%rip), LCASE_MIN_YMM ++ vmovdqa64 L(lcase_max)(%rip), LCASE_MAX_YMM ++ vmovdqa64 L(case_add)(%rip), CASE_ADD_YMM ++# endif ++ + movl %edi, %eax + orl %esi, %eax + /* Shift out the bits irrelivant to page boundary ([63:12]). */ +@@ -139,7 +301,7 @@ L(no_page_cross): + VPTESTM %YMM0, %YMM0, %k2 + /* Each bit cleared in K1 represents a mismatch or a null CHAR + in YMM0 and 32 bytes at (%rsi). */ +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP + cmpq $CHAR_PER_VEC, %rdx +@@ -169,6 +331,8 @@ L(return_vec_0): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret0): +@@ -188,11 +352,15 @@ L(ret_zero): + + .p2align 4,, 5 + L(one_or_less): ++# ifdef USE_AS_STRCASECMP_L ++ /* Set locale argument for strcasecmp. */ ++ movq %LOCALE_REG, %rdx ++# endif + jb L(ret_zero) +-# ifdef USE_AS_WCSCMP + /* 'nbe' covers the case where length is negative (large + unsigned). */ +- jnbe __wcscmp_evex ++ jnbe OVERFLOW_STRCMP ++# ifdef USE_AS_WCSCMP + movl (%rdi), %edx + xorl %eax, %eax + cmpl (%rsi), %edx +@@ -201,11 +369,10 @@ L(one_or_less): + negl %eax + orl $1, %eax + # else +- /* 'nbe' covers the case where length is negative (large +- unsigned). */ +- jnbe __strcmp_evex + movzbl (%rdi), %eax + movzbl (%rsi), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret1): +@@ -233,6 +400,8 @@ L(return_vec_1): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret2): +@@ -270,6 +439,8 @@ L(return_vec_2): + # else + movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 2)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret3): +@@ -290,6 +461,8 @@ L(return_vec_3): + # else + movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax + movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + # endif + L(ret4): +@@ -303,7 +476,7 @@ L(more_3x_vec): + /* Safe to compare 4x vectors. */ + VMOVU (VEC_SIZE)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1) +@@ -315,14 +488,14 @@ L(more_3x_vec): + + VMOVU (VEC_SIZE * 2)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 2)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_2) + + VMOVU (VEC_SIZE * 3)(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (VEC_SIZE * 3)(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_3) +@@ -381,7 +554,6 @@ L(prepare_loop_aligned): + subl %esi, %eax + andl $(PAGE_SIZE - 1), %eax + +- vpxorq %YMMZERO, %YMMZERO, %YMMZERO + + /* Loop 4x comparisons at a time. */ + .p2align 4 +@@ -413,22 +585,35 @@ L(loop_skip_page_cross_check): + /* A zero CHAR in YMM9 means that there is a null CHAR. */ + VPMINU %YMM8, %YMM9, %YMM9 + +- /* Each bit set in K1 represents a non-null CHAR in YMM8. */ ++ /* Each bit set in K1 represents a non-null CHAR in YMM9. */ + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 0)(%rsi), %YMM0, %YMM1 + vpxorq (VEC_SIZE * 1)(%rsi), %YMM2, %YMM3 + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* Ternary logic to xor (VEC_SIZE * 3)(%rsi) with YMM6 while + oring with YMM1. Result is stored in YMM6. */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM1, %YMM6 +- ++# else ++ VMOVU (VEC_SIZE * 0)(%rsi), %YMM1 ++ TOLOWER_YMM (%YMM0, %YMM1) ++ VMOVU (VEC_SIZE * 1)(%rsi), %YMM3 ++ TOLOWER_YMM (%YMM2, %YMM3) ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM0, %YMM1, %YMM1 ++ vpxorq %YMM2, %YMM3, %YMM3 ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM1, %YMM6 ++# endif + /* Or together YMM3, YMM5, and YMM6. */ + vpternlogd $0xfe, %YMM3, %YMM5, %YMM6 + + + /* A non-zero CHAR in YMM6 represents a mismatch. */ +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + + TESTEQ %LOOP_REG +@@ -437,13 +622,13 @@ L(loop_skip_page_cross_check): + + /* Find which VEC has the mismatch of end of string. */ + VPTESTM %YMM0, %YMM0, %k1 +- VPCMP $0, %YMMZERO, %YMM1, %k0{%k1} ++ VPTESTNM %YMM1, %YMM1, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) + + VPTESTM %YMM2, %YMM2, %k1 +- VPCMP $0, %YMMZERO, %YMM3, %k0{%k1} ++ VPTESTNM %YMM3, %YMM3, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -457,7 +642,7 @@ L(return_vec_2_3_end): + # endif + + VPTESTM %YMM4, %YMM4, %k1 +- VPCMP $0, %YMMZERO, %YMM5, %k0{%k1} ++ VPTESTNM %YMM5, %YMM5, %k0{%k1} + kmovd %k0, %ecx + TESTEQ %ecx + # if CHAR_PER_VEC <= 16 +@@ -493,6 +678,8 @@ L(return_vec_3_end): + # else + movzbl (VEC_SIZE * 2)(%rdi, %LOOP_REG64), %eax + movzbl (VEC_SIZE * 2)(%rsi, %LOOP_REG64), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -545,6 +732,8 @@ L(return_vec_0_end): + # else + movzbl (%rdi, %rcx), %eax + movzbl (%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + /* Flip `eax` if `rdi` and `rsi` where swapped in page cross + logic. Subtract `r8d` after xor for zero case. */ +@@ -569,6 +758,8 @@ L(return_vec_1_end): + # else + movzbl VEC_SIZE(%rdi, %rcx), %eax + movzbl VEC_SIZE(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -598,7 +789,7 @@ L(page_cross_during_loop): + + VMOVA (%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_0_end) +@@ -619,8 +810,7 @@ L(less_1x_vec_till_page_cross): + been loaded earlier so must be valid. */ + VMOVU -VEC_SIZE(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -VEC_SIZE(%rsi, %rax), %YMM0, %k1{%k2} +- ++ CMP_R1_S2_YMM (%YMM0, -VEC_SIZE(%rsi, %rax), %YMM1, %k1){%k2} + /* Mask of potentially valid bits. The lower bits can be out of + range comparisons (but safe regarding page crosses). */ + +@@ -642,6 +832,8 @@ L(less_1x_vec_till_page_cross): + + # ifdef USE_AS_STRNCMP + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -679,6 +871,8 @@ L(return_page_cross_cmp_mem): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -709,7 +903,7 @@ L(more_2x_vec_till_page_cross): + + VMOVA VEC_SIZE(%rdi), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, VEC_SIZE(%rsi), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_1_end) +@@ -724,14 +918,14 @@ L(more_2x_vec_till_page_cross): + /* Safe to include comparisons from lower bytes. */ + VMOVU -(VEC_SIZE * 2)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 2)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_0) + + VMOVU -(VEC_SIZE * 1)(%rdi, %rax), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, -(VEC_SIZE * 1)(%rsi, %rax), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(return_vec_page_cross_1) +@@ -740,6 +934,8 @@ L(more_2x_vec_till_page_cross): + /* Must check length here as length might proclude reading next + page. */ + # ifdef USE_AS_WCSCMP ++ /* NB: strcasecmp not used with WCSCMP so this access to r11 is ++ safe. */ + movl %eax, %r11d + shrl $2, %r11d + cmpq %r11, %rdx +@@ -754,12 +950,19 @@ L(more_2x_vec_till_page_cross): + VMOVA (VEC_SIZE * 3)(%rdi), %YMM6 + VPMINU %YMM4, %YMM6, %YMM9 + VPTESTM %YMM9, %YMM9, %k1 +- ++# ifndef USE_AS_STRCASECMP_L + vpxorq (VEC_SIZE * 2)(%rsi), %YMM4, %YMM5 + /* YMM6 = YMM5 | ((VEC_SIZE * 3)(%rsi) ^ YMM6). */ + vpternlogd $0xde, (VEC_SIZE * 3)(%rsi), %YMM5, %YMM6 +- +- VPCMP $0, %YMMZERO, %YMM6, %k0{%k1} ++# else ++ VMOVU (VEC_SIZE * 2)(%rsi), %YMM5 ++ TOLOWER_YMM (%YMM4, %YMM5) ++ VMOVU (VEC_SIZE * 3)(%rsi), %YMM7 ++ TOLOWER_YMM (%YMM6, %YMM7) ++ vpxorq %YMM4, %YMM5, %YMM5 ++ vpternlogd $0xde, %YMM7, %YMM5, %YMM6 ++# endif ++ VPTESTNM %YMM6, %YMM6, %k0{%k1} + kmovd %k0, %LOOP_REG + TESTEQ %LOOP_REG + jnz L(return_vec_2_3_end) +@@ -815,6 +1018,8 @@ L(return_vec_page_cross_1): + # else + movzbl VEC_OFFSET(%rdi, %rcx), %eax + movzbl VEC_OFFSET(%rsi, %rcx), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -871,7 +1076,7 @@ L(page_cross): + L(page_cross_loop): + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + kmovd %k1, %ecx + TESTEQ %ecx + jnz L(check_ret_vec_page_cross) +@@ -895,7 +1100,7 @@ L(page_cross_loop): + */ + VMOVU (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0 + VPTESTM %YMM0, %YMM0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM0, %k1{%k2} ++ CMP_R1_S2_YMM (%YMM0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %YMM1, %k1){%k2} + + kmovd %k1, %ecx + # ifdef USE_AS_STRNCMP +@@ -930,6 +1135,8 @@ L(ret_vec_page_cross_cont): + # else + movzbl (%rdi, %rcx, SIZE_OF_CHAR), %eax + movzbl (%rsi, %rcx, SIZE_OF_CHAR), %ecx ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %ecx) + subl %ecx, %eax + xorl %r8d, %eax + subl %r8d, %eax +@@ -989,7 +1196,7 @@ L(less_1x_vec_till_page): + /* Use 16 byte comparison. */ + vmovdqu (%rdi), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1009,7 +1216,7 @@ L(less_1x_vec_till_page): + # endif + vmovdqu (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0, %k1{%k2} ++ CMP_R1_S2_XMM (%xmm0, (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0xf, %ecx +@@ -1048,7 +1255,7 @@ L(less_16_till_page): + vmovq (%rdi), %xmm0 + vmovq (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1068,7 +1275,7 @@ L(less_16_till_page): + vmovq (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovq (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + # ifdef USE_AS_WCSCMP + subl $0x3, %ecx +@@ -1128,7 +1335,7 @@ L(ret_less_8_wcs): + vmovd (%rdi), %xmm0 + vmovd (%rsi), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1143,7 +1350,7 @@ L(ret_less_8_wcs): + vmovd (%rdi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm0 + vmovd (%rsi, %OFFSET_REG64, SIZE_OF_CHAR), %xmm1 + VPTESTM %xmm0, %xmm0, %k2 +- VPCMP $0, %xmm1, %xmm0, %k1{%k2} ++ CMP_R1_R2_XMM (%xmm0, %xmm1, %k1){%k2} + kmovd %k1, %ecx + subl $0xf, %ecx + jnz L(check_ret_vec_page_cross) +@@ -1176,7 +1383,9 @@ L(less_4_till_page): + L(less_4_loop): + movzbl (%rdi), %eax + movzbl (%rsi, %rdi), %ecx +- subl %ecx, %eax ++ TOLOWER_gpr (%rax, %eax) ++ TOLOWER_gpr (%rcx, %BYTE_LOOP_REG) ++ subl %BYTE_LOOP_REG, %eax + jnz L(ret_less_4_loop) + testl %ecx, %ecx + jz L(ret_zero_4_loop) +@@ -1203,5 +1412,6 @@ L(ret_less_4_loop): + subl %r8d, %eax + ret + # endif +-END(STRCMP) ++ cfi_endproc ++ .size STRCMP, .-STRCMP + #endif +diff --git a/sysdeps/x86_64/multiarch/strncase_l-evex.S b/sysdeps/x86_64/multiarch/strncase_l-evex.S +new file mode 100644 +index 00000000..8a5af369 +--- /dev/null ++++ b/sysdeps/x86_64/multiarch/strncase_l-evex.S +@@ -0,0 +1,25 @@ ++/* strncasecmp_l optimized with EVEX. ++ Copyright (C) 2017-2022 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#ifndef STRCMP ++# define STRCMP __strncasecmp_l_evex ++#endif ++#define OVERFLOW_STRCMP __strcasecmp_l_evex ++#define USE_AS_STRCASECMP_L ++#define USE_AS_STRNCMP ++#include "strcmp-evex.S" +-- +GitLab + diff --git a/glibc-RHEL-15696-99.patch b/glibc-RHEL-15696-99.patch new file mode 100644 index 0000000..06d5d53 --- /dev/null +++ b/glibc-RHEL-15696-99.patch @@ -0,0 +1,913 @@ +From 305769b2a15c2e96f9e1b5195d3c4e0d6f0f4b68 Mon Sep 17 00:00:00 2001 +From: Noah Goldstein +Date: Wed, 23 Mar 2022 16:57:46 -0500 +Subject: [PATCH] x86: Remove AVX str{n}casecmp +Content-type: text/plain; charset=UTF-8 + +The rational is: + +1. SSE42 has nearly identical logic so any benefit is minimal (3.4% + regression on Tigerlake using SSE42 versus AVX across the + benchtest suite). +2. AVX2 version covers the majority of targets that previously + prefered it. +3. The targets where AVX would still be best (SnB and IVB) are + becoming outdated. + +All in all the saving the code size is worth it. + +All string/memory tests pass. +Reviewed-by: H.J. Lu +--- + sysdeps/x86_64/multiarch/Makefile | 2 - + sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 - + sysdeps/x86_64/multiarch/ifunc-strcasecmp.h | 4 - + sysdeps/x86_64/multiarch/strcasecmp_l-avx.S | 22 -- + sysdeps/x86_64/multiarch/strcmp-sse42.S | 240 +++++++++----------- + sysdeps/x86_64/multiarch/strncase_l-avx.S | 22 -- + 6 files changed, 105 insertions(+), 197 deletions(-) + delete mode 100644 sysdeps/x86_64/multiarch/strcasecmp_l-avx.S + delete mode 100644 sysdeps/x86_64/multiarch/strncase_l-avx.S + +diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile +index 359712c1..bca82e38 100644 +--- a/sysdeps/x86_64/multiarch/Makefile ++++ b/sysdeps/x86_64/multiarch/Makefile +@@ -50,7 +50,6 @@ sysdep_routines += \ + stpncpy-evex \ + stpncpy-sse2-unaligned \ + stpncpy-ssse3 \ +- strcasecmp_l-avx \ + strcasecmp_l-avx2 \ + strcasecmp_l-avx2-rtm \ + strcasecmp_l-evex \ +@@ -91,7 +90,6 @@ sysdep_routines += \ + strlen-avx2-rtm \ + strlen-evex \ + strlen-sse2 \ +- strncase_l-avx \ + strncase_l-avx2 \ + strncase_l-avx2-rtm \ + strncase_l-evex \ +diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +index 1dedc637..14314367 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c +@@ -429,9 +429,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_sse42) +@@ -453,9 +450,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strcasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strcasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strcasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strcasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strcasecmp_l_sse42) +@@ -591,9 +585,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_sse42) +@@ -616,9 +607,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + (CPU_FEATURE_USABLE (AVX2) + && CPU_FEATURE_USABLE (RTM)), + __strncasecmp_l_avx2_rtm) +- IFUNC_IMPL_ADD (array, i, strncasecmp_l, +- CPU_FEATURE_USABLE (AVX), +- __strncasecmp_l_avx) + IFUNC_IMPL_ADD (array, i, strncasecmp_l, + CPU_FEATURE_USABLE (SSE4_2), + __strncasecmp_l_sse42) +diff --git a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +index 6dd49a21..34cfbb8f 100644 +--- a/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h ++++ b/sysdeps/x86_64/multiarch/ifunc-strcasecmp.h +@@ -22,7 +22,6 @@ + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden; +-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden; + extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden; +@@ -46,9 +45,6 @@ IFUNC_SELECTOR (void) + return OPTIMIZE (avx2); + } + +- if (CPU_FEATURE_USABLE_P (cpu_features, AVX)) +- return OPTIMIZE (avx); +- + if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2) + && !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2)) + return OPTIMIZE (sse42); +diff --git a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S b/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S +deleted file mode 100644 +index 56a03547..00000000 +--- a/sysdeps/x86_64/multiarch/strcasecmp_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strcasecmp_l optimized with AVX. +- Copyright (C) 2017-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strcasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRCASECMP_L +-#include "strcmp-sse42.S" +diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S +index 59e8ddfc..0a42b7a4 100644 +--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S ++++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S +@@ -42,13 +42,8 @@ + # define UPDATE_STRNCMP_COUNTER + #endif + +-#ifdef USE_AVX +-# define SECTION avx +-# define GLABEL(l) l##_avx +-#else +-# define SECTION sse4.2 +-# define GLABEL(l) l##_sse42 +-#endif ++#define SECTION sse4.2 ++#define GLABEL(l) l##_sse42 + + #define LABEL(l) .L##l + +@@ -106,21 +101,7 @@ END (GLABEL(__strncasecmp)) + #endif + + +-#ifdef USE_AVX +-# define movdqa vmovdqa +-# define movdqu vmovdqu +-# define pmovmskb vpmovmskb +-# define pcmpistri vpcmpistri +-# define psubb vpsubb +-# define pcmpeqb vpcmpeqb +-# define psrldq vpsrldq +-# define pslldq vpslldq +-# define palignr vpalignr +-# define pxor vpxor +-# define D(arg) arg, arg +-#else +-# define D(arg) arg +-#endif ++#define arg arg + + STRCMP_SSE42: + cfi_startproc +@@ -192,18 +173,7 @@ LABEL(case_add): + movdqu (%rdi), %xmm1 + movdqu (%rsi), %xmm2 + #if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L +-# ifdef USE_AVX +-# define TOLOWER(reg1, reg2) \ +- vpaddb LCASE_MIN_reg, reg1, %xmm7; \ +- vpaddb LCASE_MIN_reg, reg2, %xmm8; \ +- vpcmpgtb LCASE_MAX_reg, %xmm7, %xmm7; \ +- vpcmpgtb LCASE_MAX_reg, %xmm8, %xmm8; \ +- vpandn CASE_ADD_reg, %xmm7, %xmm7; \ +- vpandn CASE_ADD_reg, %xmm8, %xmm8; \ +- vpaddb %xmm7, reg1, reg1; \ +- vpaddb %xmm8, reg2, reg2 +-# else +-# define TOLOWER(reg1, reg2) \ ++# define TOLOWER(reg1, reg2) \ + movdqa LCASE_MIN_reg, %xmm7; \ + movdqa LCASE_MIN_reg, %xmm8; \ + paddb reg1, %xmm7; \ +@@ -214,15 +184,15 @@ LABEL(case_add): + pandn CASE_ADD_reg, %xmm8; \ + paddb %xmm7, reg1; \ + paddb %xmm8, reg2 +-# endif ++ + TOLOWER (%xmm1, %xmm2) + #else + # define TOLOWER(reg1, reg2) + #endif +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char checks */ +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ +- pcmpeqb %xmm2, D(%xmm1) /* compare first 16 bytes for equality */ +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char checks */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ ++ pcmpeqb %xmm2, %xmm1 /* compare first 16 bytes for equality */ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %edx + sub $0xffff, %edx /* if first 16 bytes are same, edx == 0xffff */ + jnz LABEL(less16bytes)/* If not, find different value or null char */ +@@ -246,7 +216,7 @@ LABEL(crosscache): + xor %r8d, %r8d + and $0xf, %ecx /* offset of rsi */ + and $0xf, %eax /* offset of rdi */ +- pxor %xmm0, D(%xmm0) /* clear %xmm0 for null char check */ ++ pxor %xmm0, %xmm0 /* clear %xmm0 for null char check */ + cmp %eax, %ecx + je LABEL(ashr_0) /* rsi and rdi relative offset same */ + ja LABEL(bigger) +@@ -260,7 +230,7 @@ LABEL(bigger): + sub %rcx, %r9 + lea LABEL(unaligned_table)(%rip), %r10 + movslq (%r10, %r9,4), %r9 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + lea (%r10, %r9), %r10 + _CET_NOTRACK jmp *%r10 /* jump to corresponding case */ + +@@ -273,15 +243,15 @@ LABEL(bigger): + LABEL(ashr_0): + + movdqa (%rsi), %xmm1 +- pcmpeqb %xmm1, D(%xmm0) /* Any null chars? */ ++ pcmpeqb %xmm1, %xmm0 /* Any null chars? */ + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L +- pcmpeqb (%rdi), D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb (%rdi), %xmm1 /* compare 16 bytes for equality */ + #else + movdqa (%rdi), %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm2, D(%xmm1) /* compare 16 bytes for equality */ ++ pcmpeqb %xmm2, %xmm1 /* compare 16 bytes for equality */ + #endif +- psubb %xmm0, D(%xmm1) /* packed sub of comparison results*/ ++ psubb %xmm0, %xmm1 /* packed sub of comparison results*/ + pmovmskb %xmm1, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -361,10 +331,10 @@ LABEL(ashr_0_exit_use): + */ + .p2align 4 + LABEL(ashr_1): +- pslldq $15, D(%xmm2) /* shift first string to align with second */ ++ pslldq $15, %xmm2 /* shift first string to align with second */ + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) /* compare 16 bytes for equality */ +- psubb %xmm0, D(%xmm2) /* packed sub of comparison results*/ ++ pcmpeqb %xmm1, %xmm2 /* compare 16 bytes for equality */ ++ psubb %xmm0, %xmm2 /* packed sub of comparison results*/ + pmovmskb %xmm2, %r9d + shr %cl, %edx /* adjust 0xffff for offset */ + shr %cl, %r9d /* adjust for 16-byte offset */ +@@ -392,7 +362,7 @@ LABEL(loop_ashr_1_use): + + LABEL(nibble_ashr_1_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -411,7 +381,7 @@ LABEL(nibble_ashr_1_restart_use): + jg LABEL(nibble_ashr_1_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $1, -16(%rdi, %rdx), D(%xmm0) ++ palignr $1, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -431,7 +401,7 @@ LABEL(nibble_ashr_1_restart_use): + LABEL(nibble_ashr_1_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $1, D(%xmm0) ++ psrldq $1, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -449,10 +419,10 @@ LABEL(nibble_ashr_1_use): + */ + .p2align 4 + LABEL(ashr_2): +- pslldq $14, D(%xmm2) ++ pslldq $14, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -480,7 +450,7 @@ LABEL(loop_ashr_2_use): + + LABEL(nibble_ashr_2_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -499,7 +469,7 @@ LABEL(nibble_ashr_2_restart_use): + jg LABEL(nibble_ashr_2_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $2, -16(%rdi, %rdx), D(%xmm0) ++ palignr $2, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -519,7 +489,7 @@ LABEL(nibble_ashr_2_restart_use): + LABEL(nibble_ashr_2_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $2, D(%xmm0) ++ psrldq $2, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -537,10 +507,10 @@ LABEL(nibble_ashr_2_use): + */ + .p2align 4 + LABEL(ashr_3): +- pslldq $13, D(%xmm2) ++ pslldq $13, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -568,7 +538,7 @@ LABEL(loop_ashr_3_use): + + LABEL(nibble_ashr_3_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -587,7 +557,7 @@ LABEL(nibble_ashr_3_restart_use): + jg LABEL(nibble_ashr_3_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $3, -16(%rdi, %rdx), D(%xmm0) ++ palignr $3, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -607,7 +577,7 @@ LABEL(nibble_ashr_3_restart_use): + LABEL(nibble_ashr_3_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $3, D(%xmm0) ++ psrldq $3, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -625,10 +595,10 @@ LABEL(nibble_ashr_3_use): + */ + .p2align 4 + LABEL(ashr_4): +- pslldq $12, D(%xmm2) ++ pslldq $12, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -657,7 +627,7 @@ LABEL(loop_ashr_4_use): + + LABEL(nibble_ashr_4_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -676,7 +646,7 @@ LABEL(nibble_ashr_4_restart_use): + jg LABEL(nibble_ashr_4_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $4, -16(%rdi, %rdx), D(%xmm0) ++ palignr $4, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -696,7 +666,7 @@ LABEL(nibble_ashr_4_restart_use): + LABEL(nibble_ashr_4_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $4, D(%xmm0) ++ psrldq $4, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -714,10 +684,10 @@ LABEL(nibble_ashr_4_use): + */ + .p2align 4 + LABEL(ashr_5): +- pslldq $11, D(%xmm2) ++ pslldq $11, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -746,7 +716,7 @@ LABEL(loop_ashr_5_use): + + LABEL(nibble_ashr_5_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -766,7 +736,7 @@ LABEL(nibble_ashr_5_restart_use): + + movdqa (%rdi, %rdx), %xmm0 + +- palignr $5, -16(%rdi, %rdx), D(%xmm0) ++ palignr $5, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -786,7 +756,7 @@ LABEL(nibble_ashr_5_restart_use): + LABEL(nibble_ashr_5_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $5, D(%xmm0) ++ psrldq $5, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -804,10 +774,10 @@ LABEL(nibble_ashr_5_use): + */ + .p2align 4 + LABEL(ashr_6): +- pslldq $10, D(%xmm2) ++ pslldq $10, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -836,7 +806,7 @@ LABEL(loop_ashr_6_use): + + LABEL(nibble_ashr_6_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -855,7 +825,7 @@ LABEL(nibble_ashr_6_restart_use): + jg LABEL(nibble_ashr_6_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $6, -16(%rdi, %rdx), D(%xmm0) ++ palignr $6, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -875,7 +845,7 @@ LABEL(nibble_ashr_6_restart_use): + LABEL(nibble_ashr_6_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $6, D(%xmm0) ++ psrldq $6, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -893,10 +863,10 @@ LABEL(nibble_ashr_6_use): + */ + .p2align 4 + LABEL(ashr_7): +- pslldq $9, D(%xmm2) ++ pslldq $9, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -925,7 +895,7 @@ LABEL(loop_ashr_7_use): + + LABEL(nibble_ashr_7_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -944,7 +914,7 @@ LABEL(nibble_ashr_7_restart_use): + jg LABEL(nibble_ashr_7_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $7, -16(%rdi, %rdx), D(%xmm0) ++ palignr $7, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a,(%rsi,%rdx), %xmm0 + #else +@@ -964,7 +934,7 @@ LABEL(nibble_ashr_7_restart_use): + LABEL(nibble_ashr_7_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $7, D(%xmm0) ++ psrldq $7, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -982,10 +952,10 @@ LABEL(nibble_ashr_7_use): + */ + .p2align 4 + LABEL(ashr_8): +- pslldq $8, D(%xmm2) ++ pslldq $8, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1014,7 +984,7 @@ LABEL(loop_ashr_8_use): + + LABEL(nibble_ashr_8_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1033,7 +1003,7 @@ LABEL(nibble_ashr_8_restart_use): + jg LABEL(nibble_ashr_8_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $8, -16(%rdi, %rdx), D(%xmm0) ++ palignr $8, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1053,7 +1023,7 @@ LABEL(nibble_ashr_8_restart_use): + LABEL(nibble_ashr_8_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $8, D(%xmm0) ++ psrldq $8, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1071,10 +1041,10 @@ LABEL(nibble_ashr_8_use): + */ + .p2align 4 + LABEL(ashr_9): +- pslldq $7, D(%xmm2) ++ pslldq $7, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1104,7 +1074,7 @@ LABEL(loop_ashr_9_use): + LABEL(nibble_ashr_9_restart_use): + movdqa (%rdi, %rdx), %xmm0 + +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1123,7 +1093,7 @@ LABEL(nibble_ashr_9_restart_use): + jg LABEL(nibble_ashr_9_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $9, -16(%rdi, %rdx), D(%xmm0) ++ palignr $9, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1143,7 +1113,7 @@ LABEL(nibble_ashr_9_restart_use): + LABEL(nibble_ashr_9_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $9, D(%xmm0) ++ psrldq $9, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1161,10 +1131,10 @@ LABEL(nibble_ashr_9_use): + */ + .p2align 4 + LABEL(ashr_10): +- pslldq $6, D(%xmm2) ++ pslldq $6, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1193,7 +1163,7 @@ LABEL(loop_ashr_10_use): + + LABEL(nibble_ashr_10_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1212,7 +1182,7 @@ LABEL(nibble_ashr_10_restart_use): + jg LABEL(nibble_ashr_10_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $10, -16(%rdi, %rdx), D(%xmm0) ++ palignr $10, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1232,7 +1202,7 @@ LABEL(nibble_ashr_10_restart_use): + LABEL(nibble_ashr_10_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $10, D(%xmm0) ++ psrldq $10, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1250,10 +1220,10 @@ LABEL(nibble_ashr_10_use): + */ + .p2align 4 + LABEL(ashr_11): +- pslldq $5, D(%xmm2) ++ pslldq $5, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1282,7 +1252,7 @@ LABEL(loop_ashr_11_use): + + LABEL(nibble_ashr_11_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1301,7 +1271,7 @@ LABEL(nibble_ashr_11_restart_use): + jg LABEL(nibble_ashr_11_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $11, -16(%rdi, %rdx), D(%xmm0) ++ palignr $11, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1321,7 +1291,7 @@ LABEL(nibble_ashr_11_restart_use): + LABEL(nibble_ashr_11_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $11, D(%xmm0) ++ psrldq $11, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1339,10 +1309,10 @@ LABEL(nibble_ashr_11_use): + */ + .p2align 4 + LABEL(ashr_12): +- pslldq $4, D(%xmm2) ++ pslldq $4, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1371,7 +1341,7 @@ LABEL(loop_ashr_12_use): + + LABEL(nibble_ashr_12_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1390,7 +1360,7 @@ LABEL(nibble_ashr_12_restart_use): + jg LABEL(nibble_ashr_12_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $12, -16(%rdi, %rdx), D(%xmm0) ++ palignr $12, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1410,7 +1380,7 @@ LABEL(nibble_ashr_12_restart_use): + LABEL(nibble_ashr_12_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $12, D(%xmm0) ++ psrldq $12, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1428,10 +1398,10 @@ LABEL(nibble_ashr_12_use): + */ + .p2align 4 + LABEL(ashr_13): +- pslldq $3, D(%xmm2) ++ pslldq $3, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1461,7 +1431,7 @@ LABEL(loop_ashr_13_use): + + LABEL(nibble_ashr_13_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1480,7 +1450,7 @@ LABEL(nibble_ashr_13_restart_use): + jg LABEL(nibble_ashr_13_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $13, -16(%rdi, %rdx), D(%xmm0) ++ palignr $13, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1500,7 +1470,7 @@ LABEL(nibble_ashr_13_restart_use): + LABEL(nibble_ashr_13_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $13, D(%xmm0) ++ psrldq $13, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1518,10 +1488,10 @@ LABEL(nibble_ashr_13_use): + */ + .p2align 4 + LABEL(ashr_14): +- pslldq $2, D(%xmm2) ++ pslldq $2, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1551,7 +1521,7 @@ LABEL(loop_ashr_14_use): + + LABEL(nibble_ashr_14_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1570,7 +1540,7 @@ LABEL(nibble_ashr_14_restart_use): + jg LABEL(nibble_ashr_14_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $14, -16(%rdi, %rdx), D(%xmm0) ++ palignr $14, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1590,7 +1560,7 @@ LABEL(nibble_ashr_14_restart_use): + LABEL(nibble_ashr_14_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $14, D(%xmm0) ++ psrldq $14, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +@@ -1608,10 +1578,10 @@ LABEL(nibble_ashr_14_use): + */ + .p2align 4 + LABEL(ashr_15): +- pslldq $1, D(%xmm2) ++ pslldq $1, %xmm2 + TOLOWER (%xmm1, %xmm2) +- pcmpeqb %xmm1, D(%xmm2) +- psubb %xmm0, D(%xmm2) ++ pcmpeqb %xmm1, %xmm2 ++ psubb %xmm0, %xmm2 + pmovmskb %xmm2, %r9d + shr %cl, %edx + shr %cl, %r9d +@@ -1643,7 +1613,7 @@ LABEL(loop_ashr_15_use): + + LABEL(nibble_ashr_15_restart_use): + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1662,7 +1632,7 @@ LABEL(nibble_ashr_15_restart_use): + jg LABEL(nibble_ashr_15_use) + + movdqa (%rdi, %rdx), %xmm0 +- palignr $15, -16(%rdi, %rdx), D(%xmm0) ++ palignr $15, -16(%rdi, %rdx), %xmm0 + #if !defined USE_AS_STRCASECMP_L && !defined USE_AS_STRNCASECMP_L + pcmpistri $0x1a, (%rsi,%rdx), %xmm0 + #else +@@ -1682,7 +1652,7 @@ LABEL(nibble_ashr_15_restart_use): + LABEL(nibble_ashr_15_use): + sub $0x1000, %r10 + movdqa -16(%rdi, %rdx), %xmm0 +- psrldq $15, D(%xmm0) ++ psrldq $15, %xmm0 + pcmpistri $0x3a,%xmm0, %xmm0 + #if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L + cmp %r11, %rcx +diff --git a/sysdeps/x86_64/multiarch/strncase_l-avx.S b/sysdeps/x86_64/multiarch/strncase_l-avx.S +deleted file mode 100644 +index 0c4e525b..00000000 +--- a/sysdeps/x86_64/multiarch/strncase_l-avx.S ++++ /dev/null +@@ -1,22 +0,0 @@ +-/* strncasecmp_l optimized with AVX. +- Copyright (C) 2017-2018 Free Software Foundation, Inc. +- This file is part of the GNU C Library. +- +- The GNU C Library is free software; you can redistribute it and/or +- modify it under the terms of the GNU Lesser General Public +- License as published by the Free Software Foundation; either +- version 2.1 of the License, or (at your option) any later version. +- +- The GNU C Library is distributed in the hope that it will be useful, +- but WITHOUT ANY WARRANTY; without even the implied warranty of +- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +- Lesser General Public License for more details. +- +- You should have received a copy of the GNU Lesser General Public +- License along with the GNU C Library; if not, see +- . */ +- +-#define STRCMP_SSE42 __strncasecmp_l_avx +-#define USE_AVX 1 +-#define USE_AS_STRNCASECMP_L +-#include "strcmp-sse42.S" +-- +GitLab + diff --git a/glibc.spec b/glibc.spec index a5ff930..2d5c641 100644 --- a/glibc.spec +++ b/glibc.spec @@ -1,6 +1,6 @@ %define glibcsrcdir glibc-2.28 %define glibcversion 2.28 -%define glibcrelease 245%{?dist} +%define glibcrelease 246%{?dist} # Pre-release tarballs are pulled in from git using a command that is # effectively: # @@ -1065,6 +1065,116 @@ Patch877: glibc-RHEL-16825-1.patch Patch878: glibc-RHEL-16825-2.patch Patch879: glibc-RHEL-16825-3.patch Patch880: glibc-RHEL-16825-4.patch +Patch881: glibc-RHEL-15696-1.patch +Patch882: glibc-RHEL-15696-2.patch +Patch883: glibc-RHEL-15696-3.patch +Patch884: glibc-RHEL-15696-4.patch +Patch885: glibc-RHEL-15696-5.patch +Patch886: glibc-RHEL-15696-6.patch +Patch887: glibc-RHEL-15696-7.patch +Patch888: glibc-RHEL-15696-8.patch +Patch889: glibc-RHEL-15696-9.patch +Patch890: glibc-RHEL-15696-10.patch +Patch891: glibc-RHEL-15696-11.patch +Patch892: glibc-RHEL-15696-12.patch +Patch893: glibc-RHEL-15696-13.patch +Patch894: glibc-RHEL-15696-14.patch +Patch895: glibc-RHEL-15696-15.patch +Patch896: glibc-RHEL-15696-16.patch +Patch897: glibc-RHEL-15696-17.patch +Patch898: glibc-RHEL-15696-18.patch +Patch899: glibc-RHEL-15696-19.patch +Patch900: glibc-RHEL-15696-20.patch +Patch901: glibc-RHEL-15696-21.patch +Patch902: glibc-RHEL-15696-22.patch +Patch903: glibc-RHEL-15696-23.patch +Patch904: glibc-RHEL-15696-24.patch +Patch905: glibc-RHEL-15696-25.patch +Patch906: glibc-RHEL-15696-26.patch +Patch907: glibc-RHEL-15696-27.patch +Patch908: glibc-RHEL-15696-28.patch +Patch909: glibc-RHEL-15696-29.patch +Patch910: glibc-RHEL-15696-30.patch +Patch911: glibc-RHEL-15696-31.patch +Patch912: glibc-RHEL-15696-32.patch +Patch913: glibc-RHEL-15696-33.patch +Patch914: glibc-RHEL-15696-34.patch +Patch915: glibc-RHEL-15696-35.patch +Patch916: glibc-RHEL-15696-36.patch +Patch917: glibc-RHEL-15696-37.patch +Patch918: glibc-RHEL-15696-38.patch +Patch919: glibc-RHEL-15696-39.patch +Patch920: glibc-RHEL-15696-40.patch +Patch921: glibc-RHEL-15696-41.patch +Patch922: glibc-RHEL-15696-42.patch +Patch923: glibc-RHEL-15696-43.patch +Patch924: glibc-RHEL-15696-44.patch +Patch925: glibc-RHEL-15696-45.patch +Patch926: glibc-RHEL-15696-46.patch +Patch927: glibc-RHEL-15696-47.patch +Patch928: glibc-RHEL-15696-48.patch +Patch929: glibc-RHEL-15696-49.patch +Patch930: glibc-RHEL-15696-50.patch +Patch931: glibc-RHEL-15696-51.patch +Patch932: glibc-RHEL-15696-52.patch +Patch933: glibc-RHEL-15696-53.patch +Patch934: glibc-RHEL-15696-54.patch +Patch935: glibc-RHEL-15696-55.patch +Patch936: glibc-RHEL-15696-56.patch +Patch937: glibc-RHEL-15696-57.patch +Patch938: glibc-RHEL-15696-58.patch +Patch939: glibc-RHEL-15696-59.patch +Patch940: glibc-RHEL-15696-60.patch +Patch941: glibc-RHEL-15696-61.patch +Patch942: glibc-RHEL-15696-62.patch +Patch943: glibc-RHEL-15696-63.patch +Patch944: glibc-RHEL-15696-64.patch +Patch945: glibc-RHEL-15696-65.patch +Patch946: glibc-RHEL-15696-66.patch +Patch947: glibc-RHEL-15696-67.patch +Patch948: glibc-RHEL-15696-68.patch +Patch949: glibc-RHEL-15696-69.patch +Patch950: glibc-RHEL-15696-70.patch +Patch951: glibc-RHEL-15696-71.patch +Patch952: glibc-RHEL-15696-72.patch +Patch953: glibc-RHEL-15696-73.patch +Patch954: glibc-RHEL-15696-74.patch +Patch955: glibc-RHEL-15696-75.patch +Patch956: glibc-RHEL-15696-76.patch +Patch957: glibc-RHEL-15696-77.patch +Patch958: glibc-RHEL-15696-78.patch +Patch959: glibc-RHEL-15696-79.patch +Patch960: glibc-RHEL-15696-80.patch +Patch961: glibc-RHEL-15696-81.patch +Patch962: glibc-RHEL-15696-82.patch +Patch963: glibc-RHEL-15696-83.patch +Patch964: glibc-RHEL-15696-84.patch +Patch965: glibc-RHEL-15696-85.patch +Patch966: glibc-RHEL-15696-86.patch +Patch967: glibc-RHEL-15696-87.patch +Patch968: glibc-RHEL-15696-88.patch +Patch969: glibc-RHEL-15696-89.patch +Patch970: glibc-RHEL-15696-90.patch +Patch971: glibc-RHEL-15696-91.patch +Patch972: glibc-RHEL-15696-92.patch +Patch973: glibc-RHEL-15696-93.patch +Patch974: glibc-RHEL-15696-94.patch +Patch975: glibc-RHEL-15696-95.patch +Patch976: glibc-RHEL-15696-96.patch +Patch977: glibc-RHEL-15696-97.patch +Patch978: glibc-RHEL-15696-98.patch +Patch979: glibc-RHEL-15696-99.patch +Patch980: glibc-RHEL-15696-100.patch +Patch981: glibc-RHEL-15696-101.patch +Patch982: glibc-RHEL-15696-102.patch +Patch983: glibc-RHEL-15696-103.patch +Patch984: glibc-RHEL-15696-104.patch +Patch985: glibc-RHEL-15696-105.patch +Patch986: glibc-RHEL-15696-106.patch +Patch987: glibc-RHEL-15696-107.patch +Patch988: glibc-RHEL-15696-108.patch +Patch989: glibc-RHEL-15696-109.patch +Patch990: glibc-RHEL-15696-110.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2896,6 +3006,9 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Thu Dec 14 2023 DJ Delorie - 2.28-246 +- Include CentOS Hyperscaler SIG patches backported by Intel (RHEL-15696) + * Fri Dec 8 2023 Florian Weimer - 2.28-245 - Improve compatibility between underlinking and IFUNC resolvers (RHEL-16825)