3411 lines
87 KiB
Diff
3411 lines
87 KiB
Diff
From 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77 Mon Sep 17 00:00:00 2001
|
|
From: "H.J. Lu" <hjl.tools@gmail.com>
|
|
Date: Fri, 5 Mar 2021 06:24:52 -0800
|
|
Subject: [PATCH] x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
|
|
Content-type: text/plain; charset=UTF-8
|
|
|
|
Update ifunc-avx2.h, strchr.c, strcmp.c, strncmp.c and wcsnlen.c to
|
|
select the function optimized with 256-bit EVEX instructions using
|
|
YMM16-YMM31 registers to avoid RTM abort with usable AVX512VL, AVX512BW
|
|
and BMI2 since VZEROUPPER isn't needed at function exit.
|
|
|
|
For strcmp/strncmp, prefer AVX2 strcmp/strncmp if Prefer_AVX2_STRCMP
|
|
is set.
|
|
---
|
|
sysdeps/x86_64/multiarch/Makefile | 21 +-
|
|
sysdeps/x86_64/multiarch/ifunc-avx2.h | 14 +-
|
|
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 81 ++
|
|
sysdeps/x86_64/multiarch/memchr-evex.S | 381 +++++++
|
|
sysdeps/x86_64/multiarch/memrchr-evex.S | 337 +++++++
|
|
sysdeps/x86_64/multiarch/rawmemchr-evex.S | 4 +
|
|
sysdeps/x86_64/multiarch/strchr-evex.S | 335 +++++++
|
|
sysdeps/x86_64/multiarch/strchr.c | 14 +-
|
|
sysdeps/x86_64/multiarch/strchrnul-evex.S | 3 +
|
|
sysdeps/x86_64/multiarch/strcmp-evex.S | 1043 ++++++++++++++++++++
|
|
sysdeps/x86_64/multiarch/strcmp.c | 15 +-
|
|
sysdeps/x86_64/multiarch/strlen-evex.S | 436 ++++++++
|
|
sysdeps/x86_64/multiarch/strncmp-evex.S | 3 +
|
|
sysdeps/x86_64/multiarch/strncmp.c | 15 +-
|
|
sysdeps/x86_64/multiarch/strnlen-evex.S | 4 +
|
|
sysdeps/x86_64/multiarch/strrchr-evex.S | 265 +++++
|
|
sysdeps/x86_64/multiarch/wcschr-evex.S | 3 +
|
|
sysdeps/x86_64/multiarch/wcscmp-evex.S | 4 +
|
|
sysdeps/x86_64/multiarch/wcslen-evex.S | 4 +
|
|
sysdeps/x86_64/multiarch/wcsncmp-evex.S | 5 +
|
|
sysdeps/x86_64/multiarch/wcsnlen-evex.S | 5 +
|
|
sysdeps/x86_64/multiarch/wcsnlen.c | 14 +-
|
|
sysdeps/x86_64/multiarch/wcsrchr-evex.S | 3 +
|
|
sysdeps/x86_64/multiarch/wmemchr-evex.S | 4 +
|
|
24 files changed, 2996 insertions(+), 17 deletions(-)
|
|
create mode 100644 sysdeps/x86_64/multiarch/memchr-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/memrchr-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/strchr-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/strchrnul-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/strlen-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/strncmp-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/strnlen-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/strrchr-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/wcschr-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/wcscmp-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/wcslen-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/wcsncmp-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/wcsnlen-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/wcsrchr-evex.S
|
|
create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex.S
|
|
|
|
Conflicts:
|
|
sysdeps/x86_64/multiarch/wcsnlen.c
|
|
(account for missing upstream macros)
|
|
|
|
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
|
index 9477538a..5ce85882 100644
|
|
--- a/sysdeps/x86_64/multiarch/Makefile
|
|
+++ b/sysdeps/x86_64/multiarch/Makefile
|
|
@@ -39,7 +39,17 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
|
memmove-avx512-unaligned-erms \
|
|
memset-sse2-unaligned-erms \
|
|
memset-avx2-unaligned-erms \
|
|
- memset-avx512-unaligned-erms
|
|
+ memset-avx512-unaligned-erms \
|
|
+ memchr-evex \
|
|
+ memrchr-evex \
|
|
+ rawmemchr-evex \
|
|
+ strchr-evex \
|
|
+ strchrnul-evex \
|
|
+ strcmp-evex \
|
|
+ strlen-evex \
|
|
+ strncmp-evex \
|
|
+ strnlen-evex \
|
|
+ strrchr-evex
|
|
CFLAGS-varshift.c += -msse4
|
|
CFLAGS-strcspn-c.c += -msse4
|
|
CFLAGS-strpbrk-c.c += -msse4
|
|
@@ -56,7 +66,14 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
|
wcschr-sse2 wcschr-avx2 \
|
|
wcsrchr-sse2 wcsrchr-avx2 \
|
|
wcsnlen-sse4_1 wcsnlen-c \
|
|
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2
|
|
+ wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
|
|
+ wcschr-evex \
|
|
+ wcscmp-evex \
|
|
+ wcslen-evex \
|
|
+ wcsncmp-evex \
|
|
+ wcsnlen-evex \
|
|
+ wcsrchr-evex \
|
|
+ wmemchr-evex
|
|
endif
|
|
|
|
ifeq ($(subdir),debug)
|
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
index 5c88640a..7081b0c9 100644
|
|
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
|
@@ -21,16 +21,24 @@
|
|
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
|
|
static inline void *
|
|
IFUNC_SELECTOR (void)
|
|
{
|
|
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
|
|
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
- return OPTIMIZE (avx2);
|
|
+ {
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
+ return OPTIMIZE (evex);
|
|
+
|
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
+ return OPTIMIZE (avx2);
|
|
+ }
|
|
|
|
return OPTIMIZE (sse2);
|
|
}
|
|
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
index fe13505c..bd7d9f19 100644
|
|
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
|
@@ -43,6 +43,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, memchr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__memchr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, memchr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __memchr_evex)
|
|
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
|
|
@@ -121,6 +126,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, memrchr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__memrchr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, memrchr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
+ __memrchr_evex)
|
|
+
|
|
IFUNC_IMPL_ADD (array, i, memrchr, 1, __memrchr_sse2))
|
|
|
|
#ifdef SHARED
|
|
@@ -179,6 +189,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, rawmemchr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__rawmemchr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __rawmemchr_evex)
|
|
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
|
@@ -186,6 +201,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, strlen,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__strlen_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, strlen,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
+ __strlen_evex)
|
|
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
|
|
@@ -193,6 +212,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, strnlen,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__strnlen_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, strnlen,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
+ __strnlen_evex)
|
|
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/stpncpy.c. */
|
|
@@ -255,6 +278,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, strchr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__strchr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, strchr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __strchr_evex)
|
|
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2_no_bsf)
|
|
IFUNC_IMPL_ADD (array, i, strchr, 1, __strchr_sse2))
|
|
|
|
@@ -263,6 +291,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__strchrnul_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, strchrnul,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __strchrnul_evex)
|
|
IFUNC_IMPL_ADD (array, i, strchrnul, 1, __strchrnul_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/strrchr.c. */
|
|
@@ -270,6 +303,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, strrchr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__strrchr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, strrchr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
+ __strrchr_evex)
|
|
IFUNC_IMPL_ADD (array, i, strrchr, 1, __strrchr_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/strcmp.c. */
|
|
@@ -277,6 +314,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, strcmp,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__strcmp_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, strcmp,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __strcmp_evex)
|
|
IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSE4_2),
|
|
__strcmp_sse42)
|
|
IFUNC_IMPL_ADD (array, i, strcmp, CPU_FEATURE_USABLE (SSSE3),
|
|
@@ -370,6 +412,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, wcschr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__wcschr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, wcschr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __wcschr_evex)
|
|
IFUNC_IMPL_ADD (array, i, wcschr, 1, __wcschr_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/wcsrchr.c. */
|
|
@@ -377,6 +424,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, wcsrchr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__wcsrchr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, wcsrchr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __wcsrchr_evex)
|
|
IFUNC_IMPL_ADD (array, i, wcsrchr, 1, __wcsrchr_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/wcscmp.c. */
|
|
@@ -384,6 +436,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, wcscmp,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__wcscmp_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, wcscmp,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __wcscmp_evex)
|
|
IFUNC_IMPL_ADD (array, i, wcscmp, 1, __wcscmp_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/wcsncmp.c. */
|
|
@@ -391,6 +448,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, wcsncmp,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__wcsncmp_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, wcsncmp,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __wcsncmp_evex)
|
|
IFUNC_IMPL_ADD (array, i, wcsncmp, 1, __wcsncmp_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/wcscpy.c. */
|
|
@@ -404,6 +466,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, wcslen,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__wcslen_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, wcslen,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __wcslen_evex)
|
|
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
|
@@ -411,6 +478,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__wcsnlen_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __wcsnlen_evex)
|
|
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
|
CPU_FEATURE_USABLE (SSE4_1),
|
|
__wcsnlen_sse4_1)
|
|
@@ -421,6 +493,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, wmemchr,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__wmemchr_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, wmemchr,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)
|
|
+ && CPU_FEATURE_USABLE (BMI2)),
|
|
+ __wmemchr_evex)
|
|
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
|
|
|
|
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
|
|
@@ -568,6 +645,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
|
IFUNC_IMPL_ADD (array, i, strncmp,
|
|
CPU_FEATURE_USABLE (AVX2),
|
|
__strncmp_avx2)
|
|
+ IFUNC_IMPL_ADD (array, i, strncmp,
|
|
+ (CPU_FEATURE_USABLE (AVX512VL)
|
|
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
|
+ __strncmp_evex)
|
|
IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSE4_2),
|
|
__strncmp_sse42)
|
|
IFUNC_IMPL_ADD (array, i, strncmp, CPU_FEATURE_USABLE (SSSE3),
|
|
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
new file mode 100644
|
|
index 00000000..6dd5d67b
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
|
@@ -0,0 +1,381 @@
|
|
+/* memchr/wmemchr optimized with 256-bit EVEX instructions.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library; if not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#if IS_IN (libc)
|
|
+
|
|
+# include <sysdep.h>
|
|
+
|
|
+# ifndef MEMCHR
|
|
+# define MEMCHR __memchr_evex
|
|
+# endif
|
|
+
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+# define VPBROADCAST vpbroadcastd
|
|
+# define VPCMP vpcmpd
|
|
+# define SHIFT_REG r8d
|
|
+# else
|
|
+# define VPBROADCAST vpbroadcastb
|
|
+# define VPCMP vpcmpb
|
|
+# define SHIFT_REG ecx
|
|
+# endif
|
|
+
|
|
+# define XMMMATCH xmm16
|
|
+# define YMMMATCH ymm16
|
|
+# define YMM1 ymm17
|
|
+# define YMM2 ymm18
|
|
+# define YMM3 ymm19
|
|
+# define YMM4 ymm20
|
|
+# define YMM5 ymm21
|
|
+# define YMM6 ymm22
|
|
+
|
|
+# define VEC_SIZE 32
|
|
+
|
|
+ .section .text.evex,"ax",@progbits
|
|
+ENTRY (MEMCHR)
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ /* Check for zero length. */
|
|
+ test %RDX_LP, %RDX_LP
|
|
+ jz L(zero)
|
|
+# endif
|
|
+ movl %edi, %ecx
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ shl $2, %RDX_LP
|
|
+# else
|
|
+# ifdef __ILP32__
|
|
+ /* Clear the upper 32 bits. */
|
|
+ movl %edx, %edx
|
|
+# endif
|
|
+# endif
|
|
+ /* Broadcast CHAR to YMMMATCH. */
|
|
+ VPBROADCAST %esi, %YMMMATCH
|
|
+ /* Check if we may cross page boundary with one vector load. */
|
|
+ andl $(2 * VEC_SIZE - 1), %ecx
|
|
+ cmpl $VEC_SIZE, %ecx
|
|
+ ja L(cros_page_boundary)
|
|
+
|
|
+ /* Check the first VEC_SIZE bytes. */
|
|
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ jnz L(first_vec_x0_check)
|
|
+ /* Adjust length and check the end of data. */
|
|
+ subq $VEC_SIZE, %rdx
|
|
+ jbe L(zero)
|
|
+# else
|
|
+ jnz L(first_vec_x0)
|
|
+# endif
|
|
+
|
|
+ /* Align data for aligned loads in the loop. */
|
|
+ addq $VEC_SIZE, %rdi
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ /* Adjust length. */
|
|
+ addq %rcx, %rdx
|
|
+
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+# endif
|
|
+ jmp L(more_4x_vec)
|
|
+
|
|
+ .p2align 4
|
|
+L(cros_page_boundary):
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
+ bytes. */
|
|
+ movl %ecx, %SHIFT_REG
|
|
+ sarl $2, %SHIFT_REG
|
|
+# endif
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ /* Remove the leading bytes. */
|
|
+ sarxl %SHIFT_REG, %eax, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(aligned_more)
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rdx
|
|
+ jbe L(zero)
|
|
+# endif
|
|
+ addq %rdi, %rax
|
|
+ addq %rcx, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(aligned_more):
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
|
+ instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
|
+ overflow. */
|
|
+ negq %rcx
|
|
+ addq $VEC_SIZE, %rcx
|
|
+
|
|
+ /* Check the end of data. */
|
|
+ subq %rcx, %rdx
|
|
+ jbe L(zero)
|
|
+# endif
|
|
+
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+# endif
|
|
+
|
|
+L(more_4x_vec):
|
|
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
+ since data is only aligned to VEC_SIZE. */
|
|
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+
|
|
+ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x2)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x3)
|
|
+
|
|
+ addq $(VEC_SIZE * 4), %rdi
|
|
+
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+# endif
|
|
+
|
|
+ /* Align data to 4 * VEC_SIZE. */
|
|
+ movq %rdi, %rcx
|
|
+ andl $(4 * VEC_SIZE - 1), %ecx
|
|
+ andq $-(4 * VEC_SIZE), %rdi
|
|
+
|
|
+# ifndef USE_AS_RAWMEMCHR
|
|
+ /* Adjust length. */
|
|
+ addq %rcx, %rdx
|
|
+# endif
|
|
+
|
|
+ .p2align 4
|
|
+L(loop_4x_vec):
|
|
+ /* Compare 4 * VEC at a time forward. */
|
|
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
+ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
|
|
+ kord %k1, %k2, %k5
|
|
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
|
|
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
|
|
+
|
|
+ kord %k3, %k4, %k6
|
|
+ kortestd %k5, %k6
|
|
+ jnz L(4x_vec_end)
|
|
+
|
|
+ addq $(VEC_SIZE * 4), %rdi
|
|
+
|
|
+# ifdef USE_AS_RAWMEMCHR
|
|
+ jmp L(loop_4x_vec)
|
|
+# else
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ ja L(loop_4x_vec)
|
|
+
|
|
+L(last_4x_vec_or_less):
|
|
+ /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
+ addl $(VEC_SIZE * 2), %edx
|
|
+ jle L(last_2x_vec)
|
|
+
|
|
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+
|
|
+ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+
|
|
+ jnz L(first_vec_x2_check)
|
|
+ subl $VEC_SIZE, %edx
|
|
+ jle L(zero)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+
|
|
+ jnz L(first_vec_x3_check)
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_2x_vec):
|
|
+ addl $(VEC_SIZE * 2), %edx
|
|
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+
|
|
+ jnz L(first_vec_x0_check)
|
|
+ subl $VEC_SIZE, %edx
|
|
+ jle L(zero)
|
|
+
|
|
+ VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1_check)
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x0_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rdx
|
|
+ jbe L(zero)
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x1_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rdx
|
|
+ jbe L(zero)
|
|
+ addq $VEC_SIZE, %rax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x2_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rdx
|
|
+ jbe L(zero)
|
|
+ addq $(VEC_SIZE * 2), %rax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x3_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rdx
|
|
+ jbe L(zero)
|
|
+ addq $(VEC_SIZE * 3), %rax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(zero):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+# endif
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x0):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ addq %rdi, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x1):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ addq $VEC_SIZE, %rax
|
|
+ addq %rdi, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x2):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ addq $(VEC_SIZE * 2), %rax
|
|
+ addq %rdi, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(4x_vec_end):
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+ kmovd %k2, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+ kmovd %k3, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x2)
|
|
+ kmovd %k4, %eax
|
|
+ testl %eax, %eax
|
|
+L(first_vec_x3):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WMEMCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ addq $(VEC_SIZE * 3), %rax
|
|
+ addq %rdi, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+END (MEMCHR)
|
|
+#endif
|
|
diff --git a/sysdeps/x86_64/multiarch/memrchr-evex.S b/sysdeps/x86_64/multiarch/memrchr-evex.S
|
|
new file mode 100644
|
|
index 00000000..16bf8e02
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/memrchr-evex.S
|
|
@@ -0,0 +1,337 @@
|
|
+/* memrchr optimized with 256-bit EVEX instructions.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library; if not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#if IS_IN (libc)
|
|
+
|
|
+# include <sysdep.h>
|
|
+
|
|
+# define VMOVA vmovdqa64
|
|
+
|
|
+# define YMMMATCH ymm16
|
|
+
|
|
+# define VEC_SIZE 32
|
|
+
|
|
+ .section .text.evex,"ax",@progbits
|
|
+ENTRY (__memrchr_evex)
|
|
+ /* Broadcast CHAR to YMMMATCH. */
|
|
+ vpbroadcastb %esi, %YMMMATCH
|
|
+
|
|
+ sub $VEC_SIZE, %RDX_LP
|
|
+ jbe L(last_vec_or_less)
|
|
+
|
|
+ add %RDX_LP, %RDI_LP
|
|
+
|
|
+ /* Check the last VEC_SIZE bytes. */
|
|
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x0)
|
|
+
|
|
+ subq $(VEC_SIZE * 4), %rdi
|
|
+ movl %edi, %ecx
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ jz L(aligned_more)
|
|
+
|
|
+ /* Align data for aligned loads in the loop. */
|
|
+ addq $VEC_SIZE, %rdi
|
|
+ addq $VEC_SIZE, %rdx
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+ subq %rcx, %rdx
|
|
+
|
|
+ .p2align 4
|
|
+L(aligned_more):
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+
|
|
+ /* Check the last 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
+ since data is only aligned to VEC_SIZE. */
|
|
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x3)
|
|
+
|
|
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
|
|
+ kmovd %k2, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x2)
|
|
+
|
|
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
|
|
+ kmovd %k3, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x1)
|
|
+
|
|
+ vpcmpb $0, (%rdi), %YMMMATCH, %k4
|
|
+ kmovd %k4, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x0)
|
|
+
|
|
+ /* Align data to 4 * VEC_SIZE for loop with fewer branches.
|
|
+ There are some overlaps with above if data isn't aligned
|
|
+ to 4 * VEC_SIZE. */
|
|
+ movl %edi, %ecx
|
|
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
|
+ jz L(loop_4x_vec)
|
|
+
|
|
+ addq $(VEC_SIZE * 4), %rdi
|
|
+ addq $(VEC_SIZE * 4), %rdx
|
|
+ andq $-(VEC_SIZE * 4), %rdi
|
|
+ subq %rcx, %rdx
|
|
+
|
|
+ .p2align 4
|
|
+L(loop_4x_vec):
|
|
+ /* Compare 4 * VEC at a time forward. */
|
|
+ subq $(VEC_SIZE * 4), %rdi
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+
|
|
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
|
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
|
|
+ kord %k1, %k2, %k5
|
|
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
|
|
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
|
|
+
|
|
+ kord %k3, %k4, %k6
|
|
+ kortestd %k5, %k6
|
|
+ jz L(loop_4x_vec)
|
|
+
|
|
+ /* There is a match. */
|
|
+ kmovd %k4, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x3)
|
|
+
|
|
+ kmovd %k3, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x2)
|
|
+
|
|
+ kmovd %k2, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x1)
|
|
+
|
|
+ kmovd %k1, %eax
|
|
+ bsrl %eax, %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_4x_vec_or_less):
|
|
+ addl $(VEC_SIZE * 4), %edx
|
|
+ cmpl $(VEC_SIZE * 2), %edx
|
|
+ jbe L(last_2x_vec)
|
|
+
|
|
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x3)
|
|
+
|
|
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k2
|
|
+ kmovd %k2, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x2)
|
|
+
|
|
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k3
|
|
+ kmovd %k3, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x1_check)
|
|
+ cmpl $(VEC_SIZE * 3), %edx
|
|
+ jbe L(zero)
|
|
+
|
|
+ vpcmpb $0, (%rdi), %YMMMATCH, %k4
|
|
+ kmovd %k4, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(zero)
|
|
+ bsrl %eax, %eax
|
|
+ subq $(VEC_SIZE * 4), %rdx
|
|
+ addq %rax, %rdx
|
|
+ jl L(zero)
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_2x_vec):
|
|
+ vpcmpb $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x3_check)
|
|
+ cmpl $VEC_SIZE, %edx
|
|
+ jbe L(zero)
|
|
+
|
|
+ vpcmpb $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(zero)
|
|
+ bsrl %eax, %eax
|
|
+ subq $(VEC_SIZE * 2), %rdx
|
|
+ addq %rax, %rdx
|
|
+ jl L(zero)
|
|
+ addl $(VEC_SIZE * 2), %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_x0):
|
|
+ bsrl %eax, %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_x1):
|
|
+ bsrl %eax, %eax
|
|
+ addl $VEC_SIZE, %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_x2):
|
|
+ bsrl %eax, %eax
|
|
+ addl $(VEC_SIZE * 2), %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_x3):
|
|
+ bsrl %eax, %eax
|
|
+ addl $(VEC_SIZE * 3), %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_x1_check):
|
|
+ bsrl %eax, %eax
|
|
+ subq $(VEC_SIZE * 3), %rdx
|
|
+ addq %rax, %rdx
|
|
+ jl L(zero)
|
|
+ addl $VEC_SIZE, %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_x3_check):
|
|
+ bsrl %eax, %eax
|
|
+ subq $VEC_SIZE, %rdx
|
|
+ addq %rax, %rdx
|
|
+ jl L(zero)
|
|
+ addl $(VEC_SIZE * 3), %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(zero):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_or_less_aligned):
|
|
+ movl %edx, %ecx
|
|
+
|
|
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
|
+
|
|
+ movl $1, %edx
|
|
+ /* Support rdx << 32. */
|
|
+ salq %cl, %rdx
|
|
+ subq $1, %rdx
|
|
+
|
|
+ kmovd %k1, %eax
|
|
+
|
|
+ /* Remove the trailing bytes. */
|
|
+ andl %edx, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(zero)
|
|
+
|
|
+ bsrl %eax, %eax
|
|
+ addq %rdi, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_or_less):
|
|
+ addl $VEC_SIZE, %edx
|
|
+
|
|
+ /* Check for zero length. */
|
|
+ testl %edx, %edx
|
|
+ jz L(zero)
|
|
+
|
|
+ movl %edi, %ecx
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ jz L(last_vec_or_less_aligned)
|
|
+
|
|
+ movl %ecx, %esi
|
|
+ movl %ecx, %r8d
|
|
+ addl %edx, %esi
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+
|
|
+ subl $VEC_SIZE, %esi
|
|
+ ja L(last_vec_2x_aligned)
|
|
+
|
|
+ /* Check the last VEC. */
|
|
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
|
+ kmovd %k1, %eax
|
|
+
|
|
+ /* Remove the leading and trailing bytes. */
|
|
+ sarl %cl, %eax
|
|
+ movl %edx, %ecx
|
|
+
|
|
+ movl $1, %edx
|
|
+ sall %cl, %edx
|
|
+ subl $1, %edx
|
|
+
|
|
+ andl %edx, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(zero)
|
|
+
|
|
+ bsrl %eax, %eax
|
|
+ addq %rdi, %rax
|
|
+ addq %r8, %rax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vec_2x_aligned):
|
|
+ movl %esi, %ecx
|
|
+
|
|
+ /* Check the last VEC. */
|
|
+ vpcmpb $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
|
+
|
|
+ movl $1, %edx
|
|
+ sall %cl, %edx
|
|
+ subl $1, %edx
|
|
+
|
|
+ kmovd %k1, %eax
|
|
+
|
|
+ /* Remove the trailing bytes. */
|
|
+ andl %edx, %eax
|
|
+
|
|
+ testl %eax, %eax
|
|
+ jnz L(last_vec_x1)
|
|
+
|
|
+ /* Check the second last VEC. */
|
|
+ vpcmpb $0, (%rdi), %YMMMATCH, %k1
|
|
+
|
|
+ movl %r8d, %ecx
|
|
+
|
|
+ kmovd %k1, %eax
|
|
+
|
|
+ /* Remove the leading bytes. Must use unsigned right shift for
|
|
+ bsrl below. */
|
|
+ shrl %cl, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(zero)
|
|
+
|
|
+ bsrl %eax, %eax
|
|
+ addq %rdi, %rax
|
|
+ addq %r8, %rax
|
|
+ ret
|
|
+END (__memrchr_evex)
|
|
+#endif
|
|
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex.S b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
|
|
new file mode 100644
|
|
index 00000000..ec942b77
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex.S
|
|
@@ -0,0 +1,4 @@
|
|
+#define MEMCHR __rawmemchr_evex
|
|
+#define USE_AS_RAWMEMCHR 1
|
|
+
|
|
+#include "memchr-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
new file mode 100644
|
|
index 00000000..ddc86a70
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
|
|
@@ -0,0 +1,335 @@
|
|
+/* strchr/strchrnul optimized with 256-bit EVEX instructions.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library; if not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#if IS_IN (libc)
|
|
+
|
|
+# include <sysdep.h>
|
|
+
|
|
+# ifndef STRCHR
|
|
+# define STRCHR __strchr_evex
|
|
+# endif
|
|
+
|
|
+# define VMOVU vmovdqu64
|
|
+# define VMOVA vmovdqa64
|
|
+
|
|
+# ifdef USE_AS_WCSCHR
|
|
+# define VPBROADCAST vpbroadcastd
|
|
+# define VPCMP vpcmpd
|
|
+# define VPMINU vpminud
|
|
+# define CHAR_REG esi
|
|
+# define SHIFT_REG r8d
|
|
+# else
|
|
+# define VPBROADCAST vpbroadcastb
|
|
+# define VPCMP vpcmpb
|
|
+# define VPMINU vpminub
|
|
+# define CHAR_REG sil
|
|
+# define SHIFT_REG ecx
|
|
+# endif
|
|
+
|
|
+# define XMMZERO xmm16
|
|
+
|
|
+# define YMMZERO ymm16
|
|
+# define YMM0 ymm17
|
|
+# define YMM1 ymm18
|
|
+# define YMM2 ymm19
|
|
+# define YMM3 ymm20
|
|
+# define YMM4 ymm21
|
|
+# define YMM5 ymm22
|
|
+# define YMM6 ymm23
|
|
+# define YMM7 ymm24
|
|
+# define YMM8 ymm25
|
|
+
|
|
+# define VEC_SIZE 32
|
|
+# define PAGE_SIZE 4096
|
|
+
|
|
+ .section .text.evex,"ax",@progbits
|
|
+ENTRY (STRCHR)
|
|
+ movl %edi, %ecx
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ xorl %edx, %edx
|
|
+# endif
|
|
+
|
|
+ /* Broadcast CHAR to YMM0. */
|
|
+ VPBROADCAST %esi, %YMM0
|
|
+
|
|
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
+
|
|
+ /* Check if we cross page boundary with one vector load. */
|
|
+ andl $(PAGE_SIZE - 1), %ecx
|
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
|
+ ja L(cross_page_boundary)
|
|
+
|
|
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
|
+ null bytes. */
|
|
+ VMOVU (%rdi), %YMM1
|
|
+
|
|
+ /* Leaves only CHARS matching esi as 0. */
|
|
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
+ ktestd %k0, %k0
|
|
+ jz L(more_vecs)
|
|
+ kmovd %k0, %eax
|
|
+ tzcntl %eax, %eax
|
|
+ /* Found CHAR or the null byte. */
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ addq %rdi, %rax
|
|
+# endif
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ cmp (%rax), %CHAR_REG
|
|
+ cmovne %rdx, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(more_vecs):
|
|
+ /* Align data for aligned loads in the loop. */
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+L(aligned_more):
|
|
+
|
|
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
+ since data is only aligned to VEC_SIZE. */
|
|
+ VMOVA VEC_SIZE(%rdi), %YMM1
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+ /* Leaves only CHARS matching esi as 0. */
|
|
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+
|
|
+ VMOVA VEC_SIZE(%rdi), %YMM1
|
|
+ /* Leaves only CHARS matching esi as 0. */
|
|
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+
|
|
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
|
|
+ /* Leaves only CHARS matching esi as 0. */
|
|
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x2)
|
|
+
|
|
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
|
|
+ /* Leaves only CHARS matching esi as 0. */
|
|
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
+ ktestd %k0, %k0
|
|
+ jz L(prep_loop_4x)
|
|
+
|
|
+ kmovd %k0, %eax
|
|
+ tzcntl %eax, %eax
|
|
+ /* Found CHAR or the null byte. */
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
|
+# endif
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ cmp (%rax), %CHAR_REG
|
|
+ cmovne %rdx, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x0):
|
|
+ tzcntl %eax, %eax
|
|
+ /* Found CHAR or the null byte. */
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ addq %rdi, %rax
|
|
+# endif
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ cmp (%rax), %CHAR_REG
|
|
+ cmovne %rdx, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x1):
|
|
+ tzcntl %eax, %eax
|
|
+ /* Found CHAR or the null byte. */
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ leaq VEC_SIZE(%rdi, %rax), %rax
|
|
+# endif
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ cmp (%rax), %CHAR_REG
|
|
+ cmovne %rdx, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x2):
|
|
+ tzcntl %eax, %eax
|
|
+ /* Found CHAR or the null byte. */
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
+# endif
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ cmp (%rax), %CHAR_REG
|
|
+ cmovne %rdx, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+L(prep_loop_4x):
|
|
+ /* Align data to 4 * VEC_SIZE. */
|
|
+ andq $-(VEC_SIZE * 4), %rdi
|
|
+
|
|
+ .p2align 4
|
|
+L(loop_4x_vec):
|
|
+ /* Compare 4 * VEC at a time forward. */
|
|
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
|
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
|
|
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
|
+ VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
|
|
+
|
|
+ /* Leaves only CHARS matching esi as 0. */
|
|
+ vpxorq %YMM1, %YMM0, %YMM5
|
|
+ vpxorq %YMM2, %YMM0, %YMM6
|
|
+ vpxorq %YMM3, %YMM0, %YMM7
|
|
+ vpxorq %YMM4, %YMM0, %YMM8
|
|
+
|
|
+ VPMINU %YMM5, %YMM1, %YMM5
|
|
+ VPMINU %YMM6, %YMM2, %YMM6
|
|
+ VPMINU %YMM7, %YMM3, %YMM7
|
|
+ VPMINU %YMM8, %YMM4, %YMM8
|
|
+
|
|
+ VPMINU %YMM5, %YMM6, %YMM1
|
|
+ VPMINU %YMM7, %YMM8, %YMM2
|
|
+
|
|
+ VPMINU %YMM1, %YMM2, %YMM1
|
|
+
|
|
+ /* Each bit in K0 represents a CHAR or a null byte. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
+
|
|
+ addq $(VEC_SIZE * 4), %rdi
|
|
+
|
|
+ ktestd %k0, %k0
|
|
+ jz L(loop_4x_vec)
|
|
+
|
|
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM5, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+
|
|
+ /* Each bit in K1 represents a CHAR or a null byte in YMM2. */
|
|
+ VPCMP $0, %YMMZERO, %YMM6, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+
|
|
+ /* Each bit in K2 represents a CHAR or a null byte in YMM3. */
|
|
+ VPCMP $0, %YMMZERO, %YMM7, %k2
|
|
+ /* Each bit in K3 represents a CHAR or a null byte in YMM4. */
|
|
+ VPCMP $0, %YMMZERO, %YMM8, %k3
|
|
+
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Each bit in K2/K3 represents 4-byte element. */
|
|
+ kshiftlw $8, %k3, %k1
|
|
+# else
|
|
+ kshiftlq $32, %k3, %k1
|
|
+# endif
|
|
+
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ korq %k1, %k2, %k1
|
|
+ kmovq %k1, %rax
|
|
+
|
|
+ tzcntq %rax, %rax
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
|
+# endif
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ cmp (%rax), %CHAR_REG
|
|
+ cmovne %rdx, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ /* Cold case for crossing page with first load. */
|
|
+ .p2align 4
|
|
+L(cross_page_boundary):
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+
|
|
+ VMOVA (%rdi), %YMM1
|
|
+
|
|
+ /* Leaves only CHARS matching esi as 0. */
|
|
+ vpxorq %YMM1, %YMM0, %YMM2
|
|
+ VPMINU %YMM2, %YMM1, %YMM2
|
|
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
+ bytes. */
|
|
+ movl %ecx, %SHIFT_REG
|
|
+ sarl $2, %SHIFT_REG
|
|
+# endif
|
|
+
|
|
+ /* Remove the leading bits. */
|
|
+ sarxl %SHIFT_REG, %eax, %eax
|
|
+ testl %eax, %eax
|
|
+
|
|
+ jz L(aligned_more)
|
|
+ tzcntl %eax, %eax
|
|
+ addq %rcx, %rdi
|
|
+# ifdef USE_AS_WCSCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq (%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ addq %rdi, %rax
|
|
+# endif
|
|
+# ifndef USE_AS_STRCHRNUL
|
|
+ cmp (%rax), %CHAR_REG
|
|
+ cmovne %rdx, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+END (STRCHR)
|
|
+# endif
|
|
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
|
|
index 32954713..be05e197 100644
|
|
--- a/sysdeps/x86_64/multiarch/strchr.c
|
|
+++ b/sysdeps/x86_64/multiarch/strchr.c
|
|
@@ -29,16 +29,24 @@
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_no_bsf) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
|
|
static inline void *
|
|
IFUNC_SELECTOR (void)
|
|
{
|
|
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
|
|
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
- return OPTIMIZE (avx2);
|
|
+ {
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
+ return OPTIMIZE (evex);
|
|
+
|
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
+ return OPTIMIZE (avx2);
|
|
+ }
|
|
|
|
if (CPU_FEATURES_ARCH_P (cpu_features, Slow_BSF))
|
|
return OPTIMIZE (sse2_no_bsf);
|
|
diff --git a/sysdeps/x86_64/multiarch/strchrnul-evex.S b/sysdeps/x86_64/multiarch/strchrnul-evex.S
|
|
new file mode 100644
|
|
index 00000000..064fe7ca
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/strchrnul-evex.S
|
|
@@ -0,0 +1,3 @@
|
|
+#define STRCHR __strchrnul_evex
|
|
+#define USE_AS_STRCHRNUL 1
|
|
+#include "strchr-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
new file mode 100644
|
|
index 00000000..459eeed0
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
|
@@ -0,0 +1,1043 @@
|
|
+/* strcmp/wcscmp/strncmp/wcsncmp optimized with 256-bit EVEX instructions.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library; if not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#if IS_IN (libc)
|
|
+
|
|
+# include <sysdep.h>
|
|
+
|
|
+# ifndef STRCMP
|
|
+# define STRCMP __strcmp_evex
|
|
+# endif
|
|
+
|
|
+# define PAGE_SIZE 4096
|
|
+
|
|
+/* VEC_SIZE = Number of bytes in a ymm register */
|
|
+# define VEC_SIZE 32
|
|
+
|
|
+/* Shift for dividing by (VEC_SIZE * 4). */
|
|
+# define DIVIDE_BY_VEC_4_SHIFT 7
|
|
+# if (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
|
|
+# error (VEC_SIZE * 4) != (1 << DIVIDE_BY_VEC_4_SHIFT)
|
|
+# endif
|
|
+
|
|
+# define VMOVU vmovdqu64
|
|
+# define VMOVA vmovdqa64
|
|
+
|
|
+# ifdef USE_AS_WCSCMP
|
|
+/* Compare packed dwords. */
|
|
+# define VPCMP vpcmpd
|
|
+# define SHIFT_REG32 r8d
|
|
+# define SHIFT_REG64 r8
|
|
+/* 1 dword char == 4 bytes. */
|
|
+# define SIZE_OF_CHAR 4
|
|
+# else
|
|
+/* Compare packed bytes. */
|
|
+# define VPCMP vpcmpb
|
|
+# define SHIFT_REG32 ecx
|
|
+# define SHIFT_REG64 rcx
|
|
+/* 1 byte char == 1 byte. */
|
|
+# define SIZE_OF_CHAR 1
|
|
+# endif
|
|
+
|
|
+# define XMMZERO xmm16
|
|
+# define XMM0 xmm17
|
|
+# define XMM1 xmm18
|
|
+
|
|
+# define YMMZERO ymm16
|
|
+# define YMM0 ymm17
|
|
+# define YMM1 ymm18
|
|
+# define YMM2 ymm19
|
|
+# define YMM3 ymm20
|
|
+# define YMM4 ymm21
|
|
+# define YMM5 ymm22
|
|
+# define YMM6 ymm23
|
|
+# define YMM7 ymm24
|
|
+
|
|
+/* Warning!
|
|
+ wcscmp/wcsncmp have to use SIGNED comparison for elements.
|
|
+ strcmp/strncmp have to use UNSIGNED comparison for elements.
|
|
+*/
|
|
+
|
|
+/* The main idea of the string comparison (byte or dword) using 256-bit
|
|
+ EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
|
|
+ latter can be on either packed bytes or dwords depending on
|
|
+ USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
|
|
+ matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
|
|
+ KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
|
|
+ are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
|
|
+ instructions. Main loop (away from from page boundary) compares 4
|
|
+ vectors are a time, effectively comparing 4 x VEC_SIZE bytes (128
|
|
+ bytes) on each loop.
|
|
+
|
|
+ The routine strncmp/wcsncmp (enabled by defining USE_AS_STRNCMP) logic
|
|
+ is the same as strcmp, except that an a maximum offset is tracked. If
|
|
+ the maximum offset is reached before a difference is found, zero is
|
|
+ returned. */
|
|
+
|
|
+ .section .text.evex,"ax",@progbits
|
|
+ENTRY (STRCMP)
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Check for simple cases (0 or 1) in offset. */
|
|
+ cmp $1, %RDX_LP
|
|
+ je L(char0)
|
|
+ jb L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* Convert units: from wide to byte char. */
|
|
+ shl $2, %RDX_LP
|
|
+# endif
|
|
+ /* Register %r11 tracks the maximum offset. */
|
|
+ mov %RDX_LP, %R11_LP
|
|
+# endif
|
|
+ movl %edi, %eax
|
|
+ xorl %edx, %edx
|
|
+ /* Make %XMMZERO (%YMMZERO) all zeros in this function. */
|
|
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
+ orl %esi, %eax
|
|
+ andl $(PAGE_SIZE - 1), %eax
|
|
+ cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
|
|
+ jg L(cross_page)
|
|
+ /* Start comparing 4 vectors. */
|
|
+ VMOVU (%rdi), %YMM0
|
|
+ VMOVU (%rsi), %YMM1
|
|
+
|
|
+ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
|
+ VPCMP $4, %YMM0, %YMM1, %k0
|
|
+
|
|
+ /* Check for NULL in YMM0. */
|
|
+ VPCMP $0, %YMMZERO, %YMM0, %k1
|
|
+ /* Check for NULL in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k2
|
|
+ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
|
+ kord %k1, %k2, %k1
|
|
+
|
|
+ /* Each bit in K1 represents:
|
|
+ 1. A mismatch in YMM0 and YMM1. Or
|
|
+ 2. A NULL in YMM0 or YMM1.
|
|
+ */
|
|
+ kord %k0, %k1, %k1
|
|
+
|
|
+ ktestd %k1, %k1
|
|
+ je L(next_3_vectors)
|
|
+ kmovd %k1, %ecx
|
|
+ tzcntl %ecx, %edx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %edx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the mismatched index (%rdx) is after the maximum
|
|
+ offset (%r11). */
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (%rdi, %rdx), %ecx
|
|
+ cmpl (%rsi, %rdx), %ecx
|
|
+ je L(return)
|
|
+L(wcscmp_return):
|
|
+ setl %al
|
|
+ negl %eax
|
|
+ orl $1, %eax
|
|
+L(return):
|
|
+# else
|
|
+ movzbl (%rdi, %rdx), %eax
|
|
+ movzbl (%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(return_vec_size):
|
|
+ kmovd %k1, %ecx
|
|
+ tzcntl %ecx, %edx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %edx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the mismatched index (%rdx + VEC_SIZE) is after
|
|
+ the maximum offset (%r11). */
|
|
+ addq $VEC_SIZE, %rdx
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (%rdi, %rdx), %ecx
|
|
+ cmpl (%rsi, %rdx), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rdi, %rdx), %eax
|
|
+ movzbl (%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl VEC_SIZE(%rdi, %rdx), %ecx
|
|
+ cmpl VEC_SIZE(%rsi, %rdx), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl VEC_SIZE(%rdi, %rdx), %eax
|
|
+ movzbl VEC_SIZE(%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(return_2_vec_size):
|
|
+ kmovd %k1, %ecx
|
|
+ tzcntl %ecx, %edx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %edx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the mismatched index (%rdx + 2 * VEC_SIZE) is
|
|
+ after the maximum offset (%r11). */
|
|
+ addq $(VEC_SIZE * 2), %rdx
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (%rdi, %rdx), %ecx
|
|
+ cmpl (%rsi, %rdx), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rdi, %rdx), %eax
|
|
+ movzbl (%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (VEC_SIZE * 2)(%rdi, %rdx), %ecx
|
|
+ cmpl (VEC_SIZE * 2)(%rsi, %rdx), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (VEC_SIZE * 2)(%rdi, %rdx), %eax
|
|
+ movzbl (VEC_SIZE * 2)(%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(return_3_vec_size):
|
|
+ kmovd %k1, %ecx
|
|
+ tzcntl %ecx, %edx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %edx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the mismatched index (%rdx + 3 * VEC_SIZE) is
|
|
+ after the maximum offset (%r11). */
|
|
+ addq $(VEC_SIZE * 3), %rdx
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (%rdi, %rdx), %ecx
|
|
+ cmpl (%rsi, %rdx), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rdi, %rdx), %eax
|
|
+ movzbl (%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (VEC_SIZE * 3)(%rdi, %rdx), %ecx
|
|
+ cmpl (VEC_SIZE * 3)(%rsi, %rdx), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (VEC_SIZE * 3)(%rdi, %rdx), %eax
|
|
+ movzbl (VEC_SIZE * 3)(%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(next_3_vectors):
|
|
+ VMOVU VEC_SIZE(%rdi), %YMM0
|
|
+ VMOVU VEC_SIZE(%rsi), %YMM1
|
|
+ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
|
+ VPCMP $4, %YMM0, %YMM1, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM0, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k2
|
|
+ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ kord %k0, %k1, %k1
|
|
+ ktestd %k1, %k1
|
|
+ jne L(return_vec_size)
|
|
+
|
|
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
|
|
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
|
|
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
|
|
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
|
|
+
|
|
+ /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
|
|
+ VPCMP $4, %YMM2, %YMM4, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM4, %k2
|
|
+ /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ kord %k0, %k1, %k1
|
|
+ ktestd %k1, %k1
|
|
+ jne L(return_2_vec_size)
|
|
+
|
|
+ /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
|
|
+ VPCMP $4, %YMM3, %YMM5, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM3, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM5, %k2
|
|
+ /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ kord %k0, %k1, %k1
|
|
+ ktestd %k1, %k1
|
|
+ jne L(return_3_vec_size)
|
|
+L(main_loop_header):
|
|
+ leaq (VEC_SIZE * 4)(%rdi), %rdx
|
|
+ movl $PAGE_SIZE, %ecx
|
|
+ /* Align load via RAX. */
|
|
+ andq $-(VEC_SIZE * 4), %rdx
|
|
+ subq %rdi, %rdx
|
|
+ leaq (%rdi, %rdx), %rax
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Starting from this point, the maximum offset, or simply the
|
|
+ 'offset', DECREASES by the same amount when base pointers are
|
|
+ moved forward. Return 0 when:
|
|
+ 1) On match: offset <= the matched vector index.
|
|
+ 2) On mistmach, offset is before the mistmatched index.
|
|
+ */
|
|
+ subq %rdx, %r11
|
|
+ jbe L(zero)
|
|
+# endif
|
|
+ addq %rsi, %rdx
|
|
+ movq %rdx, %rsi
|
|
+ andl $(PAGE_SIZE - 1), %esi
|
|
+ /* Number of bytes before page crossing. */
|
|
+ subq %rsi, %rcx
|
|
+ /* Number of VEC_SIZE * 4 blocks before page crossing. */
|
|
+ shrq $DIVIDE_BY_VEC_4_SHIFT, %rcx
|
|
+ /* ESI: Number of VEC_SIZE * 4 blocks before page crossing. */
|
|
+ movl %ecx, %esi
|
|
+ jmp L(loop_start)
|
|
+
|
|
+ .p2align 4
|
|
+L(loop):
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Base pointers are moved forward by 4 * VEC_SIZE. Decrease
|
|
+ the maximum offset (%r11) by the same amount. */
|
|
+ subq $(VEC_SIZE * 4), %r11
|
|
+ jbe L(zero)
|
|
+# endif
|
|
+ addq $(VEC_SIZE * 4), %rax
|
|
+ addq $(VEC_SIZE * 4), %rdx
|
|
+L(loop_start):
|
|
+ testl %esi, %esi
|
|
+ leal -1(%esi), %esi
|
|
+ je L(loop_cross_page)
|
|
+L(back_to_loop):
|
|
+ /* Main loop, comparing 4 vectors are a time. */
|
|
+ VMOVA (%rax), %YMM0
|
|
+ VMOVA VEC_SIZE(%rax), %YMM2
|
|
+ VMOVA (VEC_SIZE * 2)(%rax), %YMM4
|
|
+ VMOVA (VEC_SIZE * 3)(%rax), %YMM6
|
|
+ VMOVU (%rdx), %YMM1
|
|
+ VMOVU VEC_SIZE(%rdx), %YMM3
|
|
+ VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
|
|
+ VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
|
|
+
|
|
+ VPCMP $4, %YMM0, %YMM1, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM0, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k2
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
|
|
+ YMM1. */
|
|
+ kord %k0, %k1, %k4
|
|
+
|
|
+ VPCMP $4, %YMM2, %YMM3, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM3, %k2
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
|
|
+ YMM3. */
|
|
+ kord %k0, %k1, %k5
|
|
+
|
|
+ VPCMP $4, %YMM4, %YMM5, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM4, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM5, %k2
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
|
|
+ YMM5. */
|
|
+ kord %k0, %k1, %k6
|
|
+
|
|
+ VPCMP $4, %YMM6, %YMM7, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM6, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM7, %k2
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
|
|
+ YMM7. */
|
|
+ kord %k0, %k1, %k7
|
|
+
|
|
+ kord %k4, %k5, %k0
|
|
+ kord %k6, %k7, %k1
|
|
+
|
|
+ /* Test each mask (32 bits) individually because for VEC_SIZE
|
|
+ == 32 is not possible to OR the four masks and keep all bits
|
|
+ in a 64-bit integer register, differing from SSE2 strcmp
|
|
+ where ORing is possible. */
|
|
+ kortestd %k0, %k1
|
|
+ je L(loop)
|
|
+ ktestd %k4, %k4
|
|
+ je L(test_vec)
|
|
+ kmovd %k4, %edi
|
|
+ tzcntl %edi, %ecx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %ecx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ cmpq %rcx, %r11
|
|
+ jbe L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rcx), %edi
|
|
+ cmpl (%rdx, %rcx), %edi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rcx), %eax
|
|
+ movzbl (%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rcx), %edi
|
|
+ cmpl (%rdx, %rcx), %edi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rcx), %eax
|
|
+ movzbl (%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(test_vec):
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* The first vector matched. Return 0 if the maximum offset
|
|
+ (%r11) <= VEC_SIZE. */
|
|
+ cmpq $VEC_SIZE, %r11
|
|
+ jbe L(zero)
|
|
+# endif
|
|
+ ktestd %k5, %k5
|
|
+ je L(test_2_vec)
|
|
+ kmovd %k5, %ecx
|
|
+ tzcntl %ecx, %edi
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %edi
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ addq $VEC_SIZE, %rdi
|
|
+ cmpq %rdi, %r11
|
|
+ jbe L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rdi), %ecx
|
|
+ cmpl (%rdx, %rdi), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rdi), %eax
|
|
+ movzbl (%rdx, %rdi), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl VEC_SIZE(%rsi, %rdi), %ecx
|
|
+ cmpl VEC_SIZE(%rdx, %rdi), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl VEC_SIZE(%rax, %rdi), %eax
|
|
+ movzbl VEC_SIZE(%rdx, %rdi), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(test_2_vec):
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* The first 2 vectors matched. Return 0 if the maximum offset
|
|
+ (%r11) <= 2 * VEC_SIZE. */
|
|
+ cmpq $(VEC_SIZE * 2), %r11
|
|
+ jbe L(zero)
|
|
+# endif
|
|
+ ktestd %k6, %k6
|
|
+ je L(test_3_vec)
|
|
+ kmovd %k6, %ecx
|
|
+ tzcntl %ecx, %edi
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %edi
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ addq $(VEC_SIZE * 2), %rdi
|
|
+ cmpq %rdi, %r11
|
|
+ jbe L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rdi), %ecx
|
|
+ cmpl (%rdx, %rdi), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rdi), %eax
|
|
+ movzbl (%rdx, %rdi), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (VEC_SIZE * 2)(%rsi, %rdi), %ecx
|
|
+ cmpl (VEC_SIZE * 2)(%rdx, %rdi), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (VEC_SIZE * 2)(%rax, %rdi), %eax
|
|
+ movzbl (VEC_SIZE * 2)(%rdx, %rdi), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(test_3_vec):
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* The first 3 vectors matched. Return 0 if the maximum offset
|
|
+ (%r11) <= 3 * VEC_SIZE. */
|
|
+ cmpq $(VEC_SIZE * 3), %r11
|
|
+ jbe L(zero)
|
|
+# endif
|
|
+ kmovd %k7, %esi
|
|
+ tzcntl %esi, %ecx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %ecx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ addq $(VEC_SIZE * 3), %rcx
|
|
+ cmpq %rcx, %r11
|
|
+ jbe L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rcx), %esi
|
|
+ cmpl (%rdx, %rcx), %esi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rcx), %eax
|
|
+ movzbl (%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (VEC_SIZE * 3)(%rsi, %rcx), %esi
|
|
+ cmpl (VEC_SIZE * 3)(%rdx, %rcx), %esi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (VEC_SIZE * 3)(%rax, %rcx), %eax
|
|
+ movzbl (VEC_SIZE * 3)(%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(loop_cross_page):
|
|
+ xorl %r10d, %r10d
|
|
+ movq %rdx, %rcx
|
|
+ /* Align load via RDX. We load the extra ECX bytes which should
|
|
+ be ignored. */
|
|
+ andl $((VEC_SIZE * 4) - 1), %ecx
|
|
+ /* R10 is -RCX. */
|
|
+ subq %rcx, %r10
|
|
+
|
|
+ /* This works only if VEC_SIZE * 2 == 64. */
|
|
+# if (VEC_SIZE * 2) != 64
|
|
+# error (VEC_SIZE * 2) != 64
|
|
+# endif
|
|
+
|
|
+ /* Check if the first VEC_SIZE * 2 bytes should be ignored. */
|
|
+ cmpl $(VEC_SIZE * 2), %ecx
|
|
+ jge L(loop_cross_page_2_vec)
|
|
+
|
|
+ VMOVU (%rax, %r10), %YMM2
|
|
+ VMOVU VEC_SIZE(%rax, %r10), %YMM3
|
|
+ VMOVU (%rdx, %r10), %YMM4
|
|
+ VMOVU VEC_SIZE(%rdx, %r10), %YMM5
|
|
+
|
|
+ VPCMP $4, %YMM4, %YMM2, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM4, %k2
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
|
|
+ YMM4. */
|
|
+ kord %k0, %k1, %k1
|
|
+
|
|
+ VPCMP $4, %YMM5, %YMM3, %k3
|
|
+ VPCMP $0, %YMMZERO, %YMM3, %k4
|
|
+ VPCMP $0, %YMMZERO, %YMM5, %k5
|
|
+ kord %k4, %k5, %k4
|
|
+ /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
|
|
+ YMM5. */
|
|
+ kord %k3, %k4, %k3
|
|
+
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Each bit in K1/K3 represents 4-byte element. */
|
|
+ kshiftlw $8, %k3, %k2
|
|
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
+ bytes. */
|
|
+ movl %ecx, %SHIFT_REG32
|
|
+ sarl $2, %SHIFT_REG32
|
|
+# else
|
|
+ kshiftlq $32, %k3, %k2
|
|
+# endif
|
|
+
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ korq %k1, %k2, %k1
|
|
+ kmovq %k1, %rdi
|
|
+
|
|
+ /* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
|
|
+ shrxq %SHIFT_REG64, %rdi, %rdi
|
|
+ testq %rdi, %rdi
|
|
+ je L(loop_cross_page_2_vec)
|
|
+ tzcntq %rdi, %rcx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %ecx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ cmpq %rcx, %r11
|
|
+ jbe L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rcx), %edi
|
|
+ cmpl (%rdx, %rcx), %edi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rcx), %eax
|
|
+ movzbl (%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rcx), %edi
|
|
+ cmpl (%rdx, %rcx), %edi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rcx), %eax
|
|
+ movzbl (%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(loop_cross_page_2_vec):
|
|
+ /* The first VEC_SIZE * 2 bytes match or are ignored. */
|
|
+ VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
|
|
+ VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
|
|
+ VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
|
|
+ VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
|
|
+
|
|
+ VPCMP $4, %YMM0, %YMM2, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM0, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM2, %k2
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
|
|
+ YMM2. */
|
|
+ kord %k0, %k1, %k1
|
|
+
|
|
+ VPCMP $4, %YMM1, %YMM3, %k3
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k4
|
|
+ VPCMP $0, %YMMZERO, %YMM3, %k5
|
|
+ kord %k4, %k5, %k4
|
|
+ /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
|
|
+ YMM3. */
|
|
+ kord %k3, %k4, %k3
|
|
+
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Each bit in K1/K3 represents 4-byte element. */
|
|
+ kshiftlw $8, %k3, %k2
|
|
+# else
|
|
+ kshiftlq $32, %k3, %k2
|
|
+# endif
|
|
+
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ korq %k1, %k2, %k1
|
|
+ kmovq %k1, %rdi
|
|
+
|
|
+ xorl %r8d, %r8d
|
|
+ /* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
|
|
+ subl $(VEC_SIZE * 2), %ecx
|
|
+ jle 1f
|
|
+ /* R8 has number of bytes skipped. */
|
|
+ movl %ecx, %r8d
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
+ bytes. */
|
|
+ sarl $2, %ecx
|
|
+# endif
|
|
+ /* Skip ECX bytes. */
|
|
+ shrq %cl, %rdi
|
|
+1:
|
|
+ /* Before jumping back to the loop, set ESI to the number of
|
|
+ VEC_SIZE * 4 blocks before page crossing. */
|
|
+ movl $(PAGE_SIZE / (VEC_SIZE * 4) - 1), %esi
|
|
+
|
|
+ testq %rdi, %rdi
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* At this point, if %rdi value is 0, it already tested
|
|
+ VEC_SIZE*4+%r10 byte starting from %rax. This label
|
|
+ checks whether strncmp maximum offset reached or not. */
|
|
+ je L(string_nbyte_offset_check)
|
|
+# else
|
|
+ je L(back_to_loop)
|
|
+# endif
|
|
+ tzcntq %rdi, %rcx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %ecx
|
|
+# endif
|
|
+ addq %r10, %rcx
|
|
+ /* Adjust for number of bytes skipped. */
|
|
+ addq %r8, %rcx
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ addq $(VEC_SIZE * 2), %rcx
|
|
+ subq %rcx, %r11
|
|
+ jbe L(zero)
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (%rsi, %rcx), %edi
|
|
+ cmpl (%rdx, %rcx), %edi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rax, %rcx), %eax
|
|
+ movzbl (%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# else
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movq %rax, %rsi
|
|
+ xorl %eax, %eax
|
|
+ movl (VEC_SIZE * 2)(%rsi, %rcx), %edi
|
|
+ cmpl (VEC_SIZE * 2)(%rdx, %rcx), %edi
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (VEC_SIZE * 2)(%rax, %rcx), %eax
|
|
+ movzbl (VEC_SIZE * 2)(%rdx, %rcx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+# ifdef USE_AS_STRNCMP
|
|
+L(string_nbyte_offset_check):
|
|
+ leaq (VEC_SIZE * 4)(%r10), %r10
|
|
+ cmpq %r10, %r11
|
|
+ jbe L(zero)
|
|
+ jmp L(back_to_loop)
|
|
+# endif
|
|
+
|
|
+ .p2align 4
|
|
+L(cross_page_loop):
|
|
+ /* Check one byte/dword at a time. */
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ cmpl %ecx, %eax
|
|
+# else
|
|
+ subl %ecx, %eax
|
|
+# endif
|
|
+ jne L(different)
|
|
+ addl $SIZE_OF_CHAR, %edx
|
|
+ cmpl $(VEC_SIZE * 4), %edx
|
|
+ je L(main_loop_header)
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movl (%rdi, %rdx), %eax
|
|
+ movl (%rsi, %rdx), %ecx
|
|
+# else
|
|
+ movzbl (%rdi, %rdx), %eax
|
|
+ movzbl (%rsi, %rdx), %ecx
|
|
+# endif
|
|
+ /* Check null char. */
|
|
+ testl %eax, %eax
|
|
+ jne L(cross_page_loop)
|
|
+ /* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
|
|
+ comparisons. */
|
|
+ subl %ecx, %eax
|
|
+# ifndef USE_AS_WCSCMP
|
|
+L(different):
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ .p2align 4
|
|
+L(different):
|
|
+ /* Use movl to avoid modifying EFLAGS. */
|
|
+ movl $0, %eax
|
|
+ setl %al
|
|
+ negl %eax
|
|
+ orl $1, %eax
|
|
+ ret
|
|
+# endif
|
|
+
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ .p2align 4
|
|
+L(zero):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(char0):
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (%rdi), %ecx
|
|
+ cmpl (%rsi), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rsi), %ecx
|
|
+ movzbl (%rdi), %eax
|
|
+ subl %ecx, %eax
|
|
+# endif
|
|
+ ret
|
|
+# endif
|
|
+
|
|
+ .p2align 4
|
|
+L(last_vector):
|
|
+ addq %rdx, %rdi
|
|
+ addq %rdx, %rsi
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ subq %rdx, %r11
|
|
+# endif
|
|
+ tzcntl %ecx, %edx
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %edx
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ xorl %eax, %eax
|
|
+ movl (%rdi, %rdx), %ecx
|
|
+ cmpl (%rsi, %rdx), %ecx
|
|
+ jne L(wcscmp_return)
|
|
+# else
|
|
+ movzbl (%rdi, %rdx), %eax
|
|
+ movzbl (%rsi, %rdx), %edx
|
|
+ subl %edx, %eax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ /* Comparing on page boundary region requires special treatment:
|
|
+ It must done one vector at the time, starting with the wider
|
|
+ ymm vector if possible, if not, with xmm. If fetching 16 bytes
|
|
+ (xmm) still passes the boundary, byte comparison must be done.
|
|
+ */
|
|
+ .p2align 4
|
|
+L(cross_page):
|
|
+ /* Try one ymm vector at a time. */
|
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
+ jg L(cross_page_1_vector)
|
|
+L(loop_1_vector):
|
|
+ VMOVU (%rdi, %rdx), %YMM0
|
|
+ VMOVU (%rsi, %rdx), %YMM1
|
|
+
|
|
+ /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
|
+ VPCMP $4, %YMM0, %YMM1, %k0
|
|
+ VPCMP $0, %YMMZERO, %YMM0, %k1
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k2
|
|
+ /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ kord %k0, %k1, %k1
|
|
+ kmovd %k1, %ecx
|
|
+ testl %ecx, %ecx
|
|
+ jne L(last_vector)
|
|
+
|
|
+ addl $VEC_SIZE, %edx
|
|
+
|
|
+ addl $VEC_SIZE, %eax
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
|
|
+ (%r11). */
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
+ jle L(loop_1_vector)
|
|
+L(cross_page_1_vector):
|
|
+ /* Less than 32 bytes to check, try one xmm vector. */
|
|
+ cmpl $(PAGE_SIZE - 16), %eax
|
|
+ jg L(cross_page_1_xmm)
|
|
+ VMOVU (%rdi, %rdx), %XMM0
|
|
+ VMOVU (%rsi, %rdx), %XMM1
|
|
+
|
|
+ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
|
+ VPCMP $4, %XMM0, %XMM1, %k0
|
|
+ VPCMP $0, %XMMZERO, %XMM0, %k1
|
|
+ VPCMP $0, %XMMZERO, %XMM1, %k2
|
|
+ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
|
+ korw %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ korw %k0, %k1, %k1
|
|
+ kmovw %k1, %ecx
|
|
+ testl %ecx, %ecx
|
|
+ jne L(last_vector)
|
|
+
|
|
+ addl $16, %edx
|
|
+# ifndef USE_AS_WCSCMP
|
|
+ addl $16, %eax
|
|
+# endif
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
|
|
+ (%r11). */
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+
|
|
+L(cross_page_1_xmm):
|
|
+# ifndef USE_AS_WCSCMP
|
|
+ /* Less than 16 bytes to check, try 8 byte vector. NB: No need
|
|
+ for wcscmp nor wcsncmp since wide char is 4 bytes. */
|
|
+ cmpl $(PAGE_SIZE - 8), %eax
|
|
+ jg L(cross_page_8bytes)
|
|
+ vmovq (%rdi, %rdx), %XMM0
|
|
+ vmovq (%rsi, %rdx), %XMM1
|
|
+
|
|
+ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
|
+ VPCMP $4, %XMM0, %XMM1, %k0
|
|
+ VPCMP $0, %XMMZERO, %XMM0, %k1
|
|
+ VPCMP $0, %XMMZERO, %XMM1, %k2
|
|
+ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ kord %k0, %k1, %k1
|
|
+ kmovd %k1, %ecx
|
|
+
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* Only last 2 bits are valid. */
|
|
+ andl $0x3, %ecx
|
|
+# else
|
|
+ /* Only last 8 bits are valid. */
|
|
+ andl $0xff, %ecx
|
|
+# endif
|
|
+
|
|
+ testl %ecx, %ecx
|
|
+ jne L(last_vector)
|
|
+
|
|
+ addl $8, %edx
|
|
+ addl $8, %eax
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
|
|
+ (%r11). */
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+
|
|
+L(cross_page_8bytes):
|
|
+ /* Less than 8 bytes to check, try 4 byte vector. */
|
|
+ cmpl $(PAGE_SIZE - 4), %eax
|
|
+ jg L(cross_page_4bytes)
|
|
+ vmovd (%rdi, %rdx), %XMM0
|
|
+ vmovd (%rsi, %rdx), %XMM1
|
|
+
|
|
+ /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
|
+ VPCMP $4, %XMM0, %XMM1, %k0
|
|
+ VPCMP $0, %XMMZERO, %XMM0, %k1
|
|
+ VPCMP $0, %XMMZERO, %XMM1, %k2
|
|
+ /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
|
+ kord %k1, %k2, %k1
|
|
+ /* Each bit in K1 represents a NULL or a mismatch. */
|
|
+ kord %k0, %k1, %k1
|
|
+ kmovd %k1, %ecx
|
|
+
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ /* Only the last bit is valid. */
|
|
+ andl $0x1, %ecx
|
|
+# else
|
|
+ /* Only last 4 bits are valid. */
|
|
+ andl $0xf, %ecx
|
|
+# endif
|
|
+
|
|
+ testl %ecx, %ecx
|
|
+ jne L(last_vector)
|
|
+
|
|
+ addl $4, %edx
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ /* Return 0 if the current offset (%rdx) >= the maximum offset
|
|
+ (%r11). */
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+
|
|
+L(cross_page_4bytes):
|
|
+# endif
|
|
+ /* Less than 4 bytes to check, try one byte/dword at a time. */
|
|
+# ifdef USE_AS_STRNCMP
|
|
+ cmpq %r11, %rdx
|
|
+ jae L(zero)
|
|
+# endif
|
|
+# ifdef USE_AS_WCSCMP
|
|
+ movl (%rdi, %rdx), %eax
|
|
+ movl (%rsi, %rdx), %ecx
|
|
+# else
|
|
+ movzbl (%rdi, %rdx), %eax
|
|
+ movzbl (%rsi, %rdx), %ecx
|
|
+# endif
|
|
+ testl %eax, %eax
|
|
+ jne L(cross_page_loop)
|
|
+ subl %ecx, %eax
|
|
+ ret
|
|
+END (STRCMP)
|
|
+#endif
|
|
diff --git a/sysdeps/x86_64/multiarch/strcmp.c b/sysdeps/x86_64/multiarch/strcmp.c
|
|
index 3f433fbc..c5f38510 100644
|
|
--- a/sysdeps/x86_64/multiarch/strcmp.c
|
|
+++ b/sysdeps/x86_64/multiarch/strcmp.c
|
|
@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
|
|
static inline void *
|
|
IFUNC_SELECTOR (void)
|
|
{
|
|
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
|
|
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
- return OPTIMIZE (avx2);
|
|
+ {
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
|
|
+ return OPTIMIZE (evex);
|
|
+
|
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
+ return OPTIMIZE (avx2);
|
|
+ }
|
|
|
|
if (CPU_FEATURES_ARCH_P (cpu_features, Fast_Unaligned_Load))
|
|
return OPTIMIZE (sse2_unaligned);
|
|
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
new file mode 100644
|
|
index 00000000..cd022509
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
|
@@ -0,0 +1,436 @@
|
|
+/* strlen/strnlen/wcslen/wcsnlen optimized with 256-bit EVEX instructions.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library; if not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#if IS_IN (libc)
|
|
+
|
|
+# include <sysdep.h>
|
|
+
|
|
+# ifndef STRLEN
|
|
+# define STRLEN __strlen_evex
|
|
+# endif
|
|
+
|
|
+# define VMOVA vmovdqa64
|
|
+
|
|
+# ifdef USE_AS_WCSLEN
|
|
+# define VPCMP vpcmpd
|
|
+# define VPMINU vpminud
|
|
+# define SHIFT_REG r9d
|
|
+# else
|
|
+# define VPCMP vpcmpb
|
|
+# define VPMINU vpminub
|
|
+# define SHIFT_REG ecx
|
|
+# endif
|
|
+
|
|
+# define XMMZERO xmm16
|
|
+# define YMMZERO ymm16
|
|
+# define YMM1 ymm17
|
|
+# define YMM2 ymm18
|
|
+# define YMM3 ymm19
|
|
+# define YMM4 ymm20
|
|
+# define YMM5 ymm21
|
|
+# define YMM6 ymm22
|
|
+
|
|
+# define VEC_SIZE 32
|
|
+
|
|
+ .section .text.evex,"ax",@progbits
|
|
+ENTRY (STRLEN)
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ /* Check for zero length. */
|
|
+ test %RSI_LP, %RSI_LP
|
|
+ jz L(zero)
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shl $2, %RSI_LP
|
|
+# elif defined __ILP32__
|
|
+ /* Clear the upper 32 bits. */
|
|
+ movl %esi, %esi
|
|
+# endif
|
|
+ mov %RSI_LP, %R8_LP
|
|
+# endif
|
|
+ movl %edi, %ecx
|
|
+ movq %rdi, %rdx
|
|
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
+
|
|
+ /* Check if we may cross page boundary with one vector load. */
|
|
+ andl $(2 * VEC_SIZE - 1), %ecx
|
|
+ cmpl $VEC_SIZE, %ecx
|
|
+ ja L(cros_page_boundary)
|
|
+
|
|
+ /* Check the first VEC_SIZE bytes. Each bit in K0 represents a
|
|
+ null byte. */
|
|
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ jnz L(first_vec_x0_check)
|
|
+ /* Adjust length and check the end of data. */
|
|
+ subq $VEC_SIZE, %rsi
|
|
+ jbe L(max)
|
|
+# else
|
|
+ jnz L(first_vec_x0)
|
|
+# endif
|
|
+
|
|
+ /* Align data for aligned loads in the loop. */
|
|
+ addq $VEC_SIZE, %rdi
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ /* Adjust length. */
|
|
+ addq %rcx, %rsi
|
|
+
|
|
+ subq $(VEC_SIZE * 4), %rsi
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+# endif
|
|
+ jmp L(more_4x_vec)
|
|
+
|
|
+ .p2align 4
|
|
+L(cros_page_boundary):
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
|
+ bytes. */
|
|
+ movl %ecx, %SHIFT_REG
|
|
+ sarl $2, %SHIFT_REG
|
|
+# endif
|
|
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+
|
|
+ /* Remove the leading bytes. */
|
|
+ sarxl %SHIFT_REG, %eax, %eax
|
|
+ testl %eax, %eax
|
|
+ jz L(aligned_more)
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rsi
|
|
+ jbe L(max)
|
|
+# endif
|
|
+ addq %rdi, %rax
|
|
+ addq %rcx, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(aligned_more):
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
|
+ with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
|
+ to void possible addition overflow. */
|
|
+ negq %rcx
|
|
+ addq $VEC_SIZE, %rcx
|
|
+
|
|
+ /* Check the end of data. */
|
|
+ subq %rcx, %rsi
|
|
+ jbe L(max)
|
|
+# endif
|
|
+
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ subq $(VEC_SIZE * 4), %rsi
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+# endif
|
|
+
|
|
+L(more_4x_vec):
|
|
+ /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
|
+ since data is only aligned to VEC_SIZE. */
|
|
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+
|
|
+ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x2)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x3)
|
|
+
|
|
+ addq $(VEC_SIZE * 4), %rdi
|
|
+
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ subq $(VEC_SIZE * 4), %rsi
|
|
+ jbe L(last_4x_vec_or_less)
|
|
+# endif
|
|
+
|
|
+ /* Align data to 4 * VEC_SIZE. */
|
|
+ movq %rdi, %rcx
|
|
+ andl $(4 * VEC_SIZE - 1), %ecx
|
|
+ andq $-(4 * VEC_SIZE), %rdi
|
|
+
|
|
+# ifdef USE_AS_STRNLEN
|
|
+ /* Adjust length. */
|
|
+ addq %rcx, %rsi
|
|
+# endif
|
|
+
|
|
+ .p2align 4
|
|
+L(loop_4x_vec):
|
|
+ /* Compare 4 * VEC at a time forward. */
|
|
+ VMOVA (%rdi), %YMM1
|
|
+ VMOVA VEC_SIZE(%rdi), %YMM2
|
|
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
|
|
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
|
|
+
|
|
+ VPMINU %YMM1, %YMM2, %YMM5
|
|
+ VPMINU %YMM3, %YMM4, %YMM6
|
|
+
|
|
+ VPMINU %YMM5, %YMM6, %YMM5
|
|
+ VPCMP $0, %YMM5, %YMMZERO, %k0
|
|
+ ktestd %k0, %k0
|
|
+ jnz L(4x_vec_end)
|
|
+
|
|
+ addq $(VEC_SIZE * 4), %rdi
|
|
+
|
|
+# ifndef USE_AS_STRNLEN
|
|
+ jmp L(loop_4x_vec)
|
|
+# else
|
|
+ subq $(VEC_SIZE * 4), %rsi
|
|
+ ja L(loop_4x_vec)
|
|
+
|
|
+L(last_4x_vec_or_less):
|
|
+ /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
|
+ addl $(VEC_SIZE * 2), %esi
|
|
+ jle L(last_2x_vec)
|
|
+
|
|
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+
|
|
+ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x2_check)
|
|
+ subl $VEC_SIZE, %esi
|
|
+ jle L(max)
|
|
+
|
|
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x3_check)
|
|
+ movq %r8, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(last_2x_vec):
|
|
+ addl $(VEC_SIZE * 2), %esi
|
|
+
|
|
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0_check)
|
|
+ subl $VEC_SIZE, %esi
|
|
+ jle L(max)
|
|
+
|
|
+ VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1_check)
|
|
+ movq %r8, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x0_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rsi
|
|
+ jbe L(max)
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x1_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rsi
|
|
+ jbe L(max)
|
|
+ addq $VEC_SIZE, %rax
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x2_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rsi
|
|
+ jbe L(max)
|
|
+ addq $(VEC_SIZE * 2), %rax
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x3_check):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ /* Check the end of data. */
|
|
+ cmpq %rax, %rsi
|
|
+ jbe L(max)
|
|
+ addq $(VEC_SIZE * 3), %rax
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(max):
|
|
+ movq %r8, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(zero):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+# endif
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x0):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x1):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ addq $VEC_SIZE, %rax
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec_x2):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ addq $(VEC_SIZE * 2), %rax
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(4x_vec_end):
|
|
+ VPCMP $0, %YMM1, %YMMZERO, %k0
|
|
+ kmovd %k0, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x0)
|
|
+ VPCMP $0, %YMM2, %YMMZERO, %k1
|
|
+ kmovd %k1, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x1)
|
|
+ VPCMP $0, %YMM3, %YMMZERO, %k2
|
|
+ kmovd %k2, %eax
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec_x2)
|
|
+ VPCMP $0, %YMM4, %YMMZERO, %k3
|
|
+ kmovd %k3, %eax
|
|
+L(first_vec_x3):
|
|
+ tzcntl %eax, %eax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ sall $2, %eax
|
|
+# endif
|
|
+ addq $(VEC_SIZE * 3), %rax
|
|
+ addq %rdi, %rax
|
|
+ subq %rdx, %rax
|
|
+# ifdef USE_AS_WCSLEN
|
|
+ shrq $2, %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+END (STRLEN)
|
|
+#endif
|
|
diff --git a/sysdeps/x86_64/multiarch/strncmp-evex.S b/sysdeps/x86_64/multiarch/strncmp-evex.S
|
|
new file mode 100644
|
|
index 00000000..a1d53e8c
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/strncmp-evex.S
|
|
@@ -0,0 +1,3 @@
|
|
+#define STRCMP __strncmp_evex
|
|
+#define USE_AS_STRNCMP 1
|
|
+#include "strcmp-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/strncmp.c b/sysdeps/x86_64/multiarch/strncmp.c
|
|
index 686d654f..4c15542f 100644
|
|
--- a/sysdeps/x86_64/multiarch/strncmp.c
|
|
+++ b/sysdeps/x86_64/multiarch/strncmp.c
|
|
@@ -30,16 +30,25 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse42) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
|
|
static inline void *
|
|
IFUNC_SELECTOR (void)
|
|
{
|
|
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
|
|
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
- return OPTIMIZE (avx2);
|
|
+ {
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
|
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_AVX2_STRCMP))
|
|
+ return OPTIMIZE (evex);
|
|
+
|
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
+ return OPTIMIZE (avx2);
|
|
+ }
|
|
|
|
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_2)
|
|
&& !CPU_FEATURES_ARCH_P (cpu_features, Slow_SSE4_2))
|
|
diff --git a/sysdeps/x86_64/multiarch/strnlen-evex.S b/sysdeps/x86_64/multiarch/strnlen-evex.S
|
|
new file mode 100644
|
|
index 00000000..722022f3
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/strnlen-evex.S
|
|
@@ -0,0 +1,4 @@
|
|
+#define STRLEN __strnlen_evex
|
|
+#define USE_AS_STRNLEN 1
|
|
+
|
|
+#include "strlen-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
|
new file mode 100644
|
|
index 00000000..f920b5a5
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
|
@@ -0,0 +1,265 @@
|
|
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
|
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
|
+ This file is part of the GNU C Library.
|
|
+
|
|
+ The GNU C Library is free software; you can redistribute it and/or
|
|
+ modify it under the terms of the GNU Lesser General Public
|
|
+ License as published by the Free Software Foundation; either
|
|
+ version 2.1 of the License, or (at your option) any later version.
|
|
+
|
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
+ Lesser General Public License for more details.
|
|
+
|
|
+ You should have received a copy of the GNU Lesser General Public
|
|
+ License along with the GNU C Library; if not, see
|
|
+ <https://www.gnu.org/licenses/>. */
|
|
+
|
|
+#if IS_IN (libc)
|
|
+
|
|
+# include <sysdep.h>
|
|
+
|
|
+# ifndef STRRCHR
|
|
+# define STRRCHR __strrchr_evex
|
|
+# endif
|
|
+
|
|
+# define VMOVU vmovdqu64
|
|
+# define VMOVA vmovdqa64
|
|
+
|
|
+# ifdef USE_AS_WCSRCHR
|
|
+# define VPBROADCAST vpbroadcastd
|
|
+# define VPCMP vpcmpd
|
|
+# define SHIFT_REG r8d
|
|
+# else
|
|
+# define VPBROADCAST vpbroadcastb
|
|
+# define VPCMP vpcmpb
|
|
+# define SHIFT_REG ecx
|
|
+# endif
|
|
+
|
|
+# define XMMZERO xmm16
|
|
+# define YMMZERO ymm16
|
|
+# define YMMMATCH ymm17
|
|
+# define YMM1 ymm18
|
|
+
|
|
+# define VEC_SIZE 32
|
|
+
|
|
+ .section .text.evex,"ax",@progbits
|
|
+ENTRY (STRRCHR)
|
|
+ movl %edi, %ecx
|
|
+ /* Broadcast CHAR to YMMMATCH. */
|
|
+ VPBROADCAST %esi, %YMMMATCH
|
|
+
|
|
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
|
+
|
|
+ /* Check if we may cross page boundary with one vector load. */
|
|
+ andl $(2 * VEC_SIZE - 1), %ecx
|
|
+ cmpl $VEC_SIZE, %ecx
|
|
+ ja L(cros_page_boundary)
|
|
+
|
|
+ VMOVU (%rdi), %YMM1
|
|
+
|
|
+ /* Each bit in K0 represents a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
|
+ kmovd %k0, %ecx
|
|
+ kmovd %k1, %eax
|
|
+
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+ testl %eax, %eax
|
|
+ jnz L(first_vec)
|
|
+
|
|
+ testl %ecx, %ecx
|
|
+ jnz L(return_null)
|
|
+
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+ xorl %edx, %edx
|
|
+ jmp L(aligned_loop)
|
|
+
|
|
+ .p2align 4
|
|
+L(first_vec):
|
|
+ /* Check if there is a null byte. */
|
|
+ testl %ecx, %ecx
|
|
+ jnz L(char_and_nul_in_first_vec)
|
|
+
|
|
+ /* Remember the match and keep searching. */
|
|
+ movl %eax, %edx
|
|
+ movq %rdi, %rsi
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+ jmp L(aligned_loop)
|
|
+
|
|
+ .p2align 4
|
|
+L(cros_page_boundary):
|
|
+ andl $(VEC_SIZE - 1), %ecx
|
|
+ andq $-VEC_SIZE, %rdi
|
|
+
|
|
+# ifdef USE_AS_WCSRCHR
|
|
+ /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
|
+ bytes. */
|
|
+ movl %ecx, %SHIFT_REG
|
|
+ sarl $2, %SHIFT_REG
|
|
+# endif
|
|
+
|
|
+ VMOVA (%rdi), %YMM1
|
|
+
|
|
+ /* Each bit in K0 represents a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
|
+ kmovd %k0, %edx
|
|
+ kmovd %k1, %eax
|
|
+
|
|
+ shrxl %SHIFT_REG, %edx, %edx
|
|
+ shrxl %SHIFT_REG, %eax, %eax
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+ /* Check if there is a CHAR. */
|
|
+ testl %eax, %eax
|
|
+ jnz L(found_char)
|
|
+
|
|
+ testl %edx, %edx
|
|
+ jnz L(return_null)
|
|
+
|
|
+ jmp L(aligned_loop)
|
|
+
|
|
+ .p2align 4
|
|
+L(found_char):
|
|
+ testl %edx, %edx
|
|
+ jnz L(char_and_nul)
|
|
+
|
|
+ /* Remember the match and keep searching. */
|
|
+ movl %eax, %edx
|
|
+ leaq (%rdi, %rcx), %rsi
|
|
+
|
|
+ .p2align 4
|
|
+L(aligned_loop):
|
|
+ VMOVA (%rdi), %YMM1
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+ /* Each bit in K0 represents a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
|
+ kmovd %k0, %ecx
|
|
+ kmovd %k1, %eax
|
|
+ orl %eax, %ecx
|
|
+ jnz L(char_nor_null)
|
|
+
|
|
+ VMOVA (%rdi), %YMM1
|
|
+ add $VEC_SIZE, %rdi
|
|
+
|
|
+ /* Each bit in K0 represents a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
|
+ kmovd %k0, %ecx
|
|
+ kmovd %k1, %eax
|
|
+ orl %eax, %ecx
|
|
+ jnz L(char_nor_null)
|
|
+
|
|
+ VMOVA (%rdi), %YMM1
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+ /* Each bit in K0 represents a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
|
+ kmovd %k0, %ecx
|
|
+ kmovd %k1, %eax
|
|
+ orl %eax, %ecx
|
|
+ jnz L(char_nor_null)
|
|
+
|
|
+ VMOVA (%rdi), %YMM1
|
|
+ addq $VEC_SIZE, %rdi
|
|
+
|
|
+ /* Each bit in K0 represents a null byte in YMM1. */
|
|
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
|
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
|
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
|
+ kmovd %k0, %ecx
|
|
+ kmovd %k1, %eax
|
|
+ orl %eax, %ecx
|
|
+ jz L(aligned_loop)
|
|
+
|
|
+ .p2align 4
|
|
+L(char_nor_null):
|
|
+ /* Find a CHAR or a null byte in a loop. */
|
|
+ testl %eax, %eax
|
|
+ jnz L(match)
|
|
+L(return_value):
|
|
+ testl %edx, %edx
|
|
+ jz L(return_null)
|
|
+ movl %edx, %eax
|
|
+ movq %rsi, %rdi
|
|
+ bsrl %eax, %eax
|
|
+# ifdef USE_AS_WCSRCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ leaq -VEC_SIZE(%rdi, %rax), %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(match):
|
|
+ /* Find a CHAR. Check if there is a null byte. */
|
|
+ kmovd %k0, %ecx
|
|
+ testl %ecx, %ecx
|
|
+ jnz L(find_nul)
|
|
+
|
|
+ /* Remember the match and keep searching. */
|
|
+ movl %eax, %edx
|
|
+ movq %rdi, %rsi
|
|
+ jmp L(aligned_loop)
|
|
+
|
|
+ .p2align 4
|
|
+L(find_nul):
|
|
+ /* Mask out any matching bits after the null byte. */
|
|
+ movl %ecx, %r8d
|
|
+ subl $1, %r8d
|
|
+ xorl %ecx, %r8d
|
|
+ andl %r8d, %eax
|
|
+ testl %eax, %eax
|
|
+ /* If there is no CHAR here, return the remembered one. */
|
|
+ jz L(return_value)
|
|
+ bsrl %eax, %eax
|
|
+# ifdef USE_AS_WCSRCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ leaq -VEC_SIZE(%rdi, %rax), %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(char_and_nul):
|
|
+ /* Find both a CHAR and a null byte. */
|
|
+ addq %rcx, %rdi
|
|
+ movl %edx, %ecx
|
|
+L(char_and_nul_in_first_vec):
|
|
+ /* Mask out any matching bits after the null byte. */
|
|
+ movl %ecx, %r8d
|
|
+ subl $1, %r8d
|
|
+ xorl %ecx, %r8d
|
|
+ andl %r8d, %eax
|
|
+ testl %eax, %eax
|
|
+ /* Return null pointer if the null byte comes first. */
|
|
+ jz L(return_null)
|
|
+ bsrl %eax, %eax
|
|
+# ifdef USE_AS_WCSRCHR
|
|
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
|
+ leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
|
+# else
|
|
+ leaq -VEC_SIZE(%rdi, %rax), %rax
|
|
+# endif
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+L(return_null):
|
|
+ xorl %eax, %eax
|
|
+ ret
|
|
+
|
|
+END (STRRCHR)
|
|
+#endif
|
|
diff --git a/sysdeps/x86_64/multiarch/wcschr-evex.S b/sysdeps/x86_64/multiarch/wcschr-evex.S
|
|
new file mode 100644
|
|
index 00000000..7cb8f1e4
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/wcschr-evex.S
|
|
@@ -0,0 +1,3 @@
|
|
+#define STRCHR __wcschr_evex
|
|
+#define USE_AS_WCSCHR 1
|
|
+#include "strchr-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/wcscmp-evex.S b/sysdeps/x86_64/multiarch/wcscmp-evex.S
|
|
new file mode 100644
|
|
index 00000000..42e73e51
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/wcscmp-evex.S
|
|
@@ -0,0 +1,4 @@
|
|
+#define STRCMP __wcscmp_evex
|
|
+#define USE_AS_WCSCMP 1
|
|
+
|
|
+#include "strcmp-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/wcslen-evex.S b/sysdeps/x86_64/multiarch/wcslen-evex.S
|
|
new file mode 100644
|
|
index 00000000..bdafa83b
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/wcslen-evex.S
|
|
@@ -0,0 +1,4 @@
|
|
+#define STRLEN __wcslen_evex
|
|
+#define USE_AS_WCSLEN 1
|
|
+
|
|
+#include "strlen-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-evex.S b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
|
|
new file mode 100644
|
|
index 00000000..8a8e3107
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/wcsncmp-evex.S
|
|
@@ -0,0 +1,5 @@
|
|
+#define STRCMP __wcsncmp_evex
|
|
+#define USE_AS_STRNCMP 1
|
|
+#define USE_AS_WCSCMP 1
|
|
+
|
|
+#include "strcmp-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-evex.S b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
|
|
new file mode 100644
|
|
index 00000000..24773bb4
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/wcsnlen-evex.S
|
|
@@ -0,0 +1,5 @@
|
|
+#define STRLEN __wcsnlen_evex
|
|
+#define USE_AS_WCSLEN 1
|
|
+#define USE_AS_STRNLEN 1
|
|
+
|
|
+#include "strlen-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
index b3144c93..84254b83 100644
|
|
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
|
|
@@ -29,16 +29,24 @@
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
|
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
|
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
|
|
|
static inline void *
|
|
IFUNC_SELECTOR (void)
|
|
{
|
|
const struct cpu_features* cpu_features = __get_cpu_features ();
|
|
|
|
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
|
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
|
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
|
- return OPTIMIZE (avx2);
|
|
+ {
|
|
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
|
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
|
+ return OPTIMIZE (evex);
|
|
+
|
|
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
|
+ return OPTIMIZE (avx2);
|
|
+ }
|
|
|
|
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
|
return OPTIMIZE (sse4_1);
|
|
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
|
|
new file mode 100644
|
|
index 00000000..c64602f7
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
|
|
@@ -0,0 +1,3 @@
|
|
+#define STRRCHR __wcsrchr_evex
|
|
+#define USE_AS_WCSRCHR 1
|
|
+#include "strrchr-evex.S"
|
|
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex.S b/sysdeps/x86_64/multiarch/wmemchr-evex.S
|
|
new file mode 100644
|
|
index 00000000..06cd0f9f
|
|
--- /dev/null
|
|
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex.S
|
|
@@ -0,0 +1,4 @@
|
|
+#define MEMCHR __wmemchr_evex
|
|
+#define USE_AS_WMEMCHR 1
|
|
+
|
|
+#include "memchr-evex.S"
|
|
--
|
|
GitLab
|
|
|