Compare commits

..

8 Commits
c8 ... c8

Author SHA1 Message Date
eabdullin 0e973eb579 import UBI glibc-2.28-251.el8_10.2 2024-05-24 02:13:07 +00:00
eabdullin 641f05b794 import UBI glibc-2.28-251.el8_10.1 2024-05-22 14:49:58 +00:00
eabdullin 97f5644cdb import UBI glibc-2.28-236.el8_9.13 2024-05-07 08:36:02 +00:00
eabdullin 8eea62074d import UBI glibc-2.28-236.el8_9.12 2024-02-20 14:43:54 +00:00
eabdullin 0058656e5f import UBI glibc-2.28-236.el8.7 2023-11-14 20:00:37 +00:00
Andrew Lukoshko fc2ec9e560 import UBI glibc-2.28-225.el8_8.6 2023-10-05 16:02:38 +00:00
CentOS Sources b73861e187 import glibc-2.28-225.el8 2023-05-17 01:33:08 +00:00
CentOS Sources 362627e54e import glibc-2.28-211.el8 2022-11-08 09:12:41 +00:00
336 changed files with 85728 additions and 31 deletions

View File

@ -159,7 +159,8 @@ en_SG/ISO-8859-1 \
en_US.UTF-8/UTF-8 \
en_US/ISO-8859-1 \
en_US.ISO-8859-15/ISO-8859-15 \
en_US@ampm.UTF-8/UTF-8 \
en_US@ampm/UTF-8 \
en_US.UTF-8@ampm/UTF-8 \
en_ZA.UTF-8/UTF-8 \
en_ZA/ISO-8859-1 \
en_ZM/UTF-8 \

View File

@ -448,7 +448,7 @@ fill_archive (struct locarhandle *tmpl_ah,
char fullname[fnamelen + 2 * strlen (d->d_name) + 7];
#ifdef _DIRENT_HAVE_D_TYPE
if (d_type == DT_UNKNOWN)
if (d_type == DT_UNKNOWN || d_type == DT_LNK)
#endif
{
strcpy (stpcpy (stpcpy (fullname, fname), "/"),

View File

@ -0,0 +1,112 @@
commit 849274d48fc59bfa6db3c713c8ced8026b20f3b7
Author: Florian Weimer <fweimer@redhat.com>
Date: Thu Nov 16 19:55:35 2023 +0100
elf: Fix force_first handling in dlclose (bug 30981)
The force_first parameter was ineffective because the dlclose'd
object was not necessarily the first in the maps array. Also
enable force_first handling unconditionally, regardless of namespace.
The initial object in a namespace should be destructed first, too.
The _dl_sort_maps_dfs function had early returns for relocation
dependency processing which broke force_first handling, too, and
this is fixed in this change as well.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
diff --git a/elf/dl-close.c b/elf/dl-close.c
index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
--- a/elf/dl-close.c
+++ b/elf/dl-close.c
@@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
}
assert (idx == nloaded);
+ /* Put the dlclose'd map first, so that its destructor runs first.
+ The map variable is NULL after a retry. */
+ if (map != NULL)
+ {
+ maps[map->l_idx] = maps[0];
+ maps[map->l_idx]->l_idx = map->l_idx;
+ maps[0] = map;
+ maps[0]->l_idx = 0;
+ }
+
/* Keep track of the lowest index link map we have covered already. */
int done_index = -1;
while (++done_index < nloaded)
@@ -255,9 +265,10 @@ _dl_close_worker (struct link_map *map, bool force)
}
}
- /* Sort the entries. We can skip looking for the binary itself which is
- at the front of the search list for the main namespace. */
- _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
+ /* Sort the entries. Unless retrying, the maps[0] object (the
+ original argument to dlclose) needs to remain first, so that its
+ destructor runs first. */
+ _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
/* Call all termination functions at once. */
bool unload_any = false;
@@ -768,7 +779,11 @@ _dl_close_worker (struct link_map *map, bool force)
/* Recheck if we need to retry, release the lock. */
out:
if (dl_close_state == rerun)
- goto retry;
+ {
+ /* The map may have been deallocated. */
+ map = NULL;
+ goto retry;
+ }
dl_close_state = not_pending;
}
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
index aeb79b40b45054c0..c17ac325eca658ef 100644
--- a/elf/dl-sort-maps.c
+++ b/elf/dl-sort-maps.c
@@ -260,13 +260,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
The below memcpy is not needed in the do_reldeps case here,
since we wrote back to maps[] during DFS traversal. */
if (maps_head == maps)
- return;
+ break;
}
assert (maps_head == maps);
- return;
}
-
- memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
+ else
+ memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
/* Skipping the first object at maps[0] is not valid in general,
since traversing along object dependency-links may "find" that
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
index 4bf9052db16fb352..cf6453e9eb85ac65 100644
--- a/elf/dso-sort-tests-1.def
+++ b/elf/dso-sort-tests-1.def
@@ -56,14 +56,16 @@ output: b>a>{}<a<b
# relocation(dynamic) dependencies. While this is technically unspecified, the
# presumed reasonable practical behavior is for the destructor order to respect
# the static DT_NEEDED links (here this means the a->b->c->d order).
-# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
-# dynamic_sort=2 algorithm does, although it is still arguable whether going
-# beyond spec to do this is the right thing to do.
+# The older dynamic_sort=1 algorithm originally did not achieve this,
+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
+# effectively disabling proper force_first handling.
+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
+# handling: the a object is simply moved to the front.
# The below expected outputs are what the two algorithms currently produce
# respectively, for regression testing purposes.
tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
-output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
-output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
+output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
# Test that even in the presence of dependency loops involving dlopen'ed
# object, that object is initialized last (and not unloaded prematurely).

View File

@ -0,0 +1,83 @@
commit c00b984fcd53f679ca2dafcd1aee2c89836e6e73
Author: Florian Weimer <fweimer@redhat.com>
Date: Tue Aug 29 08:28:31 2023 +0200
nscd: Skip unusable entries in first pass in prune_cache (bug 30800)
Previously, if an entry was marked unusable for any reason, but had
not timed out yet, the assert would trigger.
One way to get into such state is if a data change is detected during
re-validation of an entry. This causes the entry to be marked as not
usable. If exits nscd soon after that, then the clock jumps
backwards, and nscd restarted, the cache re-validation run after
startup triggers the removed assert.
The change is more complicated than just the removal of the assert
because entries marked as not usable should be garbage-collected in
the second pass. To make this happen, it is necessary to update some
book-keeping data.
Reviewed-by: DJ Delorie <dj@redhat.com>
diff --git a/nscd/cache.c b/nscd/cache.c
index efe4214d953edb30..2fd3f78ebb567bbe 100644
--- a/nscd/cache.c
+++ b/nscd/cache.c
@@ -371,8 +371,11 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
serv2str[runp->type], str, dh->timeout);
}
- /* Check whether the entry timed out. */
- if (dh->timeout < now)
+ /* Check whether the entry timed out. Timed out entries
+ will be revalidated. For unusable records, it is still
+ necessary to record that the bucket needs to be scanned
+ again below. */
+ if (dh->timeout < now || !dh->usable)
{
/* This hash bucket could contain entries which need to
be looked at. */
@@ -384,7 +387,7 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
/* We only have to look at the data of the first entries
since the count information is kept in the data part
which is shared. */
- if (runp->first)
+ if (runp->first && dh->usable)
{
/* At this point there are two choices: we reload the
@@ -400,9 +403,6 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
{
/* Remove the value. */
dh->usable = false;
-
- /* We definitely have some garbage entries now. */
- any = true;
}
else
{
@@ -414,18 +414,15 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
time_t timeout = readdfcts[runp->type] (table, runp, dh);
next_timeout = MIN (next_timeout, timeout);
-
- /* If the entry has been replaced, we might need
- cleanup. */
- any |= !dh->usable;
}
}
+
+ /* If the entry has been replaced, we might need cleanup. */
+ any |= !dh->usable;
}
else
- {
- assert (dh->usable);
- next_timeout = MIN (next_timeout, dh->timeout);
- }
+ /* Entry has not timed out and is usable. */
+ next_timeout = MIN (next_timeout, dh->timeout);
run = runp->next;
}

View File

@ -0,0 +1,72 @@
commit 2aa0974d2573441bffd596b07bff8698b1f2f18c
Author: Florian Weimer <fweimer@redhat.com>
Date: Fri Oct 20 14:29:50 2023 +0200
elf: ldconfig should skip temporary files created by package managers
This avoids crashes due to partially written files, after a package
update is interrupted.
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
Conflicts:
elf/ldconfig.c
(missing alloca removal downstream)
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -771,6 +771,31 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Skip some temporary DSO files. These files may be partially written
+ and lead to ldconfig crashes when examined. */
+static bool
+skip_dso_based_on_name (const char *name, size_t len)
+{
+ /* Skip temporary files created by the prelink program. Files with
+ names like these are never really DSOs we want to look at. */
+ if (len >= sizeof (".#prelink#") - 1)
+ {
+ if (strcmp (name + len - sizeof (".#prelink#") + 1,
+ ".#prelink#") == 0)
+ return true;
+ if (len >= sizeof (".#prelink#.XXXXXX") - 1
+ && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
+ + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
+ return true;
+ }
+ /* Skip temporary files created by RPM. */
+ if (memchr (name, len, ';') != NULL)
+ return true;
+ /* Skip temporary files created by dpkg. */
+ if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ return true;
+ return false;
+}
static void
search_dir (const struct dir_entry *entry)
@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
continue;
size_t len = strlen (direntry->d_name);
- /* Skip temporary files created by the prelink program. Files with
- names like these are never really DSOs we want to look at. */
- if (len >= sizeof (".#prelink#") - 1)
- {
- if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
- continue;
- if (len >= sizeof (".#prelink#.XXXXXX") - 1
- && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
- + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
- continue;
- }
+ if (skip_dso_based_on_name (direntry->d_name, len))
+ continue;
len += strlen (entry->path) + 2;
if (len > file_name_len)
{

View File

@ -0,0 +1,61 @@
commit cfb5a97a93ea656e3b2263e42142a4032986d9ba
Author: Florian Weimer <fweimer@redhat.com>
Date: Mon Oct 23 12:53:16 2023 +0200
ldconfig: Fixes for skipping temporary files.
Arguments to a memchr call were swapped, causing incorrect skipping
of files.
Files related to dpkg have different names: they actually end in
.dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
temporary files created by package managers").
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
index 51de08f91fbaf093..fb19dd68d41c07a4 100644
--- a/elf/ldconfig.c
+++ b/elf/ldconfig.c
@@ -771,6 +771,17 @@ struct dlib_entry
struct dlib_entry *next;
};
+/* Return true if the N bytes at NAME end with with the characters in
+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.)
+ Expected to be called with a string literal for SUFFIX. */
+static inline bool
+endswithn (const char *name, size_t n, const char *suffix)
+{
+ return (n >= strlen (suffix)
+ && memcmp (name + n - strlen (suffix), suffix,
+ strlen (suffix)) == 0);
+}
+
/* Skip some temporary DSO files. These files may be partially written
and lead to ldconfig crashes when examined. */
static bool
@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
names like these are never really DSOs we want to look at. */
if (len >= sizeof (".#prelink#") - 1)
{
- if (strcmp (name + len - sizeof (".#prelink#") + 1,
- ".#prelink#") == 0)
+ if (endswithn (name, len, ".#prelink#"))
return true;
if (len >= sizeof (".#prelink#.XXXXXX") - 1
&& memcmp (name + len - sizeof (".#prelink#.XXXXXX")
@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
return true;
}
/* Skip temporary files created by RPM. */
- if (memchr (name, len, ';') != NULL)
+ if (memchr (name, ';', len) != NULL)
return true;
/* Skip temporary files created by dpkg. */
- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
+ if (endswithn (name, len, ".dpkg-new")
+ || endswithn (name, len, ".dpkg-tmp"))
return true;
return false;
}

View File

@ -0,0 +1,259 @@
From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:23:59 -0800
Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the
upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
tst-size_t-wmemchr.
* sysdeps/x86_64/x32/test-size_t.h: New file.
* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
---
sysdeps/x86_64/memchr.S | 10 ++--
sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++-
sysdeps/x86_64/x32/Makefile | 8 +++
sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++
sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
6 files changed, 148 insertions(+), 5 deletions(-)
create mode 100644 sysdeps/x86_64/x32/test-size_t.h
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
Conflicts:
ChangeLog
(removed)
NEWS
(removed)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index feef5d4f..cb320257 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
mov %edi, %ecx
#ifdef USE_AS_WMEMCHR
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %rdx
+ shl $2, %RDX_LP
#else
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
punpcklbw %xmm1, %xmm1
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(return_null)
punpcklbw %xmm1, %xmm1
#endif
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index 5f5e7725..c81da19b 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -40,16 +40,20 @@
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz L(null)
# endif
movl %edi, %ecx
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
- shl $2, %rdx
+ shl $2, %RDX_LP
vpbroadcastd %xmm0, %ymm0
# else
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
vpbroadcastb %xmm0, %ymm0
# endif
/* Check if we may cross page boundary with one vector load. */
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index f2ebc24f..7d528889 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
# 64-bit llround. Add -fno-builtin-lround to silence the compiler.
CFLAGS-s_llround.c += -fno-builtin-lround
endif
+
+ifeq ($(subdir),string)
+tests += tst-size_t-memchr
+endif
+
+ifeq ($(subdir),wcsmbs)
+tests += tst-size_t-wmemchr
+endif
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
new file mode 100644
index 00000000..78a94086
--- /dev/null
+++ b/sysdeps/x86_64/x32/test-size_t.h
@@ -0,0 +1,35 @@
+/* Test string/memory functions with size_t in the lower 32 bits of
+ 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#include <string/test-string.h>
+
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
+ field in the lower 32 bits. When the LEN field of 64-bit register
+ is passed to string/memory function as the size_t parameter, only
+ the lower 32 bits can be used. */
+typedef struct
+{
+ union
+ {
+ size_t len;
+ void (*fn) (void);
+ };
+ void *p;
+} parameter_t;
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
new file mode 100644
index 00000000..29a3daf1
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
@@ -0,0 +1,72 @@
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef WIDE
+# define TEST_NAME "memchr"
+#else
+# define TEST_NAME "wmemchr"
+#endif /* WIDE */
+#include "test-size_t.h"
+
+#ifndef WIDE
+# define MEMCHR memchr
+# define CHAR char
+# define UCHAR unsigned char
+#else
+# include <wchar.h>
+# define MEMCHR wmemchr
+# define CHAR wchar_t
+# define UCHAR wchar_t
+#endif /* WIDE */
+
+IMPL (MEMCHR, 1)
+
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
+
+static CHAR *
+__attribute__ ((noinline, noclone))
+do_memchr (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ c.fn = impl->fn;
+ CHAR *res = do_memchr (src, c);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %p != NULL",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
new file mode 100644
index 00000000..877801d6
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
@@ -0,0 +1,20 @@
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-memchr.c"
--
GitLab

View File

@ -0,0 +1,41 @@
From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun, 9 Jan 2022 16:02:21 -0600
Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
Content-type: text/plain; charset=UTF-8
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
__wcscmp_avx2. For x86_64 this covers the entire address range so any
length larger could not possibly be used to bound `s1` or `s2`.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 156c1949..8fb8eedc 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
je L(char0)
jb L(zero)
# ifdef USE_AS_WCSCMP
+# ifndef __ILP32__
+ movq %rdx, %rcx
+ /* Check if length could overflow when multiplied by
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
+ overflow cases as well as redirect cases where its impossible to
+ length to bound a valid memory region. In these cases just use
+ 'wcscmp'. */
+ shrq $56, %rcx
+ jnz __wcscmp_avx2
+# endif
/* Convert units: from wide to byte char. */
shl $2, %RDX_LP
# endif
--
GitLab

View File

@ -0,0 +1,257 @@
From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 25 Mar 2022 17:13:33 -0500
Subject: [PATCH] x86: Small improvements for wcslen
Content-type: text/plain; charset=UTF-8
Just a few QOL changes.
1. Prefer `add` > `lea` as it has high execution units it can run
on.
2. Don't break macro-fusion between `test` and `jcc`
3. Reduce code size by removing gratuitous padding bytes (-90
bytes).
geometric_mean(N=20) of all benchmarks New / Original: 0.959
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
1 file changed, 41 insertions(+), 45 deletions(-)
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
index 9f5f7232..254bb030 100644
--- a/sysdeps/x86_64/wcslen.S
+++ b/sysdeps/x86_64/wcslen.S
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
pxor %xmm0, %xmm0
lea 32(%rdi), %rax
- lea 16(%rdi), %rcx
+ addq $16, %rdi
and $-16, %rax
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
pxor %xmm1, %xmm1
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
pxor %xmm2, %xmm2
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
pxor %xmm3, %xmm3
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm0
pmovmskb %xmm0, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm1
pmovmskb %xmm1, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm2
pmovmskb %xmm2, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
pcmpeqd (%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $16, %rax
test %edx, %edx
- lea 16(%rax), %rax
jnz L(exit)
and $-0x40, %rax
@@ -133,104 +133,100 @@ L(aligned_64_loop):
pminub %xmm0, %xmm2
pcmpeqd %xmm3, %xmm2
pmovmskb %xmm2, %edx
+ addq $64, %rax
test %edx, %edx
- lea 64(%rax), %rax
jz L(aligned_64_loop)
pcmpeqd -64(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $48, %rdi
test %edx, %edx
- lea 48(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm1, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd -32(%rax), %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
jnz L(exit)
pcmpeqd %xmm6, %xmm3
pmovmskb %xmm3, %edx
+ addq $-16, %rdi
test %edx, %edx
- lea -16(%rcx), %rcx
- jnz L(exit)
-
- jmp L(aligned_64_loop)
+ jz L(aligned_64_loop)
.p2align 4
L(exit):
- sub %rcx, %rax
+ sub %rdi, %rax
shr $2, %rax
test %dl, %dl
jz L(exit_high)
- mov %dl, %cl
- and $15, %cl
+ andl $15, %edx
jz L(exit_1)
ret
- .p2align 4
+ /* No align here. Naturally aligned % 16 == 1. */
L(exit_high):
- mov %dh, %ch
- and $15, %ch
+ andl $(15 << 8), %edx
jz L(exit_3)
add $2, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_1):
add $1, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_3):
add $3, %rax
ret
- .p2align 4
+ .p2align 3
L(exit_tail0):
- xor %rax, %rax
+ xorl %eax, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail1):
- mov $1, %rax
+ movl $1, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail2):
- mov $2, %rax
+ movl $2, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail3):
- mov $3, %rax
+ movl $3, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail4):
- mov $4, %rax
+ movl $4, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail5):
- mov $5, %rax
+ movl $5, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail6):
- mov $6, %rax
+ movl $6, %eax
ret
- .p2align 4
+ .p2align 3
L(exit_tail7):
- mov $7, %rax
+ movl $7, %eax
ret
END (__wcslen)
--
GitLab

View File

@ -0,0 +1,964 @@
From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:00 -0500
Subject: [PATCH] x86: Remove memcmp-sse4.S
Content-type: text/plain; charset=UTF-8
Code didn't actually use any sse4 instructions since `ptest` was
removed in:
commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Nov 10 16:18:56 2021 -0600
x86: Shrink memcmp-sse4.S code size
The new memcmp-sse2 implementation is also faster.
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
Note there are two regressions preferring SSE2 for Size = 1 and Size =
65.
Size = 1:
size, align0, align1, ret, New Time/Old Time
1, 1, 1, 0, 1.2
1, 1, 1, 1, 1.197
1, 1, 1, -1, 1.2
This is intentional. Size == 1 is significantly less hot based on
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
hotter).
Python3 Size = 1 -> 13.64%
Python3 Size = [4, 8] -> 60.92%
GCC11 Size = 1 -> 1.29%
GCC11 Size = [4, 8] -> 33.86%
size, align0, align1, ret, New Time/Old Time
4, 4, 4, 0, 0.622
4, 4, 4, 1, 0.797
4, 4, 4, -1, 0.805
5, 5, 5, 0, 0.623
5, 5, 5, 1, 0.777
5, 5, 5, -1, 0.802
6, 6, 6, 0, 0.625
6, 6, 6, 1, 0.813
6, 6, 6, -1, 0.788
7, 7, 7, 0, 0.625
7, 7, 7, 1, 0.799
7, 7, 7, -1, 0.795
8, 8, 8, 0, 0.625
8, 8, 8, 1, 0.848
8, 8, 8, -1, 0.914
9, 9, 9, 0, 0.625
Size = 65:
size, align0, align1, ret, New Time/Old Time
65, 0, 0, 0, 1.103
65, 0, 0, 1, 1.216
65, 0, 0, -1, 1.227
65, 65, 0, 0, 1.091
65, 0, 65, 1, 1.19
65, 65, 65, -1, 1.215
This is because A) the checks in range [65, 96] are now unrolled 2x
and B) because smaller values <= 16 are now given a hotter path. By
contrast the SSE4 version has a branch for Size = 80. The unrolled
version has get better performance for returns which need both
comparisons.
size, align0, align1, ret, New Time/Old Time
128, 4, 8, 0, 0.858
128, 4, 8, 1, 0.879
128, 4, 8, -1, 0.888
As well, out of microbenchmark environments that are not full
predictable the branch will have a real-cost.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 2 -
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 ---------------------
4 files changed, 814 deletions(-)
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index bca82e38..b503e4b8 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -11,7 +11,6 @@ sysdep_routines += \
memcmp-avx2-movbe-rtm \
memcmp-evex-movbe \
memcmp-sse2 \
- memcmp-sse4 \
memcmp-ssse3 \
memcpy-ssse3 \
memcpy-ssse3-back \
@@ -174,7 +173,6 @@ sysdep_routines += \
wmemcmp-avx2-movbe-rtm \
wmemcmp-c \
wmemcmp-evex-movbe \
- wmemcmp-sse4 \
wmemcmp-ssse3 \
# sysdep_routines
endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 14314367..450a2917 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
- __memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
__memcmp_ssse3)
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
- __wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
__wmemcmp_ssse3)
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 690dffe8..0bc47a7f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -21,7 +21,6 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
return OPTIMIZE (avx2_movbe);
}
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
return OPTIMIZE (ssse3);
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
deleted file mode 100644
index 50060006..00000000
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ /dev/null
@@ -1,804 +0,0 @@
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
- Copyright (C) 2010-2018 Free Software Foundation, Inc.
- Contributed by Intel Corporation.
- This file is part of the GNU C Library.
-
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
-
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
-
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <http://www.gnu.org/licenses/>. */
-
-#if IS_IN (libc)
-
-# include <sysdep.h>
-
-# ifndef MEMCMP
-# define MEMCMP __memcmp_sse4_1
-# endif
-
-#ifdef USE_AS_WMEMCMP
-# define CMPEQ pcmpeqd
-# define CHAR_SIZE 4
-#else
-# define CMPEQ pcmpeqb
-# define CHAR_SIZE 1
-#endif
-
-
-/* Warning!
- wmemcmp has to use SIGNED comparison for elements.
- memcmp has to use UNSIGNED comparison for elemnts.
-*/
-
- .section .text.sse4.1,"ax",@progbits
-ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- mov %edx, %edx
-# endif
- cmp $79, %RDX_LP
- ja L(79bytesormore)
-
- cmp $CHAR_SIZE, %RDX_LP
- jbe L(firstbyte)
-
- /* N in (CHAR_SIZE, 79) bytes. */
- cmpl $32, %edx
- ja L(more_32_bytes)
-
- cmpl $16, %edx
- jae L(16_to_32_bytes)
-
-# ifndef USE_AS_WMEMCMP
- cmpl $8, %edx
- jae L(8_to_16_bytes)
-
- cmpl $4, %edx
- jb L(2_to_3_bytes)
-
- movl (%rdi), %eax
- movl (%rsi), %ecx
-
- bswap %eax
- bswap %ecx
-
- shlq $32, %rax
- shlq $32, %rcx
-
- movl -4(%rdi, %rdx), %edi
- movl -4(%rsi, %rdx), %esi
-
- bswap %edi
- bswap %esi
-
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(2_to_3_bytes):
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- subl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(8_to_16_bytes):
- movq (%rdi), %rax
- movq (%rsi), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
- jne L(8_to_16_bytes_done)
-
- movq -8(%rdi, %rdx), %rax
- movq -8(%rsi, %rdx), %rcx
-
- bswap %rax
- bswap %rcx
-
- subq %rcx, %rax
-
-L(8_to_16_bytes_done):
- cmovne %edx, %eax
- sbbl %ecx, %ecx
- orl %ecx, %eax
- ret
-# else
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl 4(%rdi), %ecx
- cmpl 4(%rsi), %ecx
- jne L(8_to_16_bytes_done)
- movl -4(%rdi, %rdx), %ecx
- cmpl -4(%rsi, %rdx), %ecx
- jne L(8_to_16_bytes_done)
- ret
-# endif
-
- .p2align 4,, 3
-L(ret_zero):
- xorl %eax, %eax
-L(zero):
- ret
-
- .p2align 4,, 8
-L(firstbyte):
- jb L(ret_zero)
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi), %ecx
- cmpl (%rsi), %ecx
- je L(zero)
-L(8_to_16_bytes_done):
- setg %al
- leal -1(%rax, %rax), %eax
-# else
- movzbl (%rdi), %eax
- movzbl (%rsi), %ecx
- sub %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_48):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin_32):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl 32(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl 32(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl 32(%rsi, %rax), %ecx
- movzbl 32(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_begin_16):
- addq $16, %rdi
- addq $16, %rsi
-L(vec_return_begin):
- bsfl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(vec_return_end_16):
- subl $16, %edx
-L(vec_return_end):
- bsfl %eax, %eax
- addl %edx, %eax
-# ifdef USE_AS_WMEMCMP
- movl -16(%rdi, %rax), %ecx
- xorl %edx, %edx
- cmpl -16(%rsi, %rax), %ecx
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl -16(%rsi, %rax), %ecx
- movzbl -16(%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4,, 8
-L(more_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm0
- movdqu 16(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- cmpl $64, %edx
- jbe L(32_to_64_bytes)
- movdqu 32(%rdi), %xmm0
- movdqu 32(%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- .p2align 4,, 6
-L(32_to_64_bytes):
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(16_to_32_bytes):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
-
- .p2align 4
-L(79bytesormore):
- movdqu (%rdi), %xmm0
- movdqu (%rsi), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
-
- mov %rsi, %rcx
- and $-16, %rsi
- add $16, %rsi
- sub %rsi, %rcx
-
- sub %rcx, %rdi
- add %rcx, %rdx
- test $0xf, %rdi
- jz L(2aligned)
-
- cmp $128, %rdx
- ja L(128bytesormore)
-
- .p2align 4,, 6
-L(less128bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(last_64_bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormore):
- cmp $256, %rdx
- ja L(unaligned_loop)
-L(less256bytes):
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqu (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqu 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqu 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytes)
-
- cmp $32, %rdx
- ja L(last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(unaligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_unaligned)
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loop):
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loop)
-
- .p2align 4,, 6
-L(loop_tail):
- addq %rdx, %rdi
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- addq %rdx, %rsi
- movdqu (%rsi), %xmm4
- movdqu 16(%rsi), %xmm5
- movdqu 32(%rsi), %xmm6
- movdqu 48(%rsi), %xmm7
-
- CMPEQ %xmm4, %xmm0
- CMPEQ %xmm5, %xmm1
- CMPEQ %xmm6, %xmm2
- CMPEQ %xmm7, %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- ret
-
-L(L2_L3_cache_unaligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_unaligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
-
- movdqu (%rdi), %xmm0
- movdqu 16(%rdi), %xmm1
- movdqu 32(%rdi), %xmm2
- movdqu 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(L2_L3_unaligned_128bytes_loop)
- jmp L(loop_tail)
-
-
- /* This case is for machines which are sensitive for unaligned
- * instructions. */
- .p2align 4
-L(2aligned):
- cmp $128, %rdx
- ja L(128bytesormorein2aligned)
-L(less128bytesin2aligned):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- cmp $96, %rdx
- jb L(32_to_64_bytes)
-
- addq $64, %rdi
- addq $64, %rsi
- subq $64, %rdx
-
- .p2align 4,, 6
-L(aligned_last_64_bytes):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(128bytesormorein2aligned):
- cmp $256, %rdx
- ja L(aligned_loop)
-L(less256bytesin2alinged):
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $64, %rdi
- addq $64, %rsi
-
- movdqa (%rdi), %xmm1
- CMPEQ (%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin)
-
- movdqa 16(%rdi), %xmm1
- CMPEQ 16(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_16)
-
- movdqa 32(%rdi), %xmm1
- CMPEQ 32(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_32)
-
- movdqa 48(%rdi), %xmm1
- CMPEQ 48(%rsi), %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_begin_48)
-
- addq $-128, %rdx
- subq $-64, %rsi
- subq $-64, %rdi
-
- cmp $64, %rdx
- ja L(less128bytesin2aligned)
-
- cmp $32, %rdx
- ja L(aligned_last_64_bytes)
-
- movdqu -32(%rdi, %rdx), %xmm0
- movdqu -32(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end_16)
-
- movdqu -16(%rdi, %rdx), %xmm0
- movdqu -16(%rsi, %rdx), %xmm1
- CMPEQ %xmm0, %xmm1
- pmovmskb %xmm1, %eax
- incw %ax
- jnz L(vec_return_end)
- ret
-
- .p2align 4
-L(aligned_loop):
-# ifdef DATA_CACHE_SIZE_HALF
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
-# else
- mov __x86_data_cache_size_half(%rip), %R8_LP
-# endif
- movq %r8, %r9
- addq %r8, %r8
- addq %r9, %r8
- cmpq %r8, %rdx
- ja L(L2_L3_cache_aligned)
-
- sub $64, %rdx
- .p2align 4
-L(64bytesormore_loopin2aligned):
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
- add $64, %rsi
- add $64, %rdi
- sub $64, %rdx
- ja L(64bytesormore_loopin2aligned)
- jmp L(loop_tail)
-
-L(L2_L3_cache_aligned):
- subq $64, %rdx
- .p2align 4
-L(L2_L3_aligned_128bytes_loop):
- prefetchnta 0x1c0(%rdi)
- prefetchnta 0x1c0(%rsi)
- movdqa (%rdi), %xmm0
- movdqa 16(%rdi), %xmm1
- movdqa 32(%rdi), %xmm2
- movdqa 48(%rdi), %xmm3
-
- CMPEQ (%rsi), %xmm0
- CMPEQ 16(%rsi), %xmm1
- CMPEQ 32(%rsi), %xmm2
- CMPEQ 48(%rsi), %xmm3
-
- pand %xmm0, %xmm1
- pand %xmm2, %xmm3
- pand %xmm1, %xmm3
-
- pmovmskb %xmm3, %eax
- incw %ax
- jnz L(64bytesormore_loop_end)
-
- addq $64, %rsi
- addq $64, %rdi
- subq $64, %rdx
- ja L(L2_L3_aligned_128bytes_loop)
- jmp L(loop_tail)
-
- .p2align 4
-L(64bytesormore_loop_end):
- pmovmskb %xmm0, %ecx
- incw %cx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm1, %ecx
- notw %cx
- sall $16, %ecx
- jnz L(loop_end_ret)
-
- pmovmskb %xmm2, %ecx
- notw %cx
- shlq $32, %rcx
- jnz L(loop_end_ret)
-
- addq $48, %rdi
- addq $48, %rsi
- movq %rax, %rcx
-
- .p2align 4,, 6
-L(loop_end_ret):
- bsfq %rcx, %rcx
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rcx), %eax
- xorl %edx, %edx
- cmpl (%rsi, %rcx), %eax
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %ecx
- subl %ecx, %eax
-# endif
- ret
-END (MEMCMP)
-#endif
--
GitLab

View File

@ -0,0 +1,263 @@
From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 15 Apr 2022 12:28:01 -0500
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
Content-type: text/plain; charset=UTF-8
Old code was both inefficient and wasted code size. New code (-62
bytes) and comparable or better performance in the page cross case.
geometric_mean(N=20) of page cross cases New / Original: 0.960
size, align0, align1, ret, New Time/Old Time
1, 4095, 0, 0, 1.001
1, 4095, 0, 1, 0.999
1, 4095, 0, -1, 1.0
2, 4094, 0, 0, 1.0
2, 4094, 0, 1, 1.0
2, 4094, 0, -1, 1.0
3, 4093, 0, 0, 1.0
3, 4093, 0, 1, 1.0
3, 4093, 0, -1, 1.0
4, 4092, 0, 0, 0.987
4, 4092, 0, 1, 1.0
4, 4092, 0, -1, 1.0
5, 4091, 0, 0, 0.984
5, 4091, 0, 1, 1.002
5, 4091, 0, -1, 1.005
6, 4090, 0, 0, 0.993
6, 4090, 0, 1, 1.001
6, 4090, 0, -1, 1.003
7, 4089, 0, 0, 0.991
7, 4089, 0, 1, 1.0
7, 4089, 0, -1, 1.001
8, 4088, 0, 0, 0.875
8, 4088, 0, 1, 0.881
8, 4088, 0, -1, 0.888
9, 4087, 0, 0, 0.872
9, 4087, 0, 1, 0.879
9, 4087, 0, -1, 0.883
10, 4086, 0, 0, 0.878
10, 4086, 0, 1, 0.886
10, 4086, 0, -1, 0.873
11, 4085, 0, 0, 0.878
11, 4085, 0, 1, 0.881
11, 4085, 0, -1, 0.879
12, 4084, 0, 0, 0.873
12, 4084, 0, 1, 0.889
12, 4084, 0, -1, 0.875
13, 4083, 0, 0, 0.873
13, 4083, 0, 1, 0.863
13, 4083, 0, -1, 0.863
14, 4082, 0, 0, 0.838
14, 4082, 0, 1, 0.869
14, 4082, 0, -1, 0.877
15, 4081, 0, 0, 0.841
15, 4081, 0, 1, 0.869
15, 4081, 0, -1, 0.876
16, 4080, 0, 0, 0.988
16, 4080, 0, 1, 0.99
16, 4080, 0, -1, 0.989
17, 4079, 0, 0, 0.978
17, 4079, 0, 1, 0.981
17, 4079, 0, -1, 0.98
18, 4078, 0, 0, 0.981
18, 4078, 0, 1, 0.98
18, 4078, 0, -1, 0.985
19, 4077, 0, 0, 0.977
19, 4077, 0, 1, 0.979
19, 4077, 0, -1, 0.986
20, 4076, 0, 0, 0.977
20, 4076, 0, 1, 0.986
20, 4076, 0, -1, 0.984
21, 4075, 0, 0, 0.977
21, 4075, 0, 1, 0.983
21, 4075, 0, -1, 0.988
22, 4074, 0, 0, 0.983
22, 4074, 0, 1, 0.994
22, 4074, 0, -1, 0.993
23, 4073, 0, 0, 0.98
23, 4073, 0, 1, 0.992
23, 4073, 0, -1, 0.995
24, 4072, 0, 0, 0.989
24, 4072, 0, 1, 0.989
24, 4072, 0, -1, 0.991
25, 4071, 0, 0, 0.99
25, 4071, 0, 1, 0.999
25, 4071, 0, -1, 0.996
26, 4070, 0, 0, 0.993
26, 4070, 0, 1, 0.995
26, 4070, 0, -1, 0.998
27, 4069, 0, 0, 0.993
27, 4069, 0, 1, 0.999
27, 4069, 0, -1, 1.0
28, 4068, 0, 0, 0.997
28, 4068, 0, 1, 1.0
28, 4068, 0, -1, 0.999
29, 4067, 0, 0, 0.996
29, 4067, 0, 1, 0.999
29, 4067, 0, -1, 0.999
30, 4066, 0, 0, 0.991
30, 4066, 0, 1, 1.001
30, 4066, 0, -1, 0.999
31, 4065, 0, 0, 0.988
31, 4065, 0, 1, 0.998
31, 4065, 0, -1, 0.998
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
1 file changed, 61 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 16fc673e..99258cf5 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
# ifndef USE_AS_WMEMCMP
cmpl $8, %edx
jae L(between_8_15)
+ /* Fall through for [4, 7]. */
cmpl $4, %edx
- jae L(between_4_7)
+ jb L(between_2_3)
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
/* No ymm register was touched. */
ret
@@ -457,9 +456,33 @@ L(one_or_less):
/* No ymm register was touched. */
ret
+ .p2align 4,, 5
+L(ret_nonzero):
+ sbbl %eax, %eax
+ orl $1, %eax
+ /* No ymm register was touched. */
+ ret
+
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ /* No ymm register was touched. */
+ ret
+
.p2align 4
L(between_8_15):
-# endif
+ movbe (%rdi), %rax
+ movbe (%rsi), %rcx
+ subq %rcx, %rax
+ jnz L(ret_nonzero)
+ movbe -8(%rdi, %rdx), %rax
+ movbe -8(%rsi, %rdx), %rcx
+ subq %rcx, %rax
+ /* Fast path for return zero. */
+ jnz L(ret_nonzero)
+ /* No ymm register was touched. */
+ ret
+# else
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
@@ -475,16 +498,13 @@ L(between_8_15):
VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
+# endif
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ .p2align 4,, 10
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
vmovdqu (%rsi), %xmm2
@@ -501,11 +521,17 @@ L(between_16_31):
VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
subl $0xffff, %eax
+ /* Fast path for return zero. */
jnz L(return_vec_0)
/* No ymm register was touched. */
ret
# ifdef USE_AS_WMEMCMP
+ .p2align 4,, 2
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(one_or_less):
jb L(zero)
@@ -520,22 +546,20 @@ L(one_or_less):
# else
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ bswap %eax
+ bswap %ecx
+ shrl %eax
+ shrl %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper bit is zero. */
+ subl %ecx, %eax
/* No ymm register was touched. */
ret
# endif
--
GitLab

View File

@ -0,0 +1,876 @@
From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:28 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.741
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
sysdeps/x86_64/wcsrchr.S | 266 +-----------
4 files changed, 338 insertions(+), 443 deletions(-)
Conflicts:
sysdeps/x86_64/wcsrchr.S
(copyright header)
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
index 0ec76fe9..6bb1284b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
@@ -17,7 +17,7 @@
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define strrchr __strrchr_sse2
+# define STRRCHR __strrchr_sse2
# undef weak_alias
# define weak_alias(strrchr, rindex)
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
index d015e953..f26d53b5 100644
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
@@ -17,7 +17,6 @@
<http://www.gnu.org/licenses/>. */
#if IS_IN (libc)
-# define wcsrchr __wcsrchr_sse2
+# define STRRCHR __wcsrchr_sse2
#endif
-
#include "../wcsrchr.S"
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
index aca98e7e..a58cc220 100644
--- a/sysdeps/x86_64/strrchr.S
+++ b/sysdeps/x86_64/strrchr.S
@@ -19,210 +19,360 @@
#include <sysdep.h>
+#ifndef STRRCHR
+# define STRRCHR strrchr
+#endif
+
+#ifdef USE_AS_WCSRCHR
+# define PCMPEQ pcmpeqd
+# define CHAR_SIZE 4
+# define PMINU pminud
+#else
+# define PCMPEQ pcmpeqb
+# define CHAR_SIZE 1
+# define PMINU pminub
+#endif
+
+#define PAGE_SIZE 4096
+#define VEC_SIZE 16
+
.text
-ENTRY (strrchr)
- movd %esi, %xmm1
+ENTRY(STRRCHR)
+ movd %esi, %xmm0
movq %rdi, %rax
- andl $4095, %eax
- punpcklbw %xmm1, %xmm1
- cmpq $4032, %rax
- punpcklwd %xmm1, %xmm1
- pshufd $0, %xmm1, %xmm1
+ andl $(PAGE_SIZE - 1), %eax
+#ifndef USE_AS_WCSRCHR
+ punpcklbw %xmm0, %xmm0
+ punpcklwd %xmm0, %xmm0
+#endif
+ pshufd $0, %xmm0, %xmm0
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
ja L(cross_page)
- movdqu (%rdi), %xmm0
+
+L(cross_page_continue):
+ movups (%rdi), %xmm1
pxor %xmm2, %xmm2
- movdqa %xmm0, %xmm3
- pcmpeqb %xmm1, %xmm0
- pcmpeqb %xmm2, %xmm3
- pmovmskb %xmm0, %ecx
- pmovmskb %xmm3, %edx
- testq %rdx, %rdx
- je L(next_48_bytes)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rcx, %rax
- je L(exit)
- bsrq %rax, %rax
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret0):
ret
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
.p2align 4
-L(next_48_bytes):
- movdqu 16(%rdi), %xmm4
- movdqa %xmm4, %xmm5
- movdqu 32(%rdi), %xmm3
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm2, %xmm5
- movdqu 48(%rdi), %xmm0
- pmovmskb %xmm5, %edx
- movdqa %xmm3, %xmm5
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm2, %xmm5
- pcmpeqb %xmm0, %xmm2
- salq $16, %rdx
- pmovmskb %xmm3, %r8d
- pmovmskb %xmm5, %eax
- pmovmskb %xmm2, %esi
- salq $32, %r8
- salq $32, %rax
- pcmpeqb %xmm1, %xmm0
- orq %rdx, %rax
- movq %rsi, %rdx
- pmovmskb %xmm4, %esi
- salq $48, %rdx
- salq $16, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
- pmovmskb %xmm0, %ecx
- salq $48, %rcx
- orq %rcx, %rsi
- orq %rdx, %rax
- je L(loop_header2)
- leaq -1(%rax), %rcx
- xorq %rax, %rcx
- andq %rcx, %rsi
- je L(exit)
- bsrq %rsi, %rsi
- leaq (%rdi,%rsi), %rax
+L(first_vec_x0_test):
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ testl %eax, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %r8, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
-L(loop_header2):
- testq %rsi, %rsi
- movq %rdi, %rcx
- je L(no_c_found)
-L(loop_header):
- addq $64, %rdi
- pxor %xmm7, %xmm7
- andq $-64, %rdi
- jmp L(loop_entry)
+L(first_vec_x1):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
.p2align 4
-L(loop64):
- testq %rdx, %rdx
- cmovne %rdx, %rsi
- cmovne %rdi, %rcx
- addq $64, %rdi
-L(loop_entry):
- movdqa 32(%rdi), %xmm3
- pxor %xmm6, %xmm6
- movdqa 48(%rdi), %xmm2
- movdqa %xmm3, %xmm0
- movdqa 16(%rdi), %xmm4
- pminub %xmm2, %xmm0
- movdqa (%rdi), %xmm5
- pminub %xmm4, %xmm0
- pminub %xmm5, %xmm0
- pcmpeqb %xmm7, %xmm0
- pmovmskb %xmm0, %eax
- movdqa %xmm5, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %r9d
- movdqa %xmm4, %xmm0
- pcmpeqb %xmm1, %xmm0
- pmovmskb %xmm0, %edx
- movdqa %xmm3, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $16, %rdx
- pmovmskb %xmm0, %r10d
- movdqa %xmm2, %xmm0
- pcmpeqb %xmm1, %xmm0
- salq $32, %r10
- orq %r10, %rdx
- pmovmskb %xmm0, %r8d
- orq %r9, %rdx
- salq $48, %r8
- orq %r8, %rdx
+L(first_vec_x1_test):
+ PCMPEQ %xmm0, %xmm2
+ pmovmskb %xmm2, %eax
testl %eax, %eax
- je L(loop64)
- pcmpeqb %xmm6, %xmm4
- pcmpeqb %xmm6, %xmm3
- pcmpeqb %xmm6, %xmm5
- pmovmskb %xmm4, %eax
- pmovmskb %xmm3, %r10d
- pcmpeqb %xmm6, %xmm2
- pmovmskb %xmm5, %r9d
- salq $32, %r10
- salq $16, %rax
- pmovmskb %xmm2, %r8d
- orq %r10, %rax
- orq %r9, %rax
- salq $48, %r8
- orq %r8, %rax
- leaq -1(%rax), %r8
- xorq %rax, %r8
- andq %r8, %rdx
- cmovne %rdi, %rcx
- cmovne %rdx, %rsi
- bsrq %rsi, %rsi
- leaq (%rcx,%rsi), %rax
+ jz L(first_vec_x0_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm3, %eax
+ leal -1(%rcx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_vec_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+ andq $-VEC_SIZE, %rdi
+
+ movaps VEC_SIZE(%rdi), %xmm2
+ pxor %xmm3, %xmm3
+ PCMPEQ %xmm2, %xmm3
+ pmovmskb %xmm3, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
+
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
+ pxor %xmm4, %xmm4
+ PCMPEQ %xmm3, %xmm4
+ pmovmskb %xmm4, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+
+ addq $VEC_SIZE, %rdi
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ andq $-(VEC_SIZE * 2), %rdi
+ .p2align 4
+L(first_loop):
+ /* Do 2x VEC at a time. */
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
+ macro-fuse with `jz`. */
+ addl %ecx, %eax
+ jz L(first_loop)
+
+ /* Check if there is zero match. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+ /* Check if there was a match in last iteration. */
+ subl %ecx, %eax
+ jnz L(new_match)
+
+L(first_loop_old_match):
+ PCMPEQ %xmm0, %xmm2
+ PCMPEQ %xmm0, %xmm3
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ addl %eax, %ecx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ sall $16, %eax
+ orl %ecx, %eax
+
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4
+L(new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(first_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
+ /* Save minimum state for getting most recent match. We can
+ throw out all previous work. */
.p2align 4
-L(no_c_found):
- movl $1, %esi
- xorl %ecx, %ecx
- jmp L(loop_header)
+L(second_loop_match):
+ movq %rdi, %rsi
+ movaps %xmm4, %xmm2
+ movaps %xmm7, %xmm3
.p2align 4
-L(exit):
- xorl %eax, %eax
+L(second_loop):
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
+ detecting zero. Note if this is found to be a bottleneck it
+ may be worth adding an SSE4.1 wcsrchr implementation. */
+#ifdef USE_AS_WCSRCHR
+ movaps %xmm5, %xmm6
+ pxor %xmm8, %xmm8
+
+ PCMPEQ %xmm8, %xmm5
+ PCMPEQ %xmm4, %xmm8
+ por %xmm5, %xmm8
+#else
+ movaps %xmm5, %xmm6
+ PMINU %xmm4, %xmm5
+#endif
+
+ movaps %xmm4, %xmm9
+ PCMPEQ %xmm0, %xmm4
+ PCMPEQ %xmm0, %xmm6
+ movaps %xmm6, %xmm7
+ por %xmm4, %xmm6
+#ifndef USE_AS_WCSRCHR
+ pxor %xmm8, %xmm8
+ PCMPEQ %xmm5, %xmm8
+#endif
+
+ pmovmskb %xmm8, %ecx
+ pmovmskb %xmm6, %eax
+
+ addq $(VEC_SIZE * 2), %rdi
+ /* Either null term or new occurence of CHAR. */
+ addl %ecx, %eax
+ jz L(second_loop)
+
+ /* No null term so much be new occurence of CHAR. */
+ testl %ecx, %ecx
+ jz L(second_loop_match)
+
+
+ subl %ecx, %eax
+ jnz L(second_loop_new_match)
+
+L(second_loop_old_match):
+ pmovmskb %xmm2, %ecx
+ pmovmskb %xmm3, %eax
+ sall $16, %eax
+ orl %ecx, %eax
+ bsrl %eax, %eax
+ addq %rsi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
ret
.p2align 4
+L(second_loop_new_match):
+ pxor %xmm6, %xmm6
+ PCMPEQ %xmm9, %xmm6
+ pmovmskb %xmm6, %eax
+ sall $16, %ecx
+ orl %eax, %ecx
+
+ /* We can't reuse either of the old comparisons as since we mask
+ of zeros after first zero (instead of using the full
+ comparison) we can't gurantee no interference between match
+ after end of string and valid match. */
+ pmovmskb %xmm4, %eax
+ pmovmskb %xmm7, %edx
+ sall $16, %edx
+ orl %edx, %eax
+
+ leal -1(%ecx), %edx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(second_loop_old_match)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+ ret
+
+ .p2align 4,, 4
L(cross_page):
- movq %rdi, %rax
- pxor %xmm0, %xmm0
- andq $-64, %rax
- movdqu (%rax), %xmm5
- movdqa %xmm5, %xmm6
- movdqu 16(%rax), %xmm4
- pcmpeqb %xmm1, %xmm5
- pcmpeqb %xmm0, %xmm6
- movdqu 32(%rax), %xmm3
- pmovmskb %xmm6, %esi
- movdqa %xmm4, %xmm6
- movdqu 48(%rax), %xmm2
- pcmpeqb %xmm1, %xmm4
- pcmpeqb %xmm0, %xmm6
- pmovmskb %xmm6, %edx
- movdqa %xmm3, %xmm6
- pcmpeqb %xmm1, %xmm3
- pcmpeqb %xmm0, %xmm6
- pcmpeqb %xmm2, %xmm0
- salq $16, %rdx
- pmovmskb %xmm3, %r9d
- pmovmskb %xmm6, %r8d
- pmovmskb %xmm0, %ecx
- salq $32, %r9
- salq $32, %r8
- pcmpeqb %xmm1, %xmm2
- orq %r8, %rdx
- salq $48, %rcx
- pmovmskb %xmm5, %r8d
- orq %rsi, %rdx
- pmovmskb %xmm4, %esi
- orq %rcx, %rdx
- pmovmskb %xmm2, %ecx
- salq $16, %rsi
- salq $48, %rcx
- orq %r9, %rsi
- orq %r8, %rsi
- orq %rcx, %rsi
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ movaps (%rsi), %xmm1
+ pxor %xmm2, %xmm2
+ PCMPEQ %xmm1, %xmm2
+ pmovmskb %xmm2, %edx
movl %edi, %ecx
- subl %eax, %ecx
- shrq %cl, %rdx
- shrq %cl, %rsi
- testq %rdx, %rdx
- je L(loop_header2)
- leaq -1(%rdx), %rax
- xorq %rdx, %rax
- andq %rax, %rsi
- je L(exit)
- bsrq %rsi, %rax
+ andl $(VEC_SIZE - 1), %ecx
+ sarl %cl, %edx
+ jz L(cross_page_continue)
+ PCMPEQ %xmm0, %xmm1
+ pmovmskb %xmm1, %eax
+ sarl %cl, %eax
+ leal -1(%rdx), %ecx
+ xorl %edx, %ecx
+ andl %ecx, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
addq %rdi, %rax
+#ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+#endif
+L(ret1):
ret
-END (strrchr)
+END(STRRCHR)
-weak_alias (strrchr, rindex)
-libc_hidden_builtin_def (strrchr)
+#ifndef USE_AS_WCSRCHR
+ weak_alias (STRRCHR, rindex)
+ libc_hidden_builtin_def (STRRCHR)
+#endif
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
index 2f388537..ae3cfa7d 100644
--- a/sysdeps/x86_64/wcsrchr.S
+++ b/sysdeps/x86_64/wcsrchr.S
@@ -17,266 +17,12 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
- .text
-ENTRY (wcsrchr)
+#define USE_AS_WCSRCHR 1
+#define NO_PMINU 1
- movd %rsi, %xmm1
- mov %rdi, %rcx
- punpckldq %xmm1, %xmm1
- pxor %xmm2, %xmm2
- punpckldq %xmm1, %xmm1
- and $63, %rcx
- cmp $48, %rcx
- ja L(crosscache)
+#ifndef STRRCHR
+# define STRRCHR wcsrchr
+#endif
- movdqu (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match1)
-
- test %rcx, %rcx
- jnz L(return_null)
-
- and $-16, %rdi
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match1):
- test %rcx, %rcx
- jnz L(prolog_find_zero_1)
-
- mov %rax, %r8
- mov %rdi, %rsi
- and $-16, %rdi
- jmp L(loop)
-
- .p2align 4
-L(crosscache):
- and $15, %rcx
- and $-16, %rdi
- pxor %xmm3, %xmm3
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm3
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm3, %rdx
- pmovmskb %xmm0, %rax
- shr %cl, %rdx
- shr %cl, %rax
- add $16, %rdi
-
- test %rax, %rax
- jnz L(unaligned_match)
-
- test %rdx, %rdx
- jnz L(return_null)
-
- xor %r8, %r8
- jmp L(loop)
-
- .p2align 4
-L(unaligned_match):
- test %rdx, %rdx
- jnz L(prolog_find_zero)
-
- mov %rax, %r8
- lea (%rdi, %rcx), %rsi
-
-/* Loop start on aligned string. */
- .p2align 4
-L(loop):
- movdqa (%rdi), %xmm0
- pcmpeqd %xmm0, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm0
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm0, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm3
- pcmpeqd %xmm3, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm3
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm3, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm4
- pcmpeqd %xmm4, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm4
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm4, %rax
- or %rax, %rcx
- jnz L(matches)
-
- movdqa (%rdi), %xmm5
- pcmpeqd %xmm5, %xmm2
- add $16, %rdi
- pcmpeqd %xmm1, %xmm5
- pmovmskb %xmm2, %rcx
- pmovmskb %xmm5, %rax
- or %rax, %rcx
- jz L(loop)
-
- .p2align 4
-L(matches):
- test %rax, %rax
- jnz L(match)
-L(return_value):
- test %r8, %r8
- jz L(return_null)
- mov %r8, %rax
- mov %rsi, %rdi
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match):
- pmovmskb %xmm2, %rcx
- test %rcx, %rcx
- jnz L(find_zero)
- mov %rax, %r8
- mov %rdi, %rsi
- jmp L(loop)
-
- .p2align 4
-L(find_zero):
- test $15, %cl
- jnz L(find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(find_zero_in_second_wchar)
- test $15, %ch
- jnz L(find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_value)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_value)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_value)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero):
- add %rcx, %rdi
- mov %rdx, %rcx
-L(prolog_find_zero_1):
- test $15, %cl
- jnz L(prolog_find_zero_in_first_wchar)
- test %cl, %cl
- jnz L(prolog_find_zero_in_second_wchar)
- test $15, %ch
- jnz L(prolog_find_zero_in_third_wchar)
-
- and $1 << 13 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %ah
- jnz L(match_fourth_wchar)
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_first_wchar):
- test $1, %rax
- jz L(return_null)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_second_wchar):
- and $1 << 5 - 1, %rax
- jz L(return_null)
-
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(prolog_find_zero_in_third_wchar):
- and $1 << 9 - 1, %rax
- jz L(return_null)
-
- test %ah, %ah
- jnz L(match_third_wchar)
- test $15 << 4, %al
- jnz L(match_second_wchar)
- lea -16(%rdi), %rax
- ret
-
- .p2align 4
-L(match_second_wchar):
- lea -12(%rdi), %rax
- ret
-
- .p2align 4
-L(match_third_wchar):
- lea -8(%rdi), %rax
- ret
-
- .p2align 4
-L(match_fourth_wchar):
- lea -4(%rdi), %rax
- ret
-
- .p2align 4
-L(return_null):
- xor %rax, %rax
- ret
-
-END (wcsrchr)
+#include "../strrchr.S"
--
GitLab

View File

@ -0,0 +1,501 @@
From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:29 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
1 file changed, 269 insertions(+), 157 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index c949410b..3d26fad4 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@
# ifdef USE_AS_WCSRCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMIN vpminud
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMIN vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,196 +45,304 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
- .section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
- movd %esi, %xmm4
- movl %edi, %ecx
+ .section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+ movd %esi, %xmm7
+ movl %edi, %eax
/* Broadcast CHAR to YMM4. */
- VPBROADCAST %xmm4, %ymm4
+ VPBROADCAST %xmm7, %ymm7
vpxor %xmm0, %xmm0, %xmm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ /* Shift here instead of `andl` to save code size (saves a fetch
+ block). */
+ sall $20, %eax
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
+ ja L(cross_page)
+L(page_cross_continue):
vmovdqu (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- addq $VEC_SIZE, %rdi
+ /* Check end of string match. */
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jz L(aligned_more)
+
+ /* Only check match with search CHAR if needed. */
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Check if match before first zero. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+ search CHAR is zero we are correct. Either way `andq
+ -CHAR_SIZE, %rax` gets the correct result. */
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ /* Returns for first vec x1/x2 have hard coded backward search
+ path for earlier matches. */
+ .p2align 4,, 10
+L(first_vec_x1):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ .p2align 4,, 4
+L(first_vec_x0_test):
+ VPCMPEQ %ymm1, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ testl %eax, %eax
+ jz L(ret1)
+ bsrl %eax, %eax
+ addq %r8, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret1):
+ VZEROUPPER_RETURN
+ .p2align 4,, 10
+L(first_vec_x0_x1_test):
+ VPCMPEQ %ymm2, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ /* Check ymm2 for search CHAR match. If no match then check ymm1
+ before returning. */
testl %eax, %eax
- jnz L(first_vec)
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq 1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
- testl %ecx, %ecx
- jnz L(return_null)
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm6
+ vpmovmskb %ymm6, %eax
+ blsmskl %ecx, %ecx
+ /* If no in-range search CHAR match in ymm3 then need to check
+ ymm1/ymm2 for an earlier match (we delay checking search
+ CHAR matches until needed). */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(first_vec):
- /* Check if there is a nul CHAR. */
+L(aligned_more):
+ /* Save original pointer if match was in VEC 0. */
+ movq %rdi, %r8
+
+ /* Align src. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqu 1(%rdi), %ymm2
+ VPCMPEQ %ymm2, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
+ jnz L(first_vec_x1)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
+ VPCMPEQ %ymm3, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
+ /* Save pointer again before realigning. */
+ movq %rdi, %rsi
+ addq $(VEC_SIZE + 1), %rdi
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %edx
- vpmovmskb %ymm3, %eax
- shrl %cl, %edx
- shrl %cl, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
+L(first_aligned_loop):
+ /* Do 2x VEC at a time. Any more and the cost of finding the
+ match outweights loop benefit. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm8
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm8, %ymm0, %ymm8
+ vpor %ymm5, %ymm8, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
+ /* No zero or search CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
+ jz L(first_aligned_loop)
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
+ /* If no zero CHAR then go to second loop (this allows us to
+ throw away all prior work). */
+ vpmovmskb %ymm8, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_prep)
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ /* Search char could be zero so we need to get the true match.
+ */
+ vpmovmskb %ymm5, %eax
+ testl %eax, %eax
+ jnz L(first_aligned_loop_return)
- .p2align 4
-L(aligned_loop):
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- add $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
+ .p2align 4,, 4
+L(first_vec_x1_or_x2):
+ VPCMPEQ %ymm3, %ymm7, %ymm3
+ VPCMPEQ %ymm2, %ymm7, %ymm2
vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
-
- vmovdqa (%rdi), %ymm1
- VPCMPEQ %ymm1, %ymm0, %ymm2
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm1, %ymm4, %ymm3
- vpmovmskb %ymm2, %ecx
- vpmovmskb %ymm3, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
-
- .p2align 4
-L(char_nor_null):
- /* Find a CHAR or a nul CHAR in a loop. */
- testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ vpmovmskb %ymm2, %edx
+ /* Use add for macro-fusion. */
+ addq %rax, %rdx
+ jz L(first_vec_x0_test)
+ /* NB: We could move this shift to before the branch and save a
+ bit of code size / performance on the fall through. The
+ branch leads to the null case which generally seems hotter
+ than char in first 3x VEC. */
+ salq $32, %rax
+ addq %rdx, %rax
+ bsrq %rax, %rax
+ leaq 1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4,, 8
+L(first_aligned_loop_return):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(first_vec_x1_or_x2)
+
+ bsrq %rax, %rax
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %eax
+ andq $-CHAR_SIZE, %rax
# endif
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
+ /* Search char cannot be zero. */
.p2align 4
-L(match):
- /* Find a CHAR. Check if there is a nul CHAR. */
- vpmovmskb %ymm2, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
+L(second_aligned_loop_set_furthest_match):
+ /* Save VEC and pointer from most recent match. */
+L(second_aligned_loop_prep):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ vmovdqu %ymm6, %ymm2
+ vmovdqu %ymm10, %ymm3
.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+L(second_aligned_loop):
+ /* Search 2x at at time. */
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
+
+ VPCMPEQ %ymm4, %ymm7, %ymm6
+ VPMIN %ymm4, %ymm5, %ymm1
+ VPCMPEQ %ymm5, %ymm7, %ymm10
+ vpor %ymm6, %ymm10, %ymm5
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpor %ymm5, %ymm1, %ymm9
+
+ vpmovmskb %ymm9, %eax
+ addq $(VEC_SIZE * 2), %rdi
testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
- VZEROUPPER_RETURN
-
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a nul CHAR. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
- /* Keep the first bit for each matching CHAR for bsr. */
- andl $0x11111111, %ecx
- andl $0x11111111, %eax
-# endif
- /* Mask out any matching bits after the nul CHAR. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
+ jz L(second_aligned_loop)
+ vpmovmskb %ymm1, %ecx
+ testl %ecx, %ecx
+ jz L(second_aligned_loop_set_furthest_match)
+ vpmovmskb %ymm5, %eax
testl %eax, %eax
- /* Return null pointer if the nul CHAR comes first. */
- jz L(return_null)
- bsrl %eax, %eax
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ jnz L(return_new_match)
+
+ /* This is the hot patch. We know CHAR is inbounds and that
+ ymm3/ymm2 have latest match. */
+ .p2align 4,, 4
+L(return_old_match):
+ vpmovmskb %ymm3, %eax
+ vpmovmskb %ymm2, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
VZEROUPPER_RETURN
- .p2align 4
-L(return_null):
- xorl %eax, %eax
+ /* Last iteration also potentially has a match. */
+ .p2align 4,, 8
+L(return_new_match):
+ VPCMPEQ %ymm4, %ymm0, %ymm4
+ vpmovmskb %ymm4, %edx
+ salq $32, %rcx
+ orq %rdx, %rcx
+
+ vpmovmskb %ymm10, %eax
+ vpmovmskb %ymm6, %edx
+ salq $32, %rax
+ orq %rdx, %rax
+ blsmskq %rcx, %rcx
+ andq %rcx, %rax
+ jz L(return_old_match)
+ bsrq %rax, %rax
+ /* Search char cannot be zero so safe to just use lea for
+ wcsrchr. */
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
VZEROUPPER_RETURN
-END (STRRCHR)
+ .p2align 4,, 4
+L(cross_page):
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rsi
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ %ymm1, %ymm0, %ymm6
+ vpmovmskb %ymm6, %ecx
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %ecx, %ecx
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+ VPCMPEQ %ymm1, %ymm7, %ymm1
+ vpmovmskb %ymm1, %eax
+
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %edi, %eax, %eax
+ blsmskl %ecx, %ecx
+ /* Check if any search CHAR match in range. */
+ andl %ecx, %eax
+ jz L(ret2)
+ bsrl %eax, %eax
+ addq %rdi, %rax
+# ifdef USE_AS_WCSRCHR
+ andq $-CHAR_SIZE, %rax
+# endif
+L(ret2):
+ VZEROUPPER_RETURN
+END(STRRCHR)
#endif
--
GitLab

View File

@ -0,0 +1,558 @@
From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 21 Apr 2022 20:52:30 -0500
Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
Content-type: text/plain; charset=UTF-8
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.
Geometric Mean of all benchmarks New / Old: 0.755
See email for all results.
Full xcheck passes on x86_64 with and without multiarch enabled.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
1 file changed, 290 insertions(+), 181 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
index f920b5a5..f5b6d755 100644
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
@@ -24,242 +24,351 @@
# define STRRCHR __strrchr_evex
# endif
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
# ifdef USE_AS_WCSRCHR
+# define SHIFT_REG esi
+
+# define kunpck kunpckbw
+# define kmov_2x kmovd
+# define maskz_2x ecx
+# define maskm_2x eax
+# define CHAR_SIZE 4
+# define VPMIN vpminud
+# define VPTESTN vptestnmd
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPCMP vpcmpd
# else
+# define SHIFT_REG edi
+
+# define kunpck kunpckdq
+# define kmov_2x kmovq
+# define maskz_2x rcx
+# define maskm_2x rax
+
+# define CHAR_SIZE 1
+# define VPMIN vpminub
+# define VPTESTN vptestnmb
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPCMP vpcmpb
# endif
# define XMMZERO xmm16
# define YMMZERO ymm16
# define YMMMATCH ymm17
-# define YMM1 ymm18
+# define YMMSAVE ymm18
+
+# define YMM1 ymm19
+# define YMM2 ymm20
+# define YMM3 ymm21
+# define YMM4 ymm22
+# define YMM5 ymm23
+# define YMM6 ymm24
+# define YMM7 ymm25
+# define YMM8 ymm26
-# define VEC_SIZE 32
- .section .text.evex,"ax",@progbits
-ENTRY (STRRCHR)
- movl %edi, %ecx
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+ .section .text.evex, "ax", @progbits
+ENTRY(STRRCHR)
+ movl %edi, %eax
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(cross_page_boundary)
+L(page_cross_continue):
VMOVU (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ /* k0 has a 1 for each zero CHAR in YMM1. */
+ VPTESTN %YMM1, %YMM1, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
-
- addq $VEC_SIZE, %rdi
-
- testl %eax, %eax
- jnz L(first_vec)
-
testl %ecx, %ecx
- jnz L(return_null)
-
- andq $-VEC_SIZE, %rdi
- xorl %edx, %edx
- jmp L(aligned_loop)
-
- .p2align 4
-L(first_vec):
- /* Check if there is a null byte. */
- testl %ecx, %ecx
- jnz L(char_and_nul_in_first_vec)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- movq %rdi, %rsi
- andq $-VEC_SIZE, %rdi
- jmp L(aligned_loop)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ jz L(aligned_more)
+ /* fallthrough: zero CHAR in first VEC. */
+ /* K1 has a 1 for each search CHAR match in YMM1. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Build mask up until first zero CHAR (used to mask of
+ potential search CHAR matches past the end of the string).
+ */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret0)
+ /* Get last match (the `andl` removed any out of bounds
+ matches). */
+ bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rdi, %rax
# endif
+L(ret0):
+ ret
- VMOVA (%rdi), %YMM1
-
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
+ /* Returns for first vec x1/x2/x3 have hard coded backward
+ search path for earlier matches. */
+ .p2align 4,, 6
+L(first_vec_x1):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ blsmskl %ecx, %ecx
+ /* eax non-zero if search CHAR in range. */
+ andl %ecx, %eax
+ jnz L(first_vec_x1_return)
+
+ /* fallthrough: no match in YMM2 then need to check for earlier
+ matches (in YMM1). */
+ .p2align 4,, 4
+L(first_vec_x0_test):
VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %edx
kmovd %k1, %eax
-
- shrxl %SHIFT_REG, %edx, %edx
- shrxl %SHIFT_REG, %eax, %eax
- addq $VEC_SIZE, %rdi
-
- /* Check if there is a CHAR. */
testl %eax, %eax
- jnz L(found_char)
-
- testl %edx, %edx
- jnz L(return_null)
-
- jmp L(aligned_loop)
-
- .p2align 4
-L(found_char):
- testl %edx, %edx
- jnz L(char_and_nul)
-
- /* Remember the match and keep searching. */
- movl %eax, %edx
- leaq (%rdi, %rcx), %rsi
+ jz L(ret1)
+ bsrl %eax, %eax
+# ifdef USE_AS_WCSRCHR
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
+# else
+ addq %rsi, %rax
+# endif
+L(ret1):
+ ret
- .p2align 4
-L(aligned_loop):
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ .p2align 4,, 10
+L(first_vec_x1_or_x2):
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
+ matches between either of them. Otherwise check YMM1. */
+ kortestd %k2, %k3
+ jz L(first_vec_x0_test)
+
+ /* Guranteed that YMM2 and YMM3 are within range so merge the
+ two bitmasks then get last result. */
+ kunpck %k2, %k3, %k3
+ kmovq %k3, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 6
+L(first_vec_x3):
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
+ andl %ecx, %eax
+ jz L(first_vec_x1_or_x2)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- add $VEC_SIZE, %rdi
+ .p2align 4,, 6
+L(first_vec_x0_x1_test):
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
+ kmovd %k1, %eax
+ /* Check YMM2 for last match first. If no match try YMM1. */
+ testl %eax, %eax
+ jz L(first_vec_x0_test)
+ .p2align 4,, 4
+L(first_vec_x1_return):
+ bsrl %eax, %eax
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
- kmovd %k0, %ecx
+ .p2align 4,, 10
+L(first_vec_x2):
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ blsmskl %ecx, %ecx
+ /* Check YMM3 for last match first. If no match try YMM2/YMM1.
+ */
+ andl %ecx, %eax
+ jz L(first_vec_x0_x1_test)
+ bsrl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ .p2align 4
+L(aligned_more):
+ /* Need to keep original pointer incase YMM1 has last match. */
+ movq %rdi, %rsi
+ andq $-VEC_SIZE, %rdi
+ VMOVU VEC_SIZE(%rdi), %YMM2
+ VPTESTN %YMM2, %YMM2, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jnz L(char_nor_null)
+ testl %ecx, %ecx
+ jnz L(first_vec_x1)
- VMOVA (%rdi), %YMM1
- addq $VEC_SIZE, %rdi
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
+ VPTESTN %YMM3, %YMM3, %k0
+ kmovd %k0, %ecx
+ testl %ecx, %ecx
+ jnz L(first_vec_x2)
- /* Each bit in K0 represents a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
- /* Each bit in K1 represents a CHAR in YMM1. */
- VPCMP $0, %YMMMATCH, %YMM1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
+ VPTESTN %YMM4, %YMM4, %k0
kmovd %k0, %ecx
- kmovd %k1, %eax
- orl %eax, %ecx
- jz L(aligned_loop)
+ movq %rdi, %r8
+ testl %ecx, %ecx
+ jnz L(first_vec_x3)
+ andq $-(VEC_SIZE * 2), %rdi
.p2align 4
-L(char_nor_null):
- /* Find a CHAR or a null byte in a loop. */
+L(first_aligned_loop):
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
+ they don't store a match. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
+
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
+ vpxord %YMM6, %YMMMATCH, %YMM7
+
+ VPMIN %YMM5, %YMM6, %YMM8
+ VPMIN %YMM8, %YMM7, %YMM7
+
+ VPTESTN %YMM7, %YMM7, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(first_aligned_loop)
+
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
+ VPTESTN %YMM8, %YMM8, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_prep)
+
+ kortestd %k2, %k3
+ jnz L(return_first_aligned_loop)
+
+ .p2align 4,, 6
+L(first_vec_x1_or_x2_or_x3):
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
+ kmovd %k4, %eax
testl %eax, %eax
- jnz L(match)
-L(return_value):
- testl %edx, %edx
- jz L(return_null)
- movl %edx, %eax
- movq %rsi, %rdi
+ jz L(first_vec_x1_or_x2)
bsrl %eax, %eax
-# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
-# endif
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
ret
- .p2align 4
-L(match):
- /* Find a CHAR. Check if there is a null byte. */
- kmovd %k0, %ecx
- testl %ecx, %ecx
- jnz L(find_nul)
+ .p2align 4,, 8
+L(return_first_aligned_loop):
+ VPTESTN %YMM5, %YMM5, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(first_vec_x1_or_x2_or_x3)
- /* Remember the match and keep searching. */
- movl %eax, %edx
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+ /* We can throw away the work done for the first 4x checks here
+ as we have a later match. This is the 'fast' path persay.
+ */
+L(second_aligned_loop_prep):
+L(second_aligned_loop_set_furthest_match):
movq %rdi, %rsi
- jmp L(aligned_loop)
+ kunpck %k2, %k3, %k4
.p2align 4
-L(find_nul):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* If there is no CHAR here, return the remembered one. */
- jz L(return_value)
- bsrl %eax, %eax
+L(second_aligned_loop):
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
+
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
+ vpxord %YMM2, %YMMMATCH, %YMM3
+
+ VPMIN %YMM1, %YMM2, %YMM4
+ VPMIN %YMM3, %YMM4, %YMM3
+
+ VPTESTN %YMM3, %YMM3, %k1
+ subq $(VEC_SIZE * -2), %rdi
+ kortestd %k1, %k2
+ jz L(second_aligned_loop)
+
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
+ VPTESTN %YMM4, %YMM4, %k1
+ ktestd %k1, %k1
+ jz L(second_aligned_loop_set_furthest_match)
+
+ kortestd %k2, %k3
+ /* branch here because there is a significant advantage interms
+ of output dependency chance in using edx. */
+ jnz L(return_new_match)
+L(return_old_match):
+ kmovq %k4, %rax
+ bsrq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(return_new_match):
+ VPTESTN %YMM1, %YMM1, %k0
+ kunpck %k0, %k1, %k0
+ kmov_2x %k0, %maskz_2x
+
+ blsmsk %maskz_2x, %maskz_2x
+ kunpck %k2, %k3, %k3
+ kmov_2x %k3, %maskm_2x
+ and %maskz_2x, %maskm_2x
+ jz L(return_old_match)
+
+ bsr %maskm_2x, %maskm_2x
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+L(cross_page_boundary):
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
+ rax` sets pointer will all page offset bits cleared so
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
+ before page cross (guranteed to be safe to read). Doing this
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
+ a bit of code size. */
+ xorq %rdi, %rax
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
+ VPTESTN %YMM1, %YMM1, %k0
+ kmovd %k0, %ecx
+
+ /* Shift out zero CHAR matches that are before the begining of
+ src (rdi). */
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ movl %edi, %esi
+ andl $(VEC_SIZE - 1), %esi
+ shrl $2, %esi
# endif
- ret
+ shrxl %SHIFT_REG, %ecx, %ecx
- .p2align 4
-L(char_and_nul):
- /* Find both a CHAR and a null byte. */
- addq %rcx, %rdi
- movl %edx, %ecx
-L(char_and_nul_in_first_vec):
- /* Mask out any matching bits after the null byte. */
- movl %ecx, %r8d
- subl $1, %r8d
- xorl %ecx, %r8d
- andl %r8d, %eax
- testl %eax, %eax
- /* Return null pointer if the null byte comes first. */
- jz L(return_null)
+ testl %ecx, %ecx
+ jz L(page_cross_continue)
+
+ /* Found zero CHAR so need to test for search CHAR. */
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
+ kmovd %k1, %eax
+ /* Shift out search CHAR matches that are before the begining of
+ src (rdi). */
+ shrxl %SHIFT_REG, %eax, %eax
+
+ /* Check if any search CHAR match in range. */
+ blsmskl %ecx, %ecx
+ andl %ecx, %eax
+ jz L(ret3)
bsrl %eax, %eax
# ifdef USE_AS_WCSRCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq -VEC_SIZE(%rdi, %rax), %rax
+ addq %rdi, %rax
# endif
+L(ret3):
ret
- .p2align 4
-L(return_null):
- xorl %eax, %eax
- ret
-
-END (STRRCHR)
+END(STRRCHR)
#endif
--
GitLab

View File

@ -0,0 +1,73 @@
From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 27 Apr 2022 15:13:02 -0500
Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
Content-type: text/plain; charset=UTF-8
'get_fast_jitter' is meant to be used purely for performance
purposes. In all cases it's used it should be acceptable to get no
randomness (see default case). An example use case is in setting
jitter for retries between threads at a lock. There is a
performance benefit to having jitter, but only if the jitter can
be generated very quickly and ultimately there is no serious issue
if no jitter is generated.
The implementation generally uses 'HP_TIMING_NOW' iff it is
inlined (avoid any potential syscall paths).
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
1 file changed, 42 insertions(+)
create mode 100644 sysdeps/generic/fast-jitter.h
diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
new file mode 100644
index 00000000..4dd53e34
--- /dev/null
+++ b/sysdeps/generic/fast-jitter.h
@@ -0,0 +1,42 @@
+/* Fallback for fast jitter just return 0.
+ Copyright (C) 2019-2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#ifndef _FAST_JITTER_H
+# define _FAST_JITTER_H
+
+# include <stdint.h>
+# include <hp-timing.h>
+
+/* Baseline just return 0. We could create jitter using a clock or
+ 'random_bits' but that may imply a syscall and the goal of
+ 'get_fast_jitter' is minimal overhead "randomness" when such
+ randomness helps performance. Adding high overhead the function
+ defeats the purpose. */
+static inline uint32_t
+get_fast_jitter (void)
+{
+# if HP_TIMING_INLINE
+ hp_timing_t jitter;
+ HP_TIMING_NOW (jitter);
+ return (uint32_t) jitter;
+# else
+ return 0;
+# endif
+}
+
+#endif
--
GitLab

View File

@ -0,0 +1,226 @@
From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
From: Wangyang Guo <wangyang.guo@intel.com>
Date: Fri, 6 May 2022 01:50:10 +0000
Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
Content-type: text/plain; charset=UTF-8
When mutiple threads waiting for lock at the same time, once lock owner
releases the lock, waiters will see lock available and all try to lock,
which may cause an expensive CAS storm.
Binary exponential backoff with random jitter is introduced. As try-lock
attempt increases, there is more likely that a larger number threads
compete for adaptive mutex lock, so increase wait time in exponential.
A random jitter is also added to avoid synchronous try-lock from other
threads.
v2: Remove read-check before try-lock for performance.
v3:
1. Restore read-check since it works well in some platform.
2. Make backoff arch dependent, and enable it for x86_64.
3. Limit max backoff to reduce latency in large critical section.
v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
v5: Commit log updated for regression in large critical section.
Result of pthread-mutex-locks bench
Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
First Row: thread number
First Col: critical section length
Values: backoff vs upstream, time based, low is better
non-critical-length: 1
1 2 4 8 16 32 64 112 140
0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54
1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57
2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61
4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65
8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71
16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80
32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90
64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99
128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02
non-critical-length: 32
1 2 4 8 16 32 64 112 140
0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70
1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72
2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74
4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77
8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80
16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84
32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91
64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99
128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99
non-critical-length: 128
1 2 4 8 16 32 64 112 140
0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73
1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74
2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76
4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77
8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80
16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84
32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91
64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99
128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98
There is regression in large critical section. But adaptive mutex is
aimed for "quick" locks. Small critical section is more common when
users choose to use adaptive pthread_mutex.
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Conflicts:
pthreadP.h
(had been moved)
nptl/pthread_mutex_lock.c
(max_adaptive_count renamed)
---
nptl/pthreadP.h | 1 +
nptl/pthread_mutex_lock.c | 16 +++++++--
sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++
sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
4 files changed, 89 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
index 7ddc166c..1550e3b6 100644
--- a/nptl/pthreadP.h
+++ b/nptl/pthreadP.h
@@ -33,6 +33,7 @@
#include <kernel-features.h>
#include <errno.h>
#include <internal-signals.h>
+#include <pthread_mutex_backoff.h>
/* Atomic operations on TLS memory. */
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index d96a9933..c7770fc9 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
int cnt = 0;
int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
mutex->__data.__spins * 2 + 10);
+ int spin_count, exp_backoff = 1;
+ unsigned int jitter = get_jitter ();
do
{
- if (cnt++ >= max_cnt)
+ /* In each loop, spin count is exponential backoff plus
+ random jitter, random range is [0, exp_backoff-1]. */
+ spin_count = exp_backoff + (jitter & (exp_backoff - 1));
+ cnt += spin_count;
+ if (cnt >= max_cnt)
{
+ /* If cnt exceeds max spin count, just go to wait
+ queue. */
LLL_MUTEX_LOCK (mutex);
break;
}
- atomic_spin_nop ();
+ do
+ atomic_spin_nop ();
+ while (--spin_count > 0);
+ /* Prepare for next loop. */
+ exp_backoff = get_next_backoff (exp_backoff);
}
while (LLL_MUTEX_READ_LOCK (mutex) != 0
|| LLL_MUTEX_TRYLOCK (mutex) != 0);
diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
new file mode 100644
index 00000000..5b26c22a
--- /dev/null
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,35 @@
+/* Pthread mutex backoff configuration.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+static inline unsigned int
+get_jitter (void)
+{
+ /* Arch dependent random jitter, return 0 disables random. */
+ return 0;
+}
+
+static inline int
+get_next_backoff (int backoff)
+{
+ /* Next backoff, return 1 disables mutex backoff. */
+ return 1;
+}
+
+#endif
diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
new file mode 100644
index 00000000..ec74c3d9
--- /dev/null
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
@@ -0,0 +1,39 @@
+/* Pthread mutex backoff configuration.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
+#define _PTHREAD_MUTEX_BACKOFF_H 1
+
+#include <fast-jitter.h>
+
+static inline unsigned int
+get_jitter (void)
+{
+ return get_fast_jitter ();
+}
+
+#define MAX_BACKOFF 16
+
+static inline int
+get_next_backoff (int backoff)
+{
+ /* Binary expontial backoff. Limiting max backoff
+ can reduce latency in large critical section. */
+ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
+}
+
+#endif
--
GitLab

View File

@ -0,0 +1,55 @@
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 15 Feb 2022 08:18:15 -0600
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
#28896]
Content-type: text/plain; charset=UTF-8
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
not checks around vzeroupper and would trigger spurious
aborts. This commit fixes that.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
AVX2 machines with and without RTM.
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
1 file changed, 2 insertions(+), 6 deletions(-)
Conflicts:
sysdeps/x86_64/multiarch/strcmp-avx2.S
(split into two patches due to upstream bug differences)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 28cc98b6..e267c6cb 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -345,10 +345,10 @@ L(one_or_less):
movq %LOCALE_REG, %rdx
# endif
jb L(ret_zero)
-# ifdef USE_AS_WCSCMP
/* 'nbe' covers the case where length is negative (large
unsigned). */
- jnbe __wcscmp_avx2
+ jnbe OVERFLOW_STRCMP
+# ifdef USE_AS_WCSCMP
movl (%rdi), %edx
xorl %eax, %eax
cmpl (%rsi), %edx
@@ -357,10 +357,6 @@ L(one_or_less):
negl %eax
orl $1, %eax
# else
- /* 'nbe' covers the case where length is negative (large
- unsigned). */
-
- jnbe __strcmp_avx2
movzbl (%rdi), %eax
movzbl (%rsi), %ecx
TOLOWER_gpr (%rax, %eax)
--
GitLab

View File

@ -0,0 +1,60 @@
From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
From: Stefan Liebler <stli@linux.ibm.com>
Date: Mon, 28 Jun 2021 13:01:07 +0200
Subject: s390x: Update math: redirect roundeven function
After recent commit
447954a206837b5f153869cfeeeab44631c3fac9
"math: redirect roundeven function", building on
s390x fails with:
Error: symbol `__roundevenl' is already defined
Similar to aarch64/riscv fix, this patch redirects target
specific functions for s390x:
commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
"Update math: redirect roundeven function"
diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
index 40b07e054b..0773adfed0 100644
--- a/sysdeps/s390/fpu/s_roundeven.c
+++ b/sysdeps/s390/fpu/s_roundeven.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <libm-alias-double.h>
@@ -31,7 +32,6 @@ __roundeven (double x)
__asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
return y;
}
-hidden_def (__roundeven)
libm_alias_double (__roundeven, roundeven)
#else
diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
index d2fbf3d2b6..289785bc4a 100644
--- a/sysdeps/s390/fpu/s_roundevenf.c
+++ b/sysdeps/s390/fpu/s_roundevenf.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <libm-alias-float.h>
diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
index 29ab7a8616..94b6459ab4 100644
--- a/sysdeps/s390/fpu/s_roundevenl.c
+++ b/sysdeps/s390/fpu/s_roundevenl.c
@@ -18,6 +18,7 @@
<https://www.gnu.org/licenses/>. */
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
+# define NO_MATH_REDIRECT
# include <math.h>
# include <math_private.h>
# include <libm-alias-ldouble.h>

View File

@ -0,0 +1,74 @@
From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 26 Feb 2021 05:36:59 -0800
Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
Content-type: text/plain; charset=UTF-8
1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
by VZEROUPPER inside a transactionally executing RTM region.
2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add
Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
---
sysdeps/x86/cpu-features.c | 20 +++++++++++++++++--
sysdeps/x86/cpu-tunables.c | 2 ++
...cpu-features-preferred_feature_index_1.def | 1 +
3 files changed, 21 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 91042505..3610ee5c 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|= bit_arch_Prefer_No_VZEROUPPER;
else
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
- |= bit_arch_Prefer_No_AVX512;
+ {
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
+ |= bit_arch_Prefer_No_AVX512;
+
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
+ transactionally executing RTM region. */
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
+ |= bit_arch_Prefer_No_VZEROUPPER;
+
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
+ AVX2 strcmp is faster than EVEX strcmp. */
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
+ |= bit_arch_Prefer_AVX2_STRCMP;
+ }
}
/* This spells out "AuthenticAMD". */
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 3173b2b9..73adbaba 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
Fast_Copy_Backward,
disable, 18);
+ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
}
break;
case 19:
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
index 17a5cc42..4ca70b40 100644
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
BIT (Prefer_FSRM)
BIT (Prefer_No_AVX512)
BIT (MathVec_Prefer_No_AVX512)
+BIT (Prefer_AVX2_STRCMP)
--
GitLab

View File

@ -0,0 +1,26 @@
From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 23 Jun 2021 13:29:41 -0700
Subject: Update math: redirect roundeven function
Redirect target specific roundeven functions for aarch64, ldbl-128ibm
and riscv.
Conflicts:
sysdeps/aarch64/*
(not needed)
sysdeps/riscv/*
(not supported)
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
index 6701970f4a..90eecf496b 100644
--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,242 @@
From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 06:46:08 -0800
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
AVX512VL since VZEROUPPER isn't needed at function exit.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++
sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++--
.../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++
.../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++-----
5 files changed, 104 insertions(+), 11 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 46783cd1..4563fc56 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-evex \
+ memmove-evex-unaligned-erms \
memrchr-evex \
rawmemchr-evex \
stpcpy-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 082e4da3..6bd3abfc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
__memmove_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (SSSE3),
__memmove_chk_ssse3_back)
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX),
__memmove_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memmove,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memmove_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove,
CPU_FEATURE_USABLE (AVX512F),
__memmove_avx512_no_vzeroupper)
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
__memcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__memcpy_chk_ssse3_back)
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memcpy,
CPU_FEATURE_USABLE (AVX),
__memcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __memcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
__memcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
__mempcpy_chk_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (SSSE3),
__mempcpy_chk_ssse3_back)
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
__mempcpy_avx_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, mempcpy,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __mempcpy_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
__mempcpy_ssse3_back)
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index 5e5f0299..6f8bce5f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx_unaligned_erms);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx_unaligned_erms);
- return OPTIMIZE (avx_unaligned);
+ return OPTIMIZE (avx_unaligned);
+ }
}
if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
new file mode 100644
index 00000000..0cbce8f9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
@@ -0,0 +1,33 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 ymm16
+# define VEC1 ymm17
+# define VEC2 ymm18
+# define VEC3 ymm19
+# define VEC4 ymm20
+# define VEC5 ymm21
+# define VEC6 ymm22
+# define VEC7 ymm23
+# define VEC8 ymm24
+# define VEC9 ymm25
+# define VEC10 ymm26
+# define VEC11 ymm27
+# define VEC12 ymm28
+# define VEC13 ymm29
+# define VEC14 ymm30
+# define VEC15 ymm31
+# define VEC(i) VEC##i
+# define VMOVNT vmovntdq
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define SECTION(p) p##.evex
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
+
+# include "memmove-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 274aa1c7..08e21692 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -48,6 +48,14 @@
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -277,20 +285,20 @@ L(less_vec):
#if VEC_SIZE > 32
L(between_32_63):
/* From 32 to 63. No branch when size == 32. */
- vmovdqu (%rsi), %ymm0
- vmovdqu -32(%rsi,%rdx), %ymm1
- vmovdqu %ymm0, (%rdi)
- vmovdqu %ymm1, -32(%rdi,%rdx)
+ VMOVU (%rsi), %YMM0
+ VMOVU -32(%rsi,%rdx), %YMM1
+ VMOVU %YMM0, (%rdi)
+ VMOVU %YMM1, -32(%rdi,%rdx)
VZEROUPPER
ret
#endif
#if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu (%rsi), %xmm0
- vmovdqu -16(%rsi,%rdx), %xmm1
- vmovdqu %xmm0, (%rdi)
- vmovdqu %xmm1, -16(%rdi,%rdx)
+ VMOVU (%rsi), %XMM0
+ VMOVU -16(%rsi,%rdx), %XMM1
+ VMOVU %XMM0, (%rdi)
+ VMOVU %XMM1, -16(%rdi,%rdx)
ret
#endif
L(between_8_15):
--
GitLab

View File

@ -0,0 +1,254 @@
From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 07:15:03 -0800
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
function exit.
---
sysdeps/x86_64/multiarch/Makefile | 1 +
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++
sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++----
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++----
.../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++
.../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++-----
6 files changed, 90 insertions(+), 14 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 4563fc56..1cc0a10e 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memchr-evex \
memmove-evex-unaligned-erms \
memrchr-evex \
+ memset-evex-unaligned-erms \
rawmemchr-evex \
stpcpy-evex \
stpncpy-evex \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 6bd3abfc..7cf83485 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX2),
__memset_chk_avx2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_chk_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
__memset_chk_avx512_unaligned_erms)
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX2),
__memset_avx2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, memset,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
+ __memset_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
__memset_avx512_unaligned_erms)
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX2),
__wmemset_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, wmemset,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
CPU_FEATURE_USABLE (AVX512F),
__wmemset_avx512_unaligned))
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX2),
__wmemset_chk_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
+ CPU_FEATURE_USABLE (AVX512VL),
+ __wmemset_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
CPU_FEATURE_USABLE (AVX512F),
__wmemset_chk_avx512_unaligned))
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 708bd72e..6f31f4dc 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
+ attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx2_unaligned_erms);
- else
- return OPTIMIZE (avx2_unaligned);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (evex_unaligned_erms);
+
+ return OPTIMIZE (evex_unaligned);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx2_unaligned_erms);
+
+ return OPTIMIZE (avx2_unaligned);
+ }
}
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index eb242210..9290c4bf 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -20,6 +20,7 @@
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
static inline void *
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx512_unaligned);
- else
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ return OPTIMIZE (evex_unaligned);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
return OPTIMIZE (avx2_unaligned);
}
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
new file mode 100644
index 00000000..ae0a4d6e
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -0,0 +1,24 @@
+#if IS_IN (libc)
+# define VEC_SIZE 32
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 ymm16
+# define VEC(i) VEC##i
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+# define VZEROUPPER
+
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastb d, %VEC0
+
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+ movq r, %rax; \
+ vpbroadcastd d, %VEC0
+
+# define SECTION(p) p##.evex
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
+
+# include "memset-vec-unaligned-erms.S"
+#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 9a0fd818..71e91a8f 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -34,6 +34,14 @@
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
#endif
+#ifndef XMM0
+# define XMM0 xmm0
+#endif
+
+#ifndef YMM0
+# define YMM0 ymm0
+#endif
+
#ifndef VZEROUPPER
# if VEC_SIZE > 16
# define VZEROUPPER vzeroupper
@@ -67,7 +75,7 @@
ENTRY (__bzero)
mov %RDI_LP, %RAX_LP /* Set return value. */
mov %RSI_LP, %RDX_LP /* Set n. */
- pxor %xmm0, %xmm0
+ pxor %XMM0, %XMM0
jmp L(entry_from_bzero)
END (__bzero)
weak_alias (__bzero, bzero)
@@ -223,7 +231,7 @@ L(less_vec):
cmpb $16, %dl
jae L(between_16_31)
# endif
- MOVQ %xmm0, %rcx
+ MOVQ %XMM0, %rcx
cmpb $8, %dl
jae L(between_8_15)
cmpb $4, %dl
@@ -238,16 +246,16 @@ L(less_vec):
# if VEC_SIZE > 32
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- vmovdqu %ymm0, -32(%rdi,%rdx)
- vmovdqu %ymm0, (%rdi)
+ VMOVU %YMM0, -32(%rdi,%rdx)
+ VMOVU %YMM0, (%rdi)
VZEROUPPER
ret
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- vmovdqu %xmm0, -16(%rdi,%rdx)
- vmovdqu %xmm0, (%rdi)
+ VMOVU %XMM0, -16(%rdi,%rdx)
+ VMOVU %XMM0, (%rdi)
VZEROUPPER
ret
# endif
--
GitLab

View File

@ -0,0 +1,561 @@
From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 5 Mar 2021 07:20:28 -0800
Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
Content-type: text/plain; charset=UTF-8
Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
exit.
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 +
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +-
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++
sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 +
5 files changed, 467 insertions(+), 4 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 1cc0a10e..9d79b138 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
memset-avx2-unaligned-erms \
memset-avx512-unaligned-erms \
memchr-evex \
+ memcmp-evex-movbe \
memmove-evex-unaligned-erms \
memrchr-evex \
memset-evex-unaligned-erms \
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcsncmp-evex \
wcsnlen-evex \
wcsrchr-evex \
- wmemchr-evex
+ wmemchr-evex \
+ wmemcmp-evex-movbe
endif
ifeq ($(subdir),debug)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 7cf83485..c8da910e 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, memcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (MOVBE)),
+ __memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
__memcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
(CPU_FEATURE_USABLE (AVX2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (MOVBE)),
+ __wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
__wmemcmp_sse4_1)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 6c1f3153..3ca1f0a6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
static inline void *
IFUNC_SELECTOR (void)
{
const struct cpu_features* cpu_features = __get_cpu_features ();
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- return OPTIMIZE (avx2_movbe);
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex_movbe);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2_movbe);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
return OPTIMIZE (sse4_1);
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
new file mode 100644
index 00000000..9c093972
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -0,0 +1,440 @@
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if IS_IN (libc)
+
+/* memcmp/wmemcmp is implemented as:
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
+ to avoid branches.
+ 2. Use overlapping compare to avoid branch.
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
+ bytes for wmemcmp.
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ area.
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+
+# include <sysdep.h>
+
+# ifndef MEMCMP
+# define MEMCMP __memcmp_evex_movbe
+# endif
+
+# define VMOVU vmovdqu64
+
+# ifdef USE_AS_WMEMCMP
+# define VPCMPEQ vpcmpeqd
+# else
+# define VPCMPEQ vpcmpeqb
+# endif
+
+# define XMM1 xmm17
+# define XMM2 xmm18
+# define YMM1 ymm17
+# define YMM2 ymm18
+# define YMM3 ymm19
+# define YMM4 ymm20
+# define YMM5 ymm21
+# define YMM6 ymm22
+
+# define VEC_SIZE 32
+# ifdef USE_AS_WMEMCMP
+# define VEC_MASK 0xff
+# define XMM_MASK 0xf
+# else
+# define VEC_MASK 0xffffffff
+# define XMM_MASK 0xffff
+# endif
+
+/* Warning!
+ wmemcmp has to use SIGNED comparison for elements.
+ memcmp has to use UNSIGNED comparison for elemnts.
+*/
+
+ .section .text.evex,"ax",@progbits
+ENTRY (MEMCMP)
+# ifdef USE_AS_WMEMCMP
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
+ jb L(less_vec)
+
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_vec)
+
+ /* More than 2 * VEC. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
+ cmpq $(VEC_SIZE * 4), %rdx
+ jb L(last_4x_vec)
+
+ /* From 4 * VEC to 8 * VEC, inclusively. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+
+ kandd %k1, %k2, %k5
+ kandd %k3, %k4, %k6
+ kandd %k5, %k6, %k6
+
+ kmovd %k6, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k1, %k2, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+L(last_vec):
+ /* Use overlapping loads to avoid branches. */
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(first_vec):
+ /* A byte or int32 is different within 16 or 32 bytes. */
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (%rdi, %rcx, 4), %edx
+ cmpl (%rsi, %rcx, 4), %edx
+L(wmemcmp_return):
+ setl %al
+ negl %eax
+ orl $1, %eax
+# else
+ movzbl (%rdi, %rcx), %eax
+ movzbl (%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+# ifdef USE_AS_WMEMCMP
+ .p2align 4
+L(4):
+ xorl %eax, %eax
+ movl (%rdi), %edx
+ cmpl (%rsi), %edx
+ jne L(wmemcmp_return)
+ ret
+# else
+ .p2align 4
+L(between_4_7):
+ /* Load as big endian with overlapping movbe to avoid branches. */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ je L(exit)
+ sbbl %eax, %eax
+ orl $1, %eax
+ ret
+
+ .p2align 4
+L(exit):
+ ret
+
+ .p2align 4
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ shll $8, %eax
+ shll $8, %ecx
+ bswap %eax
+ bswap %ecx
+ movb -1(%rdi, %rdx), %al
+ movb -1(%rsi, %rdx), %cl
+ /* Subtraction is okay because the upper 8 bits are zero. */
+ subl %ecx, %eax
+ ret
+
+ .p2align 4
+L(1):
+ movzbl (%rdi), %eax
+ movzbl (%rsi), %ecx
+ subl %ecx, %eax
+ ret
+# endif
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
+ ret
+
+ .p2align 4
+L(less_vec):
+# ifdef USE_AS_WMEMCMP
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
+ cmpb $4, %dl
+ je L(4)
+ jb L(zero)
+# else
+ cmpb $1, %dl
+ je L(1)
+ jb L(zero)
+ cmpb $4, %dl
+ jb L(between_2_3)
+ cmpb $8, %dl
+ jb L(between_4_7)
+# endif
+ cmpb $16, %dl
+ jae L(between_16_31)
+ /* It is between 8 and 15 bytes. */
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ /* Use overlapping loads to avoid branches. */
+ leaq -8(%rdi, %rdx), %rdi
+ leaq -8(%rsi, %rdx), %rsi
+ vmovq (%rdi), %XMM1
+ vmovq (%rsi), %XMM2
+ VPCMPEQ %XMM1, %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(between_16_31):
+ /* From 16 to 31 bytes. No branch when size == 16. */
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -16(%rdi, %rdx), %rdi
+ leaq -16(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %XMM2
+ VPCMPEQ (%rdi), %XMM2, %k2
+ kmovw %k2, %eax
+ subl $XMM_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(more_8x_vec):
+ /* More than 8 * VEC. Check the first VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Align the first memory area for aligned loads in the loop.
+ Compute how much the first memory area is misaligned. */
+ movq %rdi, %rcx
+ andl $(VEC_SIZE - 1), %ecx
+ /* Get the negative of offset for alignment. */
+ subq $VEC_SIZE, %rcx
+ /* Adjust the second memory area. */
+ subq %rcx, %rsi
+ /* Adjust the first memory area which should be aligned now. */
+ subq %rcx, %rdi
+ /* Adjust length. */
+ addq %rcx, %rdx
+
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ VMOVU (%rsi), %YMM1
+ VPCMPEQ (%rdi), %YMM1, %k1
+
+ VMOVU VEC_SIZE(%rsi), %YMM2
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ kandd %k2, %k1, %k5
+
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ kandd %k3, %k5, %k5
+
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ kandd %k4, %k5, %k5
+
+ kmovd %k5, %eax
+ cmpl $VEC_MASK, %eax
+ jne L(4x_vec_end)
+
+ addq $(VEC_SIZE * 4), %rdi
+ addq $(VEC_SIZE * 4), %rsi
+
+ subq $(VEC_SIZE * 4), %rdx
+ cmpq $(VEC_SIZE * 4), %rdx
+ jae L(loop_4x_vec)
+
+ /* Less than 4 * VEC. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(last_vec)
+ cmpq $(VEC_SIZE * 2), %rdx
+ jbe L(last_2x_vec)
+
+L(last_4x_vec):
+ /* From 2 * VEC to 4 * VEC. */
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ /* Use overlapping loads to avoid branches. */
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+
+ addq $VEC_SIZE, %rdi
+ addq $VEC_SIZE, %rsi
+ VMOVU (%rsi), %YMM2
+ VPCMPEQ (%rdi), %YMM2, %k2
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ ret
+
+ .p2align 4
+L(4x_vec_end):
+ kmovd %k1, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec)
+ kmovd %k2, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x1)
+ kmovd %k3, %eax
+ subl $VEC_MASK, %eax
+ jnz L(first_vec_x2)
+ kmovd %k4, %eax
+ subl $VEC_MASK, %eax
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x1):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %ecx
+# ifdef USE_AS_WMEMCMP
+ xorl %eax, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
+ jmp L(wmemcmp_return)
+# else
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
+ sub %edx, %eax
+# endif
+ ret
+END (MEMCMP)
+#endif
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
new file mode 100644
index 00000000..4726d74a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
@@ -0,0 +1,4 @@
+#define MEMCMP __wmemcmp_evex_movbe
+#define USE_AS_WMEMCMP 1
+
+#include "memcmp-evex-movbe.S"
--
GitLab

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,735 @@
From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 23 Feb 2021 06:33:10 -0800
Subject: [PATCH] x86: Add string/memory function tests in RTM region
Content-type: text/plain; charset=UTF-8
At function exit, AVX optimized string/memory functions have VZEROUPPER
which triggers RTM abort. When such functions are called inside a
transactionally executing RTM region, RTM abort causes severe performance
degradation. Add tests to verify that string/memory functions won't
cause RTM abort in RTM region.
---
sysdeps/x86/Makefile | 23 +++++++++++
sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++
sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++
sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++
sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++
sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++
sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
12 files changed, 618 insertions(+)
create mode 100644 sysdeps/x86/tst-memchr-rtm.c
create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
create mode 100644 sysdeps/x86/tst-memmove-rtm.c
create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
create mode 100644 sysdeps/x86/tst-memset-rtm.c
create mode 100644 sysdeps/x86/tst-strchr-rtm.c
create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
create mode 100644 sysdeps/x86/tst-string-rtm.h
create mode 100644 sysdeps/x86/tst-strlen-rtm.c
create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 59e928e9..5be71ada 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -17,6 +17,29 @@ endif
ifeq ($(subdir),string)
sysdep_routines += cacheinfo
+
+tests += \
+ tst-memchr-rtm \
+ tst-memcmp-rtm \
+ tst-memmove-rtm \
+ tst-memrchr-rtm \
+ tst-memset-rtm \
+ tst-strchr-rtm \
+ tst-strcpy-rtm \
+ tst-strlen-rtm \
+ tst-strncmp-rtm \
+ tst-strrchr-rtm
+
+CFLAGS-tst-memchr-rtm.c += -mrtm
+CFLAGS-tst-memcmp-rtm.c += -mrtm
+CFLAGS-tst-memmove-rtm.c += -mrtm
+CFLAGS-tst-memrchr-rtm.c += -mrtm
+CFLAGS-tst-memset-rtm.c += -mrtm
+CFLAGS-tst-strchr-rtm.c += -mrtm
+CFLAGS-tst-strcpy-rtm.c += -mrtm
+CFLAGS-tst-strlen-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strrchr-rtm.c += -mrtm
endif
ifneq ($(enable-cet),no)
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
new file mode 100644
index 00000000..e4749401
--- /dev/null
+++ b/sysdeps/x86/tst-memchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
new file mode 100644
index 00000000..e4c8a623
--- /dev/null
+++ b/sysdeps/x86/tst-memcmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for memcmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ memset (string2, 'a', STRING_SIZE);
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memcmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
new file mode 100644
index 00000000..4bf97ef1
--- /dev/null
+++ b/sysdeps/x86/tst-memmove-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for memmove inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (memmove (string2, string1, STRING_SIZE) == string2
+ && memcmp (string2, string1, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memmove", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
new file mode 100644
index 00000000..a57a5a8e
--- /dev/null
+++ b/sysdeps/x86/tst-memrchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for memrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = memrchr (string1, 'c', STRING_SIZE);
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memrchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
new file mode 100644
index 00000000..bf343a4d
--- /dev/null
+++ b/sysdeps/x86/tst-memset-rtm.c
@@ -0,0 +1,45 @@
+/* Test case for memset inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return EXIT_SUCCESS;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ memset (string1, 'a', STRING_SIZE);
+ return 0;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("memset", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
new file mode 100644
index 00000000..a82e29c0
--- /dev/null
+++ b/sysdeps/x86/tst-strchr-rtm.c
@@ -0,0 +1,54 @@
+/* Test case for strchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[100] = 'c';
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strchr (string1, 'c');
+ if (p == &string1[100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strchr", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
new file mode 100644
index 00000000..2b2a583f
--- /dev/null
+++ b/sysdeps/x86/tst-strcpy-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strcpy inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strcpy (string2, string1) == string2
+ && strcmp (string2, string1) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strcpy", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
new file mode 100644
index 00000000..d2470afa
--- /dev/null
+++ b/sysdeps/x86/tst-string-rtm.h
@@ -0,0 +1,72 @@
+/* Test string function in a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <string.h>
+#include <x86intrin.h>
+#include <sys/platform/x86.h>
+#include <support/check.h>
+#include <support/test-driver.h>
+
+static int
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
+ int (*function) (void))
+{
+ if (!CPU_FEATURE_USABLE (RTM))
+ return EXIT_UNSUPPORTED;
+
+ int status = prepare ();
+ if (status != EXIT_SUCCESS)
+ return status;
+
+ unsigned int i;
+ unsigned int naborts = 0;
+ unsigned int failed = 0;
+ for (i = 0; i < loop; i++)
+ {
+ failed |= function ();
+ if (_xbegin() == _XBEGIN_STARTED)
+ {
+ failed |= function ();
+ _xend();
+ }
+ else
+ {
+ failed |= function ();
+ ++naborts;
+ }
+ }
+
+ if (failed)
+ FAIL_EXIT1 ("%s() failed", name);
+
+ if (naborts)
+ {
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
+ TSX. */
+ double rate = 100 * ((double) naborts) / ((double) loop);
+ if (rate > 5)
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
+ rate, naborts, loop);
+ }
+
+ return EXIT_SUCCESS;
+}
+
+static int do_test (void);
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
new file mode 100644
index 00000000..0dcf14db
--- /dev/null
+++ b/sysdeps/x86/tst-strlen-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strlen inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = '\0';
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ size_t len = strlen (string1);
+ if (len == STRING_SIZE - 100)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strlen", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
new file mode 100644
index 00000000..236ad951
--- /dev/null
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -0,0 +1,52 @@
+/* Test case for strncmp inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+char string2[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ memset (string2, 'a', STRING_SIZE - 1);
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strncmp", LOOP, prepare, function);
+}
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
new file mode 100644
index 00000000..e32bfaf5
--- /dev/null
+++ b/sysdeps/x86/tst-strrchr-rtm.c
@@ -0,0 +1,53 @@
+/* Test case for strrchr inside a transactionally executing RTM region.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <tst-string-rtm.h>
+
+#define LOOP 3000
+#define STRING_SIZE 1024
+char string1[STRING_SIZE];
+
+__attribute__ ((noinline, noclone))
+static int
+prepare (void)
+{
+ memset (string1, 'a', STRING_SIZE - 1);
+ string1[STRING_SIZE - 100] = 'c';
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return EXIT_SUCCESS;
+ else
+ return EXIT_FAILURE;
+}
+
+__attribute__ ((noinline, noclone))
+static int
+function (void)
+{
+ char *p = strrchr (string1, 'c');
+ if (p == &string1[STRING_SIZE - 100])
+ return 0;
+ else
+ return 1;
+}
+
+static int
+do_test (void)
+{
+ return do_test_1 ("strrchr", LOOP, prepare, function);
+}
--
GitLab

View File

@ -0,0 +1,148 @@
From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 7 Mar 2021 09:44:18 -0800
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
Content-type: text/plain; charset=UTF-8
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
function exit.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++-----
sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++-----
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------
.../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++--------
4 files changed, 31 insertions(+), 24 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index c1efeec0..d969a156 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)),
__memset_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)),
__memset_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512VL),
__wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__wmemset_avx512_unaligned))
#ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 6f3375cc..19795938 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
index bdc94c6c..98c5d406 100644
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_unaligned);
-
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
- return OPTIMIZE (evex_unaligned);
+ {
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ return OPTIMIZE (avx512_unaligned);
+
+ return OPTIMIZE (evex_unaligned);
+ }
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
return OPTIMIZE (avx2_unaligned_rtm);
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 0783979c..22e7b187 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,22 +1,22 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define YMM0 ymm16
+# define VEC0 zmm16
+# define VEC(i) VEC##i
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastb %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastb d, %VEC0
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
movq r, %rax; \
- vpbroadcastd %xmm0, %xmm0; \
- vpbroadcastq %xmm0, %zmm0
+ vpbroadcastd d, %VEC0
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
--
GitLab

View File

@ -0,0 +1,230 @@
From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:25:56 -0800
Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
length. Clear the upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
tst-size_t-wmemcmp.
* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
---
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +-
sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +-
sysdeps/x86_64/x32/Makefile | 4 +-
sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++
6 files changed, 114 insertions(+), 9 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 30f764c3..e3a35b89 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -58,9 +58,12 @@
.section .text.avx,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
# endif
- cmpq $VEC_SIZE, %rdx
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
index 8e164f2c..302900f5 100644
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
@@ -42,13 +42,16 @@
.section .text.sse4.1,"ax",@progbits
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
+ shl $2, %RDX_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
# endif
pxor %xmm0, %xmm0
- cmp $79, %rdx
+ cmp $79, %RDX_LP
ja L(79bytesormore)
# ifndef USE_AS_WMEMCMP
- cmp $1, %rdx
+ cmp $1, %RDX_LP
je L(firstbyte)
# endif
add %rdx, %rsi
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
index 6f76c641..69d030fc 100644
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
@@ -33,9 +33,12 @@
atom_text_section
ENTRY (MEMCMP)
# ifdef USE_AS_WMEMCMP
- shl $2, %rdx
- test %rdx, %rdx
+ shl $2, %RDX_LP
+ test %RDX_LP, %RDX_LP
jz L(equal)
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
# endif
mov %rdx, %rcx
mov %rdi, %rdx
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 7d528889..ddec7f04 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
endif
ifeq ($(subdir),string)
-tests += tst-size_t-memchr
+tests += tst-size_t-memchr tst-size_t-memcmp
endif
ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
new file mode 100644
index 00000000..9bd6fdb4
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
@@ -0,0 +1,76 @@
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_MAIN
+#ifdef WIDE
+# define TEST_NAME "wmemcmp"
+#else
+# define TEST_NAME "memcmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <inttypes.h>
+# include <wchar.h>
+
+# define MEMCMP wmemcmp
+# define CHAR wchar_t
+#else
+# define MEMCMP memcmp
+# define CHAR char
+#endif
+
+IMPL (MEMCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_memcmp (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ memcpy (buf1, buf2, page_size);
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ int res = do_memcmp (dest, src);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
new file mode 100644
index 00000000..e8b5ffd0
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
@@ -0,0 +1,20 @@
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-memcmp.c"
--
GitLab

View File

@ -0,0 +1,164 @@
From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sun, 7 Mar 2021 09:45:23 -0800
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
Content-type: text/plain; charset=UTF-8
Update ifunc-memmove.h to select the function optimized with AVX512
instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
AVX512VL since VZEROUPPER isn't needed at function exit.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++---------
sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++----
.../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
3 files changed, 42 insertions(+), 19 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d969a156..fec384f6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memmove_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memmove_chk,
CPU_FEATURE_USABLE (AVX),
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memmove_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memmove,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memmove_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
__memmove_ssse3_back)
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
CPU_FEATURE_USABLE (AVX),
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__memcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__memcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
IFUNC_IMPL_ADD (array, i, memcpy, 1,
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_chk_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
CPU_FEATURE_USABLE (AVX),
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX512F),
__mempcpy_avx512_no_vzeroupper)
IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, mempcpy,
- CPU_FEATURE_USABLE (AVX512F),
+ CPU_FEATURE_USABLE (AVX512VL),
__mempcpy_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, mempcpy,
CPU_FEATURE_USABLE (AVX),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
index fa09b9fb..014e95c7 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx512_no_vzeroupper);
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE (avx512_unaligned_erms);
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
- return OPTIMIZE (avx512_unaligned_erms);
+ return OPTIMIZE (avx512_unaligned);
+ }
- return OPTIMIZE (avx512_unaligned);
+ return OPTIMIZE (avx512_no_vzeroupper);
}
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
index aac1515c..848848ab 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
@@ -1,11 +1,32 @@
#if IS_IN (libc)
# define VEC_SIZE 64
-# define VEC(i) zmm##i
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define YMM0 ymm16
+# define YMM1 ymm17
+# define VEC0 zmm16
+# define VEC1 zmm17
+# define VEC2 zmm18
+# define VEC3 zmm19
+# define VEC4 zmm20
+# define VEC5 zmm21
+# define VEC6 zmm22
+# define VEC7 zmm23
+# define VEC8 zmm24
+# define VEC9 zmm25
+# define VEC10 zmm26
+# define VEC11 zmm27
+# define VEC12 zmm28
+# define VEC13 zmm29
+# define VEC14 zmm30
+# define VEC15 zmm31
+# define VEC(i) VEC##i
# define VMOVNT vmovntdq
# define VMOVU vmovdqu64
# define VMOVA vmovdqa64
+# define VZEROUPPER
-# define SECTION(p) p##.avx512
+# define SECTION(p) p##.evex512
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
# include "memmove-vec-unaligned-erms.S"
--
GitLab

View File

@ -0,0 +1,71 @@
From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
From: Sunil K Pandey <skpgkp2@gmail.com>
Date: Thu, 1 Apr 2021 15:47:04 -0700
Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
Content-type: text/plain; charset=UTF-8
Fix some indentations of ifdef in file strlen-evex.S which are off by 1
and confusing to read.
---
sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
1 file changed, 8 insertions(+), 8 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index cd022509..05838190 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -276,10 +276,10 @@ L(last_2x_vec):
.p2align 4
L(first_vec_x0_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
.p2align 4
L(first_vec_x2_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
.p2align 4
L(first_vec_x3_check):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %eax
-# endif
+# endif
/* Check the end of data. */
cmpq %rax, %rsi
jbe L(max)
--
GitLab

View File

@ -0,0 +1,51 @@
From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 19 Apr 2021 07:07:21 -0700
Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
Content-type: text/plain; charset=UTF-8
Since __strlen_evex and __strnlen_evex added by
commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Fri Mar 5 06:24:52 2021 -0800
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
use sarx:
c4 e2 6a f7 c0 sarx %edx,%eax,%eax
require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
ifunc-avx2.h already requires BMI2 for EVEX implementation.
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
1 file changed, 4 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index fec384f6..cbfc1a5d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strlen_evex)
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__strnlen_evex)
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
--
GitLab

View File

@ -0,0 +1,584 @@
From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 3 May 2021 03:01:58 -0400
Subject: [PATCH] x86: Optimize memchr-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memchr-avx2.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
asaving a few instructions the in loop return loop. test-memchr,
test-rawmemchr, and test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
1 file changed, 247 insertions(+), 178 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index cf893e77..b377f22e 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -26,8 +26,22 @@
# ifdef USE_AS_WMEMCHR
# define VPCMPEQ vpcmpeqd
+# define VPBROADCAST vpbroadcastd
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
+# define VPBROADCAST vpbroadcastb
+# define CHAR_SIZE 1
+# endif
+
+# ifdef USE_AS_RAWMEMCHR
+# define ERAW_PTR_REG ecx
+# define RRAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define ERAW_PTR_REG edi
+# define RRAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
# endif
# ifndef VZEROUPPER
@@ -39,6 +53,7 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
test %RDX_LP, %RDX_LP
jz L(null)
# endif
- movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
- vmovd %esi, %xmm0
# ifdef USE_AS_WMEMCHR
shl $2, %RDX_LP
- vpbroadcastd %xmm0, %ymm0
# else
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- vpbroadcastb %xmm0, %ymm0
# endif
+ /* Broadcast CHAR to YMMMATCH. */
+ vmovd %esi, %xmm0
+ VPBROADCAST %xmm0, %ymm0
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
+ VPCMPEQ (%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $VEC_SIZE, %rdx
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax), %rax
+ cmovle %rcx, %rax
+ VZEROUPPER_RETURN
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
+L(null):
+ xorl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
-
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is necessary
+ for computer return address if byte is found or adjusting length
+ if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
+ rdi for rawmemchr. */
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Calculate length until end of page (length checked for a
+ match). */
+ leaq 1(%ALGN_PTR_REG), %rsi
+ subq %RRAW_PTR_REG, %rsi
+# endif
/* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
+ sarxl %ERAW_PTR_REG, %eax, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
# endif
- addq %rdi, %rax
- addq %rcx, %rax
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+ addq %RRAW_PTR_REG, %rax
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ incq %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 2 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 4
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+# ifndef USE_AS_RAWMEMCHR
+L(cross_page_continue):
+ /* Align data to VEC_SIZE - 1. */
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ orq $(VEC_SIZE - 1), %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# else
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
# ifndef USE_AS_RAWMEMCHR
+ /* Check if at last VEC_SIZE * 4 length. */
subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ jbe L(last_4x_vec_or_less_cmpeq)
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
+ length. */
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
addq %rcx, %rdx
+# else
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
-
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
vpor %ymm1, %ymm2, %ymm5
vpor %ymm3, %ymm4, %ymm6
vpor %ymm5, %ymm6, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ vpmovmskb %ymm5, %ecx
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
- ja L(loop_4x_vec)
+ testl %ecx, %ecx
+ jnz L(loop_4x_vec_end)
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $(VEC_SIZE * 4), %rdx
+ ja L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+ .p2align 4
+L(last_4x_vec_or_less):
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(first_vec_x1_check)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+ /* If remaining length > VEC_SIZE * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jg L(last_4x_vec)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+L(last_2x_vec):
+ /* If remaining length < VEC_SIZE. */
+ addl $VEC_SIZE, %edx
+ jle L(zero_end)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- xorl %eax, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ addq $(VEC_SIZE + 1), %rdi
+ addq %rdi, %rax
+L(zero_end):
VZEROUPPER_RETURN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMPEQ (%rdi), %ymm0, %ymm1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
vpmovmskb %ymm1, %eax
testl %eax, %eax
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- VZEROUPPER_RETURN
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0_check):
- tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ vpmovmskb %ymm3, %eax
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 2 - 1), %rdi
+# else
+ subq $-(VEC_SIZE * 2 + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
L(first_vec_x1_check):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+ /* Adjust length. */
+ subl $-(VEC_SIZE * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ incq %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+ .p2align 4
+L(set_zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 4 - 1), %rdi
+# else
+ incq %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
+# ifdef USE_AS_RAWMEMCHR
+ subq $(VEC_SIZE * 3 - 1), %rdi
+# else
+ subq $-(VEC_SIZE + 1), %rdi
+# endif
addq %rdi, %rax
VZEROUPPER_RETURN
+# ifndef USE_AS_RAWMEMCHR
.p2align 4
-L(zero):
- xorl %eax, %eax
- jmp L(return_vzeroupper)
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(VEC_SIZE * 2), %edx
+ jle L(last_2x_vec)
.p2align 4
-L(null):
- xorl %eax, %eax
- ret
-# endif
+L(last_4x_vec):
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- .p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- VZEROUPPER_RETURN
+ /* Create mask for possible matches within remaining length. */
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
- .p2align 4
-L(first_vec_x2):
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
+ remaining length was found to be > VEC_SIZE * 2. */
+ subl $VEC_SIZE, %edx
+ jbe L(zero_end2)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Shift remaining length mask for last VEC. */
+ shrq $32, %rcx
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ addq $(VEC_SIZE * 3 + 1), %rdi
addq %rdi, %rax
+L(zero_end2):
VZEROUPPER_RETURN
.p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- vpmovmskb %ymm2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
+ subq $-(VEC_SIZE * 2 + 1), %rdi
addq %rdi, %rax
VZEROUPPER_RETURN
+# endif
END (MEMCHR)
#endif
--
GitLab

View File

@ -0,0 +1,388 @@
From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 9 Jun 2021 16:25:32 -0400
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
#27974]
Content-type: text/plain; charset=UTF-8
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on n * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
2 files changed, 98 insertions(+), 37 deletions(-)
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
index cb320257..24f9a0c5 100644
--- a/sysdeps/x86_64/memchr.S
+++ b/sysdeps/x86_64/memchr.S
@@ -21,9 +21,11 @@
#ifdef USE_AS_WMEMCHR
# define MEMCHR wmemchr
# define PCMPEQ pcmpeqd
+# define CHAR_PER_VEC 4
#else
# define MEMCHR memchr
# define PCMPEQ pcmpeqb
+# define CHAR_PER_VEC 16
#endif
/* fast SSE2 version with using pmaxub and 64 byte loop */
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
movd %esi, %xmm1
mov %edi, %ecx
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+#endif
#ifdef USE_AS_WMEMCHR
test %RDX_LP, %RDX_LP
jz L(return_null)
- shl $2, %RDX_LP
#else
-# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
-# endif
punpcklbw %xmm1, %xmm1
test %RDX_LP, %RDX_LP
jz L(return_null)
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
test %eax, %eax
jnz L(matches_1)
- sub $16, %rdx
+ sub $CHAR_PER_VEC, %rdx
jbe L(return_null)
add $16, %rdi
and $15, %ecx
and $-16, %rdi
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
jmp L(loop_prolog)
@@ -77,16 +81,21 @@ L(crosscache):
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
-/* Check if there is a match. */
+ /* Check if there is a match. */
pmovmskb %xmm0, %eax
-/* Remove the leading bytes. */
+ /* Remove the leading bytes. */
sar %cl, %eax
test %eax, %eax
je L(unaligned_no_match)
-/* Check which byte is a match. */
+ /* Check which byte is a match. */
bsf %eax, %eax
-
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
add %rcx, %rax
@@ -94,15 +103,18 @@ L(crosscache):
.p2align 4
L(unaligned_no_match):
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
possible addition overflow. */
neg %rcx
add $16, %rcx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
sub %rcx, %rdx
jbe L(return_null)
add $16, %rdi
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
.p2align 4
@@ -135,7 +147,7 @@ L(loop_prolog):
test $0x3f, %rdi
jz L(align64_loop)
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
@@ -167,11 +179,14 @@ L(loop_prolog):
mov %rdi, %rcx
and $-64, %rdi
and $63, %ecx
+#ifdef USE_AS_WMEMCHR
+ shr $2, %ecx
+#endif
add %rcx, %rdx
.p2align 4
L(align64_loop):
- sub $64, %rdx
+ sub $(CHAR_PER_VEC * 4), %rdx
jbe L(exit_loop)
movdqa (%rdi), %xmm0
movdqa 16(%rdi), %xmm2
@@ -218,7 +233,7 @@ L(align64_loop):
.p2align 4
L(exit_loop):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
jle L(exit_loop_32)
movdqa (%rdi), %xmm0
@@ -238,7 +253,7 @@ L(exit_loop):
pmovmskb %xmm3, %eax
test %eax, %eax
jnz L(matches32_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jle L(return_null)
PCMPEQ 48(%rdi), %xmm1
@@ -250,13 +265,13 @@ L(exit_loop):
.p2align 4
L(exit_loop_32):
- add $32, %edx
+ add $(CHAR_PER_VEC * 2), %edx
movdqa (%rdi), %xmm0
PCMPEQ %xmm1, %xmm0
pmovmskb %xmm0, %eax
test %eax, %eax
jnz L(matches_1)
- sub $16, %edx
+ sub $CHAR_PER_VEC, %edx
jbe L(return_null)
PCMPEQ 16(%rdi), %xmm1
@@ -293,7 +308,13 @@ L(matches32):
.p2align 4
L(matches_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
add %rdi, %rax
ret
@@ -301,7 +322,13 @@ L(matches_1):
.p2align 4
L(matches16_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 16(%rdi, %rax), %rax
ret
@@ -309,7 +336,13 @@ L(matches16_1):
.p2align 4
L(matches32_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 32(%rdi, %rax), %rax
ret
@@ -317,7 +350,13 @@ L(matches32_1):
.p2align 4
L(matches48_1):
bsf %eax, %eax
+#ifdef USE_AS_WMEMCHR
+ mov %eax, %esi
+ shr $2, %esi
+ sub %rsi, %rdx
+#else
sub %rax, %rdx
+#endif
jbe L(return_null)
lea 48(%rdi, %rax), %rax
ret
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
index b377f22e..16027abb 100644
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
@@ -54,21 +54,19 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
- test %RDX_LP, %RDX_LP
- jz L(null)
-# endif
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
# ifdef __ILP32__
- /* Clear the upper 32 bits. */
- movl %edx, %edx
+ /* Clear upper bits. */
+ and %RDX_LP, %RDX_LP
+# else
+ test %RDX_LP, %RDX_LP
# endif
+ jz L(null)
# endif
/* Broadcast CHAR to YMMMATCH. */
vmovd %esi, %xmm0
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
vpmovmskb %ymm1, %eax
# ifndef USE_AS_RAWMEMCHR
/* If length < CHAR_PER_VEC handle special. */
- cmpq $VEC_SIZE, %rdx
+ cmpq $CHAR_PER_VEC, %rdx
jbe L(first_vec_x0)
# endif
testl %eax, %eax
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
L(first_vec_x0):
/* Check if first match was before length. */
tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
xorl %ecx, %ecx
cmpl %eax, %edx
leaq (%rdi, %rax), %rax
@@ -110,12 +112,12 @@ L(null):
# endif
.p2align 4
L(cross_page_boundary):
- /* Save pointer before aligning as its original value is necessary
- for computer return address if byte is found or adjusting length
- if it is not and this is memchr. */
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
movq %rdi, %rcx
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
- rdi for rawmemchr. */
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
+ and rdi for rawmemchr. */
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
@@ -124,6 +126,10 @@ L(cross_page_boundary):
match). */
leaq 1(%ALGN_PTR_REG), %rsi
subq %RRAW_PTR_REG, %rsi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %esi
+# endif
# endif
/* Remove the leading bytes. */
sarxl %ERAW_PTR_REG, %eax, %eax
@@ -181,6 +187,10 @@ L(cross_page_continue):
orq $(VEC_SIZE - 1), %rdi
/* esi is for adjusting length to see if near the end. */
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
# else
orq $(VEC_SIZE - 1), %rdi
L(cross_page_continue):
@@ -213,7 +223,7 @@ L(cross_page_continue):
# ifndef USE_AS_RAWMEMCHR
/* Check if at last VEC_SIZE * 4 length. */
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
jbe L(last_4x_vec_or_less_cmpeq)
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
length. */
@@ -221,6 +231,10 @@ L(cross_page_continue):
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
addq %rcx, %rdx
# else
/* Align data to VEC_SIZE * 4 - 1 for loop. */
@@ -250,15 +264,19 @@ L(loop_4x_vec):
subq $-(VEC_SIZE * 4), %rdi
- subq $(VEC_SIZE * 4), %rdx
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
- /* Fall through into less than 4 remaining vectors of length case.
- */
+ /* Fall through into less than 4 remaining vectors of length
+ case. */
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
.p2align 4
L(last_4x_vec_or_less):
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
/* Check if first VEC contained match. */
testl %eax, %eax
jnz L(first_vec_x1_check)
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
vpmovmskb %ymm1, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %edx
+# endif
subq $-(VEC_SIZE * 4), %rdi
/* Check first VEC regardless. */
testl %eax, %eax
--
GitLab

View File

@ -0,0 +1,767 @@
From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 19 Apr 2021 19:36:07 -0400
Subject: [PATCH] x86: Optimize strlen-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes strlen-avx2.S. The optimizations are
mostly small things but they add up to roughly 10-30% performance
improvement for strlen. The results for strnlen are bit more
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +-
sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++--------
2 files changed, 334 insertions(+), 214 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index cbfc1a5d..f1a6460a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
IFUNC_IMPL_ADD (array, i, strlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strlen_avx2)
IFUNC_IMPL_ADD (array, i, strlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strlen,
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
IFUNC_IMPL (i, name, strnlen,
IFUNC_IMPL_ADD (array, i, strnlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strnlen_avx2)
IFUNC_IMPL_ADD (array, i, strnlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strnlen,
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
IFUNC_IMPL (i, name, wcslen,
IFUNC_IMPL_ADD (array, i, wcslen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcslen_avx2)
IFUNC_IMPL_ADD (array, i, wcslen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcslen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcslen,
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
IFUNC_IMPL (i, name, wcsnlen,
IFUNC_IMPL_ADD (array, i, wcsnlen,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcsnlen_avx2)
IFUNC_IMPL_ADD (array, i, wcsnlen,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcsnlen_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcsnlen,
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index 82826e10..be8a5db5 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -27,9 +27,11 @@
# ifdef USE_AS_WCSLEN
# define VPCMPEQ vpcmpeqd
# define VPMINU vpminud
+# define CHAR_SIZE 4
# else
# define VPCMPEQ vpcmpeqb
# define VPMINU vpminub
+# define CHAR_SIZE 1
# endif
# ifndef VZEROUPPER
@@ -41,349 +43,459 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
- /* Check for zero length. */
+ /* Check zero length. */
test %RSI_LP, %RSI_LP
jz L(zero)
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
+ mov %RSI_LP, %R8_LP
# ifdef USE_AS_WCSLEN
shl $2, %RSI_LP
# elif defined __ILP32__
/* Clear the upper 32 bits. */
movl %esi, %esi
# endif
- mov %RSI_LP, %R8_LP
# endif
- movl %edi, %ecx
+ movl %edi, %eax
movq %rdi, %rdx
vpxor %xmm0, %xmm0, %xmm0
-
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. */
+ andl $(PAGE_SIZE - 1), %eax
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
-
+ VPCMPEQ (%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rsi
- jbe L(max)
-# else
- jnz L(first_vec_x0)
+ /* If length < VEC_SIZE handle special. */
+ cmpq $VEC_SIZE, %rsi
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ /* If empty continue to aligned_more. Otherwise return bit
+ position of first match. */
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
- addq %rcx, %rsi
+L(zero):
+ xorl %eax, %eax
+ ret
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ .p2align 4
+L(first_vec_x0):
+ /* Set bit for max len so that tzcnt will return min of max len
+ and position of first match. */
+ btsq %rsi, %rax
+ tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
# endif
- jmp L(more_4x_vec)
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
- testl %eax, %eax
- jz L(aligned_more)
+L(first_vec_x1):
tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 4 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ incl %edi
+ addl %edi, %eax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ shrl $2, %eax
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
+ VZEROUPPER_RETURN
.p2align 4
-L(aligned_more):
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
- to void possible addition overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
-
- /* Check the end of data. */
- subq %rcx, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 3 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE + 1), %edi
+ addl %edi, %eax
# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
+# ifdef USE_AS_STRNLEN
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE * 2 + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 2 + 1), %edi
+ addl %edi, %eax
+# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ subl $(VEC_SIZE + 1), %ecx
+ addl %ecx, %eax
+# else
+ subl %edx, %edi
+ addl $(VEC_SIZE * 3 + 1), %edi
+ addl %edi, %eax
# endif
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
+ VZEROUPPER_RETURN
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
+ /* Align data to VEC_SIZE - 1. This is the same number of
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
+ code on the x4 check. */
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+# ifdef USE_AS_STRNLEN
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
+ it simplies the logic in last_4x_vec_or_less. */
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
+ subq %rdx, %rcx
+# endif
+ /* Load first VEC regardless. */
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. If near end handle specially. */
+ subq %rcx, %rsi
+ jb L(last_4x_vec_or_less)
+# endif
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ /* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
+ jbe L(last_4x_vec_or_less_load)
+ incq %rdi
+ movl %edi, %ecx
+ orq $(VEC_SIZE * 4 - 1), %rdi
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* Readjust length. */
addq %rcx, %rsi
+# else
+ incq %rdi
+ orq $(VEC_SIZE * 4 - 1), %rdi
# endif
-
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm1
- vmovdqa VEC_SIZE(%rdi), %ymm2
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
- VPMINU %ymm1, %ymm2, %ymm5
- VPMINU %ymm3, %ymm4, %ymm6
- VPMINU %ymm5, %ymm6, %ymm5
-
- VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
-# ifndef USE_AS_STRNLEN
- jmp L(loop_4x_vec)
-# else
+# ifdef USE_AS_STRNLEN
+ /* Break if at end of length. */
subq $(VEC_SIZE * 4), %rsi
- ja L(loop_4x_vec)
-
-L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %esi
- jle L(last_2x_vec)
+ jb L(last_4x_vec_or_less_cmpeq)
+# endif
+ /* Save some code size by microfusing VPMINU with the load. Since
+ the matches in ymm2/ymm4 can only be returned if there where no
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
+ */
+ vmovdqa 1(%rdi), %ymm1
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
+
+ VPMINU %ymm2, %ymm4, %ymm5
+ VPCMPEQ %ymm5, %ymm0, %ymm5
+ vpmovmskb %ymm5, %ecx
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm1, %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ subq %rdx, %rdi
testl %eax, %eax
+ jnz L(last_vec_return_x0)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %esi
- jle L(max)
-
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ VPCMPEQ %ymm2, %ymm0, %ymm2
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
-
- jnz L(first_vec_x3_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
+ jnz L(last_vec_return_x1)
+
+ /* Combine last 2 VEC. */
+ VPCMPEQ %ymm3, %ymm0, %ymm3
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used if
+ the first 3 other VEC all did not contain a match. */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ subq $(VEC_SIZE * 2 - 1), %rdi
+ addq %rdi, %rax
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
+
+# ifdef USE_AS_STRNLEN
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %esi
- VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
+L(last_4x_vec_or_less_load):
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ subq $-(VEC_SIZE * 4), %rdi
+L(last_4x_vec_or_less_cmpeq):
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
+L(last_4x_vec_or_less):
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %esi
- jle L(max)
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
+ VEC_SIZE * 4. */
+ testl $(VEC_SIZE * 2), %esi
+ jnz L(last_4x_vec)
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ /* length may have been negative or positive by an offset of
+ VEC_SIZE * 4 depending on where this was called from. This fixes
+ that. */
+ andl $(VEC_SIZE * 4 - 1), %esi
testl %eax, %eax
- jnz L(first_vec_x1_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- VZEROUPPER_RETURN
+ jnz L(last_vec_x1_check)
- .p2align 4
-L(first_vec_x0_check):
+ subl $VEC_SIZE, %esi
+ jb L(max)
+
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
+# endif
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_return_x0):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $VEC_SIZE, %rax
+ subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_return_x1):
tzcntl %eax, %eax
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 2), %rax
+ subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
+# ifdef USE_AS_STRNLEN
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x1_check):
+
tzcntl %eax, %eax
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 3), %rax
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
- .p2align 4
L(max):
movq %r8, %rax
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(last_4x_vec):
+ /* Test first 2x VEC normally. */
+ testl %eax, %eax
+ jnz L(last_vec_x1)
+
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+ /* Normalize length. */
+ andl $(VEC_SIZE * 4 - 1), %esi
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x3)
+
+ subl $(VEC_SIZE * 3), %esi
+ jb L(max)
+
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 3 + 1), %eax
+ addq %rdi, %rax
# ifdef USE_AS_WCSLEN
shrq $2, %rax
# endif
VZEROUPPER_RETURN
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
.p2align 4
-L(first_vec_x0):
+L(last_vec_x1):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
+ subq %rdx, %rdi
+ incl %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x1):
+L(last_vec_x2):
+ /* essentially duplicates of first_vec_x1 but use 64 bit
+ instructions. */
tzcntl %eax, %eax
- addq $VEC_SIZE, %rax
+ subq %rdx, %rdi
+ addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2):
+L(last_vec_x3):
tzcntl %eax, %eax
- addq $(VEC_SIZE * 2), %rax
+ subl $(VEC_SIZE * 2), %esi
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max_end)
+ subq %rdx, %rdi
+ addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
+# ifdef USE_AS_WCSLEN
shrq $2, %rax
-# endif
+# endif
+ VZEROUPPER_RETURN
+L(max_end):
+ movq %r8, %rax
VZEROUPPER_RETURN
+# endif
+ /* Cold case for crossing page with first load. */
.p2align 4
-L(4x_vec_end):
- VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+L(cross_page_boundary):
+ /* Align data to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ so no need to manually mod rdx. */
+ sarxl %edx, %eax, %eax
+# ifdef USE_AS_STRNLEN
testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
+ jnz L(cross_page_less_vec)
+ leaq 1(%rdi), %rcx
+ subq %rdx, %rcx
+ /* Check length. */
+ cmpq %rsi, %rcx
+ jb L(cross_page_continue)
+ movq %r8, %rax
+# else
testl %eax, %eax
- jnz L(first_vec_x2)
- VPCMPEQ %ymm4, %ymm0, %ymm4
- vpmovmskb %ymm4, %eax
-L(first_vec_x3):
+ jz L(cross_page_continue)
tzcntl %eax, %eax
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
# endif
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+
+# ifdef USE_AS_STRNLEN
+ .p2align 4
+L(cross_page_less_vec):
+ tzcntl %eax, %eax
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
+# ifdef USE_AS_WCSLEN
+ shrl $2, %eax
+# endif
VZEROUPPER_RETURN
+# endif
END (STRLEN)
#endif
--
GitLab

View File

@ -0,0 +1,701 @@
From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 3 May 2021 03:03:19 -0400
Subject: [PATCH] x86: Optimize memchr-evex.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memchr-evex.S. The optimizations include
replacing some branches with cmovcc, avoiding some branches entirely
in the less_4x_vec case, making the page cross logic less strict,
saving some ALU in the alignment process, and most importantly
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
test-wmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
1 file changed, 322 insertions(+), 225 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 6dd5d67b..81d5cd64 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -26,14 +26,28 @@
# ifdef USE_AS_WMEMCHR
# define VPBROADCAST vpbroadcastd
-# define VPCMP vpcmpd
-# define SHIFT_REG r8d
+# define VPMINU vpminud
+# define VPCMP vpcmpd
+# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
-# define VPCMP vpcmpb
-# define SHIFT_REG ecx
+# define VPMINU vpminub
+# define VPCMP vpcmpb
+# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
# endif
+# ifdef USE_AS_RAWMEMCHR
+# define RAW_PTR_REG rcx
+# define ALGN_PTR_REG rdi
+# else
+# define RAW_PTR_REG rdi
+# define ALGN_PTR_REG rcx
+# endif
+
+# define XMMZERO xmm23
+# define YMMZERO ymm23
# define XMMMATCH xmm16
# define YMMMATCH ymm16
# define YMM1 ymm17
@@ -44,6 +58,8 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+# define PAGE_SIZE 4096
.section .text.evex,"ax",@progbits
ENTRY (MEMCHR)
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
/* Check for zero length. */
test %RDX_LP, %RDX_LP
jz L(zero)
-# endif
- movl %edi, %ecx
-# ifdef USE_AS_WMEMCHR
- shl $2, %RDX_LP
-# else
+
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
/* Broadcast CHAR to YMMMATCH. */
VPBROADCAST %esi, %YMMMATCH
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
-
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
# ifndef USE_AS_RAWMEMCHR
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rdx
- jbe L(zero)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
- jnz L(first_vec_x0)
+ addq %rdi, %rax
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ ret
# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
- addq %rcx, %rdx
-
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
- jmp L(more_4x_vec)
+L(zero):
+ xorl %eax, %eax
+ ret
+ .p2align 5
+L(first_vec_x0):
+ /* Check if first match was before length. */
+ tzcntl %eax, %eax
+ xorl %ecx, %ecx
+ cmpl %eax, %edx
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ cmovle %rcx, %rax
+ ret
+# else
+ /* NB: first_vec_x0 is 17 bytes which will leave
+ cross_page_boundary (which is relatively cold) close enough
+ to ideal alignment. So only realign L(cross_page_boundary) if
+ rawmemchr. */
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
+# endif
+L(cross_page_boundary):
+ /* Save pointer before aligning as its original value is
+ necessary for computer return address if byte is found or
+ adjusting length if it is not and this is memchr. */
+ movq %rdi, %rcx
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
+ for rawmemchr. */
+ andq $-VEC_SIZE, %ALGN_PTR_REG
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
+ kmovd %k0, %r8d
# ifdef USE_AS_WMEMCHR
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ sarl $2, %eax
+# endif
+# ifndef USE_AS_RAWMEMCHR
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
+ subl %eax, %esi
# endif
- andq $-VEC_SIZE, %rdi
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
- tzcntl %eax, %eax
# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+ andl $(CHAR_PER_VEC - 1), %eax
# endif
+ /* Remove the leading bytes. */
+ sarxl %eax, %r8d, %eax
# ifndef USE_AS_RAWMEMCHR
/* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
+ cmpq %rsi, %rdx
+ jbe L(first_vec_x0)
+# endif
+ testl %eax, %eax
+ jz L(cross_page_continue)
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
+# else
+ addq %RAW_PTR_REG, %rax
# endif
- addq %rdi, %rax
- addq %rcx, %rax
ret
.p2align 4
-L(aligned_more):
-# ifndef USE_AS_RAWMEMCHR
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
- overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Check the end of data. */
- subq %rcx, %rdx
- jbe L(zero)
-# endif
+ .p2align 4
+L(first_vec_x2):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- addq $VEC_SIZE, %rdi
+ .p2align 4
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Align data to VEC_SIZE. */
+L(cross_page_continue):
+ xorl %ecx, %ecx
+ subl %edi, %ecx
+ andq $-VEC_SIZE, %rdi
+ /* esi is for adjusting length to see if near the end. */
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
+# ifdef USE_AS_WMEMCHR
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %esi
+# endif
+# else
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+# endif
+ /* Load first VEC regardless. */
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+# ifndef USE_AS_RAWMEMCHR
+ /* Adjust length. If near end handle specially. */
+ subq %rsi, %rdx
+ jbe L(last_4x_vec_or_less)
+# endif
testl %eax, %eax
jnz L(first_vec_x1)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x2)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+
# ifndef USE_AS_RAWMEMCHR
- subq $(VEC_SIZE * 4), %rdx
- jbe L(last_4x_vec_or_less)
-# endif
+ /* Check if at last CHAR_PER_VEC * 4 length. */
+ subq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_4x_vec_or_less_cmpeq)
+ addq $VEC_SIZE, %rdi
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
+ */
+# ifdef USE_AS_WMEMCHR
+ movl %edi, %ecx
andq $-(4 * VEC_SIZE), %rdi
-
-# ifndef USE_AS_RAWMEMCHR
- /* Adjust length. */
+ andl $(VEC_SIZE * 4 - 1), %ecx
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
addq %rcx, %rdx
+# else
+ addq %rdi, %rdx
+ andq $-(4 * VEC_SIZE), %rdi
+ subq %rdi, %rdx
+# endif
+# else
+ addq $VEC_SIZE, %rdi
+ andq $-(4 * VEC_SIZE), %rdi
# endif
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
- kord %k1, %k2, %k5
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
-
- kord %k3, %k4, %k6
- kortestd %k5, %k6
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
-
+ /* It would be possible to save some instructions using 4x VPCMP
+ but bottleneck on port 5 makes it not woth it. */
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
+ /* xor will set bytes match esi to zero. */
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPCMP $0, %YMM3, %YMMZERO, %k2
# ifdef USE_AS_RAWMEMCHR
- jmp L(loop_4x_vec)
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k2, %k3
+ jz L(loop_4x_vec)
# else
- subq $(VEC_SIZE * 4), %rdx
+ kortestd %k2, %k3
+ jnz L(loop_4x_vec_end)
+
+ subq $-(VEC_SIZE * 4), %rdi
+
+ subq $(CHAR_PER_VEC * 4), %rdx
ja L(loop_4x_vec)
+ /* Fall through into less than 4 remaining vectors of length case.
+ */
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ addq $(VEC_SIZE * 3), %rdi
+ .p2align 4
L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %edx
- jle L(last_2x_vec)
-
- VPCMP $0, (%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ /* Check if first VEC contained match. */
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(first_vec_x1_check)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
+ /* If remaining length > CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jg L(last_4x_vec)
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
+L(last_2x_vec):
+ /* If remaining length < CHAR_PER_VEC. */
+ addl $CHAR_PER_VEC, %edx
+ jle L(zero_end)
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ /* Check VEC2 and compare any match with remaining length. */
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end):
+ ret
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ .p2align 4
+L(first_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Adjust length. */
+ subl $-(CHAR_PER_VEC * 4), %edx
+ /* Check if match within remaining length. */
+ cmpl %eax, %edx
+ jbe L(set_zero_end)
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+L(set_zero_end):
xorl %eax, %eax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %edx
- VPCMP $0, (%rdi), %YMMMATCH, %k1
+L(loop_4x_vec_end):
+# endif
+ /* rawmemchr will fall through into this if match was found in
+ loop. */
+
+ /* k1 has not of matches with VEC1. */
kmovd %k1, %eax
- testl %eax, %eax
+# ifdef USE_AS_WMEMCHR
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
+# else
+ incl %eax
+# endif
+ jnz L(last_vec_x1_return)
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %edx
- jle L(zero)
+ VPCMP $0, %YMM2, %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2_return)
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
- kmovd %k1, %eax
+ kmovd %k2, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- xorl %eax, %eax
- ret
+ jnz L(last_vec_x3_return)
- .p2align 4
-L(first_vec_x0_check):
+ kmovd %k3, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $VEC_SIZE, %rax
+# ifdef USE_AS_RAWMEMCHR
+# ifdef USE_AS_WMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+# else
addq %rdi, %rax
- ret
-
- .p2align 4
-L(first_vec_x2_check):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# endif
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x2_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
+# else
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
# endif
- /* Check the end of data. */
- cmpq %rax, %rdx
- jbe L(zero)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
ret
.p2align 4
-L(zero):
- xorl %eax, %eax
- ret
-# endif
-
- .p2align 4
-L(first_vec_x0):
+L(last_vec_x3_return):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+# ifdef USE_AS_RAWMEMCHR
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
# else
- addq %rdi, %rax
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
# endif
ret
+
+# ifndef USE_AS_RAWMEMCHR
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check first VEC regardless. */
+ testl %eax, %eax
+ jnz L(first_vec_x1_check)
+
+ /* If remaining length <= CHAR_PER_VEC * 2. */
+ addl $(CHAR_PER_VEC * 2), %edx
+ jle L(last_2x_vec)
+
.p2align 4
-L(first_vec_x1):
+L(last_4x_vec):
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(last_vec_x2)
+
+
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Create mask for possible matches within remaining length. */
+# ifdef USE_AS_WMEMCHR
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
+ bzhil %edx, %ecx, %ecx
+# else
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+# endif
+ /* Test matches in data against length match. */
+ andl %ecx, %eax
+ jnz L(last_vec_x3)
+
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
+ remaining length was found to be > CHAR_PER_VEC * 2. */
+ subl $CHAR_PER_VEC, %edx
+ jbe L(zero_end2)
+
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ kmovd %k0, %eax
+ /* Shift remaining length mask for last VEC. */
+# ifdef USE_AS_WMEMCHR
+ shrl $CHAR_PER_VEC, %ecx
+# else
+ shrq $CHAR_PER_VEC, %rcx
+# endif
+ andl %ecx, %eax
+ jz L(zero_end2)
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+L(zero_end2):
ret
- .p2align 4
-L(first_vec_x2):
+L(last_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
-L(4x_vec_end):
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- kmovd %k3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- kmovd %k4, %eax
- testl %eax, %eax
-L(first_vec_x3):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
-# endif
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
ret
+# endif
END (MEMCHR)
#endif
--
GitLab

View File

@ -0,0 +1,30 @@
From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
From: Alice Xu <alice.d.xu@gmail.com>
Date: Fri, 7 May 2021 19:03:21 -0700
Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
Content-type: text/plain; charset=UTF-8
An unknown vector operation occurred in commit 2a76821c308. Fixed it
by using "ymm{k1}{z}" but not "ymm {k1} {z}".
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index 81d5cd64..f3fdad4f 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -271,7 +271,7 @@ L(loop_4x_vec):
vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
VPCMP $0, %YMM3, %YMMZERO, %k2
# ifdef USE_AS_RAWMEMCHR
subq $-(VEC_SIZE * 4), %rdi
--
GitLab

View File

@ -0,0 +1,566 @@
From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 22 Jun 2021 20:42:10 -0700
Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
Content-type: text/plain; charset=UTF-8
Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
This also removes the unused symbols, __GI___strlen_sse2 and
__GI___wcsnlen_sse4_1.
---
sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +-
sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +-
sysdeps/x86_64/strlen.S | 243 +-------------------
4 files changed, 262 insertions(+), 242 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
Conflicts:
sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
(Copyright dates, URL)
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
index 7bc57b8d..449c8a7f 100644
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
@@ -20,4 +20,4 @@
# define strlen __strlen_sse2
#endif
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
new file mode 100644
index 00000000..8f660bb9
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -0,0 +1,257 @@
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifdef AS_WCSLEN
+# define PMINU pminud
+# define PCMPEQ pcmpeqd
+# define SHIFT_RETURN shrq $2, %rax
+#else
+# define PMINU pminub
+# define PCMPEQ pcmpeqb
+# define SHIFT_RETURN
+#endif
+
+/* Long lived register in strlen(s), strnlen(s, n) are:
+
+ %xmm3 - zero
+ %rdi - s
+ %r10 (s+n) & (~(64-1))
+ %r11 s+n
+*/
+
+
+.text
+ENTRY(strlen)
+
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
+#define FIND_ZERO \
+ PCMPEQ (%rax), %xmm0; \
+ PCMPEQ 16(%rax), %xmm1; \
+ PCMPEQ 32(%rax), %xmm2; \
+ PCMPEQ 48(%rax), %xmm3; \
+ pmovmskb %xmm0, %esi; \
+ pmovmskb %xmm1, %edx; \
+ pmovmskb %xmm2, %r8d; \
+ pmovmskb %xmm3, %ecx; \
+ salq $16, %rdx; \
+ salq $16, %rcx; \
+ orq %rsi, %rdx; \
+ orq %r8, %rcx; \
+ salq $32, %rcx; \
+ orq %rcx, %rdx;
+
+#ifdef AS_STRNLEN
+/* Do not read anything when n==0. */
+ test %RSI_LP, %RSI_LP
+ jne L(n_nonzero)
+ xor %rax, %rax
+ ret
+L(n_nonzero):
+# ifdef AS_WCSLEN
+ shl $2, %RSI_LP
+# endif
+
+/* Initialize long lived registers. */
+
+ add %RDI_LP, %RSI_LP
+ mov %RSI_LP, %R10_LP
+ and $-64, %R10_LP
+ mov %RSI_LP, %R11_LP
+#endif
+
+ pxor %xmm0, %xmm0
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+ movq %rdi, %rax
+ movq %rdi, %rcx
+ andq $4095, %rcx
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
+ cmpq $4047, %rcx
+/* We cannot unify this branching as it would be ~6 cycles slower. */
+ ja L(cross_page)
+
+#ifdef AS_STRNLEN
+/* Test if end is among first 64 bytes. */
+# define STRNLEN_PROLOG \
+ mov %r11, %rsi; \
+ subq %rax, %rsi; \
+ andq $-64, %rax; \
+ testq $-64, %rsi; \
+ je L(strnlen_ret)
+#else
+# define STRNLEN_PROLOG andq $-64, %rax;
+#endif
+
+/* Ignore bits in mask that come before start of string. */
+#define PROLOG(lab) \
+ movq %rdi, %rcx; \
+ xorq %rax, %rcx; \
+ STRNLEN_PROLOG; \
+ sarq %cl, %rdx; \
+ test %rdx, %rdx; \
+ je L(lab); \
+ bsfq %rdx, %rax; \
+ SHIFT_RETURN; \
+ ret
+
+#ifdef AS_STRNLEN
+ andq $-16, %rax
+ FIND_ZERO
+#else
+ /* Test first 16 bytes unaligned. */
+ movdqu (%rax), %xmm4
+ PCMPEQ %xmm0, %xmm4
+ pmovmskb %xmm4, %edx
+ test %edx, %edx
+ je L(next48_bytes)
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
+ SHIFT_RETURN
+ ret
+
+L(next48_bytes):
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
+ andq $-16, %rax
+ PCMPEQ 16(%rax), %xmm1
+ PCMPEQ 32(%rax), %xmm2
+ PCMPEQ 48(%rax), %xmm3
+ pmovmskb %xmm1, %edx
+ pmovmskb %xmm2, %r8d
+ pmovmskb %xmm3, %ecx
+ salq $16, %rdx
+ salq $16, %rcx
+ orq %r8, %rcx
+ salq $32, %rcx
+ orq %rcx, %rdx
+#endif
+
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
+ zero them. */
+ PROLOG(loop)
+
+ .p2align 4
+L(cross_page):
+ andq $-64, %rax
+ FIND_ZERO
+ PROLOG(loop_init)
+
+#ifdef AS_STRNLEN
+/* We must do this check to correctly handle strnlen (s, -1). */
+L(strnlen_ret):
+ bts %rsi, %rdx
+ sarq %cl, %rdx
+ test %rdx, %rdx
+ je L(loop_init)
+ bsfq %rdx, %rax
+ SHIFT_RETURN
+ ret
+#endif
+ .p2align 4
+L(loop_init):
+ pxor %xmm1, %xmm1
+ pxor %xmm2, %xmm2
+ pxor %xmm3, %xmm3
+#ifdef AS_STRNLEN
+ .p2align 4
+L(loop):
+
+ addq $64, %rax
+ cmpq %rax, %r10
+ je L(exit_end)
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit)
+ jmp L(loop)
+
+ .p2align 4
+L(exit_end):
+ cmp %rax, %r11
+ je L(first) /* Do not read when end is at page boundary. */
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+L(first):
+ bts %r11, %rdx
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+ .p2align 4
+L(exit):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#else
+
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
+ .p2align 4
+L(loop):
+
+ movdqa 64(%rax), %xmm0
+ PMINU 80(%rax), %xmm0
+ PMINU 96(%rax), %xmm0
+ PMINU 112(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit64)
+
+ subq $-128, %rax
+
+ movdqa (%rax), %xmm0
+ PMINU 16(%rax), %xmm0
+ PMINU 32(%rax), %xmm0
+ PMINU 48(%rax), %xmm0
+ PCMPEQ %xmm3, %xmm0
+ pmovmskb %xmm0, %edx
+ testl %edx, %edx
+ jne L(exit0)
+ jmp L(loop)
+
+ .p2align 4
+L(exit64):
+ addq $64, %rax
+L(exit0):
+ pxor %xmm0, %xmm0
+ FIND_ZERO
+
+ bsfq %rdx, %rdx
+ addq %rdx, %rax
+ subq %rdi, %rax
+ SHIFT_RETURN
+ ret
+
+#endif
+
+END(strlen)
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
index a8cab0cb..5fa51fe0 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
@@ -2,4 +2,4 @@
#define AS_STRNLEN
#define strlen __wcsnlen_sse4_1
-#include "../strlen.S"
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index f845f3d4..ad047d84 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -1,5 +1,5 @@
-/* SSE2 version of strlen/wcslen.
- Copyright (C) 2012-2018 Free Software Foundation, Inc.
+/* SSE2 version of strlen.
+ Copyright (C) 2021 Free Software Foundation, Inc.
This file is part of the GNU C Library.
The GNU C Library is free software; you can redistribute it and/or
@@ -16,243 +16,6 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
-#include <sysdep.h>
+#include "multiarch/strlen-vec.S"
-#ifdef AS_WCSLEN
-# define PMINU pminud
-# define PCMPEQ pcmpeqd
-# define SHIFT_RETURN shrq $2, %rax
-#else
-# define PMINU pminub
-# define PCMPEQ pcmpeqb
-# define SHIFT_RETURN
-#endif
-
-/* Long lived register in strlen(s), strnlen(s, n) are:
-
- %xmm3 - zero
- %rdi - s
- %r10 (s+n) & (~(64-1))
- %r11 s+n
-*/
-
-
-.text
-ENTRY(strlen)
-
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
-#define FIND_ZERO \
- PCMPEQ (%rax), %xmm0; \
- PCMPEQ 16(%rax), %xmm1; \
- PCMPEQ 32(%rax), %xmm2; \
- PCMPEQ 48(%rax), %xmm3; \
- pmovmskb %xmm0, %esi; \
- pmovmskb %xmm1, %edx; \
- pmovmskb %xmm2, %r8d; \
- pmovmskb %xmm3, %ecx; \
- salq $16, %rdx; \
- salq $16, %rcx; \
- orq %rsi, %rdx; \
- orq %r8, %rcx; \
- salq $32, %rcx; \
- orq %rcx, %rdx;
-
-#ifdef AS_STRNLEN
-/* Do not read anything when n==0. */
- test %RSI_LP, %RSI_LP
- jne L(n_nonzero)
- xor %rax, %rax
- ret
-L(n_nonzero):
-# ifdef AS_WCSLEN
- shl $2, %RSI_LP
-# endif
-
-/* Initialize long lived registers. */
-
- add %RDI_LP, %RSI_LP
- mov %RSI_LP, %R10_LP
- and $-64, %R10_LP
- mov %RSI_LP, %R11_LP
-#endif
-
- pxor %xmm0, %xmm0
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
- movq %rdi, %rax
- movq %rdi, %rcx
- andq $4095, %rcx
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
- cmpq $4047, %rcx
-/* We cannot unify this branching as it would be ~6 cycles slower. */
- ja L(cross_page)
-
-#ifdef AS_STRNLEN
-/* Test if end is among first 64 bytes. */
-# define STRNLEN_PROLOG \
- mov %r11, %rsi; \
- subq %rax, %rsi; \
- andq $-64, %rax; \
- testq $-64, %rsi; \
- je L(strnlen_ret)
-#else
-# define STRNLEN_PROLOG andq $-64, %rax;
-#endif
-
-/* Ignore bits in mask that come before start of string. */
-#define PROLOG(lab) \
- movq %rdi, %rcx; \
- xorq %rax, %rcx; \
- STRNLEN_PROLOG; \
- sarq %cl, %rdx; \
- test %rdx, %rdx; \
- je L(lab); \
- bsfq %rdx, %rax; \
- SHIFT_RETURN; \
- ret
-
-#ifdef AS_STRNLEN
- andq $-16, %rax
- FIND_ZERO
-#else
- /* Test first 16 bytes unaligned. */
- movdqu (%rax), %xmm4
- PCMPEQ %xmm0, %xmm4
- pmovmskb %xmm4, %edx
- test %edx, %edx
- je L(next48_bytes)
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
- SHIFT_RETURN
- ret
-
-L(next48_bytes):
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
- andq $-16, %rax
- PCMPEQ 16(%rax), %xmm1
- PCMPEQ 32(%rax), %xmm2
- PCMPEQ 48(%rax), %xmm3
- pmovmskb %xmm1, %edx
- pmovmskb %xmm2, %r8d
- pmovmskb %xmm3, %ecx
- salq $16, %rdx
- salq $16, %rcx
- orq %r8, %rcx
- salq $32, %rcx
- orq %rcx, %rdx
-#endif
-
- /* When no zero byte is found xmm1-3 are zero so we do not have to
- zero them. */
- PROLOG(loop)
-
- .p2align 4
-L(cross_page):
- andq $-64, %rax
- FIND_ZERO
- PROLOG(loop_init)
-
-#ifdef AS_STRNLEN
-/* We must do this check to correctly handle strnlen (s, -1). */
-L(strnlen_ret):
- bts %rsi, %rdx
- sarq %cl, %rdx
- test %rdx, %rdx
- je L(loop_init)
- bsfq %rdx, %rax
- SHIFT_RETURN
- ret
-#endif
- .p2align 4
-L(loop_init):
- pxor %xmm1, %xmm1
- pxor %xmm2, %xmm2
- pxor %xmm3, %xmm3
-#ifdef AS_STRNLEN
- .p2align 4
-L(loop):
-
- addq $64, %rax
- cmpq %rax, %r10
- je L(exit_end)
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit)
- jmp L(loop)
-
- .p2align 4
-L(exit_end):
- cmp %rax, %r11
- je L(first) /* Do not read when end is at page boundary. */
- pxor %xmm0, %xmm0
- FIND_ZERO
-
-L(first):
- bts %r11, %rdx
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
- .p2align 4
-L(exit):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#else
-
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
- .p2align 4
-L(loop):
-
- movdqa 64(%rax), %xmm0
- PMINU 80(%rax), %xmm0
- PMINU 96(%rax), %xmm0
- PMINU 112(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit64)
-
- subq $-128, %rax
-
- movdqa (%rax), %xmm0
- PMINU 16(%rax), %xmm0
- PMINU 32(%rax), %xmm0
- PMINU 48(%rax), %xmm0
- PCMPEQ %xmm3, %xmm0
- pmovmskb %xmm0, %edx
- testl %edx, %edx
- jne L(exit0)
- jmp L(loop)
-
- .p2align 4
-L(exit64):
- addq $64, %rax
-L(exit0):
- pxor %xmm0, %xmm0
- FIND_ZERO
-
- bsfq %rdx, %rdx
- addq %rdx, %rax
- subq %rdi, %rax
- SHIFT_RETURN
- ret
-
-#endif
-
-END(strlen)
libc_hidden_builtin_def (strlen)
--
GitLab

View File

@ -0,0 +1,181 @@
From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 23 Jun 2021 01:19:34 -0400
Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
Content-type: text/plain; charset=UTF-8
No bug. This comment adds the ifunc / build infrastructure
necessary for wcslen to prefer the sse4.1 implementation
in strlen-vec.S. test-wcslen.c is passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 4 +-
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++
sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++
sysdeps/x86_64/multiarch/wcslen.c | 2 +-
sysdeps/x86_64/multiarch/wcsnlen.c | 34 +-------------
6 files changed, 63 insertions(+), 36 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 491c7698..65fde4eb 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcscpy-ssse3 wcscpy-c \
wcschr-sse2 wcschr-avx2 \
wcsrchr-sse2 wcsrchr-avx2 \
- wcsnlen-sse4_1 wcsnlen-c \
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
wcschr-avx2-rtm \
wcscmp-avx2-rtm \
wcslen-avx2-rtm \
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index f1a6460a..580913ca 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
+ CPU_FEATURE_USABLE (SSE4_1),
+ __wcsnlen_sse4_1)
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
new file mode 100644
index 00000000..39e33473
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
@@ -0,0 +1,52 @@
+/* Common definition for ifunc selections for wcslen and wcsnlen
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ return OPTIMIZE (evex);
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
+ return OPTIMIZE (sse4_1);
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
new file mode 100644
index 00000000..7e62621a
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
@@ -0,0 +1,4 @@
+#define AS_WCSLEN
+#define strlen __wcslen_sse4_1
+
+#include "strlen-vec.S"
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
index 6d06e47c..3b04b75b 100644
--- a/sysdeps/x86_64/multiarch/wcslen.c
+++ b/sysdeps/x86_64/multiarch/wcslen.c
@@ -24,7 +24,7 @@
# undef __wcslen
# define SYMBOL_NAME wcslen
-# include "ifunc-avx2.h"
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
weak_alias (__wcslen, wcslen);
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
index 20b731ae..06736410 100644
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
@@ -24,39 +24,7 @@
# undef __wcsnlen
# define SYMBOL_NAME wcsnlen
-# include <init-arch.h>
-
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
-
-static inline void *
-IFUNC_SELECTOR (void)
-{
- const struct cpu_features* cpu_features = __get_cpu_features ();
-
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
- {
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
- return OPTIMIZE (evex);
-
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
- return OPTIMIZE (avx2_rtm);
-
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
- return OPTIMIZE (avx2);
- }
-
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
- return OPTIMIZE (sse4_1);
-
- return OPTIMIZE (sse2);
-}
+# include "ifunc-wcslen.h"
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
weak_alias (__wcsnlen, wcsnlen);
--
GitLab

View File

@ -0,0 +1,396 @@
From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:27:25 -0800
Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64,
libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
length. Clear the upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
Likewise.
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
tst-size_t-wmemchr.
* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
---
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--
.../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--
.../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++--------
sysdeps/x86_64/x32/Makefile | 2 +-
sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++
6 files changed, 122 insertions(+), 42 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
index 3cd11233..568eebd3 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
@@ -45,28 +45,33 @@
.section .text.ssse3,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY)
#endif
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
#endif
ENTRY (MEMCPY)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
#endif
#ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
index 0240bfa3..0bd5ee99 100644
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
@@ -45,28 +45,33 @@
.section .text.ssse3,"ax",@progbits
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
ENTRY (MEMPCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMPCPY_CHK)
ENTRY (MEMPCPY)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY)
#endif
#if !defined USE_AS_BCOPY
ENTRY (MEMCPY_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMCPY_CHK)
#endif
ENTRY (MEMCPY)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
#ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
+#endif
+
+#ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
#endif
#ifdef USE_AS_MEMMOVE
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
index effc3ac2..6ca2bbc9 100644
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
@@ -24,27 +24,31 @@
.section .text.avx512,"ax",@progbits
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_avx512_no_vzeroupper)
ENTRY (__mempcpy_avx512_no_vzeroupper)
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (__mempcpy_avx512_no_vzeroupper)
ENTRY (__memmove_chk_avx512_no_vzeroupper)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_avx512_no_vzeroupper)
ENTRY (__memmove_avx512_no_vzeroupper)
- mov %rdi, %rax
+ mov %RDI_LP, %RAX_LP
# ifdef USE_AS_MEMPCPY
- add %rdx, %rax
+ add %RDX_LP, %RAX_LP
# endif
L(start):
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
+# endif
lea (%rsi, %rdx), %rcx
lea (%rdi, %rdx), %r9
cmp $512, %rdx
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index c952576c..274aa1c7 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -95,20 +95,20 @@
.section SECTION(.text),"ax",@progbits
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
#endif
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start)
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
#if defined SHARED && IS_IN (libc)
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
#endif
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
movq %rdi, %rax
L(start):
- cmpq $VEC_SIZE, %rdx
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
+ cmp $(VEC_SIZE * 2), %RDX_LP
ja L(more_2x_vec)
#if !defined USE_MULTIARCH || !IS_IN (libc)
L(last_2x_vec):
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
# if VEC_SIZE == 16
ENTRY (__mempcpy_chk_erms)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__mempcpy_chk_erms)
/* Only used to measure performance of REP MOVSB. */
ENTRY (__mempcpy_erms)
- movq %rdi, %rax
+ mov %RDI_LP, %RAX_LP
/* Skip zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz 2f
- addq %rdx, %rax
+ add %RDX_LP, %RAX_LP
jmp L(start_movsb)
END (__mempcpy_erms)
ENTRY (__memmove_chk_erms)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memmove_chk_erms)
ENTRY (__memmove_erms)
movq %rdi, %rax
/* Skip zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jz 2f
L(start_movsb):
- movq %rdx, %rcx
- cmpq %rsi, %rdi
+ mov %RDX_LP, %RCX_LP
+ cmp %RSI_LP, %RDI_LP
jb 1f
/* Source == destination is less common. */
je 2f
- leaq (%rsi,%rcx), %rdx
- cmpq %rdx, %rdi
+ lea (%rsi,%rcx), %RDX_LP
+ cmp %RDX_LP, %RDI_LP
jb L(movsb_backward)
1:
rep movsb
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
# endif
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
- movq %rdi, %rax
- addq %rdx, %rax
+ mov %RDI_LP, %RAX_LP
+ add %RDX_LP, %RAX_LP
jmp L(start_erms)
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
# ifdef SHARED
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
# endif
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
movq %rdi, %rax
L(start_erms):
- cmpq $VEC_SIZE, %rdx
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
+ cmp $(VEC_SIZE * 2), %RDX_LP
ja L(movsb_more_2x_vec)
L(last_2x_vec):
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -236,7 +244,7 @@ L(movsb):
/* Avoid slow backward REP MOVSB. */
jb L(more_8x_vec_backward)
1:
- movq %rdx, %rcx
+ mov %RDX_LP, %RCX_LP
rep movsb
L(nop):
ret
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index ddec7f04..2fe1e5ac 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
endif
ifeq ($(subdir),string)
-tests += tst-size_t-memchr tst-size_t-memcmp
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
endif
ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
new file mode 100644
index 00000000..66b71e17
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
@@ -0,0 +1,58 @@
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_NAME "memcpy"
+#include "test-size_t.h"
+
+IMPL (memcpy, 1)
+
+typedef void *(*proto_t) (void *, const void *, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memcpy (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ do_memcpy (dest, src);
+ int res = memcmp (dest.p, src.p, dest.len);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
--
GitLab

View File

@ -0,0 +1,497 @@
From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 23 Jun 2021 01:56:29 -0400
Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
#27974]
Content-type: text/plain; charset=UTF-8
This commit fixes the bug mentioned in the previous commit.
The previous implementations of wmemchr in these files relied
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
The new overflow tests added in the previous commit now
pass (As well as all the other tests).
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
2 files changed, 107 insertions(+), 38 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index be8a5db5..37688966 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -44,21 +44,21 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section SECTION(.text),"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check zero length. */
+# ifdef __ILP32__
+ /* Clear upper bits. */
+ and %RSI_LP, %RSI_LP
+# else
test %RSI_LP, %RSI_LP
+# endif
jz L(zero)
/* Store max len in R8_LP before adjusting if using WCSLEN. */
mov %RSI_LP, %R8_LP
-# ifdef USE_AS_WCSLEN
- shl $2, %RSI_LP
-# elif defined __ILP32__
- /* Clear the upper 32 bits. */
- movl %esi, %esi
-# endif
# endif
movl %edi, %eax
movq %rdi, %rdx
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
/* Check the first VEC_SIZE bytes. */
VPCMPEQ (%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
# ifdef USE_AS_STRNLEN
/* If length < VEC_SIZE handle special. */
- cmpq $VEC_SIZE, %rsi
+ cmpq $CHAR_PER_VEC, %rsi
jbe L(first_vec_x0)
# endif
/* If empty continue to aligned_more. Otherwise return bit
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
jz L(aligned_more)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -97,9 +98,14 @@ L(zero):
L(first_vec_x0):
/* Set bit for max len so that tzcnt will return min of max len
and position of first match. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
btsq %rsi, %rax
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -113,14 +119,19 @@ L(first_vec_x1):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 4 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
incl %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -133,14 +144,19 @@ L(first_vec_x2):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 3 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -153,14 +169,19 @@ L(first_vec_x3):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE * 2 + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 2 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -173,14 +194,19 @@ L(first_vec_x4):
# ifdef USE_AS_STRNLEN
/* Use ecx which was computed earlier to compute correct value.
*/
+# ifdef USE_AS_WCSLEN
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
+# else
subl $(VEC_SIZE + 1), %ecx
addl %ecx, %eax
+# endif
# else
subl %edx, %edi
addl $(VEC_SIZE * 3 + 1), %edi
addl %edi, %eax
# endif
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrl $2, %eax
# endif
VZEROUPPER_RETURN
@@ -195,10 +221,14 @@ L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
# ifdef USE_AS_STRNLEN
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
- it simplies the logic in last_4x_vec_or_less. */
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
+ because it simplies the logic in last_4x_vec_or_less. */
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
# endif
/* Load first VEC regardless. */
VPCMPEQ 1(%rdi), %ymm0, %ymm1
@@ -207,34 +237,38 @@ L(cross_page_continue):
subq %rcx, %rsi
jb L(last_4x_vec_or_less)
# endif
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x2)
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x3)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x4)
/* Align data to VEC_SIZE * 4 - 1. */
# ifdef USE_AS_STRNLEN
/* Before adjusting length check if at last VEC_SIZE * 4. */
- cmpq $(VEC_SIZE * 4 - 1), %rsi
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
jbe L(last_4x_vec_or_less_load)
incq %rdi
movl %edi, %ecx
orq $(VEC_SIZE * 4 - 1), %rdi
andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
/* Readjust length. */
addq %rcx, %rsi
# else
@@ -246,13 +280,13 @@ L(cross_page_continue):
L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
/* Break if at end of length. */
- subq $(VEC_SIZE * 4), %rsi
+ subq $(CHAR_PER_VEC * 4), %rsi
jb L(last_4x_vec_or_less_cmpeq)
# endif
- /* Save some code size by microfusing VPMINU with the load. Since
- the matches in ymm2/ymm4 can only be returned if there where no
- matches in ymm1/ymm3 respectively there is no issue with overlap.
- */
+ /* Save some code size by microfusing VPMINU with the load.
+ Since the matches in ymm2/ymm4 can only be returned if there
+ where no matches in ymm1/ymm3 respectively there is no issue
+ with overlap. */
vmovdqa 1(%rdi), %ymm1
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
@@ -260,7 +294,7 @@ L(loop_4x_vec):
VPMINU %ymm2, %ymm4, %ymm5
VPCMPEQ %ymm5, %ymm0, %ymm5
- vpmovmskb %ymm5, %ecx
+ vpmovmskb %ymm5, %ecx
subq $-(VEC_SIZE * 4), %rdi
testl %ecx, %ecx
@@ -268,27 +302,28 @@ L(loop_4x_vec):
VPCMPEQ %ymm1, %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
subq %rdx, %rdi
testl %eax, %eax
jnz L(last_vec_return_x0)
VPCMPEQ %ymm2, %ymm0, %ymm2
- vpmovmskb %ymm2, %eax
+ vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(last_vec_return_x1)
/* Combine last 2 VEC. */
VPCMPEQ %ymm3, %ymm0, %ymm3
- vpmovmskb %ymm3, %eax
- /* rcx has combined result from all 4 VEC. It will only be used if
- the first 3 other VEC all did not contain a match. */
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used
+ if the first 3 other VEC all did not contain a match. */
salq $32, %rcx
orq %rcx, %rax
tzcntq %rax, %rax
subq $(VEC_SIZE * 2 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -297,15 +332,19 @@ L(loop_4x_vec):
# ifdef USE_AS_STRNLEN
.p2align 4
L(last_4x_vec_or_less_load):
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
+ */
subq $-(VEC_SIZE * 4), %rdi
L(last_4x_vec_or_less_cmpeq):
VPCMPEQ 1(%rdi), %ymm0, %ymm1
L(last_4x_vec_or_less):
-
- vpmovmskb %ymm1, %eax
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
- VEC_SIZE * 4. */
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
+ vpmovmskb %ymm1, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
+ by VEC_SIZE * 4. */
testl $(VEC_SIZE * 2), %esi
jnz L(last_4x_vec)
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
jb L(max)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
subq $(VEC_SIZE * 4 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
subq $(VEC_SIZE * 3 - 1), %rdi
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -381,14 +424,14 @@ L(last_4x_vec):
jnz L(last_vec_x1)
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x2)
/* Normalize length. */
andl $(VEC_SIZE * 4 - 1), %esi
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(last_vec_x3)
@@ -396,7 +439,7 @@ L(last_4x_vec):
jb L(max)
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
tzcntl %eax, %eax
/* Check the end of data. */
cmpl %eax, %esi
@@ -405,6 +448,7 @@ L(last_4x_vec):
addl $(VEC_SIZE * 3 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -419,6 +463,7 @@ L(last_vec_x1):
incl %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -432,6 +477,7 @@ L(last_vec_x2):
addl $(VEC_SIZE + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -447,6 +493,7 @@ L(last_vec_x3):
addl $(VEC_SIZE * 2 + 1), %eax
addq %rdi, %rax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
shrq $2, %rax
# endif
VZEROUPPER_RETURN
@@ -455,13 +502,13 @@ L(max_end):
VZEROUPPER_RETURN
# endif
- /* Cold case for crossing page with first load. */
+ /* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
/* Align data to VEC_SIZE - 1. */
orq $(VEC_SIZE - 1), %rdi
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
- vpmovmskb %ymm1, %eax
+ vpmovmskb %ymm1, %eax
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
so no need to manually mod rdx. */
sarxl %edx, %eax, %eax
@@ -470,6 +517,10 @@ L(cross_page_boundary):
jnz L(cross_page_less_vec)
leaq 1(%rdi), %rcx
subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get wchar_t count. */
+ shrl $2, %ecx
+# endif
/* Check length. */
cmpq %rsi, %rcx
jb L(cross_page_continue)
@@ -479,6 +530,7 @@ L(cross_page_boundary):
jz L(cross_page_continue)
tzcntl %eax, %eax
# ifdef USE_AS_WCSLEN
+ /* NB: Divide length by 4 to get wchar_t count. */
shrl $2, %eax
# endif
# endif
@@ -489,6 +541,10 @@ L(return_vzeroupper):
.p2align 4
L(cross_page_less_vec):
tzcntl %eax, %eax
+# ifdef USE_AS_WCSLEN
+ /* NB: Multiply length by 4 to get byte count. */
+ sall $2, %esi
+# endif
cmpq %rax, %rsi
cmovb %esi, %eax
# ifdef USE_AS_WCSLEN
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
index 8f660bb9..439e486a 100644
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -65,12 +65,25 @@ ENTRY(strlen)
ret
L(n_nonzero):
# ifdef AS_WCSLEN
- shl $2, %RSI_LP
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
+ overflow the only way this program doesn't have undefined behavior
+ is if there is a null terminator in valid memory so wcslen will
+ suffice. */
+ mov %RSI_LP, %R10_LP
+ sar $62, %R10_LP
+ test %R10_LP, %R10_LP
+ jnz __wcslen_sse4_1
+ sal $2, %RSI_LP
# endif
+
/* Initialize long lived registers. */
add %RDI_LP, %RSI_LP
+# ifdef AS_WCSLEN
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
+ jbe __wcslen_sse4_1
+# endif
mov %RSI_LP, %R10_LP
and $-64, %R10_LP
mov %RSI_LP, %R11_LP
--
GitLab

View File

@ -0,0 +1,745 @@
From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 19 Apr 2021 19:36:06 -0400
Subject: [PATCH] x86: Optimize strlen-evex.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes strlen-evex.S. The
optimizations are mostly small things but they add up to roughly
10-30% performance improvement for strlen. The results for strnlen are
bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
test-wcsnlen are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
1 file changed, 317 insertions(+), 264 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
index 05838190..4bf6874b 100644
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
@@ -29,11 +29,13 @@
# ifdef USE_AS_WCSLEN
# define VPCMP vpcmpd
# define VPMINU vpminud
-# define SHIFT_REG r9d
+# define SHIFT_REG ecx
+# define CHAR_SIZE 4
# else
# define VPCMP vpcmpb
# define VPMINU vpminub
-# define SHIFT_REG ecx
+# define SHIFT_REG edx
+# define CHAR_SIZE 1
# endif
# define XMMZERO xmm16
@@ -46,132 +48,165 @@
# define YMM6 ymm22
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
- /* Check for zero length. */
+ /* Check zero length. */
test %RSI_LP, %RSI_LP
jz L(zero)
-# ifdef USE_AS_WCSLEN
- shl $2, %RSI_LP
-# elif defined __ILP32__
+# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %esi, %esi
# endif
mov %RSI_LP, %R8_LP
# endif
- movl %edi, %ecx
- movq %rdi, %rdx
+ movl %edi, %eax
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
-
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. */
+ andl $(PAGE_SIZE - 1), %eax
/* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
null byte. */
VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
- testl %eax, %eax
-
# ifdef USE_AS_STRNLEN
- jnz L(first_vec_x0_check)
- /* Adjust length and check the end of data. */
- subq $VEC_SIZE, %rsi
- jbe L(max)
-# else
- jnz L(first_vec_x0)
+ /* If length < CHAR_PER_VEC handle special. */
+ cmpq $CHAR_PER_VEC, %rsi
+ jbe L(first_vec_x0)
# endif
-
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
+ testl %eax, %eax
+ jz L(aligned_more)
+ tzcntl %eax, %eax
+ ret
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
- addq %rcx, %rsi
+L(zero):
+ xorl %eax, %eax
+ ret
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ .p2align 4
+L(first_vec_x0):
+ /* Set bit for max len so that tzcnt will return min of max len
+ and position of first match. */
+ btsq %rsi, %rax
+ tzcntl %eax, %eax
+ ret
# endif
- jmp L(more_4x_vec)
.p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
-# ifdef USE_AS_WCSLEN
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
- bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+L(first_vec_x1):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
+# ifdef USE_AS_STRNLEN
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal CHAR_PER_VEC(%rdi, %rax), %eax
# endif
- VPCMP $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
+ ret
- /* Remove the leading bytes. */
- sarxl %SHIFT_REG, %eax, %eax
- testl %eax, %eax
- jz L(aligned_more)
+ .p2align 4
+L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
-# endif
- addq %rdi, %rax
- addq %rcx, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
# endif
ret
.p2align 4
-L(aligned_more):
+L(first_vec_x3):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
- to void possible addition overflow. */
- negq %rcx
- addq $VEC_SIZE, %rcx
-
- /* Check the end of data. */
- subq %rcx, %rsi
- jbe L(max)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
# endif
+ ret
- addq $VEC_SIZE, %rdi
-
+ .p2align 4
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ /* Safe to use 32 bit instructions as these are only called for
+ size = [1, 159]. */
# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
+ /* Use ecx which was computed earlier to compute correct value.
+ */
+ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
+# else
+ subl %edx, %edi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %edi
+# endif
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
# endif
+ ret
-L(more_4x_vec):
+ .p2align 5
+L(aligned_more):
+ movq %rdi, %rdx
+ /* Align data to VEC_SIZE. */
+ andq $-(VEC_SIZE), %rdi
+L(cross_page_continue):
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
since data is only aligned to VEC_SIZE. */
- VPCMP $0, (%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
+# ifdef USE_AS_STRNLEN
+ /* + CHAR_SIZE because it simplies the logic in
+ last_4x_vec_or_less. */
+ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
+ subq %rdx, %rcx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
+# endif
+ /* Load first VEC regardless. */
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+# ifdef USE_AS_STRNLEN
+ /* Adjust length. If near end handle specially. */
+ subq %rcx, %rsi
+ jb L(last_4x_vec_or_less)
+# endif
kmovd %k0, %eax
testl %eax, %eax
jnz L(first_vec_x1)
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
- testl %eax, %eax
+ test %eax, %eax
jnz L(first_vec_x2)
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
@@ -179,258 +214,276 @@ L(more_4x_vec):
testl %eax, %eax
jnz L(first_vec_x3)
- addq $(VEC_SIZE * 4), %rdi
-
-# ifdef USE_AS_STRNLEN
- subq $(VEC_SIZE * 4), %rsi
- jbe L(last_4x_vec_or_less)
-# endif
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ addq $VEC_SIZE, %rdi
# ifdef USE_AS_STRNLEN
- /* Adjust length. */
+ /* Check if at last VEC_SIZE * 4 length. */
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
+ jbe L(last_4x_vec_or_less_load)
+ movl %edi, %ecx
+ andl $(VEC_SIZE * 4 - 1), %ecx
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarl $2, %ecx
+# endif
+ /* Readjust length. */
addq %rcx, %rsi
# endif
+ /* Align data to VEC_SIZE * 4. */
+ andq $-(VEC_SIZE * 4), %rdi
+ /* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VMOVA (%rdi), %YMM1
- VMOVA VEC_SIZE(%rdi), %YMM2
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
-
- VPMINU %YMM1, %YMM2, %YMM5
- VPMINU %YMM3, %YMM4, %YMM6
+ /* Load first VEC regardless. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+# ifdef USE_AS_STRNLEN
+ /* Break if at end of length. */
+ subq $(CHAR_PER_VEC * 4), %rsi
+ jb L(last_4x_vec_or_less_cmpeq)
+# endif
+ /* Save some code size by microfusing VPMINU with the load. Since
+ the matches in ymm2/ymm4 can only be returned if there where no
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
+ */
+ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
+ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
+
+ VPCMP $0, %YMM2, %YMMZERO, %k0
+ VPCMP $0, %YMM4, %YMMZERO, %k1
+ subq $-(VEC_SIZE * 4), %rdi
+ kortestd %k0, %k1
+ jz L(loop_4x_vec)
+
+ /* Check if end was in first half. */
+ kmovd %k0, %eax
+ subq %rdx, %rdi
+# ifdef USE_AS_WCSLEN
+ shrq $2, %rdi
+# endif
+ testl %eax, %eax
+ jz L(second_vec_return)
- VPMINU %YMM5, %YMM6, %YMM5
- VPCMP $0, %YMM5, %YMMZERO, %k0
- ktestd %k0, %k0
- jnz L(4x_vec_end)
+ VPCMP $0, %YMM1, %YMMZERO, %k2
+ kmovd %k2, %edx
+ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
+# ifdef USE_AS_WCSLEN
+ sall $CHAR_PER_VEC, %eax
+ orl %edx, %eax
+ tzcntl %eax, %eax
+# else
+ salq $CHAR_PER_VEC, %rax
+ orq %rdx, %rax
+ tzcntq %rax, %rax
+# endif
+ addq %rdi, %rax
+ ret
- addq $(VEC_SIZE * 4), %rdi
-# ifndef USE_AS_STRNLEN
- jmp L(loop_4x_vec)
-# else
- subq $(VEC_SIZE * 4), %rsi
- ja L(loop_4x_vec)
+# ifdef USE_AS_STRNLEN
+L(last_4x_vec_or_less_load):
+ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+L(last_4x_vec_or_less_cmpeq):
+ VPCMP $0, %YMM1, %YMMZERO, %k0
+ addq $(VEC_SIZE * 3), %rdi
L(last_4x_vec_or_less):
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
- addl $(VEC_SIZE * 2), %esi
- jle L(last_2x_vec)
-
- VPCMP $0, (%rdi), %YMMZERO, %k0
kmovd %k0, %eax
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
+ VEC_SIZE * 4. */
+ testl $(CHAR_PER_VEC * 2), %esi
+ jnz L(last_4x_vec)
+
+ /* length may have been negative or positive by an offset of
+ CHAR_PER_VEC * 4 depending on where this was called from. This
+ fixes that. */
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(last_vec_x1_check)
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
+ /* Check the end of data. */
+ subl $CHAR_PER_VEC, %esi
+ jb L(max)
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x2_check)
- subl $VEC_SIZE, %esi
- jle L(max)
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x3_check)
+ subq %rdx, %rdi
+# ifdef USE_AS_WCSLEN
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
+# endif
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ ret
+L(max):
movq %r8, %rax
+ ret
+# endif
+
+ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
+ in the 4x VEC loop can use 2 byte encoding. */
+ .p2align 4
+L(second_vec_return):
+ VPCMP $0, %YMM3, %YMMZERO, %k0
+ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
+# ifdef USE_AS_WCSLEN
+ kunpckbw %k0, %k1, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
+# else
+ kunpckdq %k0, %k1, %k0
+ kmovq %k0, %rax
+ tzcntq %rax, %rax
+# endif
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
+ ret
+
+
+# ifdef USE_AS_STRNLEN
+L(last_vec_x1_check):
+ tzcntl %eax, %eax
+ /* Check the end of data. */
+ cmpl %eax, %esi
+ jb L(max)
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
.p2align 4
-L(last_2x_vec):
- addl $(VEC_SIZE * 2), %esi
+L(last_4x_vec):
+ /* Test first 2x VEC normally. */
+ testl %eax, %eax
+ jnz L(last_vec_x1)
- VPCMP $0, (%rdi), %YMMZERO, %k0
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
- jnz L(first_vec_x0_check)
- subl $VEC_SIZE, %esi
- jle L(max)
+ jnz L(last_vec_x2)
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
+ /* Normalize length. */
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
kmovd %k0, %eax
testl %eax, %eax
- jnz L(first_vec_x1_check)
- movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- ret
+ jnz L(last_vec_x3)
- .p2align 4
-L(first_vec_x0_check):
+ /* Check the end of data. */
+ subl $(CHAR_PER_VEC * 3), %esi
+ jb L(max)
+
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq %rdi, %rax
- subq %rdx, %rax
+ cmpl %eax, %esi
+ jb L(max_end)
+
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
ret
.p2align 4
-L(first_vec_x1_check):
+L(last_vec_x1):
tzcntl %eax, %eax
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
ret
.p2align 4
-L(first_vec_x2_check):
+L(last_vec_x2):
tzcntl %eax, %eax
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- /* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
ret
.p2align 4
-L(first_vec_x3_check):
+L(last_vec_x3):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
+ subl $(CHAR_PER_VEC * 2), %esi
/* Check the end of data. */
- cmpq %rax, %rsi
- jbe L(max)
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
+ cmpl %eax, %esi
+ jb L(max_end)
+ subq %rdx, %rdi
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
+ sarq $2, %rdi
# endif
+ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
ret
-
- .p2align 4
-L(max):
+L(max_end):
movq %r8, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
- ret
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
ret
# endif
+ /* Cold case for crossing page with first load. */
.p2align 4
-L(first_vec_x0):
- tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq %rdi, %rax
- subq %rdx, %rax
+L(cross_page_boundary):
+ movq %rdi, %rdx
+ /* Align data to VEC_SIZE. */
+ andq $-VEC_SIZE, %rdi
+ VPCMP $0, (%rdi), %YMMZERO, %k0
+ kmovd %k0, %eax
+ /* Remove the leading bytes. */
# ifdef USE_AS_WCSLEN
- shrq $2, %rax
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
+ bytes. */
+ movl %edx, %ecx
+ shrl $2, %ecx
+ andl $(CHAR_PER_VEC - 1), %ecx
# endif
- ret
-
- .p2align 4
-L(first_vec_x1):
+ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
+ sarxl %SHIFT_REG, %eax, %eax
+ testl %eax, %eax
+# ifndef USE_AS_STRNLEN
+ jz L(cross_page_continue)
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
ret
-
- .p2align 4
-L(first_vec_x2):
- tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
+# else
+ jnz L(cross_page_less_vec)
+# ifndef USE_AS_WCSLEN
+ movl %edx, %ecx
+ andl $(CHAR_PER_VEC - 1), %ecx
+# endif
+ movl $CHAR_PER_VEC, %eax
+ subl %ecx, %eax
+ /* Check the end of data. */
+ cmpq %rax, %rsi
+ ja L(cross_page_continue)
+ movl %esi, %eax
ret
-
- .p2align 4
-L(4x_vec_end):
- VPCMP $0, %YMM1, %YMMZERO, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
- VPCMP $0, %YMM2, %YMMZERO, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
- VPCMP $0, %YMM3, %YMMZERO, %k2
- kmovd %k2, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
- VPCMP $0, %YMM4, %YMMZERO, %k3
- kmovd %k3, %eax
-L(first_vec_x3):
+L(cross_page_less_vec):
tzcntl %eax, %eax
-# ifdef USE_AS_WCSLEN
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- sall $2, %eax
-# endif
- addq $(VEC_SIZE * 3), %rax
- addq %rdi, %rax
- subq %rdx, %rax
-# ifdef USE_AS_WCSLEN
- shrq $2, %rax
-# endif
+ /* Select min of length and position of first null. */
+ cmpq %rax, %rsi
+ cmovb %esi, %eax
ret
+# endif
END (STRLEN)
#endif
--
GitLab

View File

@ -0,0 +1,158 @@
From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Wed, 30 Jun 2021 10:47:06 -0700
Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
Content-type: text/plain; charset=UTF-8
From
https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
* Intel TSX will be disabled by default.
* The processor will force abort all Restricted Transactional Memory (RTM)
transactions by default.
* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
which is set to indicate to updated software that the loaded microcode is
forcing RTM abort.
* On processors that enumerate support for RTM, the CPUID enumeration bits
for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
be set by default after microcode update.
* Workloads that were benefited from Intel TSX might experience a change
in performance.
* System software may use a new bit in Model-Specific Register (MSR) 0x10F
TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
Elision (HLE) and RTM bits to indicate to software that Intel TSX is
disabled.
1. Add RTM_ALWAYS_ABORT to CPUID features.
2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set. This skips the
string/tst-memchr-rtm etc. testcases on the affected processors, which
always fail after a microcde update.
3. Check RTM feature, instead of usability, against /proc/cpuinfo.
This fixes BZ #28033.
---
manual/platform.texi | 3 +++
sysdeps/x86/cpu-features.c | 5 ++++-
sysdeps/x86/sys/platform/x86.h | 6 +++---
sysdeps/x86/tst-cpu-features-supports.c | 2 +-
sysdeps/x86/tst-get-cpu-features.c | 2 ++
5 files changed, 13 insertions(+), 5 deletions(-)
Conflicts:
sysdeps/x86/bits/platform/x86.h
(doesn't exist)
sysdeps/x86/bits/platform/x86.h
(account for lack of upstream renames)
diff --git a/manual/platform.texi b/manual/platform.texi
index 8fec2933..b7e8aef7 100644
--- a/manual/platform.texi
+++ b/manual/platform.texi
@@ -510,6 +510,9 @@ capability.
@item
@code{RTM} -- RTM instruction extensions.
+@item
+@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
+
@item
@code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 3610ee5c..4889f062 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
CPU_FEATURE_SET_USABLE (cpu_features, HLE);
CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
- CPU_FEATURE_SET_USABLE (cpu_features, RTM);
CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
CPU_FEATURE_SET_USABLE (cpu_features, ADX);
CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
@@ -779,6 +779,9 @@ no_cpuid:
GLRO(dl_platform) = "i586";
#endif
+ if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM);
+
#if CET_ENABLED
# if HAVE_TUNABLES
TUNABLE_GET (x86_ibt, tunable_val_t *,
diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
index e5cc7c68..7a434926 100644
--- a/sysdeps/x86/sys/platform/x86.h
+++ b/sysdeps/x86/sys/platform/x86.h
@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
#define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
#define bit_cpu_INDEX_7_EDX_9 (1u << 9)
#define bit_cpu_MD_CLEAR (1u << 10)
-#define bit_cpu_INDEX_7_EDX_11 (1u << 11)
+#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
#define bit_cpu_INDEX_7_EDX_12 (1u << 12)
#define bit_cpu_INDEX_7_EDX_13 (1u << 13)
#define bit_cpu_SERIALIZE (1u << 14)
@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
#define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
#define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7
#define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7
-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
+#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
#define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
#define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
#define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7
@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
#define reg_AVX512_VP2INTERSECT edx
#define reg_INDEX_7_EDX_9 edx
#define reg_MD_CLEAR edx
-#define reg_INDEX_7_EDX_11 edx
+#define reg_RTM_ALWAYS_ABORT edx
#define reg_INDEX_7_EDX_12 edx
#define reg_INDEX_7_EDX_13 edx
#define reg_SERIALIZE edx
diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
index 287cf01f..8100a319 100644
--- a/sysdeps/x86/tst-cpu-features-supports.c
+++ b/sysdeps/x86/tst-cpu-features-supports.c
@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
fails += CHECK_SUPPORTS (rdpid, RDPID);
fails += CHECK_SUPPORTS (rdrnd, RDRAND);
fails += CHECK_SUPPORTS (rdseed, RDSEED);
- fails += CHECK_SUPPORTS (rtm, RTM);
+ fails += CHECK_CPU_SUPPORTS (rtm, RTM);
fails += CHECK_SUPPORTS (serialize, SERIALIZE);
fails += CHECK_SUPPORTS (sha, SHA);
fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
index 2763deb6..0717e5d8 100644
--- a/sysdeps/x86/tst-get-cpu-features.c
+++ b/sysdeps/x86/tst-get-cpu-features.c
@@ -183,6 +183,7 @@ do_test (void)
CHECK_CPU_FEATURE (UINTR);
CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
CHECK_CPU_FEATURE (MD_CLEAR);
+ CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
CHECK_CPU_FEATURE (SERIALIZE);
CHECK_CPU_FEATURE (HYBRID);
CHECK_CPU_FEATURE (TSXLDTRK);
@@ -344,6 +345,7 @@ do_test (void)
CHECK_CPU_FEATURE_USABLE (FSRM);
CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
+ CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
CHECK_CPU_FEATURE_USABLE (SERIALIZE);
CHECK_CPU_FEATURE_USABLE (HYBRID);
CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
--
GitLab

View File

@ -0,0 +1,51 @@
From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 8 Jul 2021 16:13:19 -0400
Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
#28064]
Content-type: text/plain; charset=UTF-8
The following commit
commit 6f573a27b6c8b4236445810a44660612323f5a73
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed Jun 23 01:19:34 2021 -0400
x86-64: Add wcslen optimize for sse4.1
Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
implementation list and adding wcslen-sse4.1 to the ifunc
implementation list.
Testing:
test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
well as all other tests in wcsmbs and string.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 580913ca..695cdba6 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wcslen_evex)
- IFUNC_IMPL_ADD (array, i, wcsnlen,
+ IFUNC_IMPL_ADD (array, i, wcslen,
CPU_FEATURE_USABLE (SSE4_1),
- __wcsnlen_sse4_1)
+ __wcslen_sse4_1)
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
--
GitLab

View File

@ -0,0 +1,135 @@
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 15 Feb 2022 08:18:15 -0600
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
#28896]
Content-type: text/plain; charset=UTF-8
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
not checks around vzeroupper and would trigger spurious
aborts. This commit fixes that.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
AVX2 machines with and without RTM.
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/Makefile | 2 +-
sysdeps/x86/tst-strncmp-rtm.c | 17 ++++++++++++++++-
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 1 +
sysdeps/x86_64/multiarch/strncmp-avx2.S | 1 +
sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 2 +-
sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 2 +-
7 files changed, 22 insertions(+), 5 deletions(-)
Conflicts:
sysdeps/x86_64/multiarch/strcmp-avx2.S
(split into two patches due to upstream bug differences)
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 5be71ada..2d814915 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
CFLAGS-tst-strchr-rtm.c += -mrtm
CFLAGS-tst-strcpy-rtm.c += -mrtm
CFLAGS-tst-strlen-rtm.c += -mrtm
-CFLAGS-tst-strncmp-rtm.c += -mrtm
+CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
CFLAGS-tst-strrchr-rtm.c += -mrtm
endif
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index 236ad951..4d0004b5 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -16,6 +16,7 @@
License along with the GNU C Library; if not, see
<https://www.gnu.org/licenses/>. */
+#include <stdint.h>
#include <tst-string-rtm.h>
#define LOOP 3000
@@ -45,8 +46,22 @@ function (void)
return 1;
}
+__attribute__ ((noinline, noclone))
+static int
+function_overflow (void)
+{
+ if (strncmp (string1, string2, SIZE_MAX) == 0)
+ return 0;
+ else
+ return 1;
+}
+
static int
do_test (void)
{
- return do_test_1 ("strncmp", LOOP, prepare, function);
+ int status = do_test_1 ("strncmp", LOOP, prepare, function);
+ if (status != EXIT_SUCCESS)
+ return status;
+ status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
+ return status;
}
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 5d1c9d90..433ae047 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -95,7 +95,7 @@ ENTRY (STRCMP)
length to bound a valid memory region. In these cases just use
'wcscmp'. */
shrq $56, %rcx
- jnz __wcscmp_avx2
+ jnz OVERFLOW_STRCMP
# endif
/* Convert units: from wide to byte char. */
shl $2, %RDX_LP
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
index 37d1224b..68bad365 100644
--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
@@ -1,3 +1,4 @@
#define STRCMP __strncmp_avx2_rtm
#define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP __strcmp_avx2_rtm
#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
index 1678bcc2..f138e9f1 100644
--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
@@ -1,3 +1,4 @@
#define STRCMP __strncmp_avx2
#define USE_AS_STRNCMP 1
+#define OVERFLOW_STRCMP __strcmp_avx2
#include "strcmp-avx2.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
index 4e88c70c..f467582c 100644
--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
@@ -1,5 +1,5 @@
#define STRCMP __wcsncmp_avx2_rtm
#define USE_AS_STRNCMP 1
#define USE_AS_WCSCMP 1
-
+#define OVERFLOW_STRCMP __wcscmp_avx2_rtm
#include "strcmp-avx2-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
index 4fa1de4d..e9ede522 100644
--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
@@ -1,5 +1,5 @@
#define STRCMP __wcsncmp_avx2
#define USE_AS_STRNCMP 1
#define USE_AS_WCSCMP 1
-
+#define OVERFLOW_STRCMP __wcscmp_avx2
#include "strcmp-avx2.S"
--
GitLab

View File

@ -0,0 +1,51 @@
From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 9 May 2020 12:04:23 -0700
Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
#25966]
Content-type: text/plain; charset=UTF-8
Since __x86_shared_non_temporal_threshold is defined as
long int __x86_shared_non_temporal_threshold;
and long int is 4 bytes for x32, use RDX_LP to compare against
__x86_shared_non_temporal_threshold in assembly code.
---
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 71f5954d..673b73aa 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -245,7 +245,7 @@ L(return):
#endif
L(movsb):
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
jae L(more_8x_vec)
cmpq %rsi, %rdi
jb 1f
@@ -397,7 +397,7 @@ L(more_8x_vec):
addq %r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_forward)
#endif
L(loop_4x_vec_forward):
@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
subq %r8, %rdx
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
/* Check non-temporal store threshold. */
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
ja L(large_backward)
#endif
L(loop_4x_vec_backward):
--
GitLab

View File

@ -0,0 +1,44 @@
From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 11 Jun 2020 12:41:18 -0700
Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
Content-type: text/plain; charset=UTF-8
Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
%xmmN, instead of %ymmN, with vpxor to clear a vector register.
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 4 ++--
sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
2 files changed, 3 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 433ae047..70d8499b 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -105,8 +105,8 @@ ENTRY (STRCMP)
# endif
movl %edi, %eax
xorl %edx, %edx
- /* Make %ymm7 all zeros in this function. */
- vpxor %ymm7, %ymm7, %ymm7
+ /* Make %xmm7 (%ymm7) all zeros in this function. */
+ vpxor %xmm7, %xmm7, %xmm7
orl %esi, %eax
andl $(PAGE_SIZE - 1), %eax
cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 9f22a15e..c949410b 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
movl %edi, %ecx
/* Broadcast CHAR to YMM4. */
VPBROADCAST %xmm4, %ymm4
- vpxor %ymm0, %ymm0, %ymm0
+ vpxor %xmm0, %xmm0, %xmm0
/* Check if we may cross page boundary with one vector load. */
andl $(2 * VEC_SIZE - 1), %ecx
--
GitLab

View File

@ -0,0 +1,359 @@
From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
From: noah <goldstein.w.n@gmail.com>
Date: Wed, 3 Feb 2021 00:38:59 -0500
Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. Just seemed the performance could be improved a bit. Observed
and expected behavior are unchanged. Optimized body of main
loop. Updated page cross logic and optimized accordingly. Made a few
minor instruction selection modifications. No regressions in test
suite. Both test-strchrnul and test-strchr passed.
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
sysdeps/x86_64/multiarch/strchr.c | 4 +-
2 files changed, 114 insertions(+), 115 deletions(-)
Conflicts:
sysdeps/x86_64/multiarch/strchr.c
(account for missing upstream macros)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index da7d2620..919d256c 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -27,10 +27,12 @@
# ifdef USE_AS_WCSCHR
# define VPBROADCAST vpbroadcastd
# define VPCMPEQ vpcmpeqd
+# define VPMINU vpminud
# define CHAR_REG esi
# else
# define VPBROADCAST vpbroadcastb
# define VPCMPEQ vpcmpeqb
+# define VPMINU vpminub
# define CHAR_REG sil
# endif
@@ -43,71 +45,54 @@
# endif
# define VEC_SIZE 32
+# define PAGE_SIZE 4096
.section SECTION(.text),"ax",@progbits
ENTRY (STRCHR)
movl %edi, %ecx
- /* Broadcast CHAR to YMM0. */
+# ifndef USE_AS_STRCHRNUL
+ xorl %edx, %edx
+# endif
+
+ /* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
vpxor %xmm9, %xmm9, %xmm9
VPBROADCAST %xmm0, %ymm0
- /* Check if we may cross page boundary with one vector load. */
- andl $(2 * VEC_SIZE - 1), %ecx
- cmpl $VEC_SIZE, %ecx
- ja L(cros_page_boundary)
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
- null byte. */
- vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
+ /* Check if we cross page boundary with one vector load. */
+ andl $(PAGE_SIZE - 1), %ecx
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
+ ja L(cross_page_boundary)
- /* Align data for aligned loads in the loop. */
- addq $VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
-
- jmp L(more_4x_vec)
-
- .p2align 4
-L(cros_page_boundary):
- andl $(VEC_SIZE - 1), %ecx
- andq $-VEC_SIZE, %rdi
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
+ null byte. */
vmovdqu (%rdi), %ymm8
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
- /* Remove the leading bytes. */
- sarl %cl, %eax
testl %eax, %eax
- jz L(aligned_more)
- /* Found CHAR or the null byte. */
+ jz L(more_vecs)
tzcntl %eax, %eax
- addq %rcx, %rax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
+L(more_vecs):
+ /* Align data for aligned loads in the loop. */
+ andq $-VEC_SIZE, %rdi
L(aligned_more):
- addq $VEC_SIZE, %rdi
-L(more_4x_vec):
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa (%rdi), %ymm8
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa VEC_SIZE(%rdi), %ymm8
+ addq $VEC_SIZE, %rdi
VPCMPEQ %ymm8, %ymm0, %ymm1
VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
@@ -137,61 +122,24 @@ L(more_4x_vec):
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x3)
-
- addq $(VEC_SIZE * 4), %rdi
-
- /* Align data to 4 * VEC_SIZE. */
- movq %rdi, %rcx
- andl $(4 * VEC_SIZE - 1), %ecx
- andq $-(4 * VEC_SIZE), %rdi
-
- .p2align 4
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqa (%rdi), %ymm5
- vmovdqa VEC_SIZE(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
-
- VPCMPEQ %ymm5, %ymm0, %ymm1
- VPCMPEQ %ymm6, %ymm0, %ymm2
- VPCMPEQ %ymm7, %ymm0, %ymm3
- VPCMPEQ %ymm8, %ymm0, %ymm4
-
- VPCMPEQ %ymm5, %ymm9, %ymm5
- VPCMPEQ %ymm6, %ymm9, %ymm6
- VPCMPEQ %ymm7, %ymm9, %ymm7
- VPCMPEQ %ymm8, %ymm9, %ymm8
-
- vpor %ymm1, %ymm5, %ymm1
- vpor %ymm2, %ymm6, %ymm2
- vpor %ymm3, %ymm7, %ymm3
- vpor %ymm4, %ymm8, %ymm4
-
- vpor %ymm1, %ymm2, %ymm5
- vpor %ymm3, %ymm4, %ymm6
-
- vpor %ymm5, %ymm6, %ymm5
-
- vpmovmskb %ymm5, %eax
- testl %eax, %eax
- jnz L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
+ jz L(prep_loop_4x)
- jmp L(loop_4x_vec)
+ tzcntl %eax, %eax
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
.p2align 4
L(first_vec_x0):
- /* Found CHAR or the null byte. */
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER_RETURN
@@ -199,13 +147,9 @@ L(first_vec_x0):
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $VEC_SIZE, %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
leaq VEC_SIZE(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER_RETURN
@@ -213,42 +157,97 @@ L(first_vec_x1):
.p2align 4
L(first_vec_x2):
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 2), %rax
- addq %rdi, %rax
-# else
- xorl %edx, %edx
+ /* Found CHAR or the null byte. */
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER_RETURN
+L(prep_loop_4x):
+ /* Align data to 4 * VEC_SIZE. */
+ andq $-(VEC_SIZE * 4), %rdi
+
.p2align 4
-L(4x_vec_end):
+L(loop_4x_vec):
+ /* Compare 4 * VEC at a time forward. */
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+
+ /* Leaves only CHARS matching esi as 0. */
+ vpxor %ymm5, %ymm0, %ymm1
+ vpxor %ymm6, %ymm0, %ymm2
+ vpxor %ymm7, %ymm0, %ymm3
+ vpxor %ymm8, %ymm0, %ymm4
+
+ VPMINU %ymm1, %ymm5, %ymm1
+ VPMINU %ymm2, %ymm6, %ymm2
+ VPMINU %ymm3, %ymm7, %ymm3
+ VPMINU %ymm4, %ymm8, %ymm4
+
+ VPMINU %ymm1, %ymm2, %ymm5
+ VPMINU %ymm3, %ymm4, %ymm6
+
+ VPMINU %ymm5, %ymm6, %ymm5
+
+ VPCMPEQ %ymm5, %ymm9, %ymm5
+ vpmovmskb %ymm5, %eax
+
+ addq $(VEC_SIZE * 4), %rdi
+ testl %eax, %eax
+ jz L(loop_4x_vec)
+
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
jnz L(first_vec_x0)
+
+ VPCMPEQ %ymm2, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ VPCMPEQ %ymm4, %ymm9, %ymm4
+ vpmovmskb %ymm3, %ecx
vpmovmskb %ymm4, %eax
+ salq $32, %rax
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
+ cmovne %rdx, %rax
+# endif
+ VZEROUPPER
+ ret
+
+ /* Cold case for crossing page with first load. */
+ .p2align 4
+L(cross_page_boundary):
+ andq $-VEC_SIZE, %rdi
+ andl $(VEC_SIZE - 1), %ecx
+
+ vmovdqa (%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* Remove the leading bits. */
+ sarxl %ecx, %eax, %eax
testl %eax, %eax
-L(first_vec_x3):
+ jz L(aligned_more)
tzcntl %eax, %eax
-# ifdef USE_AS_STRCHRNUL
- addq $(VEC_SIZE * 3), %rax
+ addq %rcx, %rdi
addq %rdi, %rax
-# else
- xorl %edx, %edx
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
- cmp (%rax), %CHAR_REG
+# ifndef USE_AS_STRCHRNUL
+ cmp (%rax), %CHAR_REG
cmovne %rdx, %rax
# endif
VZEROUPPER_RETURN
END (STRCHR)
-#endif
+# endif
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
index 7e582f02..5225bd4f 100644
--- a/sysdeps/x86_64/multiarch/strchr.c
+++ b/sysdeps/x86_64/multiarch/strchr.c
@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
const struct cpu_features* cpu_features = __get_cpu_features ();
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
--
GitLab

View File

@ -0,0 +1,67 @@
From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 25 Jan 2020 14:19:40 -0800
Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
Content-type: text/plain; charset=UTF-8
When copying with "rep movsb", if the distance between source and
destination is N*4GB + [1..63] with N >= 0, performance may be very
slow. This patch updates memmove-vec-unaligned-erms.S for AVX and
AVX512 versions with the distance in RCX:
cmpl $63, %ecx
// Don't use "rep movsb" if ECX <= 63
jbe L(Don't use rep movsb")
Use "rep movsb"
Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
performance impact is within noise range as "rep movsb" is only used for
data size >= 4KB.
---
.../multiarch/memmove-vec-unaligned-erms.S | 21 +++++++++++++++++++
1 file changed, 21 insertions(+)
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 673b73aa..c475fed4 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -64,6 +64,13 @@
# endif
#endif
+/* Avoid short distance rep movsb only with non-SSE vector. */
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
+#else
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
+#endif
+
#ifndef PREFETCH
# define PREFETCH(addr) prefetcht0 addr
#endif
@@ -255,7 +262,21 @@ L(movsb):
cmpq %r9, %rdi
/* Avoid slow backward REP MOVSB. */
jb L(more_8x_vec_backward)
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ movq %rdi, %rcx
+ subq %rsi, %rcx
+ jmp 2f
+# endif
1:
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
+ movq %rsi, %rcx
+ subq %rdi, %rcx
+2:
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
+ is N*4GB + [1..63] with N >= 0. */
+ cmpl $63, %ecx
+ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
+# endif
mov %RDX_LP, %RCX_LP
rep movsb
L(nop):
--
GitLab

View File

@ -0,0 +1,449 @@
From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
From: noah <goldstein.w.n@gmail.com>
Date: Sat, 3 Apr 2021 04:12:15 -0400
Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8
No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
.../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++----
1 file changed, 265 insertions(+), 73 deletions(-)
Conflicts:
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
(different number of sections)
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index c475fed4..3e2dd6bc 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -32,7 +32,16 @@
overlapping addresses.
6. If size >= __x86_shared_non_temporal_threshold and there is no
overlap between destination and source, use non-temporal store
- instead of aligned store. */
+ instead of aligned store copying from either 2 or 4 pages at
+ once.
+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+ and source and destination do not page alias, copy from 2 pages
+ at once using non-temporal stores. Page aliasing in this case is
+ considered true if destination's page alignment - sources' page
+ alignment is less than 8 * VEC_SIZE.
+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+ and destination do page alias copy from 4 pages at once using
+ non-temporal stores. */
#include <sysdep.h>
@@ -64,6 +73,34 @@
# endif
#endif
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop. */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x. */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
/* Avoid short distance rep movsb only with non-SSE vector. */
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -103,6 +140,28 @@
# error Unsupported PREFETCH_SIZE!
#endif
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+ VMOVU (offset)base, vec0; \
+ VMOVU ((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+ VMOVNT vec0, (offset)base; \
+ VMOVNT vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+ VMOVU (offset)base, vec0; \
+ VMOVU ((offset) + VEC_SIZE)base, vec1; \
+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+ VMOVNT vec0, (offset)base; \
+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \
+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
#ifndef SECTION
# error SECTION is not defined!
#endif
@@ -390,6 +449,15 @@ L(last_4x_vec):
VZEROUPPER_RETURN
L(more_8x_vec):
+ /* Check if non-temporal move candidate. */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+ /* Check non-temporal store threshold. */
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+ ja L(large_memcpy_2x)
+#endif
+ /* Entry if rdx is greater than non-temporal threshold but there
+ is overlap. */
+L(more_8x_vec_check):
cmpq %rsi, %rdi
ja L(more_8x_vec_backward)
/* Source == destination is less common. */
@@ -416,24 +484,21 @@ L(more_8x_vec):
subq %r8, %rdi
/* Adjust length. */
addq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
- ja L(large_forward)
-#endif
+
+ .p2align 4
L(loop_4x_vec_forward):
/* Copy 4 * VEC a time forward. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $(VEC_SIZE * 4), %rsi
- subq $(VEC_SIZE * 4), %rdx
+ subq $-(VEC_SIZE * 4), %rsi
+ addq $-(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%rdi)
VMOVA %VEC(1), VEC_SIZE(%rdi)
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $(VEC_SIZE * 4), %rdi
+ subq $-(VEC_SIZE * 4), %rdi
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_4x_vec_forward)
/* Store the last 4 * VEC. */
@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
subq %r8, %r9
/* Adjust length. */
subq %r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
- /* Check non-temporal store threshold. */
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
- ja L(large_backward)
-#endif
+
+ .p2align 4
L(loop_4x_vec_backward):
/* Copy 4 * VEC a time backward. */
VMOVU (%rcx), %VEC(0)
VMOVU -VEC_SIZE(%rcx), %VEC(1)
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $(VEC_SIZE * 4), %rcx
- subq $(VEC_SIZE * 4), %rdx
+ addq $-(VEC_SIZE * 4), %rcx
+ addq $-(VEC_SIZE * 4), %rdx
VMOVA %VEC(0), (%r9)
VMOVA %VEC(1), -VEC_SIZE(%r9)
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $(VEC_SIZE * 4), %r9
+ addq $-(VEC_SIZE * 4), %r9
cmpq $(VEC_SIZE * 4), %rdx
ja L(loop_4x_vec_backward)
/* Store the first 4 * VEC. */
@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
VZEROUPPER_RETURN
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+ .p2align 4
+L(large_memcpy_2x):
+ /* Compute absolute value of difference between source and
+ destination. */
+ movq %rdi, %r9
+ subq %rsi, %r9
+ movq %r9, %r8
+ leaq -1(%r9), %rcx
+ sarq $63, %r8
+ xorq %r8, %r9
+ subq %r8, %r9
/* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rdi, %rdx), %r10
- cmpq %r10, %rsi
- jb L(loop_4x_vec_forward)
-L(loop_large_forward):
+ destination and source since destination may be in cache when
+ source is loaded. */
+ cmpq %r9, %rdx
+ ja L(more_8x_vec_check)
+
+ /* Cache align destination. First store the first 64 bytes then
+ adjust alignments. */
+ VMOVU (%rsi), %VEC(8)
+#if VEC_SIZE < 64
+ VMOVU VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+ VMOVU %VEC(8), (%rdi)
+#if VEC_SIZE < 64
+ VMOVU %VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+ /* Adjust source, destination, and size. */
+ movq %rdi, %r8
+ andq $63, %r8
+ /* Get the negative of offset for alignment. */
+ subq $64, %r8
+ /* Adjust source. */
+ subq %r8, %rsi
+ /* Adjust destination which should be aligned now. */
+ subq %r8, %rdi
+ /* Adjust length. */
+ addq %r8, %rdx
+
+ /* Test if source and destination addresses will alias. If they do
+ the larger pipeline in large_memcpy_4x alleviated the
+ performance drop. */
+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
+ jz L(large_memcpy_4x)
+
+ movq %rdx, %r10
+ shrq $LOG_4X_MEMCPY_THRESH, %r10
+ cmp __x86_shared_non_temporal_threshold(%rip), %r10
+ jae L(large_memcpy_4x)
+
+ /* edx will store remainder size for copying tail. */
+ andl $(PAGE_SIZE * 2 - 1), %edx
+ /* r10 stores outer loop counter. */
+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+ /* Copy 4x VEC at a time from 2 pages. */
+ .p2align 4
+L(loop_large_memcpy_2x_outer):
+ /* ecx stores inner loop counter. */
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+ /* Load vectors from rsi. */
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ subq $-LARGE_LOAD_SIZE, %rsi
+ /* Non-temporal store vectors to rdi. */
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ subq $-LARGE_LOAD_SIZE, %rdi
+ decl %ecx
+ jnz L(loop_large_memcpy_2x_inner)
+ addq $PAGE_SIZE, %rdi
+ addq $PAGE_SIZE, %rsi
+ decq %r10
+ jne L(loop_large_memcpy_2x_outer)
+ sfence
+
+ /* Check if only last 4 loads are needed. */
+ cmpl $(VEC_SIZE * 4), %edx
+ jbe L(large_memcpy_2x_end)
+
+ /* Handle the last 2 * PAGE_SIZE bytes. */
+L(loop_large_memcpy_2x_tail):
/* Copy 4 * VEC a time forward with non-temporal stores. */
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
- addq $PREFETCHED_LOAD_SIZE, %rsi
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%rdi)
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
- addq $PREFETCHED_LOAD_SIZE, %rdi
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_forward)
- sfence
+ subq $-(VEC_SIZE * 4), %rsi
+ addl $-(VEC_SIZE * 4), %edx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpl $(VEC_SIZE * 4), %edx
+ ja L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
/* Store the last 4 * VEC. */
- VMOVU %VEC(5), (%rcx)
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
- /* Store the first VEC. */
- VMOVU %VEC(4), (%r11)
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
-L(large_backward):
- /* Don't use non-temporal store if there is overlap between
- destination and source since destination may be in cache
- when source is loaded. */
- leaq (%rcx, %rdx), %r10
- cmpq %r10, %r9
- jb L(loop_4x_vec_backward)
-L(loop_large_backward):
- /* Copy 4 * VEC a time backward with non-temporal stores. */
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
- VMOVU (%rcx), %VEC(0)
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
- subq $PREFETCHED_LOAD_SIZE, %rcx
- subq $PREFETCHED_LOAD_SIZE, %rdx
- VMOVNT %VEC(0), (%r9)
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
- subq $PREFETCHED_LOAD_SIZE, %r9
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
- ja L(loop_large_backward)
+ .p2align 4
+L(large_memcpy_4x):
+ movq %rdx, %r10
+ /* edx will store remainder size for copying tail. */
+ andl $(PAGE_SIZE * 4 - 1), %edx
+ /* r10 stores outer loop counter. */
+ shrq $(LOG_PAGE_SIZE + 2), %r10
+ /* Copy 4x VEC at a time from 4 pages. */
+ .p2align 4
+L(loop_large_memcpy_4x_outer):
+ /* ecx stores inner loop counter. */
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+ /* Only one prefetch set per page as doing 4 pages give more time
+ for prefetcher to keep up. */
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+ /* Load vectors from rsi. */
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+ subq $-LARGE_LOAD_SIZE, %rsi
+ /* Non-temporal store vectors to rdi. */
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+ subq $-LARGE_LOAD_SIZE, %rdi
+ decl %ecx
+ jnz L(loop_large_memcpy_4x_inner)
+ addq $(PAGE_SIZE * 3), %rdi
+ addq $(PAGE_SIZE * 3), %rsi
+ decq %r10
+ jne L(loop_large_memcpy_4x_outer)
sfence
- /* Store the first 4 * VEC. */
- VMOVU %VEC(4), (%rdi)
- VMOVU %VEC(5), VEC_SIZE(%rdi)
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
- /* Store the last VEC. */
- VMOVU %VEC(8), (%r11)
+ /* Check if only last 4 loads are needed. */
+ cmpl $(VEC_SIZE * 4), %edx
+ jbe L(large_memcpy_4x_end)
+
+ /* Handle the last 4 * PAGE_SIZE bytes. */
+L(loop_large_memcpy_4x_tail):
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+ VMOVU (%rsi), %VEC(0)
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
+ subq $-(VEC_SIZE * 4), %rsi
+ addl $-(VEC_SIZE * 4), %edx
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpl $(VEC_SIZE * 4), %edx
+ ja L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+ /* Store the last 4 * VEC. */
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
--
GitLab

View File

@ -0,0 +1,151 @@
From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:29:58 -0800
Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memrchr for x32. Tested on x86-64 and x32. On x86-64,
libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
---
sysdeps/x86_64/memrchr.S | 4 +-
sysdeps/x86_64/multiarch/memrchr-avx2.S | 4 +-
sysdeps/x86_64/x32/Makefile | 3 +-
sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
4 files changed, 63 insertions(+), 5 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index b8e3fa1d..dc82f8f7 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -24,13 +24,13 @@
ENTRY (__memrchr)
movd %esi, %xmm1
- sub $16, %rdx
+ sub $16, %RDX_LP
jbe L(length_less16)
punpcklbw %xmm1, %xmm1
punpcklbw %xmm1, %xmm1
- add %rdx, %rdi
+ add %RDX_LP, %RDI_LP
pshufd $0, %xmm1, %xmm1
movdqu (%rdi), %xmm0
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
index b41a58bc..ce488dd9 100644
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
vmovd %esi, %xmm0
vpbroadcastb %xmm0, %ymm0
- subq $VEC_SIZE, %rdx
+ sub $VEC_SIZE, %RDX_LP
jbe L(last_vec_or_less)
- addq %rdx, %rdi
+ add %RDX_LP, %RDI_LP
/* Check the last VEC_SIZE bytes. */
vpcmpeqb (%rdi), %ymm0, %ymm1
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 2fe1e5ac..e99dbd7c 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
endif
ifeq ($(subdir),string)
-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
+ tst-size_t-memrchr
endif
ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
new file mode 100644
index 00000000..c83699c0
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
@@ -0,0 +1,57 @@
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_NAME "memrchr"
+#include "test-size_t.h"
+
+IMPL (memchr, 1)
+
+typedef void * (*proto_t) (const void *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memrchr (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t src = { { page_size }, buf2 };
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ c.fn = impl->fn;
+ void * res = do_memrchr (src, c);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %p != NULL",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
--
GitLab

View File

@ -0,0 +1,92 @@
From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 19 Apr 2021 10:45:07 -0700
Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
Content-type: text/plain; charset=UTF-8
Since strchr-avx2.S updated by
commit 1f745ecc2109890886b161d4791e1406fdfc29b8
Author: noah <goldstein.w.n@gmail.com>
Date: Wed Feb 3 00:38:59 2021 -0500
x86-64: Refactor and improve performance of strchr-avx2.S
uses sarx:
c4 e2 72 f7 c0 sarx %ecx,%eax,%eax
for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
ifunc-avx2.h.
---
sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++--
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
2 files changed, 11 insertions(+), 5 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
index e0f30e61..ef72b73f 100644
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
const struct cpu_features* cpu_features = __get_cpu_features ();
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
return OPTIMIZE (evex);
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 695cdba6..85b8863a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchr.c. */
IFUNC_IMPL (i, name, strchr,
IFUNC_IMPL_ADD (array, i, strchr,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strchr_avx2)
IFUNC_IMPL_ADD (array, i, strchr,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strchr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strchr,
@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/strchrnul.c. */
IFUNC_IMPL (i, name, strchrnul,
IFUNC_IMPL_ADD (array, i, strchrnul,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__strchrnul_avx2)
IFUNC_IMPL_ADD (array, i, strchrnul,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__strchrnul_avx2_rtm)
IFUNC_IMPL_ADD (array, i, strchrnul,
@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/x86_64/multiarch/wcschr.c. */
IFUNC_IMPL (i, name, wcschr,
IFUNC_IMPL_ADD (array, i, wcschr,
- CPU_FEATURE_USABLE (AVX2),
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)),
__wcschr_avx2)
IFUNC_IMPL_ADD (array, i, wcschr,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (RTM)),
__wcschr_avx2_rtm)
IFUNC_IMPL_ADD (array, i, wcschr,
--
GitLab

View File

@ -0,0 +1,265 @@
From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 19 Apr 2021 17:48:10 -0400
Subject: [PATCH] x86: Optimize less_vec evex and avx512
memset-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8
No bug. This commit adds optimized cased for less_vec memset case that
uses the avx512vl/avx512bw mask store avoiding the excessive
branches. test-memset and test-wmemset are passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 40 ++++++++++-----
sysdeps/x86_64/multiarch/ifunc-memset.h | 6 ++-
.../multiarch/memset-avx512-unaligned-erms.S | 2 +-
.../multiarch/memset-evex-unaligned-erms.S | 2 +-
.../multiarch/memset-vec-unaligned-erms.S | 51 +++++++++++++++----
5 files changed, 74 insertions(+), 27 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 85b8863a..d59d65f8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_chk_avx2_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_chk_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_chk_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, __memset_chk,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_chk_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, __memset_chk,
CPU_FEATURE_USABLE (AVX512F),
@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_avx2_unaligned_erms_rtm)
IFUNC_IMPL_ADD (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_evex_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_avx512_unaligned_erms)
IFUNC_IMPL_ADD (array, i, memset,
(CPU_FEATURE_USABLE (AVX512VL)
- && CPU_FEATURE_USABLE (AVX512BW)),
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__memset_avx512_unaligned)
IFUNC_IMPL_ADD (array, i, memset,
CPU_FEATURE_USABLE (AVX512F),
@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (RTM)),
__wmemset_avx2_unaligned_rtm)
IFUNC_IMPL_ADD (array, i, wmemset,
- CPU_FEATURE_USABLE (AVX512VL),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__wmemset_evex_unaligned)
IFUNC_IMPL_ADD (array, i, wmemset,
- CPU_FEATURE_USABLE (AVX512VL),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__wmemset_avx512_unaligned))
#ifdef SHARED
@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
CPU_FEATURE_USABLE (AVX2),
__wmemset_chk_avx2_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
- CPU_FEATURE_USABLE (AVX512VL),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__wmemset_chk_evex_unaligned)
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
- CPU_FEATURE_USABLE (AVX512F),
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
__wmemset_chk_avx512_unaligned))
#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
index 19795938..100e3707 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (avx512_unaligned_erms);
@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
{
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
return OPTIMIZE (evex_unaligned_erms);
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 22e7b187..8ad842fc 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -19,6 +19,6 @@
# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
-
+# define USE_LESS_VEC_MASK_STORE 1
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index ae0a4d6e..640f0929 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -19,6 +19,6 @@
# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s
# define WMEMSET_SYMBOL(p,s) p##_evex_##s
-
+# define USE_LESS_VEC_MASK_STORE 1
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index bae5cba4..f877ac9d 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -63,6 +63,8 @@
# endif
#endif
+#define PAGE_SIZE 4096
+
#ifndef SECTION
# error SECTION is not defined!
#endif
@@ -213,11 +215,38 @@ L(loop):
cmpq %rcx, %rdx
jne L(loop)
VZEROUPPER_SHORT_RETURN
+
+ .p2align 4
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
# endif
+# ifdef USE_LESS_VEC_MASK_STORE
+ /* Clear high bits from edi. Only keeping bits relevant to page
+ cross check. Note that we are using rax which is set in
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
+ */
+ andl $(PAGE_SIZE - 1), %edi
+ /* Check if VEC_SIZE store cross page. Mask stores suffer serious
+ performance degradation when it has to fault supress. */
+ cmpl $(PAGE_SIZE - VEC_SIZE), %edi
+ ja L(cross_page)
+# if VEC_SIZE > 32
+ movq $-1, %rcx
+ bzhiq %rdx, %rcx, %rcx
+ kmovq %rcx, %k1
+# else
+ movl $-1, %ecx
+ bzhil %edx, %ecx, %ecx
+ kmovd %ecx, %k1
+# endif
+ vmovdqu8 %VEC(0), (%rax) {%k1}
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(cross_page):
+# endif
# if VEC_SIZE > 32
cmpb $32, %dl
jae L(between_32_63)
@@ -234,36 +263,36 @@ L(less_vec):
cmpb $1, %dl
ja L(between_2_3)
jb 1f
- movb %cl, (%rdi)
+ movb %cl, (%rax)
1:
VZEROUPPER_RETURN
# if VEC_SIZE > 32
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, -32(%rdi,%rdx)
- VMOVU %YMM0, (%rdi)
+ VMOVU %YMM0, -32(%rax,%rdx)
+ VMOVU %YMM0, (%rax)
VZEROUPPER_RETURN
# endif
# if VEC_SIZE > 16
/* From 16 to 31. No branch when size == 16. */
L(between_16_31):
- VMOVU %XMM0, -16(%rdi,%rdx)
- VMOVU %XMM0, (%rdi)
+ VMOVU %XMM0, -16(%rax,%rdx)
+ VMOVU %XMM0, (%rax)
VZEROUPPER_RETURN
# endif
/* From 8 to 15. No branch when size == 8. */
L(between_8_15):
- movq %rcx, -8(%rdi,%rdx)
- movq %rcx, (%rdi)
+ movq %rcx, -8(%rax,%rdx)
+ movq %rcx, (%rax)
VZEROUPPER_RETURN
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
- movl %ecx, -4(%rdi,%rdx)
- movl %ecx, (%rdi)
+ movl %ecx, -4(%rax,%rdx)
+ movl %ecx, (%rax)
VZEROUPPER_RETURN
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
- movw %cx, -2(%rdi,%rdx)
- movw %cx, (%rdi)
+ movw %cx, -2(%rax,%rdx)
+ movw %cx, (%rax)
VZEROUPPER_RETURN
END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
GitLab

View File

@ -0,0 +1,396 @@
From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 23 Apr 2021 15:56:24 -0400
Subject: [PATCH] x86: Optimize strchr-avx2.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes strchr-avx2.S. The optimizations are all
small things such as save an ALU in the alignment process, saving a
few instructions in the loop return, saving some bytes in the main
loop, and increasing the ILP in the return cases. test-strchr,
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
1 file changed, 170 insertions(+), 120 deletions(-)
Conflics:
sysdeps/x86_64/multiarch/strchr-avx2.S
(rearranged to account for branch changes)
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
index 919d256c..5884726b 100644
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
@@ -49,133 +49,144 @@
.section SECTION(.text),"ax",@progbits
ENTRY (STRCHR)
- movl %edi, %ecx
-# ifndef USE_AS_STRCHRNUL
- xorl %edx, %edx
-# endif
-
/* Broadcast CHAR to YMM0. */
vmovd %esi, %xmm0
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ VPBROADCAST %xmm0, %ymm0
vpxor %xmm9, %xmm9, %xmm9
- VPBROADCAST %xmm0, %ymm0
/* Check if we cross page boundary with one vector load. */
- andl $(PAGE_SIZE - 1), %ecx
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
- ja L(cross_page_boundary)
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null byte. */
vmovdqu (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jz L(more_vecs)
+ jz L(aligned_more)
tzcntl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
+# endif
addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
+ alignment % 32 was either 16 or 0. As well this makes the
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
+ easier. */
+ .p2align 5
+L(first_vec_x4):
+ tzcntl %eax, %eax
+ addq $(VEC_SIZE * 3 + 1), %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
-L(return_vzeroupper):
- ZERO_UPPER_VEC_REGISTERS_RETURN
-
- .p2align 4
-L(more_vecs):
- /* Align data for aligned loads in the loop. */
- andq $-VEC_SIZE, %rdi
-L(aligned_more):
-
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- vmovdqa VEC_SIZE(%rdi), %ymm8
- addq $VEC_SIZE, %rdi
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- vmovdqa VEC_SIZE(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
-
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
-
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
- vpor %ymm1, %ymm2, %ymm1
- vpmovmskb %ymm1, %eax
- testl %eax, %eax
- jz L(prep_loop_4x)
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- tzcntl %eax, %eax
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+L(zero):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
# endif
- VZEROUPPER
- ret
+
.p2align 4
-L(first_vec_x0):
+L(first_vec_x1):
tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
- addq %rdi, %rax
+ incq %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
+ addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x1):
+L(first_vec_x2):
tzcntl %eax, %eax
- leaq VEC_SIZE(%rdi, %rax), %rax
+ addq $(VEC_SIZE + 1), %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
+ addq %rdi, %rax
VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2):
+L(first_vec_x3):
tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ addq $(VEC_SIZE * 2 + 1), %rdi
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero)
# endif
+ addq %rdi, %rax
VZEROUPPER_RETURN
-L(prep_loop_4x):
- /* Align data to 4 * VEC_SIZE. */
- andq $-(VEC_SIZE * 4), %rdi
+ .p2align 4
+L(aligned_more):
+ /* Align data to VEC_SIZE - 1. This is the same number of
+ instructions as using andq -VEC_SIZE but saves 4 bytes of code
+ on x4 check. */
+ orq $(VEC_SIZE - 1), %rdi
+L(cross_page_continue):
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
+ since data is only aligned to VEC_SIZE. */
+ vmovdqa 1(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x2)
+
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
+ vpor %ymm1, %ymm2, %ymm1
+ vpmovmskb %ymm1, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x4)
+ /* Align data to VEC_SIZE * 4 - 1. */
+ addq $(VEC_SIZE * 4 + 1), %rdi
+ andq $-(VEC_SIZE * 4), %rdi
.p2align 4
L(loop_4x_vec):
/* Compare 4 * VEC at a time forward. */
- vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
- vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
- vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
- vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
+ vmovdqa (%rdi), %ymm5
+ vmovdqa (VEC_SIZE)(%rdi), %ymm6
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
/* Leaves only CHARS matching esi as 0. */
vpxor %ymm5, %ymm0, %ymm1
@@ -191,63 +202,102 @@ L(loop_4x_vec):
VPMINU %ymm1, %ymm2, %ymm5
VPMINU %ymm3, %ymm4, %ymm6
- VPMINU %ymm5, %ymm6, %ymm5
+ VPMINU %ymm5, %ymm6, %ymm6
- VPCMPEQ %ymm5, %ymm9, %ymm5
- vpmovmskb %ymm5, %eax
+ VPCMPEQ %ymm6, %ymm9, %ymm6
+ vpmovmskb %ymm6, %ecx
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
+ jz L(loop_4x_vec)
- addq $(VEC_SIZE * 4), %rdi
- testl %eax, %eax
- jz L(loop_4x_vec)
- VPCMPEQ %ymm1, %ymm9, %ymm1
+ VPCMPEQ %ymm1, %ymm9, %ymm1
vpmovmskb %ymm1, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(last_vec_x0)
+
- VPCMPEQ %ymm2, %ymm9, %ymm2
+ VPCMPEQ %ymm5, %ymm9, %ymm2
vpmovmskb %ymm2, %eax
testl %eax, %eax
- jnz L(first_vec_x1)
+ jnz L(last_vec_x1)
+
+ VPCMPEQ %ymm3, %ymm9, %ymm3
+ vpmovmskb %ymm3, %eax
+ /* rcx has combined result from all 4 VEC. It will only be used
+ if the first 3 other VEC all did not contain a match. */
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ subq $(VEC_SIZE * 2), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero_end)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
+
+ .p2align 4
+L(last_vec_x0):
+ tzcntl %eax, %eax
+ addq $-(VEC_SIZE * 4), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero_end)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
- VPCMPEQ %ymm3, %ymm9, %ymm3
- VPCMPEQ %ymm4, %ymm9, %ymm4
- vpmovmskb %ymm3, %ecx
- vpmovmskb %ymm4, %eax
- salq $32, %rax
- orq %rcx, %rax
- tzcntq %rax, %rax
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+L(zero_end):
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
# endif
- VZEROUPPER
- ret
+
+ .p2align 4
+L(last_vec_x1):
+ tzcntl %eax, %eax
+ subq $(VEC_SIZE * 3), %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Found CHAR or the null byte. */
+ cmp (%rdi, %rax), %CHAR_REG
+ jne L(zero_end)
+# endif
+ addq %rdi, %rax
+ VZEROUPPER_RETURN
+
/* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
- andq $-VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
-
- vmovdqa (%rdi), %ymm8
- VPCMPEQ %ymm8, %ymm0, %ymm1
- VPCMPEQ %ymm8, %ymm9, %ymm2
+ movq %rdi, %rdx
+ /* Align rdi to VEC_SIZE - 1. */
+ orq $(VEC_SIZE - 1), %rdi
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
+ VPCMPEQ %ymm8, %ymm0, %ymm1
+ VPCMPEQ %ymm8, %ymm9, %ymm2
vpor %ymm1, %ymm2, %ymm1
vpmovmskb %ymm1, %eax
- /* Remove the leading bits. */
- sarxl %ecx, %eax, %eax
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
+ so no need to manually mod edx. */
+ sarxl %edx, %eax, %eax
testl %eax, %eax
- jz L(aligned_more)
+ jz L(cross_page_continue)
tzcntl %eax, %eax
- addq %rcx, %rdi
- addq %rdi, %rax
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ xorl %ecx, %ecx
+ /* Found CHAR or the null byte. */
+ cmp (%rdx, %rax), %CHAR_REG
+ leaq (%rdx, %rax), %rax
+ cmovne %rcx, %rax
+# else
+ addq %rdx, %rax
# endif
- VZEROUPPER_RETURN
+L(return_vzeroupper):
+ ZERO_UPPER_VEC_REGISTERS_RETURN
END (STRCHR)
# endif
--
GitLab

View File

@ -0,0 +1,532 @@
From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 23 Apr 2021 15:56:25 -0400
Subject: [PATCH] x86: Optimize strchr-evex.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes strchr-evex.S. The optimizations are
mostly small things such as save an ALU in the alignment process,
saving a few instructions in the loop return. The one significant
change is saving 2 instructions in the 4x loop. test-strchr,
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
1 file changed, 218 insertions(+), 174 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
index ddc86a70..7f9d4ee4 100644
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
@@ -32,13 +32,15 @@
# define VPCMP vpcmpd
# define VPMINU vpminud
# define CHAR_REG esi
-# define SHIFT_REG r8d
+# define SHIFT_REG ecx
+# define CHAR_SIZE 4
# else
# define VPBROADCAST vpbroadcastb
# define VPCMP vpcmpb
# define VPMINU vpminub
# define CHAR_REG sil
-# define SHIFT_REG ecx
+# define SHIFT_REG edx
+# define CHAR_SIZE 1
# endif
# define XMMZERO xmm16
@@ -56,23 +58,20 @@
# define VEC_SIZE 32
# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
.section .text.evex,"ax",@progbits
ENTRY (STRCHR)
- movl %edi, %ecx
-# ifndef USE_AS_STRCHRNUL
- xorl %edx, %edx
-# endif
-
/* Broadcast CHAR to YMM0. */
- VPBROADCAST %esi, %YMM0
-
+ VPBROADCAST %esi, %YMM0
+ movl %edi, %eax
+ andl $(PAGE_SIZE - 1), %eax
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
- /* Check if we cross page boundary with one vector load. */
- andl $(PAGE_SIZE - 1), %ecx
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
- ja L(cross_page_boundary)
+ /* Check if we cross page boundary with one vector load.
+ Otherwise it is safe to use an unaligned load. */
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ ja L(cross_page_boundary)
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
null bytes. */
@@ -83,251 +82,296 @@ ENTRY (STRCHR)
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
VPCMP $0, %YMMZERO, %YMM2, %k0
- ktestd %k0, %k0
- jz L(more_vecs)
kmovd %k0, %eax
+ testl %eax, %eax
+ jz L(aligned_more)
tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
+ */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
# else
addq %rdi, %rax
# endif
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (%rax), %CHAR_REG
+ jne L(zero)
# endif
ret
- .p2align 4
-L(more_vecs):
- /* Align data for aligned loads in the loop. */
- andq $-VEC_SIZE, %rdi
-L(aligned_more):
-
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
- since data is only aligned to VEC_SIZE. */
- VMOVA VEC_SIZE(%rdi), %YMM1
- addq $VEC_SIZE, %rdi
-
- /* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x0)
-
- VMOVA VEC_SIZE(%rdi), %YMM1
- /* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x1)
-
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
- /* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(first_vec_x2)
-
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
- /* Leaves only CHARS matching esi as 0. */
- vpxorq %YMM1, %YMM0, %YMM2
- VPMINU %YMM2, %YMM1, %YMM2
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM2, %k0
- ktestd %k0, %k0
- jz L(prep_loop_4x)
-
- kmovd %k0, %eax
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
+ alignment % 32 was either 16 or 0. As well this makes the
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
+ easier. */
+ .p2align 5
+L(first_vec_x3):
tzcntl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
/* Found CHAR or the null byte. */
-# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
-# else
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
+ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero)
# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
-# endif
+L(zero):
+ xorl %eax, %eax
ret
+# endif
.p2align 4
-L(first_vec_x0):
+L(first_vec_x4):
+# ifndef USE_AS_STRCHRNUL
+ /* Check to see if first match was CHAR (k0) or null (k1). */
+ kmovd %k0, %eax
tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
-# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+ kmovd %k1, %ecx
+ /* bzhil will not be 0 if first match was null. */
+ bzhil %eax, %ecx, %ecx
+ jne L(zero)
# else
- addq %rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Combine CHAR and null matches. */
+ kord %k0, %k1, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
L(first_vec_x1):
tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
-# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
-# else
- leaq VEC_SIZE(%rdi, %rax), %rax
-# endif
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Found CHAR or the null byte. */
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero)
+
# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
ret
.p2align 4
L(first_vec_x2):
+# ifndef USE_AS_STRCHRNUL
+ /* Check to see if first match was CHAR (k0) or null (k1). */
+ kmovd %k0, %eax
tzcntl %eax, %eax
- /* Found CHAR or the null byte. */
-# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
+ kmovd %k1, %ecx
+ /* bzhil will not be 0 if first match was null. */
+ bzhil %eax, %ecx, %ecx
+ jne L(zero)
# else
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Combine CHAR and null matches. */
+ kord %k0, %k1, %k0
+ kmovd %k0, %eax
+ tzcntl %eax, %eax
# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
ret
-L(prep_loop_4x):
- /* Align data to 4 * VEC_SIZE. */
+ .p2align 4
+L(aligned_more):
+ /* Align data to VEC_SIZE. */
+ andq $-VEC_SIZE, %rdi
+L(cross_page_continue):
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
+ data is only aligned to VEC_SIZE. Use two alternating methods
+ for checking VEC to balance latency and port contention. */
+
+ /* This method has higher latency but has better port
+ distribution. */
+ VMOVA (VEC_SIZE)(%rdi), %YMM1
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x1)
+
+ /* This method has higher latency but has better port
+ distribution. */
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
+ /* Each bit in K0 represents a CHAR in YMM1. */
+ VPCMP $0, %YMM1, %YMM0, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMM1, %YMMZERO, %k1
+ kortestd %k0, %k1
+ jnz L(first_vec_x2)
+
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
+ /* Leaves only CHARS matching esi as 0. */
+ vpxorq %YMM1, %YMM0, %YMM2
+ VPMINU %YMM2, %YMM1, %YMM2
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(first_vec_x3)
+
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
+ /* Each bit in K0 represents a CHAR in YMM1. */
+ VPCMP $0, %YMM1, %YMM0, %k0
+ /* Each bit in K1 represents a CHAR in YMM1. */
+ VPCMP $0, %YMM1, %YMMZERO, %k1
+ kortestd %k0, %k1
+ jnz L(first_vec_x4)
+
+ /* Align data to VEC_SIZE * 4 for the loop. */
+ addq $VEC_SIZE, %rdi
andq $-(VEC_SIZE * 4), %rdi
.p2align 4
L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
+ /* Check 4x VEC at a time. No penalty to imm32 offset with evex
+ encoding. */
VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
- /* Leaves only CHARS matching esi as 0. */
+ /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
+ zero. */
vpxorq %YMM1, %YMM0, %YMM5
- vpxorq %YMM2, %YMM0, %YMM6
+ /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
+ k register. Its possible to save either 1 or 2 instructions
+ using cmp no equals method for either YMM1 or YMM1 and YMM3
+ respectively but bottleneck on p5 makes it not worth it. */
+ VPCMP $4, %YMM0, %YMM2, %k2
vpxorq %YMM3, %YMM0, %YMM7
- vpxorq %YMM4, %YMM0, %YMM8
-
- VPMINU %YMM5, %YMM1, %YMM5
- VPMINU %YMM6, %YMM2, %YMM6
- VPMINU %YMM7, %YMM3, %YMM7
- VPMINU %YMM8, %YMM4, %YMM8
-
- VPMINU %YMM5, %YMM6, %YMM1
- VPMINU %YMM7, %YMM8, %YMM2
-
- VPMINU %YMM1, %YMM2, %YMM1
-
- /* Each bit in K0 represents a CHAR or a null byte. */
- VPCMP $0, %YMMZERO, %YMM1, %k0
-
- addq $(VEC_SIZE * 4), %rdi
-
- ktestd %k0, %k0
+ VPCMP $4, %YMM0, %YMM4, %k4
+
+ /* Use min to select all zeros from either xor or end of string).
+ */
+ VPMINU %YMM1, %YMM5, %YMM1
+ VPMINU %YMM3, %YMM7, %YMM3
+
+ /* Use min + zeromask to select for zeros. Since k2 and k4 will
+ have 0 as positions that matched with CHAR which will set
+ zero in the corresponding destination bytes in YMM2 / YMM4.
+ */
+ VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
+ VPMINU %YMM3, %YMM4, %YMM4
+ VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
+
+ VPCMP $0, %YMMZERO, %YMM4, %k1
+ kmovd %k1, %ecx
+ subq $-(VEC_SIZE * 4), %rdi
+ testl %ecx, %ecx
jz L(loop_4x_vec)
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
- VPCMP $0, %YMMZERO, %YMM5, %k0
+ VPCMP $0, %YMMZERO, %YMM1, %k0
kmovd %k0, %eax
testl %eax, %eax
- jnz L(first_vec_x0)
+ jnz L(last_vec_x1)
- /* Each bit in K1 represents a CHAR or a null byte in YMM2. */
- VPCMP $0, %YMMZERO, %YMM6, %k1
- kmovd %k1, %eax
+ VPCMP $0, %YMMZERO, %YMM2, %k0
+ kmovd %k0, %eax
testl %eax, %eax
- jnz L(first_vec_x1)
-
- /* Each bit in K2 represents a CHAR or a null byte in YMM3. */
- VPCMP $0, %YMMZERO, %YMM7, %k2
- /* Each bit in K3 represents a CHAR or a null byte in YMM4. */
- VPCMP $0, %YMMZERO, %YMM8, %k3
+ jnz L(last_vec_x2)
+ VPCMP $0, %YMMZERO, %YMM3, %k0
+ kmovd %k0, %eax
+ /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
# ifdef USE_AS_WCSCHR
- /* NB: Each bit in K2/K3 represents 4-byte element. */
- kshiftlw $8, %k3, %k1
+ sall $8, %ecx
+ orl %ecx, %eax
+ tzcntl %eax, %eax
# else
- kshiftlq $32, %k3, %k1
+ salq $32, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
# endif
+# ifndef USE_AS_STRCHRNUL
+ /* Check if match was CHAR or null. */
+ cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero_end)
+# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
+ ret
- /* Each bit in K1 represents a NULL or a mismatch. */
- korq %k1, %k2, %k1
- kmovq %k1, %rax
+# ifndef USE_AS_STRCHRNUL
+L(zero_end):
+ xorl %eax, %eax
+ ret
+# endif
- tzcntq %rax, %rax
-# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
-# else
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
+ .p2align 4
+L(last_vec_x1):
+ tzcntl %eax, %eax
+# ifndef USE_AS_STRCHRNUL
+ /* Check if match was null. */
+ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero_end)
# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
+ ret
+
+ .p2align 4
+L(last_vec_x2):
+ tzcntl %eax, %eax
# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ /* Check if match was null. */
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero_end)
# endif
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
+ bytes. */
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
ret
/* Cold case for crossing page with first load. */
.p2align 4
L(cross_page_boundary):
+ movq %rdi, %rdx
+ /* Align rdi. */
andq $-VEC_SIZE, %rdi
- andl $(VEC_SIZE - 1), %ecx
-
VMOVA (%rdi), %YMM1
-
/* Leaves only CHARS matching esi as 0. */
vpxorq %YMM1, %YMM0, %YMM2
VPMINU %YMM2, %YMM1, %YMM2
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
VPCMP $0, %YMMZERO, %YMM2, %k0
kmovd %k0, %eax
- testl %eax, %eax
-
+ /* Remove the leading bits. */
# ifdef USE_AS_WCSCHR
+ movl %edx, %SHIFT_REG
/* NB: Divide shift count by 4 since each bit in K1 represent 4
bytes. */
- movl %ecx, %SHIFT_REG
- sarl $2, %SHIFT_REG
+ sarl $2, %SHIFT_REG
+ andl $(CHAR_PER_VEC - 1), %SHIFT_REG
# endif
-
- /* Remove the leading bits. */
sarxl %SHIFT_REG, %eax, %eax
+ /* If eax is zero continue. */
testl %eax, %eax
-
- jz L(aligned_more)
+ jz L(cross_page_continue)
tzcntl %eax, %eax
- addq %rcx, %rdi
+# ifndef USE_AS_STRCHRNUL
+ /* Check to see if match was CHAR or null. */
+ cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
+ jne L(zero_end)
+# endif
# ifdef USE_AS_WCSCHR
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
- leaq (%rdi, %rax, 4), %rax
+ /* NB: Multiply wchar_t count by 4 to get the number of
+ bytes. */
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
# else
- addq %rdi, %rax
-# endif
-# ifndef USE_AS_STRCHRNUL
- cmp (%rax), %CHAR_REG
- cmovne %rdx, %rax
+ addq %rdx, %rax
# endif
ret
--
GitLab

View File

@ -0,0 +1,536 @@
From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 4 May 2021 19:02:40 -0400
Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
Content-type: text/plain; charset=UTF-8
No bug.
This commit adds a new implementation for EVEX memchr that is not safe
for RTM because it uses vzeroupper. The benefit is that by using
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
faster than the RTM safe version which cannot use vpcmpeq because
there is no EVEX encoding for the instruction. All parts of the
implementation aside from the 4x loop are the same for the two
versions and the optimization is only relevant for large sizes.
Tigerlake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
Icelake:
size , algn , Pos , Cur T , New T , Win , Dif
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/Makefile | 7 +-
sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++
sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 +
sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++----
sysdeps/x86_64/multiarch/memchr.c | 2 +-
sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 +
sysdeps/x86_64/multiarch/rawmemchr.c | 2 +-
sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 +
sysdeps/x86_64/multiarch/wmemchr.c | 2 +-
10 files changed, 217 insertions(+), 41 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 65fde4eb..26be4095 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
strncmp-evex \
strncpy-evex \
strnlen-evex \
- strrchr-evex
+ strrchr-evex \
+ memchr-evex-rtm \
+ rawmemchr-evex-rtm
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
wcsnlen-evex \
wcsrchr-evex \
wmemchr-evex \
- wmemcmp-evex-movbe
+ wmemcmp-evex-movbe \
+ wmemchr-evex-rtm
endif
ifeq ($(subdir),debug)
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
new file mode 100644
index 00000000..fc391edb
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
@@ -0,0 +1,55 @@
+/* Common definition for ifunc selection optimized with EVEX.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
+
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (evex_rtm);
+
+ return OPTIMIZE (evex);
+ }
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ return OPTIMIZE (avx2_rtm);
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ return OPTIMIZE (avx2);
+ }
+
+ return OPTIMIZE (sse2);
+}
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index d59d65f8..ac097e8d 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__memchr_evex)
+ IFUNC_IMPL_ADD (array, i, memchr,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __memchr_evex_rtm)
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__rawmemchr_evex)
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __rawmemchr_evex_rtm)
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
/* Support sysdeps/x86_64/multiarch/strlen.c. */
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
&& CPU_FEATURE_USABLE (AVX512BW)
&& CPU_FEATURE_USABLE (BMI2)),
__wmemchr_evex)
+ IFUNC_IMPL_ADD (array, i, wmemchr,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __wmemchr_evex_rtm)
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
new file mode 100644
index 00000000..19871882
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
@@ -0,0 +1,8 @@
+#ifndef MEMCHR
+# define MEMCHR __memchr_evex_rtm
+#endif
+
+#define USE_IN_RTM 1
+#define SECTION(p) p##.evex.rtm
+
+#include "memchr-evex.S"
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
index f3fdad4f..4d0ed6d1 100644
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
@@ -38,10 +38,32 @@
# define CHAR_SIZE 1
# endif
+ /* In the 4x loop the RTM and non-RTM versions have data pointer
+ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
+ This is represented by BASE_OFFSET. As well because the RTM
+ version uses vpcmp which stores a bit per element compared where
+ the non-RTM version uses vpcmpeq which stores a bit per byte
+ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
+ version. */
+# ifdef USE_IN_RTM
+# define VZEROUPPER
+# define BASE_OFFSET (VEC_SIZE * 4)
+# define RET_SCALE CHAR_SIZE
+# else
+# define VZEROUPPER vzeroupper
+# define BASE_OFFSET 0
+# define RET_SCALE 1
+# endif
+
+ /* In the return from 4x loop memchr and rawmemchr versions have
+ data pointers off by VEC_SIZE * 4 with memchr version being
+ VEC_SIZE * 4 greater. */
# ifdef USE_AS_RAWMEMCHR
+# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
# define RAW_PTR_REG rcx
# define ALGN_PTR_REG rdi
# else
+# define RET_OFFSET BASE_OFFSET
# define RAW_PTR_REG rdi
# define ALGN_PTR_REG rcx
# endif
@@ -57,11 +79,15 @@
# define YMM5 ymm21
# define YMM6 ymm22
+# ifndef SECTION
+# define SECTION(p) p##.evex
+# endif
+
# define VEC_SIZE 32
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
# define PAGE_SIZE 4096
- .section .text.evex,"ax",@progbits
+ .section SECTION(.text),"ax",@progbits
ENTRY (MEMCHR)
# ifndef USE_AS_RAWMEMCHR
/* Check for zero length. */
@@ -237,14 +263,15 @@ L(cross_page_continue):
/* Check if at last CHAR_PER_VEC * 4 length. */
subq $(CHAR_PER_VEC * 4), %rdx
jbe L(last_4x_vec_or_less_cmpeq)
- addq $VEC_SIZE, %rdi
+ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
/* Align data to VEC_SIZE * 4 for the loop and readjust length.
*/
# ifdef USE_AS_WMEMCHR
movl %edi, %ecx
andq $-(4 * VEC_SIZE), %rdi
- andl $(VEC_SIZE * 4 - 1), %ecx
+ subl %edi, %ecx
/* NB: Divide bytes by 4 to get the wchar_t count. */
sarl $2, %ecx
addq %rcx, %rdx
@@ -254,15 +281,28 @@ L(cross_page_continue):
subq %rdi, %rdx
# endif
# else
- addq $VEC_SIZE, %rdi
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
andq $-(4 * VEC_SIZE), %rdi
# endif
-
+# ifdef USE_IN_RTM
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
+# else
+ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
+ encodable with EVEX registers (ymm16-ymm31). */
+ vmovdqa64 %YMMMATCH, %ymm0
+# endif
/* Compare 4 * VEC at a time forward. */
.p2align 4
L(loop_4x_vec):
+ /* Two versions of the loop. One that does not require
+ vzeroupper by not using ymm0-ymm15 and another does that require
+ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
+ is used at all is because there is no EVEX encoding vpcmpeq and
+ with vpcmpeq this loop can be performed more efficiently. The
+ non-vzeroupper version is safe for RTM while the vzeroupper
+ version should be prefered if RTM are not supported. */
+# ifdef USE_IN_RTM
/* It would be possible to save some instructions using 4x VPCMP
but bottleneck on port 5 makes it not woth it. */
VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
@@ -273,12 +313,55 @@ L(loop_4x_vec):
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
VPCMP $0, %YMM3, %YMMZERO, %k2
+# else
+ /* Since vptern can only take 3x vectors fastest to do 1 vec
+ seperately with EVEX vpcmp. */
+# ifdef USE_AS_WMEMCHR
+ /* vptern can only accept masks for epi32/epi64 so can only save
+ instruction using not equals mask on vptern with wmemchr. */
+ VPCMP $4, (%rdi), %YMMMATCH, %k1
+# else
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
+# endif
+ /* Compare 3x with vpcmpeq and or them all together with vptern.
+ */
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
+# ifdef USE_AS_WMEMCHR
+ /* This takes the not of or between ymm2, ymm3, ymm4 as well as
+ combines result from VEC0 with zero mask. */
+ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
+ vpmovmskb %ymm4, %ecx
+# else
+ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
+ vpternlogd $254, %ymm2, %ymm3, %ymm4
+ vpmovmskb %ymm4, %ecx
+ kmovd %k1, %eax
+# endif
+# endif
+
# ifdef USE_AS_RAWMEMCHR
subq $-(VEC_SIZE * 4), %rdi
+# endif
+# ifdef USE_IN_RTM
kortestd %k2, %k3
+# else
+# ifdef USE_AS_WMEMCHR
+ /* ecx contains not of matches. All 1s means no matches. incl will
+ overflow and set zeroflag if that is the case. */
+ incl %ecx
+# else
+ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
+ to ecx is not an issue because if eax is non-zero it will be
+ used for returning the match. If it is zero the add does
+ nothing. */
+ addq %rax, %rcx
+# endif
+# endif
+# ifdef USE_AS_RAWMEMCHR
jz L(loop_4x_vec)
# else
- kortestd %k2, %k3
jnz L(loop_4x_vec_end)
subq $-(VEC_SIZE * 4), %rdi
@@ -288,10 +371,11 @@ L(loop_4x_vec):
/* Fall through into less than 4 remaining vectors of length case.
*/
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
+ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
+ addq $(BASE_OFFSET - VEC_SIZE), %rdi
kmovd %k0, %eax
- addq $(VEC_SIZE * 3), %rdi
- .p2align 4
+ VZEROUPPER
+
L(last_4x_vec_or_less):
/* Check if first VEC contained match. */
testl %eax, %eax
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
/* rawmemchr will fall through into this if match was found in
loop. */
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
/* k1 has not of matches with VEC1. */
kmovd %k1, %eax
-# ifdef USE_AS_WMEMCHR
+# ifdef USE_AS_WMEMCHR
subl $((1 << CHAR_PER_VEC) - 1), %eax
-# else
+# else
incl %eax
+# endif
+# else
+ /* eax already has matches for VEC1. */
+ testl %eax, %eax
# endif
jnz L(last_vec_x1_return)
+# ifdef USE_IN_RTM
VPCMP $0, %YMM2, %YMMZERO, %k0
kmovd %k0, %eax
+# else
+ vpmovmskb %ymm2, %eax
+# endif
testl %eax, %eax
jnz L(last_vec_x2_return)
+# ifdef USE_IN_RTM
kmovd %k2, %eax
testl %eax, %eax
jnz L(last_vec_x3_return)
kmovd %k3, %eax
tzcntl %eax, %eax
-# ifdef USE_AS_RAWMEMCHR
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
+ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
# else
- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
+ vpmovmskb %ymm3, %eax
+ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
+ salq $VEC_SIZE, %rcx
+ orq %rcx, %rax
+ tzcntq %rax, %rax
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
+ VZEROUPPER
# endif
ret
.p2align 4
L(last_vec_x1_return):
tzcntl %eax, %eax
-# ifdef USE_AS_RAWMEMCHR
-# ifdef USE_AS_WMEMCHR
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq (%rdi, %rax, CHAR_SIZE), %rax
-# else
- addq %rdi, %rax
-# endif
+ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
# else
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
+ addq %rdi, %rax
# endif
+ VZEROUPPER
ret
.p2align 4
L(last_vec_x2_return):
tzcntl %eax, %eax
-# ifdef USE_AS_RAWMEMCHR
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
-# else
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
-# endif
+ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
+ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
+ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
+ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
+ VZEROUPPER
ret
+# ifdef USE_IN_RTM
.p2align 4
L(last_vec_x3_return):
tzcntl %eax, %eax
-# ifdef USE_AS_RAWMEMCHR
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
-# else
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
-# endif
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
ret
-
+# endif
# ifndef USE_AS_RAWMEMCHR
L(last_4x_vec_or_less_cmpeq):
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
index 016f5784..f28aea77 100644
--- a/sysdeps/x86_64/multiarch/memchr.c
+++ b/sysdeps/x86_64/multiarch/memchr.c
@@ -24,7 +24,7 @@
# undef memchr
# define SYMBOL_NAME memchr
-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
strong_alias (memchr, __memchr)
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
new file mode 100644
index 00000000..deda1ca3
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
@@ -0,0 +1,3 @@
+#define MEMCHR __rawmemchr_evex_rtm
+#define USE_AS_RAWMEMCHR 1
+#include "memchr-evex-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
index 8a0bc313..1f764f35 100644
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
@@ -26,7 +26,7 @@
# undef __rawmemchr
# define SYMBOL_NAME rawmemchr
-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
IFUNC_SELECTOR ());
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
new file mode 100644
index 00000000..a346cd35
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
@@ -0,0 +1,3 @@
+#define MEMCHR __wmemchr_evex_rtm
+#define USE_AS_WMEMCHR 1
+#include "memchr-evex-rtm.S"
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
index 6d833702..f9c91915 100644
--- a/sysdeps/x86_64/multiarch/wmemchr.c
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
@@ -26,7 +26,7 @@
# undef __wmemchr
# define SYMBOL_NAME wmemchr
-# include "ifunc-avx2.h"
+# include "ifunc-evex.h"
libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
weak_alias (__wmemchr, wmemchr)
--
GitLab

View File

@ -0,0 +1,873 @@
From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 17 May 2021 13:56:52 -0400
Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memcmp-avx2.S. The optimizations include
adding a new vec compare path for small sizes, reorganizing the entry
control flow, and removing some unnecissary ALU instructions from the
main loop. test-memcmp and test-wmemcmp are both passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 1 +
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
3 files changed, 402 insertions(+), 281 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index ac097e8d..8be0d78a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, memcmp,
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__memcmp_avx2_movbe_rtm)
IFUNC_IMPL_ADD (array, i, memcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__memcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, wmemcmp,
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_avx2_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)
&& CPU_FEATURE_USABLE (RTM)),
__wmemcmp_avx2_movbe_rtm)
IFUNC_IMPL_ADD (array, i, wmemcmp,
(CPU_FEATURE_USABLE (AVX512VL)
&& CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)
&& CPU_FEATURE_USABLE (MOVBE)),
__wmemcmp_evex_movbe)
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
index 8043c635..690dffe8 100644
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
{
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
index 9d5c9c72..16fc673e 100644
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
@@ -19,17 +19,23 @@
#if IS_IN (libc)
/* memcmp/wmemcmp is implemented as:
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
- to avoid branches.
- 2. Use overlapping compare to avoid branch.
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
- bytes for wmemcmp.
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ 1. Use ymm vector compares when possible. The only case where
+ vector compares is not possible for when size < VEC_SIZE
+ and loading from either s1 or s2 would cause a page cross.
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
+ with movbe and bswap to avoid branches.
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
+ size >= 8 bytes for wmemcmp.
+ 4. Optimistically compare up to first 4 * VEC_SIZE one at a
+ to check for early mismatches. Only do this if its guranteed the
+ work is not wasted.
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
area.
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+ 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
+ 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
+ 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+
# include <sysdep.h>
@@ -38,8 +44,10 @@
# endif
# ifdef USE_AS_WMEMCMP
+# define CHAR_SIZE 4
# define VPCMPEQ vpcmpeqd
# else
+# define CHAR_SIZE 1
# define VPCMPEQ vpcmpeqb
# endif
@@ -52,7 +60,7 @@
# endif
# define VEC_SIZE 32
-# define VEC_MASK ((1 << VEC_SIZE) - 1)
+# define PAGE_SIZE 4096
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
jb L(less_vec)
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
+ vmovdqu (%rsi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+ vpmovmskb %ymm1, %eax
+ /* NB: eax must be destination register if going to
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
+ must be ecx. */
+ incl %eax
+ jnz L(return_vec_0)
cmpq $(VEC_SIZE * 2), %rdx
- jbe L(last_vec)
-
- VPCMPEQ %ymm0, %ymm0, %ymm0
- /* More than 2 * VEC. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
-
- /* From 4 * VEC to 8 * VEC, inclusively. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
+ jbe L(last_1x_vec)
+ /* Check second VEC no matter what. */
vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ /* If all 4 VEC where equal eax will be all 1s so incl will
+ overflow and set zero flag. */
+ incl %eax
+ jnz L(return_vec_1)
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ /* Less than 4 * VEC. */
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_2x_vec)
+ /* Check third and fourth VEC no matter what. */
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+ vpmovmskb %ymm3, %eax
+ incl %eax
+ jnz L(return_vec_2)
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+ vpmovmskb %ymm4, %ecx
+ incl %ecx
+ jnz L(return_vec_3)
- vpand %ymm1, %ymm2, %ymm5
- vpand %ymm3, %ymm4, %ymm6
- vpand %ymm5, %ymm6, %ymm5
+ /* Go to 4x VEC loop. */
+ cmpq $(VEC_SIZE * 8), %rdx
+ ja L(more_8x_vec)
- vptest %ymm0, %ymm5
- jnc L(4x_vec_end)
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+ branches. */
+ /* Load first two VEC from s2 before adjusting addresses. */
+ vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
+ vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
+ /* Wait to load from s1 until addressed adjust due to
+ unlamination of microfusion with complex address mode. */
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+ VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vptest %ymm0, %ymm5
- jnc L(4x_vec_end)
- xorl %eax, %eax
+ /* Reduce VEC0 - VEC4. */
+ vpand %ymm1, %ymm2, %ymm5
+ vpand %ymm3, %ymm4, %ymm6
+ vpand %ymm5, %ymm6, %ymm7
+ vpmovmskb %ymm7, %ecx
+ incl %ecx
+ jnz L(return_vec_0_1_2_3)
+ /* NB: eax must be zero to reach here. */
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(return_vec_0):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi, %rax), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
L(return_vzeroupper):
ZERO_UPPER_VEC_REGISTERS_RETURN
.p2align 4
-L(last_2x_vec):
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
+L(return_vec_1):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl VEC_SIZE(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl VEC_SIZE(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
+ movzbl VEC_SIZE(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(return_vec_2):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ VZEROUPPER_RETURN
+
+ /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */
+ .p2align 5
+L(8x_return_vec_0_1_2_3):
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
+ addq %rdi, %rsi
+L(return_vec_0_1_2_3):
+ vpmovmskb %ymm1, %eax
+ incl %eax
+ jnz L(return_vec_0)
-L(last_vec):
- /* Use overlapping loads to avoid branches. */
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
+ incl %eax
+ jnz L(return_vec_1)
+
+ vpmovmskb %ymm3, %eax
+ incl %eax
+ jnz L(return_vec_2)
+L(return_vec_3):
+ tzcntl %ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ subl %ecx, %eax
+# endif
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(more_8x_vec):
+ /* Set end of s1 in rdx. */
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
+ /* rsi stores s2 - s1. This allows loop to only update one
+ pointer. */
+ subq %rdi, %rsi
+ /* Align s1 pointer. */
+ andq $-VEC_SIZE, %rdi
+ /* Adjust because first 4x vec where check already. */
+ subq $-(VEC_SIZE * 4), %rdi
+ .p2align 4
+L(loop_4x_vec):
+ /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
+ */
+ vmovdqu (%rsi, %rdi), %ymm1
+ VPCMPEQ (%rdi), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
+
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
+
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
+
+ vpand %ymm1, %ymm2, %ymm5
+ vpand %ymm3, %ymm4, %ymm6
+ vpand %ymm5, %ymm6, %ymm7
+ vpmovmskb %ymm7, %ecx
+ incl %ecx
+ jnz L(8x_return_vec_0_1_2_3)
+ subq $-(VEC_SIZE * 4), %rdi
+ /* Check if s1 pointer at end. */
+ cmpq %rdx, %rdi
+ jb L(loop_4x_vec)
+
+ subq %rdx, %rdi
+ /* rdi has 4 * VEC_SIZE - remaining length. */
+ cmpl $(VEC_SIZE * 3), %edi
+ jae L(8x_last_1x_vec)
+ /* Load regardless of branch. */
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
+ cmpl $(VEC_SIZE * 2), %edi
+ jae L(8x_last_2x_vec)
+
+ /* Check last 4 VEC. */
+ vmovdqu (%rsi, %rdx), %ymm1
+ VPCMPEQ (%rdx), %ymm1, %ymm1
+
+ vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
+ VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
+
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+
+ vpand %ymm1, %ymm2, %ymm5
+ vpand %ymm3, %ymm4, %ymm6
+ vpand %ymm5, %ymm6, %ymm7
+ vpmovmskb %ymm7, %ecx
+ /* Restore s1 pointer to rdi. */
+ movq %rdx, %rdi
+ incl %ecx
+ jnz L(8x_return_vec_0_1_2_3)
+ /* NB: eax must be zero to reach here. */
+ VZEROUPPER_RETURN
+
+ /* Only entry is from L(more_8x_vec). */
+ .p2align 4
+L(8x_last_2x_vec):
+ /* Check second to last VEC. rdx store end pointer of s1 and
+ ymm3 has already been loaded with second to last VEC from s2.
+ */
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
+ vpmovmskb %ymm3, %eax
+ incl %eax
+ jnz L(8x_return_vec_2)
+ /* Check last VEC. */
+ .p2align 4
+L(8x_last_1x_vec):
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
+ vpmovmskb %ymm4, %eax
+ incl %eax
+ jnz L(8x_return_vec_3)
VZEROUPPER_RETURN
.p2align 4
-L(first_vec):
- /* A byte or int32 is different within 16 or 32 bytes. */
- tzcntl %eax, %ecx
+L(last_2x_vec):
+ /* Check second to last VEC. */
+ vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
+ VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
+ vpmovmskb %ymm1, %eax
+ incl %eax
+ jnz L(return_vec_1_end)
+ /* Check last VEC. */
+L(last_1x_vec):
+ vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
+ VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
+ vpmovmskb %ymm1, %eax
+ incl %eax
+ jnz L(return_vec_0_end)
+ VZEROUPPER_RETURN
+
+ .p2align 4
+L(8x_return_vec_2):
+ subq $VEC_SIZE, %rdx
+L(8x_return_vec_3):
+ tzcntl %eax, %eax
+ addq %rdx, %rax
# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi, %rcx), %edx
- cmpl (%rsi, %rcx), %edx
-L(wmemcmp_return):
- setl %al
- negl %eax
- orl $1, %eax
+ movl (VEC_SIZE * 3)(%rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %edx
- sub %edx, %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 3)(%rax), %eax
+ subl %ecx, %eax
# endif
VZEROUPPER_RETURN
-# ifdef USE_AS_WMEMCMP
.p2align 4
-L(4):
- xorl %eax, %eax
- movl (%rdi), %edx
- cmpl (%rsi), %edx
- jne L(wmemcmp_return)
- ret
+L(return_vec_1_end):
+ tzcntl %eax, %eax
+ addl %edx, %eax
+# ifdef USE_AS_WMEMCMP
+ movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
# else
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ VZEROUPPER_RETURN
+
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches. */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- je L(exit)
- sbbl %eax, %eax
- orl $1, %eax
- ret
+L(return_vec_0_end):
+ tzcntl %eax, %eax
+ addl %edx, %eax
+# ifdef USE_AS_WMEMCMP
+ movl -VEC_SIZE(%rdi, %rax), %ecx
+ xorl %edx, %edx
+ cmpl -VEC_SIZE(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ VZEROUPPER_RETURN
.p2align 4
-L(exit):
- ret
+L(less_vec):
+ /* Check if one or less CHAR. This is necessary for size = 0 but
+ is also faster for size = CHAR_SIZE. */
+ cmpl $CHAR_SIZE, %edx
+ jbe L(one_or_less)
+
+ /* Check if loading one VEC from either s1 or s2 could cause a
+ page cross. This can have false positives but is by far the
+ fastest method. */
+ movl %edi, %eax
+ orl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(page_cross_less_vec)
+
+ /* No page cross possible. */
+ vmovdqu (%rsi), %ymm2
+ VPCMPEQ (%rdi), %ymm2, %ymm2
+ vpmovmskb %ymm2, %eax
+ incl %eax
+ /* Result will be zero if s1 and s2 match. Otherwise first set
+ bit will be first mismatch. */
+ bzhil %edx, %eax, %edx
+ jnz L(return_vec_0)
+ xorl %eax, %eax
+ VZEROUPPER_RETURN
.p2align 4
-L(between_2_3):
+L(page_cross_less_vec):
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+ bytes. */
+ cmpl $16, %edx
+ jae L(between_16_31)
+# ifndef USE_AS_WMEMCMP
+ cmpl $8, %edx
+ jae L(between_8_15)
+ cmpl $4, %edx
+ jae L(between_4_7)
+
/* Load as big endian to avoid branches. */
movzwl (%rdi), %eax
movzwl (%rsi), %ecx
@@ -208,223 +439,106 @@ L(between_2_3):
shll $8, %ecx
bswap %eax
bswap %ecx
- movb -1(%rdi, %rdx), %al
- movb -1(%rsi, %rdx), %cl
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
/* Subtraction is okay because the upper 8 bits are zero. */
subl %ecx, %eax
+ /* No ymm register was touched. */
ret
.p2align 4
-L(1):
- movzbl (%rdi), %eax
+L(one_or_less):
+ jb L(zero)
movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
subl %ecx, %eax
- ret
-# endif
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
+ /* No ymm register was touched. */
ret
.p2align 4
-L(less_vec):
-# ifdef USE_AS_WMEMCMP
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
- cmpb $4, %dl
- je L(4)
- jb L(zero)
-# else
- cmpb $1, %dl
- je L(1)
- jb L(zero)
- cmpb $4, %dl
- jb L(between_2_3)
- cmpb $8, %dl
- jb L(between_4_7)
+L(between_8_15):
# endif
- cmpb $16, %dl
- jae L(between_16_31)
- /* It is between 8 and 15 bytes. */
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
- VPCMPEQ %xmm1, %xmm2, %xmm2
+ VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
+ subl $0xffff, %eax
+ jnz L(return_vec_0)
/* Use overlapping loads to avoid branches. */
leaq -8(%rdi, %rdx), %rdi
leaq -8(%rsi, %rdx), %rsi
vmovq (%rdi), %xmm1
vmovq (%rsi), %xmm2
- VPCMPEQ %xmm1, %xmm2, %xmm2
+ VPCMPEQ %xmm1, %xmm2, %xmm2
vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
+ subl $0xffff, %eax
+ jnz L(return_vec_0)
+ /* No ymm register was touched. */
+ ret
+
+ .p2align 4
+L(zero):
+ xorl %eax, %eax
ret
.p2align 4
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
vmovdqu (%rsi), %xmm2
- VPCMPEQ (%rdi), %xmm2, %xmm2
+ VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
+ subl $0xffff, %eax
+ jnz L(return_vec_0)
/* Use overlapping loads to avoid branches. */
+
+ vmovdqu -16(%rsi, %rdx), %xmm2
leaq -16(%rdi, %rdx), %rdi
leaq -16(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %xmm2
- VPCMPEQ (%rdi), %xmm2, %xmm2
+ VPCMPEQ (%rdi), %xmm2, %xmm2
vpmovmskb %xmm2, %eax
- subl $0xffff, %eax
- jnz L(first_vec)
+ subl $0xffff, %eax
+ jnz L(return_vec_0)
+ /* No ymm register was touched. */
ret
- .p2align 4
-L(more_8x_vec):
- /* More than 8 * VEC. Check the first VEC. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- /* Align the first memory area for aligned loads in the loop.
- Compute how much the first memory area is misaligned. */
- movq %rdi, %rcx
- andl $(VEC_SIZE - 1), %ecx
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %rcx
- /* Adjust the second memory area. */
- subq %rcx, %rsi
- /* Adjust the first memory area which should be aligned now. */
- subq %rcx, %rdi
- /* Adjust length. */
- addq %rcx, %rdx
-
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- vmovdqu (%rsi), %ymm1
- VPCMPEQ (%rdi), %ymm1, %ymm1
-
- vmovdqu VEC_SIZE(%rsi), %ymm2
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
- vpand %ymm2, %ymm1, %ymm5
-
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
- vpand %ymm3, %ymm5, %ymm5
-
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
- vpand %ymm4, %ymm5, %ymm5
-
- vptest %ymm0, %ymm5
- jnc L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
- addq $(VEC_SIZE * 4), %rsi
-
- subq $(VEC_SIZE * 4), %rdx
- cmpq $(VEC_SIZE * 4), %rdx
- jae L(loop_4x_vec)
-
- /* Less than 4 * VEC. */
- cmpq $VEC_SIZE, %rdx
- jbe L(last_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- jbe L(last_2x_vec)
-
-L(last_4x_vec):
- /* From 2 * VEC to 4 * VEC. */
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- /* Use overlapping loads to avoid branches. */
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- vmovdqu (%rsi), %ymm2
- VPCMPEQ (%rdi), %ymm2, %ymm2
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- VZEROUPPER_RETURN
-
- .p2align 4
-L(4x_vec_end):
- vpmovmskb %ymm1, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- vpmovmskb %ymm2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x1)
- vpmovmskb %ymm3, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x2)
- vpmovmskb %ymm4, %eax
- subl $VEC_MASK, %eax
- tzcntl %eax, %ecx
# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER_RETURN
-
.p2align 4
-L(first_vec_x1):
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl VEC_SIZE(%rdi, %rcx), %edx
- cmpl VEC_SIZE(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
+L(one_or_less):
+ jb L(zero)
+ movl (%rdi), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi), %ecx
+ je L(zero)
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+ /* No ymm register was touched. */
+ ret
# else
- movzbl VEC_SIZE(%rdi, %rcx), %eax
- movzbl VEC_SIZE(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
- VZEROUPPER_RETURN
.p2align 4
-L(first_vec_x2):
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
- sub %edx, %eax
+L(between_4_7):
+ /* Load as big endian with overlapping movbe to avoid branches.
+ */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ jz L(zero_4_7)
+ sbbl %eax, %eax
+ orl $1, %eax
+L(zero_4_7):
+ /* No ymm register was touched. */
+ ret
# endif
- VZEROUPPER_RETURN
+
END (MEMCMP)
#endif
--
GitLab

View File

@ -0,0 +1,851 @@
From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 17 May 2021 13:57:24 -0400
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
Content-type: text/plain; charset=UTF-8
No bug. This commit optimizes memcmp-evex.S. The optimizations include
adding a new vec compare path for small sizes, reorganizing the entry
control flow, removing some unnecissary ALU instructions from the main
loop, and most importantly replacing the heavy use of vpcmp + kand
logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
1 file changed, 408 insertions(+), 302 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 9c093972..654dc7ac 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -19,17 +19,22 @@
#if IS_IN (libc)
/* memcmp/wmemcmp is implemented as:
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
- to avoid branches.
- 2. Use overlapping compare to avoid branch.
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
- bytes for wmemcmp.
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
+ 1. Use ymm vector compares when possible. The only case where
+ vector compares is not possible for when size < CHAR_PER_VEC
+ and loading from either s1 or s2 would cause a page cross.
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
+ with movbe and bswap to avoid branches.
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
+ size >= 8 bytes for wmemcmp.
+ 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
+ to check for early mismatches. Only do this if its guranteed the
+ work is not wasted.
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
area.
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
+ 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
+ 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
# include <sysdep.h>
@@ -40,11 +45,21 @@
# define VMOVU vmovdqu64
# ifdef USE_AS_WMEMCMP
-# define VPCMPEQ vpcmpeqd
+# define CHAR_SIZE 4
+# define VPCMP vpcmpd
# else
-# define VPCMPEQ vpcmpeqb
+# define CHAR_SIZE 1
+# define VPCMP vpcmpub
# endif
+# define VEC_SIZE 32
+# define PAGE_SIZE 4096
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
+
+# define XMM0 xmm16
+# define XMM1 xmm17
+# define XMM2 xmm18
+# define YMM0 ymm16
# define XMM1 xmm17
# define XMM2 xmm18
# define YMM1 ymm17
@@ -54,15 +69,6 @@
# define YMM5 ymm21
# define YMM6 ymm22
-# define VEC_SIZE 32
-# ifdef USE_AS_WMEMCMP
-# define VEC_MASK 0xff
-# define XMM_MASK 0xf
-# else
-# define VEC_MASK 0xffffffff
-# define XMM_MASK 0xffff
-# endif
-
/* Warning!
wmemcmp has to use SIGNED comparison for elements.
memcmp has to use UNSIGNED comparison for elemnts.
@@ -70,145 +76,370 @@
.section .text.evex,"ax",@progbits
ENTRY (MEMCMP)
-# ifdef USE_AS_WMEMCMP
- shl $2, %RDX_LP
-# elif defined __ILP32__
+# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
# endif
- cmp $VEC_SIZE, %RDX_LP
+ cmp $CHAR_PER_VEC, %RDX_LP
jb L(less_vec)
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k1
+ VMOVU (%rsi), %YMM1
+ /* Use compare not equals to directly check for mismatch. */
+ VPCMP $4, (%rdi), %YMM1, %k1
kmovd %k1, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- cmpq $(VEC_SIZE * 2), %rdx
- jbe L(last_vec)
-
- /* More than 2 * VEC. */
- cmpq $(VEC_SIZE * 8), %rdx
- ja L(more_8x_vec)
- cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
+ /* NB: eax must be destination register if going to
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
+ must be ecx. */
+ testl %eax, %eax
+ jnz L(return_vec_0)
- /* From 4 * VEC to 8 * VEC, inclusively. */
- VMOVU (%rsi), %YMM1
- VPCMPEQ (%rdi), %YMM1, %k1
+ cmpq $(CHAR_PER_VEC * 2), %rdx
+ jbe L(last_1x_vec)
+ /* Check second VEC no matter what. */
VMOVU VEC_SIZE(%rsi), %YMM2
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
+ VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_1)
+
+ /* Less than 4 * VEC. */
+ cmpq $(CHAR_PER_VEC * 4), %rdx
+ jbe L(last_2x_vec)
+ /* Check third and fourth VEC no matter what. */
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
+ VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
+ VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ kmovd %k1, %ecx
+ testl %ecx, %ecx
+ jnz L(return_vec_3)
- kandd %k1, %k2, %k5
- kandd %k3, %k4, %k6
- kandd %k5, %k6, %k6
+ /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
+ compare with zero to get a mask is needed. */
+ vpxorq %XMM0, %XMM0, %XMM0
- kmovd %k6, %eax
- cmpl $VEC_MASK, %eax
- jne L(4x_vec_end)
+ /* Go to 4x VEC loop. */
+ cmpq $(CHAR_PER_VEC * 8), %rdx
+ ja L(more_8x_vec)
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
- VMOVU (%rsi), %YMM1
- VPCMPEQ (%rdi), %YMM1, %k1
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
+ branches. */
- VMOVU VEC_SIZE(%rsi), %YMM2
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
- kandd %k1, %k2, %k5
+ /* Load first two VEC from s2 before adjusting addresses. */
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
+
+ /* Wait to load from s1 until addressed adjust due to
+ unlamination of microfusion with complex address mode. */
+
+ /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
+ will have some 1s. */
+ vpxorq (%rdi), %YMM1, %YMM1
+ vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
- kandd %k3, %k5, %k5
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ /* Or together YMM1, YMM2, and YMM3 into YMM3. */
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
- kandd %k4, %k5, %k5
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
+ oring with YMM3. Result is stored in YMM4. */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+ /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
+ VPCMP $4, %YMM4, %YMM0, %k1
+ kmovd %k1, %ecx
+ testl %ecx, %ecx
+ jnz L(return_vec_0_1_2_3)
+ /* NB: eax must be zero to reach here. */
+ ret
- kmovd %k5, %eax
- cmpl $VEC_MASK, %eax
- jne L(4x_vec_end)
- xorl %eax, %eax
+ /* NB: aligning 32 here allows for the rest of the jump targets
+ to be tuned for 32 byte alignment. Most important this ensures
+ the L(more_8x_vec) loop is 32 byte aligned. */
+ .p2align 5
+L(less_vec):
+ /* Check if one or less CHAR. This is necessary for size = 0 but
+ is also faster for size = CHAR_SIZE. */
+ cmpl $1, %edx
+ jbe L(one_or_less)
+
+ /* Check if loading one VEC from either s1 or s2 could cause a
+ page cross. This can have false positives but is by far the
+ fastest method. */
+ movl %edi, %eax
+ orl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(page_cross_less_vec)
+
+ /* No page cross possible. */
+ VMOVU (%rsi), %YMM2
+ VPCMP $4, (%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ /* Create mask in ecx for potentially in bound matches. */
+ bzhil %edx, %eax, %eax
+ jnz L(return_vec_0)
ret
.p2align 4
-L(last_2x_vec):
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k2
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
+L(return_vec_0):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
-L(last_vec):
- /* Use overlapping loads to avoid branches. */
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k2
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
+ /* NB: No p2align necessary. Alignment % 16 is naturally 1
+ which is good enough for a target not in a loop. */
+L(return_vec_1):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ xorl %edx, %edx
+ cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
+ movzbl VEC_SIZE(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
ret
- .p2align 4
-L(first_vec):
- /* A byte or int32 is different within 16 or 32 bytes. */
- tzcntl %eax, %ecx
+ /* NB: No p2align necessary. Alignment % 16 is naturally 2
+ which is good enough for a target not in a loop. */
+L(return_vec_2):
+ tzcntl %eax, %eax
# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (%rdi, %rcx, 4), %edx
- cmpl (%rsi, %rcx, 4), %edx
-L(wmemcmp_return):
- setl %al
- negl %eax
- orl $1, %eax
+ movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
# else
- movzbl (%rdi, %rcx), %eax
- movzbl (%rsi, %rcx), %edx
- sub %edx, %eax
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
# endif
ret
+ .p2align 4
+L(8x_return_vec_0_1_2_3):
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
+ addq %rdi, %rsi
+L(return_vec_0_1_2_3):
+ VPCMP $4, %YMM1, %YMM0, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
+
+ VPCMP $4, %YMM2, %YMM0, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_1)
+
+ VPCMP $4, %YMM3, %YMM0, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_2)
+L(return_vec_3):
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ subl %ecx, %eax
+# endif
+ ret
+
.p2align 4
-L(4):
- xorl %eax, %eax
- movl (%rdi), %edx
- cmpl (%rsi), %edx
- jne L(wmemcmp_return)
+L(more_8x_vec):
+ /* Set end of s1 in rdx. */
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
+ /* rsi stores s2 - s1. This allows loop to only update one
+ pointer. */
+ subq %rdi, %rsi
+ /* Align s1 pointer. */
+ andq $-VEC_SIZE, %rdi
+ /* Adjust because first 4x vec where check already. */
+ subq $-(VEC_SIZE * 4), %rdi
+ .p2align 4
+L(loop_4x_vec):
+ VMOVU (%rsi, %rdi), %YMM1
+ vpxorq (%rdi), %YMM1, %YMM1
+
+ VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
+ vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
+
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
+ VPCMP $4, %YMM4, %YMM0, %k1
+ kmovd %k1, %ecx
+ testl %ecx, %ecx
+ jnz L(8x_return_vec_0_1_2_3)
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpq %rdx, %rdi
+ jb L(loop_4x_vec)
+
+ subq %rdx, %rdi
+ /* rdi has 4 * VEC_SIZE - remaining length. */
+ cmpl $(VEC_SIZE * 3), %edi
+ jae L(8x_last_1x_vec)
+ /* Load regardless of branch. */
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
+ cmpl $(VEC_SIZE * 2), %edi
+ jae L(8x_last_2x_vec)
+
+ VMOVU (%rsi, %rdx), %YMM1
+ vpxorq (%rdx), %YMM1, %YMM1
+
+ VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
+
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
+
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
+ VPCMP $4, %YMM4, %YMM0, %k1
+ kmovd %k1, %ecx
+ /* Restore s1 pointer to rdi. */
+ movq %rdx, %rdi
+ testl %ecx, %ecx
+ jnz L(8x_return_vec_0_1_2_3)
+ /* NB: eax must be zero to reach here. */
+ ret
+
+ /* Only entry is from L(more_8x_vec). */
+ .p2align 4
+L(8x_last_2x_vec):
+ VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(8x_return_vec_2)
+ /* Naturally aligned to 16 bytes. */
+L(8x_last_1x_vec):
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
+ VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(8x_return_vec_3)
+ ret
+
+ .p2align 4
+L(last_2x_vec):
+ /* Check second to last VEC. */
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
+ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_1_end)
+
+ /* Check last VEC. */
+ .p2align 4
+L(last_1x_vec):
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0_end)
ret
+
+ .p2align 4
+L(8x_return_vec_2):
+ subq $VEC_SIZE, %rdx
+L(8x_return_vec_3):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
+ movl (VEC_SIZE * 3)(%rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
# else
+ addq %rdx, %rax
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 3)(%rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
.p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches. */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- je L(exit)
- sbbl %eax, %eax
- orl $1, %eax
+L(return_vec_0_end):
+ tzcntl %eax, %eax
+ addl %edx, %eax
+# ifdef USE_AS_WMEMCMP
+ movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
+ xorl %edx, %edx
+ cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
ret
.p2align 4
-L(exit):
+L(return_vec_1_end):
+ tzcntl %eax, %eax
+ addl %edx, %eax
+# ifdef USE_AS_WMEMCMP
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ xorl %edx, %edx
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
ret
+
.p2align 4
+L(page_cross_less_vec):
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
+ bytes. */
+ cmpl $(16 / CHAR_SIZE), %edx
+ jae L(between_16_31)
+# ifndef USE_AS_WMEMCMP
+ cmpl $8, %edx
+ jae L(between_8_15)
+ cmpl $4, %edx
+ jae L(between_4_7)
L(between_2_3):
/* Load as big endian to avoid branches. */
movzwl (%rdi), %eax
@@ -217,224 +448,99 @@ L(between_2_3):
shll $8, %ecx
bswap %eax
bswap %ecx
- movb -1(%rdi, %rdx), %al
- movb -1(%rsi, %rdx), %cl
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
/* Subtraction is okay because the upper 8 bits are zero. */
subl %ecx, %eax
ret
-
.p2align 4
-L(1):
- movzbl (%rdi), %eax
+L(one_or_less):
+ jb L(zero)
movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
subl %ecx, %eax
ret
-# endif
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
- ret
.p2align 4
-L(less_vec):
-# ifdef USE_AS_WMEMCMP
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
- cmpb $4, %dl
- je L(4)
- jb L(zero)
-# else
- cmpb $1, %dl
- je L(1)
- jb L(zero)
- cmpb $4, %dl
- jb L(between_2_3)
- cmpb $8, %dl
- jb L(between_4_7)
+L(between_8_15):
# endif
- cmpb $16, %dl
- jae L(between_16_31)
- /* It is between 8 and 15 bytes. */
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
vmovq (%rdi), %XMM1
vmovq (%rsi), %XMM2
- VPCMPEQ %XMM1, %XMM2, %k2
- kmovw %k2, %eax
- subl $XMM_MASK, %eax
- jnz L(first_vec)
+ VPCMP $4, %XMM1, %XMM2, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
/* Use overlapping loads to avoid branches. */
- leaq -8(%rdi, %rdx), %rdi
- leaq -8(%rsi, %rdx), %rsi
+ leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
+ leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
vmovq (%rdi), %XMM1
vmovq (%rsi), %XMM2
- VPCMPEQ %XMM1, %XMM2, %k2
- kmovw %k2, %eax
- subl $XMM_MASK, %eax
- jnz L(first_vec)
+ VPCMP $4, %XMM1, %XMM2, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
ret
.p2align 4
-L(between_16_31):
- /* From 16 to 31 bytes. No branch when size == 16. */
- VMOVU (%rsi), %XMM2
- VPCMPEQ (%rdi), %XMM2, %k2
- kmovw %k2, %eax
- subl $XMM_MASK, %eax
- jnz L(first_vec)
-
- /* Use overlapping loads to avoid branches. */
- leaq -16(%rdi, %rdx), %rdi
- leaq -16(%rsi, %rdx), %rsi
- VMOVU (%rsi), %XMM2
- VPCMPEQ (%rdi), %XMM2, %k2
- kmovw %k2, %eax
- subl $XMM_MASK, %eax
- jnz L(first_vec)
+L(zero):
+ xorl %eax, %eax
ret
.p2align 4
-L(more_8x_vec):
- /* More than 8 * VEC. Check the first VEC. */
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k2
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- /* Align the first memory area for aligned loads in the loop.
- Compute how much the first memory area is misaligned. */
- movq %rdi, %rcx
- andl $(VEC_SIZE - 1), %ecx
- /* Get the negative of offset for alignment. */
- subq $VEC_SIZE, %rcx
- /* Adjust the second memory area. */
- subq %rcx, %rsi
- /* Adjust the first memory area which should be aligned now. */
- subq %rcx, %rdi
- /* Adjust length. */
- addq %rcx, %rdx
-
-L(loop_4x_vec):
- /* Compare 4 * VEC at a time forward. */
- VMOVU (%rsi), %YMM1
- VPCMPEQ (%rdi), %YMM1, %k1
-
- VMOVU VEC_SIZE(%rsi), %YMM2
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
- kandd %k2, %k1, %k5
-
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
- kandd %k3, %k5, %k5
-
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
- kandd %k4, %k5, %k5
-
- kmovd %k5, %eax
- cmpl $VEC_MASK, %eax
- jne L(4x_vec_end)
-
- addq $(VEC_SIZE * 4), %rdi
- addq $(VEC_SIZE * 4), %rsi
-
- subq $(VEC_SIZE * 4), %rdx
- cmpq $(VEC_SIZE * 4), %rdx
- jae L(loop_4x_vec)
-
- /* Less than 4 * VEC. */
- cmpq $VEC_SIZE, %rdx
- jbe L(last_vec)
- cmpq $(VEC_SIZE * 2), %rdx
- jbe L(last_2x_vec)
-
-L(last_4x_vec):
- /* From 2 * VEC to 4 * VEC. */
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k2
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
-
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k2
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
+L(between_16_31):
+ /* From 16 to 31 bytes. No branch when size == 16. */
+ VMOVU (%rsi), %XMM2
+ VPCMP $4, (%rdi), %XMM2, %k1
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
/* Use overlapping loads to avoid branches. */
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k2
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- addq $VEC_SIZE, %rdi
- addq $VEC_SIZE, %rsi
- VMOVU (%rsi), %YMM2
- VPCMPEQ (%rdi), %YMM2, %k2
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- ret
-
- .p2align 4
-L(4x_vec_end):
+ VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
+ leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
+ leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
+ VPCMP $4, (%rdi), %XMM2, %k1
kmovd %k1, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec)
- kmovd %k2, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x1)
- kmovd %k3, %eax
- subl $VEC_MASK, %eax
- jnz L(first_vec_x2)
- kmovd %k4, %eax
- subl $VEC_MASK, %eax
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
+ testl %eax, %eax
+ jnz L(return_vec_0)
ret
- .p2align 4
-L(first_vec_x1):
- tzcntl %eax, %ecx
# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl VEC_SIZE(%rdi, %rcx, 4), %edx
- cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl VEC_SIZE(%rdi, %rcx), %eax
- movzbl VEC_SIZE(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
+ .p2align 4
+L(one_or_less):
+ jb L(zero)
+ movl (%rdi), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi), %ecx
+ je L(zero)
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
ret
+# else
.p2align 4
-L(first_vec_x2):
- tzcntl %eax, %ecx
-# ifdef USE_AS_WMEMCMP
- xorl %eax, %eax
- movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
- cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
- jmp L(wmemcmp_return)
-# else
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
- sub %edx, %eax
-# endif
+L(between_4_7):
+ /* Load as big endian with overlapping movbe to avoid branches.
+ */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ jz L(zero_4_7)
+ sbbl %eax, %eax
+ orl $1, %eax
+L(zero_4_7):
ret
+# endif
+
END (MEMCMP)
#endif
--
GitLab

View File

@ -0,0 +1,104 @@
From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 20 May 2021 13:13:51 -0400
Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8
No bug. This commit makes a few small improvements to
memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
instead of 128. Either alignment will perform equally well in a loop
and 128 just increases the odds of having to do an extra iteration
which can be significant overhead for small values. 2) Align some
targets and the loop. 3) Remove an ALU from the alignment process. 4)
Reorder the last 4x VEC so that they are stored after the loop. 5)
Move the condition for leq 8x VEC to before the alignment
process. test-memset and test-wmemset are both passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
.../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++--------
1 file changed, 28 insertions(+), 22 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index f877ac9d..909c33f6 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
VMOVU %VEC(0), (%rdi)
VZEROUPPER_RETURN
+ .p2align 4
L(stosb_more_2x_vec):
cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
ja L(stosb)
+#else
+ .p2align 4
#endif
L(more_2x_vec):
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_start)
+ /* Stores to first 2x VEC before cmp as any path forward will
+ require it. */
VMOVU %VEC(0), (%rdi)
VMOVU %VEC(0), VEC_SIZE(%rdi)
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
+ cmpq $(VEC_SIZE * 4), %rdx
+ ja L(loop_start)
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
L(return):
#if VEC_SIZE > 16
ZERO_UPPER_VEC_REGISTERS_RETURN
@@ -192,28 +197,29 @@ L(return):
#endif
L(loop_start):
- leaq (VEC_SIZE * 4)(%rdi), %rcx
- VMOVU %VEC(0), (%rdi)
- andq $-(VEC_SIZE * 4), %rcx
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), VEC_SIZE(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
- addq %rdi, %rdx
- andq $-(VEC_SIZE * 4), %rdx
- cmpq %rdx, %rcx
- je L(return)
+ cmpq $(VEC_SIZE * 8), %rdx
+ jbe L(loop_end)
+ andq $-(VEC_SIZE * 2), %rdi
+ subq $-(VEC_SIZE * 4), %rdi
+ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
+ .p2align 4
L(loop):
- VMOVA %VEC(0), (%rcx)
- VMOVA %VEC(0), VEC_SIZE(%rcx)
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
- addq $(VEC_SIZE * 4), %rcx
- cmpq %rcx, %rdx
- jne L(loop)
+ VMOVA %VEC(0), (%rdi)
+ VMOVA %VEC(0), VEC_SIZE(%rdi)
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
+ subq $-(VEC_SIZE * 4), %rdi
+ cmpq %rcx, %rdi
+ jb L(loop)
+L(loop_end):
+ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
+ rdx as length is also unchanged. */
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
VZEROUPPER_SHORT_RETURN
.p2align 4
--
GitLab

View File

@ -0,0 +1,84 @@
From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun, 23 May 2021 19:43:24 -0400
Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8
This patch changes the condition for copy 4x VEC so that if length is
exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
8x VEC case.
Results For Skylake memcpy-avx2-erms
size, al1 , al2 , Cur T , New T , Win , New / Cur
128 , 0 , 0 , 9.137 , 6.873 , New , 75.22
128 , 7 , 0 , 12.933 , 7.732 , New , 59.79
128 , 0 , 7 , 11.852 , 6.76 , New , 57.04
128 , 7 , 7 , 12.587 , 6.808 , New , 54.09
Results For Icelake memcpy-evex-erms
size, al1 , al2 , Cur T , New T , Win , New / Cur
128 , 0 , 0 , 9.963 , 5.416 , New , 54.36
128 , 7 , 0 , 16.467 , 8.061 , New , 48.95
128 , 0 , 7 , 14.388 , 7.644 , New , 53.13
128 , 7 , 7 , 14.546 , 7.642 , New , 52.54
Results For Tigerlake memcpy-evex-erms
size, al1 , al2 , Cur T , New T , Win , New / Cur
128 , 0 , 0 , 8.979 , 4.95 , New , 55.13
128 , 7 , 0 , 14.245 , 7.122 , New , 50.0
128 , 0 , 7 , 12.668 , 6.675 , New , 52.69
128 , 7 , 7 , 13.042 , 6.802 , New , 52.15
Results For Skylake memmove-avx2-erms
size, al1 , al2 , Cur T , New T , Win , New / Cur
128 , 0 , 32 , 6.181 , 5.691 , New , 92.07
128 , 32 , 0 , 6.165 , 5.752 , New , 93.3
128 , 0 , 7 , 13.923 , 9.37 , New , 67.3
128 , 7 , 0 , 12.049 , 10.182 , New , 84.5
Results For Icelake memmove-evex-erms
size, al1 , al2 , Cur T , New T , Win , New / Cur
128 , 0 , 32 , 5.479 , 4.889 , New , 89.23
128 , 32 , 0 , 5.127 , 4.911 , New , 95.79
128 , 0 , 7 , 18.885 , 13.547 , New , 71.73
128 , 7 , 0 , 15.565 , 14.436 , New , 92.75
Results For Tigerlake memmove-evex-erms
size, al1 , al2 , Cur T , New T , Win , New / Cur
128 , 0 , 32 , 5.275 , 4.815 , New , 91.28
128 , 32 , 0 , 5.376 , 4.565 , New , 84.91
128 , 0 , 7 , 19.426 , 14.273 , New , 73.47
128 , 7 , 0 , 15.924 , 14.951 , New , 93.89
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index 3e2dd6bc..572cef04 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -417,8 +417,8 @@ L(more_2x_vec):
cmpq $(VEC_SIZE * 8), %rdx
ja L(more_8x_vec)
cmpq $(VEC_SIZE * 4), %rdx
- jb L(last_4x_vec)
- /* Copy from 4 * VEC to 8 * VEC, inclusively. */
+ jbe L(last_4x_vec)
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
@@ -437,7 +437,7 @@ L(more_2x_vec):
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
VZEROUPPER_RETURN
L(last_4x_vec):
- /* Copy from 2 * VEC to 4 * VEC. */
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
VMOVU (%rsi), %VEC(0)
VMOVU VEC_SIZE(%rsi), %VEC(1)
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
--
GitLab

View File

@ -0,0 +1,55 @@
From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Wed, 23 Jun 2021 19:19:34 -0400
Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
Content-type: text/plain; charset=UTF-8
No bug. The way wcsnlen will check if near the end of maxlen
is the following macro:
mov %r11, %rsi; \
subq %rax, %rsi; \
andq $-64, %rax; \
testq $-64, %rsi; \
je L(strnlen_ret)
Which words independently of s + maxlen overflowing. So the
second overflow check is unnecissary for correctness and
just extra overhead in the common no overflow case.
test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
all passing
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
1 file changed, 7 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
index 439e486a..b7657282 100644
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
@@ -71,19 +71,12 @@ L(n_nonzero):
suffice. */
mov %RSI_LP, %R10_LP
sar $62, %R10_LP
- test %R10_LP, %R10_LP
jnz __wcslen_sse4_1
sal $2, %RSI_LP
# endif
-
/* Initialize long lived registers. */
-
add %RDI_LP, %RSI_LP
-# ifdef AS_WCSLEN
-/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
- jbe __wcslen_sse4_1
-# endif
mov %RSI_LP, %R10_LP
and $-64, %R10_LP
mov %RSI_LP, %R11_LP
--
GitLab

View File

@ -0,0 +1,290 @@
From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:32:24 -0800
Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes memset/wmemset for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
RDX_LP for length. Clear the upper 32 bits of RDX register.
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
---
.../multiarch/memset-avx512-no-vzeroupper.S | 6 +-
.../multiarch/memset-vec-unaligned-erms.S | 34 +++++----
sysdeps/x86_64/x32/Makefile | 4 +-
sysdeps/x86_64/x32/tst-size_t-memset.c | 73 +++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wmemset.c | 20 +++++
5 files changed, 121 insertions(+), 16 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
index 689cc119..99e25519 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
@@ -29,12 +29,16 @@
.section .text.avx512,"ax",@progbits
#if defined PIC
ENTRY (MEMSET_CHK)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (MEMSET_CHK)
#endif
ENTRY (MEMSET)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
+# endif
vpxor %xmm0, %xmm0, %xmm0
vmovd %esi, %xmm1
lea (%rdi, %rdx), %rsi
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 270a1d49..9a0fd818 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -65,8 +65,8 @@
.section SECTION(.text),"ax",@progbits
#if VEC_SIZE == 16 && IS_IN (libc)
ENTRY (__bzero)
- movq %rdi, %rax /* Set return value. */
- movq %rsi, %rdx /* Set n. */
+ mov %RDI_LP, %RAX_LP /* Set return value. */
+ mov %RSI_LP, %RDX_LP /* Set n. */
pxor %xmm0, %xmm0
jmp L(entry_from_bzero)
END (__bzero)
@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
#if IS_IN (libc)
# if defined SHARED
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
# endif
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
- shlq $2, %rdx
+ shl $2, %RDX_LP
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
jmp L(entry_from_bzero)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
#if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
+# endif
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
# if VEC_SIZE == 16
ENTRY (__memset_chk_erms)
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END (__memset_chk_erms)
/* Only used to measure performance of REP STOSB. */
ENTRY (__memset_erms)
/* Skip zero length. */
- testq %rdx, %rdx
+ test %RDX_LP, %RDX_LP
jnz L(stosb)
movq %rdi, %rax
ret
@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
L(stosb):
/* Issue vzeroupper before rep stosb. */
VZEROUPPER
- movq %rdx, %rcx
+ mov %RDX_LP, %RCX_LP
movzbl %sil, %eax
- movq %rdi, %rdx
+ mov %RDI_LP, %RDX_LP
rep stosb
- movq %rdx, %rax
+ mov %RDX_LP, %RAX_LP
ret
# if VEC_SIZE == 16
END (__memset_erms)
@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
# if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
- cmpq %rdx, %rcx
+ cmp %RDX_LP, %RCX_LP
jb HIDDEN_JUMPTARGET (__chk_fail)
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- cmpq $VEC_SIZE, %rdx
+# ifdef __ILP32__
+ /* Clear the upper 32 bits. */
+ mov %edx, %edx
+# endif
+ cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
- cmpq $(VEC_SIZE * 2), %rdx
+ cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index e99dbd7c..98bd9ae9 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -7,9 +7,9 @@ endif
ifeq ($(subdir),string)
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
- tst-size_t-memrchr
+ tst-size_t-memrchr tst-size_t-memset
endif
ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
new file mode 100644
index 00000000..2c367af6
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
@@ -0,0 +1,73 @@
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef WIDE
+# define TEST_NAME "wmemset"
+#else
+# define TEST_NAME "memset"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define MEMSET wmemset
+# define CHAR wchar_t
+#else
+# define MEMSET memset
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (MEMSET, 1)
+
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_memset (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ CHAR ch = 0x23;
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
+ parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ c.fn = impl->fn;
+ CHAR *p = (CHAR *) do_memset (src, c);
+ size_t i;
+ for (i = 0; i < src.len; i++)
+ if (p[i] != ch)
+ {
+ error (0, 0, "Wrong result in function %s", impl->name);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
new file mode 100644
index 00000000..955eb488
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
@@ -0,0 +1,20 @@
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-memset.c"
--
GitLab

View File

@ -0,0 +1,43 @@
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> 2021-05-23 21:43:10
Committer: H.J. Lu <hjl.tools@gmail.com> 2021-06-27 10:56:57
Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
Child: 1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
Branches: master, remotes/origin/master and many more (41)
Follows: glibc-2.33.9000
Precedes: glibc-2.34
math: redirect roundeven function
This patch redirect roundeven function for futhermore changes.
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
Conflicts:
*
(rewritten for older branch)
diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
index 7bbbb2dc..8728d0f2 100644
--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
+++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>
#include <libm-alias-double.h>
@@ -67,5 +68,6 @@ __roundeven (double x)
INSERT_WORDS64 (x, ix);
return x;
}
-hidden_def (__roundeven)
+#ifndef __roundeven
libm_alias_double (__roundeven, roundeven)
+#endif
--
GitLab

View File

@ -0,0 +1,118 @@
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
Date: Mon, 24 May 2021 09:43:10 +0800
Subject: [PATCH] math: redirect roundeven function
Content-type: text/plain; charset=UTF-8
This patch redirect roundeven function for futhermore changes.
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
include/math.h | 3 ++-
sysdeps/ieee754/dbl-64/s_roundeven.c | 4 +++-
sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
sysdeps/ieee754/flt-32/s_roundevenf.c | 3 +++
sysdeps/ieee754/ldbl-128/s_roundevenl.c | 1 +
sysdeps/ieee754/ldbl-96/s_roundevenl.c | 1 +
6 files changed, 11 insertions(+), 2 deletions(-)
Conflicts:
include/math.h
(missing MATH_REDIRECT macros)
diff --git a/include/math.h b/include/math.h
index e21d34b8..1f9f9a54 100644
--- a/include/math.h
+++ b/include/math.h
@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
libm_hidden_proto (__issignalingf)
libm_hidden_proto (__exp)
libm_hidden_proto (__expf)
-libm_hidden_proto (__roundeven)
# ifndef __NO_LONG_DOUBLE_MATH
libm_hidden_proto (__fpclassifyl)
@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
# if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
# ifndef NO_MATH_REDIRECT
+float (roundevenf) (float) asm ("__roundevenf");
+double (roundeven) (double) asm ("__roundeven");
/* Declare sqrt for use within GLIBC. Compilers typically inline sqrt as a
single instruction. Use an asm to avoid use of PLTs if it doesn't. */
float (sqrtf) (float) asm ("__ieee754_sqrtf");
diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
index 1438e81d..61962184 100644
--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
+++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>
#include <libm-alias-double.h>
@@ -101,5 +102,6 @@ __roundeven (double x)
INSERT_WORDS (x, hx, lx);
return x;
}
-hidden_def (__roundeven)
+#ifndef __roundeven
libm_alias_double (__roundeven, roundeven)
+#endif
diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
index 5a9b3f39..e0faf727 100644
--- a/sysdeps/ieee754/float128/s_roundevenf128.c
+++ b/sysdeps/ieee754/float128/s_roundevenf128.c
@@ -1,2 +1,3 @@
+#define NO_MATH_REDIRECT
#include <float128_private.h>
#include "../ldbl-128/s_roundevenl.c"
diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
index 90f991d5..a661875e 100644
--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
+++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>
#include <libm-alias-float.h>
@@ -67,4 +68,6 @@ __roundevenf (float x)
SET_FLOAT_WORD (x, ix);
return x;
}
+#ifndef __roundevenf
libm_alias_float (__roundeven, roundeven)
+#endif
diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
index 5fc59af4..b9375b6c 100644
--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>
#include <libm-alias-ldouble.h>
diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
index be2e4fa4..65031ab7 100644
--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
+++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
@@ -17,6 +17,7 @@
License along with the GNU C Library; if not, see
<http://www.gnu.org/licenses/>. */
+#define NO_MATH_REDIRECT
#include <math.h>
#include <math_private.h>
#include <libm-alias-ldouble.h>
--
GitLab

View File

@ -0,0 +1,242 @@
From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
Date: Mon, 24 May 2021 09:43:11 +0800
Subject: [PATCH] x86_64: roundeven with sse4.1 support
Content-type: text/plain; charset=UTF-8
This patch adds support for the sse4.1 hardware floating point
roundeven.
Here is some benchmark results on my systems:
=AMD Ryzen 9 3900X 12-Core Processor=
* benchmark result before this commit
| | roundeven | roundevenf |
|------------|--------------|--------------|
| duration | 3.75587e+09 | 3.75114e+09 |
| iterations | 3.93053e+08 | 4.35402e+08 |
| max | 52.592 | 58.71 |
| min | 7.98 | 7.22 |
| mean | 9.55563 | 8.61535 |
* benchmark result after this commit
| | roundeven | roundevenf |
|------------|---------------|--------------|
| duration | 3.73815e+09 | 3.73738e+09 |
| iterations | 5.82692e+08 | 5.91498e+08 |
| max | 56.468 | 51.642 |
| min | 6.27 | 6.156 |
| mean | 6.41532 | 6.3185 |
=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
* benchmark result before this commit
| | roundeven | roundevenf |
|------------|--------------|--------------|
| duration | 2.18208e+09 | 2.18258e+09 |
| iterations | 2.39932e+08 | 2.46924e+08 |
| max | 96.378 | 98.035 |
| min | 6.776 | 5.94 |
| mean | 9.09456 | 8.83907 |
* benchmark result after this commit
| | roundeven | roundevenf |
|------------|--------------|--------------|
| duration | 2.17415e+09 | 2.17005e+09 |
| iterations | 3.56193e+08 | 4.09824e+08 |
| max | 51.693 | 97.192 |
| min | 5.926 | 5.093 |
| mean | 6.10385 | 5.29507 |
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/fpu/multiarch/Makefile | 5 +--
sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c | 2 ++
.../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
sysdeps/x86_64/fpu/multiarch/s_roundeven.c | 31 +++++++++++++++++++
sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c | 3 ++
.../fpu/multiarch/s_roundevenf-sse4_1.S | 24 ++++++++++++++
sysdeps/x86_64/fpu/multiarch/s_roundevenf.c | 31 +++++++++++++++++++
7 files changed, 118 insertions(+), 2 deletions(-)
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index 9f387248..6ddd1c01 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,11 +1,12 @@
ifeq ($(subdir),math)
libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
- s_trunc-c s_truncf-c
+ s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
s_floorf-sse4_1 s_nearbyint-sse4_1 \
- s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
+ s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
+ s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
s_trunc-sse4_1 s_truncf-sse4_1
libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
new file mode 100644
index 00000000..c7be43cb
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
@@ -0,0 +1,2 @@
+#define __roundeven __roundeven_c
+#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
new file mode 100644
index 00000000..6ae8f6b1
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
@@ -0,0 +1,24 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .section .text.sse4.1,"ax",@progbits
+ENTRY(__roundeven_sse41)
+ roundsd $8, %xmm0, %xmm0
+ ret
+END(__roundeven_sse41)
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
new file mode 100644
index 00000000..d92eda65
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
@@ -0,0 +1,31 @@
+/* Multiple versions of __roundeven.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-double.h>
+
+#define roundeven __redirect_roundeven
+#define __roundeven __redirect___roundeven
+#include <math.h>
+#undef roundeven
+#undef __roundeven
+
+#define SYMBOL_NAME roundeven
+#include "ifunc-sse4_1.h"
+
+libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
+libm_alias_double (__roundeven, roundeven)
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
new file mode 100644
index 00000000..72a6e7d1
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
@@ -0,0 +1,3 @@
+#undef __roundevenf
+#define __roundevenf __roundevenf_c
+#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
new file mode 100644
index 00000000..a76e1080
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
@@ -0,0 +1,24 @@
+/* Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+ .section .text.sse4.1,"ax",@progbits
+ENTRY(__roundevenf_sse41)
+ roundss $8, %xmm0, %xmm0
+ ret
+END(__roundevenf_sse41)
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
new file mode 100644
index 00000000..2ee196e6
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
@@ -0,0 +1,31 @@
+/* Multiple versions of __roundevenf.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <libm-alias-float.h>
+
+#define roundevenf __redirect_roundevenf
+#define __roundevenf __redirect___roundevenf
+#include <math.h>
+#undef roundevenf
+#undef __roundevenf
+
+#define SYMBOL_NAME roundevenf
+#include "ifunc-sse4_1.h"
+
+libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
+libm_alias_float (__roundeven, roundeven)
--
GitLab

View File

@ -0,0 +1,41 @@
From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun, 9 Jan 2022 16:02:28 -0600
Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
Content-type: text/plain; charset=UTF-8
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
__wcscmp_evex. For x86_64 this covers the entire address range so any
length larger could not possibly be used to bound `s1` or `s2`.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
1 file changed, 10 insertions(+)
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 459eeed0..d5aa6daa 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -97,6 +97,16 @@ ENTRY (STRCMP)
je L(char0)
jb L(zero)
# ifdef USE_AS_WCSCMP
+# ifndef __ILP32__
+ movq %rdx, %rcx
+ /* Check if length could overflow when multiplied by
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
+ overflow cases as well as redirect cases where its impossible to
+ length to bound a valid memory region. In these cases just use
+ 'wcscmp'. */
+ shrq $56, %rcx
+ jnz __wcscmp_evex
+# endif
/* Convert units: from wide to byte char. */
shl $2, %RDX_LP
# endif
--
GitLab

View File

@ -0,0 +1,268 @@
From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 20 Aug 2021 06:42:24 -0700
Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
#28252]
Content-type: text/plain; charset=UTF-8
Optimize loads of all bits set into ZMM register in AVX512 SVML codes
by replacing
vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
and
vmovups .L_2il0floatpacket.13(%rip), %zmmX
with
vpternlogd $0xff, %zmmX, %zmmX, %zmmX
This fixes BZ #28252.
---
.../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
.../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
.../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
.../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
10 files changed, 11 insertions(+), 64 deletions(-)
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
index 24e3b363..07dfed85 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
vmovaps %zmm0, %zmm8
/* Check for large arguments path */
- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
+ vpternlogd $0xff, %zmm2, %zmm2, %zmm2
/*
ARGUMENT RANGE REDUCTION:
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
jmp .LBL_2_7
#endif
END (_ZGVeN8v_cos_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.16:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.16,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
index ae8af8d8..ddb60e5b 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
/* preserve mantissa, set input exponent to 2^(-10) */
vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm1
vpsrlq $32, %zmm4, %zmm6
/* reciprocal approximation good to at least 11 bits */
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
jmp .LBL_2_7
#endif
END (_ZGVeN8v_log_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.12:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.12,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
index 2d4b14fd..529c454a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
andq $-64, %rsp
subq $1280, %rsp
movq __svml_d_trig_data@GOTPCREL(%rip), %rax
- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm14
vmovups __dAbsMask(%rax), %zmm7
vmovups __dInvPI(%rax), %zmm2
vmovups __dRShifter(%rax), %zmm1
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
jmp .LBL_2_7
#endif
END (_ZGVeN8v_sin_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.14:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.14,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
index 2df626c0..e501a53a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
/* SinPoly = SinR*SinPoly */
vfmadd213pd %zmm5, %zmm5, %zmm4
- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
/* Update Cos result's sign */
vxorpd %zmm2, %zmm1, %zmm1
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
ENTRY (_ZGVeN8vvv_sincos_skx)
WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
END (_ZGVeN8vvv_sincos_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.15:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.15,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
index 6ea1137b..377af394 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
X = X - Y*PI1 - Y*PI2 - Y*PI3
*/
vmovaps %zmm0, %zmm6
- vmovups .L_2il0floatpacket.13(%rip), %zmm12
+ vpternlogd $0xff, %zmm12, %zmm12, %zmm12
vmovups __sRShifter(%rax), %zmm3
vmovups __sPI1_FMA(%rax), %zmm5
vmovups __sA9_FMA(%rax), %zmm9
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
jmp .LBL_2_7
#endif
END (_ZGVeN16v_cosf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
index 89ba0df2..46f33d46 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
vmovaps %zmm0, %zmm7
/* compare against threshold */
- vmovups .L_2il0floatpacket.13(%rip), %zmm3
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
vmovups __sInvLn2(%rax), %zmm4
vmovups __sShifter(%rax), %zmm1
vmovups __sLn2hi(%rax), %zmm6
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
#endif
END (_ZGVeN16v_expf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
index 4cf0a96f..9e254956 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
andq $-64, %rsp
subq $1280, %rsp
movq __svml_slog_data@GOTPCREL(%rip), %rax
- vmovups .L_2il0floatpacket.7(%rip), %zmm6
+ vpternlogd $0xff, %zmm6, %zmm6, %zmm6
vmovups _iBrkValue(%rax), %zmm4
vmovups _sPoly_7(%rax), %zmm8
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
#endif
END (_ZGVeN16v_logf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.7:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.7,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
index bdcd50af..e8331ba1 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
vpsrlq $32, %zmm3, %zmm2
vpmovqd %zmm2, %ymm11
vcvtps2pd %ymm14, %zmm13
- vmovups .L_2il0floatpacket.23(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
vmovaps %zmm14, %zmm26
vpandd _ABSMASK(%rax), %zmm1, %zmm8
vpcmpd $1, _INF(%rax), %zmm8, %k2
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
vpmovqd %zmm11, %ymm5
vpxord %zmm10, %zmm10, %zmm10
vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
+ vpternlogd $0xff, %zmm4, %zmm4, %zmm4
vpxord %zmm11, %zmm11, %zmm11
vcvtdq2pd %ymm7, %zmm7
vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
jmp .LBL_2_7
#endif
END (_ZGVeN16vv_powf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.23:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.23,@object
-.L_2il0floatpacket.24:
- .long 0xffffffff,0xffffffff
- .type .L_2il0floatpacket.24,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
index 5fa4bc41..1f46f334 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
/* Result sign calculations */
vpternlogd $150, %zmm0, %zmm14, %zmm1
- vmovups .L_2il0floatpacket.13(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
/* Add correction term 0.5 for cos() part */
vaddps %zmm8, %zmm5, %zmm15
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
ENTRY (_ZGVeN16vvv_sincosf_skx)
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
END (_ZGVeN16vvv_sincosf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.13:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.13,@object
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
index 141f747e..1fc9308a 100644
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
movq __svml_s_trig_data@GOTPCREL(%rip), %rax
/* Check for large and special values */
- vmovups .L_2il0floatpacket.11(%rip), %zmm14
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
vmovups __sAbsMask(%rax), %zmm5
vmovups __sInvPI(%rax), %zmm1
vmovups __sRShifter(%rax), %zmm2
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
jmp .LBL_2_7
#endif
END (_ZGVeN16v_sinf_skx)
-
- .section .rodata, "a"
-.L_2il0floatpacket.11:
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
- .type .L_2il0floatpacket.11,@object
--
GitLab

View File

@ -0,0 +1,48 @@
From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 21 Sep 2021 18:31:49 -0500
Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be
specified
Content-type: text/plain; charset=UTF-8
No bug.
This change adds a new macro ENTRY_P2ALIGN which takes a second
argument, log2 of the desired function alignment.
The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
doesn't affect any existing functionality.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86/sysdep.h | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 01bac0f6..a70bb3a2 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -78,15 +78,18 @@ enum cf_protection_level
#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
/* Define an entry point visible from C. */
-#define ENTRY(name) \
+#define ENTRY_P2ALIGN(name, alignment) \
.globl C_SYMBOL_NAME(name); \
.type C_SYMBOL_NAME(name),@function; \
- .align ALIGNARG(4); \
+ .align ALIGNARG(alignment); \
C_LABEL(name) \
cfi_startproc; \
_CET_ENDBR; \
CALL_MCOUNT
+/* Common entry 16 byte aligns. */
+#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
+
#undef END
#define END(name) \
cfi_endproc; \
--
GitLab

View File

@ -0,0 +1,658 @@
From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 21 Sep 2021 18:45:03 -0500
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
size
Content-type: text/plain; charset=UTF-8
No bug.
The frontend optimizations are to:
1. Reorganize logically connected basic blocks so they are either in
the same cache line or adjacent cache lines.
2. Avoid cases when basic blocks unnecissarily cross cache lines.
3. Try and 32 byte align any basic blocks possible without sacrificing
code size. Smaller / Less hot basic blocks are used for this.
Overall code size shrunk by 168 bytes. This should make up for any
extra costs due to aligning to 64 bytes.
In general performance before deviated a great deal dependending on
whether entry alignment % 64 was 0, 16, 32, or 48. These changes
essentially make it so that the current implementation is at least
equal to the best alignment of the original for any arguments.
The only additional optimization is in the page cross case. Branch on
equals case was removed from the size == [4, 7] case. As well the [4,
7] and [2, 3] case where swapped as [4, 7] is likely a more hot
argument size.
test-memcmp and test-wmemcmp are both passing.
---
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
1 file changed, 242 insertions(+), 192 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 654dc7ac..2761b54f 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -34,7 +34,24 @@
area.
7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
+
+When possible the implementation tries to optimize for frontend in the
+following ways:
+Throughput:
+ 1. All code sections that fit are able to run optimally out of the
+ LSD.
+ 2. All code sections that fit are able to run optimally out of the
+ DSB
+ 3. Basic blocks are contained in minimum number of fetch blocks
+ necessary.
+
+Latency:
+ 1. Logically connected basic blocks are put in the same
+ cache-line.
+ 2. Logically connected basic blocks that do not fit in the same
+ cache-line are put in adjacent lines. This can get beneficial
+ L2 spatial prefetching and L1 next-line prefetching. */
# include <sysdep.h>
@@ -47,9 +64,11 @@
# ifdef USE_AS_WMEMCMP
# define CHAR_SIZE 4
# define VPCMP vpcmpd
+# define VPTEST vptestmd
# else
# define CHAR_SIZE 1
# define VPCMP vpcmpub
+# define VPTEST vptestmb
# endif
# define VEC_SIZE 32
@@ -75,7 +94,9 @@
*/
.section .text.evex,"ax",@progbits
-ENTRY (MEMCMP)
+/* Cache align memcmp entry. This allows for much more thorough
+ frontend optimization. */
+ENTRY_P2ALIGN (MEMCMP, 6)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
movl %edx, %edx
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
VPCMP $4, (%rdi), %YMM1, %k1
kmovd %k1, %eax
/* NB: eax must be destination register if going to
- L(return_vec_[0,2]). For L(return_vec_3 destination register
+ L(return_vec_[0,2]). For L(return_vec_3) destination register
must be ecx. */
testl %eax, %eax
jnz L(return_vec_0)
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
testl %ecx, %ecx
jnz L(return_vec_3)
- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
- compare with zero to get a mask is needed. */
- vpxorq %XMM0, %XMM0, %XMM0
-
/* Go to 4x VEC loop. */
cmpq $(CHAR_PER_VEC * 8), %rdx
ja L(more_8x_vec)
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- /* Or together YMM1, YMM2, and YMM3 into YMM3. */
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
- oring with YMM3. Result is stored in YMM4. */
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
- VPCMP $4, %YMM4, %YMM0, %k1
+ oring with YMM1. Result is stored in YMM4. */
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+
+ /* Or together YMM2, YMM3, and YMM4 into YMM4. */
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+
+ /* Test YMM4 against itself. Store any CHAR mismatches in k1.
+ */
+ VPTEST %YMM4, %YMM4, %k1
+ /* k1 must go to ecx for L(return_vec_0_1_2_3). */
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(return_vec_0_1_2_3)
/* NB: eax must be zero to reach here. */
ret
- /* NB: aligning 32 here allows for the rest of the jump targets
- to be tuned for 32 byte alignment. Most important this ensures
- the L(more_8x_vec) loop is 32 byte aligned. */
- .p2align 5
-L(less_vec):
- /* Check if one or less CHAR. This is necessary for size = 0 but
- is also faster for size = CHAR_SIZE. */
- cmpl $1, %edx
- jbe L(one_or_less)
+ .p2align 4
+L(8x_end_return_vec_0_1_2_3):
+ movq %rdx, %rdi
+L(8x_return_vec_0_1_2_3):
+ addq %rdi, %rsi
+L(return_vec_0_1_2_3):
+ VPTEST %YMM1, %YMM1, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
- /* Check if loading one VEC from either s1 or s2 could cause a
- page cross. This can have false positives but is by far the
- fastest method. */
- movl %edi, %eax
- orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(page_cross_less_vec)
+ VPTEST %YMM2, %YMM2, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_1)
- /* No page cross possible. */
- VMOVU (%rsi), %YMM2
- VPCMP $4, (%rdi), %YMM2, %k1
- kmovd %k1, %eax
- /* Create mask in ecx for potentially in bound matches. */
- bzhil %edx, %eax, %eax
- jnz L(return_vec_0)
+ VPTEST %YMM3, %YMM3, %k0
+ kmovd %k0, %eax
+ testl %eax, %eax
+ jnz L(return_vec_2)
+L(return_vec_3):
+ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
+ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
+ line. */
+ bsfl %ecx, %ecx
+# ifdef USE_AS_WMEMCMP
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
+ subl %ecx, %eax
+# endif
ret
.p2align 4
@@ -209,10 +240,11 @@ L(return_vec_0):
# endif
ret
- /* NB: No p2align necessary. Alignment % 16 is naturally 1
- which is good enough for a target not in a loop. */
+ .p2align 4
L(return_vec_1):
- tzcntl %eax, %eax
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
+ fetch block. */
+ bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -226,10 +258,11 @@ L(return_vec_1):
# endif
ret
- /* NB: No p2align necessary. Alignment % 16 is naturally 2
- which is good enough for a target not in a loop. */
+ .p2align 4,, 10
L(return_vec_2):
- tzcntl %eax, %eax
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
+ fetch block. */
+ bsfl %eax, %eax
# ifdef USE_AS_WMEMCMP
movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
@@ -243,40 +276,6 @@ L(return_vec_2):
# endif
ret
- .p2align 4
-L(8x_return_vec_0_1_2_3):
- /* Returning from L(more_8x_vec) requires restoring rsi. */
- addq %rdi, %rsi
-L(return_vec_0_1_2_3):
- VPCMP $4, %YMM1, %YMM0, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(return_vec_0)
-
- VPCMP $4, %YMM2, %YMM0, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(return_vec_1)
-
- VPCMP $4, %YMM3, %YMM0, %k0
- kmovd %k0, %eax
- testl %eax, %eax
- jnz L(return_vec_2)
-L(return_vec_3):
- tzcntl %ecx, %ecx
-# ifdef USE_AS_WMEMCMP
- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
- xorl %edx, %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
- subl %ecx, %eax
-# endif
- ret
-
.p2align 4
L(more_8x_vec):
/* Set end of s1 in rdx. */
@@ -288,21 +287,19 @@ L(more_8x_vec):
andq $-VEC_SIZE, %rdi
/* Adjust because first 4x vec where check already. */
subq $-(VEC_SIZE * 4), %rdi
+
.p2align 4
L(loop_4x_vec):
VMOVU (%rsi, %rdi), %YMM1
vpxorq (%rdi), %YMM1, %YMM1
-
VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
-
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
-
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
- VPCMP $4, %YMM4, %YMM0, %k1
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(8x_return_vec_0_1_2_3)
@@ -319,28 +316,25 @@ L(loop_4x_vec):
cmpl $(VEC_SIZE * 2), %edi
jae L(8x_last_2x_vec)
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
+
VMOVU (%rsi, %rdx), %YMM1
vpxorq (%rdx), %YMM1, %YMM1
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
-
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
-
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
- VPCMP $4, %YMM4, %YMM0, %k1
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
+ VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
- /* Restore s1 pointer to rdi. */
- movq %rdx, %rdi
testl %ecx, %ecx
- jnz L(8x_return_vec_0_1_2_3)
+ jnz L(8x_end_return_vec_0_1_2_3)
/* NB: eax must be zero to reach here. */
ret
/* Only entry is from L(more_8x_vec). */
- .p2align 4
+ .p2align 4,, 10
L(8x_last_2x_vec):
VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
kmovd %k1, %eax
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
jnz L(8x_return_vec_3)
ret
- .p2align 4
+ /* Not ideally aligned (at offset +9 bytes in fetch block) but
+ not aligning keeps it in the same cache line as
+ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
+ size. */
+ .p2align 4,, 4
+L(8x_return_vec_2):
+ subq $VEC_SIZE, %rdx
+L(8x_return_vec_3):
+ bsfl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
+ movl (VEC_SIZE * 3)(%rax), %ecx
+ xorl %edx, %edx
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ addq %rdx, %rax
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ movzbl (VEC_SIZE * 3)(%rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+ .p2align 4,, 10
L(last_2x_vec):
/* Check second to last VEC. */
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
@@ -374,26 +392,49 @@ L(last_1x_vec):
jnz L(return_vec_0_end)
ret
- .p2align 4
-L(8x_return_vec_2):
- subq $VEC_SIZE, %rdx
-L(8x_return_vec_3):
- tzcntl %eax, %eax
+ .p2align 4,, 10
+L(return_vec_1_end):
+ /* Use bsf to save code size. This is necessary to have
+ L(one_or_less) fit in aligning bytes between. */
+ bsfl %eax, %eax
+ addl %edx, %eax
# ifdef USE_AS_WMEMCMP
- leaq (%rdx, %rax, CHAR_SIZE), %rax
- movl (VEC_SIZE * 3)(%rax), %ecx
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
setg %dl
leal -1(%rdx, %rdx), %eax
# else
- addq %rdx, %rax
- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
- movzbl (VEC_SIZE * 3)(%rax), %eax
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
+ /* NB: L(one_or_less) fits in alignment padding between
+ L(return_vec_1_end) and L(return_vec_0_end). */
+# ifdef USE_AS_WMEMCMP
+L(one_or_less):
+ jb L(zero)
+ movl (%rdi), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi), %ecx
+ je L(zero)
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+ ret
+# else
+L(one_or_less):
+ jb L(zero)
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
+ subl %ecx, %eax
+ ret
+# endif
+L(zero):
+ xorl %eax, %eax
+ ret
+
.p2align 4
L(return_vec_0_end):
tzcntl %eax, %eax
@@ -412,23 +453,56 @@ L(return_vec_0_end):
ret
.p2align 4
-L(return_vec_1_end):
+L(less_vec):
+ /* Check if one or less CHAR. This is necessary for size == 0
+ but is also faster for size == CHAR_SIZE. */
+ cmpl $1, %edx
+ jbe L(one_or_less)
+
+ /* Check if loading one VEC from either s1 or s2 could cause a
+ page cross. This can have false positives but is by far the
+ fastest method. */
+ movl %edi, %eax
+ orl %esi, %eax
+ andl $(PAGE_SIZE - 1), %eax
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
+ jg L(page_cross_less_vec)
+
+ /* No page cross possible. */
+ VMOVU (%rsi), %YMM2
+ VPCMP $4, (%rdi), %YMM2, %k1
+ kmovd %k1, %eax
+ /* Check if any matches where in bounds. Intentionally not
+ storing result in eax to limit dependency chain if it goes to
+ L(return_vec_0_lv). */
+ bzhil %edx, %eax, %edx
+ jnz L(return_vec_0_lv)
+ xorl %eax, %eax
+ ret
+
+ /* Essentially duplicate of L(return_vec_0). Ends up not costing
+ any code as shrinks L(less_vec) by allowing 2-byte encoding of
+ the jump and ends up fitting in aligning bytes. As well fits on
+ same cache line as L(less_vec) so also saves a line from having
+ to be fetched on cold calls to memcmp. */
+ .p2align 4,, 4
+L(return_vec_0_lv):
tzcntl %eax, %eax
- addl %edx, %eax
# ifdef USE_AS_WMEMCMP
- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
xorl %edx, %edx
- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
setg %dl
leal -1(%rdx, %rdx), %eax
# else
- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
subl %ecx, %eax
# endif
ret
-
.p2align 4
L(page_cross_less_vec):
/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
cmpl $8, %edx
jae L(between_8_15)
cmpl $4, %edx
- jae L(between_4_7)
-L(between_2_3):
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
- ret
- .p2align 4
-L(one_or_less):
- jb L(zero)
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
+ jb L(between_2_3)
+
+ /* Load as big endian with overlapping movbe to avoid branches.
+ */
+ movbe (%rdi), %eax
+ movbe (%rsi), %ecx
+ shlq $32, %rax
+ shlq $32, %rcx
+ movbe -4(%rdi, %rdx), %edi
+ movbe -4(%rsi, %rdx), %esi
+ orq %rdi, %rax
+ orq %rsi, %rcx
+ subq %rcx, %rax
+ /* edx is guranteed to be positive int32 in range [4, 7]. */
+ cmovne %edx, %eax
+ /* ecx is -1 if rcx > rax. Otherwise 0. */
+ sbbl %ecx, %ecx
+ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
+ rax then eax and ecx are zero. If rax < rax then ecx is -1 so
+ eax doesn't matter. */
+ orl %ecx, %eax
ret
- .p2align 4
+ .p2align 4,, 8
L(between_8_15):
# endif
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
- vmovq (%rdi), %XMM1
- vmovq (%rsi), %XMM2
- VPCMP $4, %XMM1, %XMM2, %k1
+ vmovq (%rdi), %xmm1
+ vmovq (%rsi), %xmm2
+ VPCMP $4, %xmm1, %xmm2, %k1
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
+ jnz L(return_vec_0_lv)
/* Use overlapping loads to avoid branches. */
- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
- vmovq (%rdi), %XMM1
- vmovq (%rsi), %XMM2
- VPCMP $4, %XMM1, %XMM2, %k1
+ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
+ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
+ VPCMP $4, %xmm1, %xmm2, %k1
+ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
- ret
-
- .p2align 4
-L(zero):
- xorl %eax, %eax
+ jnz L(return_vec_0_end)
ret
- .p2align 4
+ .p2align 4,, 8
L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
- VMOVU (%rsi), %XMM2
- VPCMP $4, (%rdi), %XMM2, %k1
+
+ /* Use movups to save code size. */
+ movups (%rsi), %xmm2
+ VPCMP $4, (%rdi), %xmm2, %k1
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
-
+ jnz L(return_vec_0_lv)
/* Use overlapping loads to avoid branches. */
-
- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
- VPCMP $4, (%rdi), %XMM2, %k1
+ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
+ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
kmovd %k1, %eax
testl %eax, %eax
- jnz L(return_vec_0)
- ret
-
-# ifdef USE_AS_WMEMCMP
- .p2align 4
-L(one_or_less):
- jb L(zero)
- movl (%rdi), %ecx
- xorl %edx, %edx
- cmpl (%rsi), %ecx
- je L(zero)
- setg %dl
- leal -1(%rdx, %rdx), %eax
+ jnz L(return_vec_0_end)
ret
-# else
- .p2align 4
-L(between_4_7):
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- jz L(zero_4_7)
- sbbl %eax, %eax
- orl $1, %eax
-L(zero_4_7):
+# ifndef USE_AS_WMEMCMP
+L(between_2_3):
+ /* Load as big endian to avoid branches. */
+ movzwl (%rdi), %eax
+ movzwl (%rsi), %ecx
+ shll $8, %eax
+ shll $8, %ecx
+ bswap %eax
+ bswap %ecx
+ movzbl -1(%rdi, %rdx), %edi
+ movzbl -1(%rsi, %rdx), %esi
+ orl %edi, %eax
+ orl %esi, %ecx
+ /* Subtraction is okay because the upper 8 bits are zero. */
+ subl %ecx, %eax
ret
# endif
-
END (MEMCMP)
#endif
--
GitLab

View File

@ -0,0 +1,510 @@
From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 20 Sep 2021 16:20:15 -0500
Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8
No bug.
Optimization are
1. change control flow for L(more_2x_vec) to fall through to loop and
jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
size and saves jumps for length > 4x VEC_SIZE.
2. For EVEX/AVX512 move L(less_vec) closer to entry.
3. Avoid complex address mode for length > 2x VEC_SIZE
4. Slightly better aligning code for the loop from the perspective of
code size and uops.
5. Align targets so they make full use of their fetch block and if
possible cache line.
6. Try and reduce total number of icache lines that will need to be
pulled in for a given length.
7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
jumping to the stosb target in the sse2 code section will almost
certainly be to a new page. The new version does increase code size
marginally by duplicating the target but should get better iTLB
behavior as a result.
test-memset, test-wmemset, and test-bzero are all passing.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/memset.S | 10 +-
.../multiarch/memset-avx2-unaligned-erms.S | 10 +-
.../multiarch/memset-avx512-unaligned-erms.S | 11 +-
.../multiarch/memset-evex-unaligned-erms.S | 11 +-
.../multiarch/memset-vec-unaligned-erms.S | 285 ++++++++++++------
5 files changed, 232 insertions(+), 95 deletions(-)
Conflicts:
sysdeps/x86_64/memset.S
(GNU URL)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index b3426795..8672b030 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -18,13 +18,15 @@
<http://www.gnu.org/licenses/>. */
#include <sysdep.h>
+#define USE_WITH_SSE2 1
#define VEC_SIZE 16
+#define MOV_SIZE 3
+#define RET_SIZE 1
+
#define VEC(i) xmm##i
-/* Don't use movups and movaps since it will get larger nop paddings for
- alignment. */
-#define VMOVU movdqu
-#define VMOVA movdqa
+#define VMOVU movups
+#define VMOVA movaps
#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index ae0860f3..1af668af 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -1,8 +1,14 @@
#if IS_IN (libc)
+# define USE_WITH_AVX2 1
+
# define VEC_SIZE 32
+# define MOV_SIZE 4
+# define RET_SIZE 4
+
# define VEC(i) ymm##i
-# define VMOVU vmovdqu
-# define VMOVA vmovdqa
+
+# define VMOVU vmovdqu
+# define VMOVA vmovdqa
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 8ad842fc..f14d6f84 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -1,11 +1,18 @@
#if IS_IN (libc)
+# define USE_WITH_AVX512 1
+
# define VEC_SIZE 64
+# define MOV_SIZE 6
+# define RET_SIZE 1
+
# define XMM0 xmm16
# define YMM0 ymm16
# define VEC0 zmm16
# define VEC(i) VEC##i
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 640f0929..64b09e77 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -1,11 +1,18 @@
#if IS_IN (libc)
+# define USE_WITH_EVEX 1
+
# define VEC_SIZE 32
+# define MOV_SIZE 6
+# define RET_SIZE 1
+
# define XMM0 xmm16
# define YMM0 ymm16
# define VEC0 ymm16
# define VEC(i) VEC##i
-# define VMOVU vmovdqu64
-# define VMOVA vmovdqa64
+
+# define VMOVU vmovdqu64
+# define VMOVA vmovdqa64
+
# define VZEROUPPER
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 909c33f6..f08b7323 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -63,8 +63,27 @@
# endif
#endif
+#if VEC_SIZE == 64
+# define LOOP_4X_OFFSET (VEC_SIZE * 4)
+#else
+# define LOOP_4X_OFFSET (0)
+#endif
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+# define END_REG rcx
+# define LOOP_REG rdi
+#else
+# define END_REG rdi
+# define LOOP_REG rdx
+#endif
+
#define PAGE_SIZE 4096
+/* Macro to calculate size of small memset block for aligning
+ purposes. */
+#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
+
+
#ifndef SECTION
# error SECTION is not defined!
#endif
@@ -74,6 +93,7 @@
ENTRY (__bzero)
mov %RDI_LP, %RAX_LP /* Set return value. */
mov %RSI_LP, %RDX_LP /* Set n. */
+ xorl %esi, %esi
pxor %XMM0, %XMM0
jmp L(entry_from_bzero)
END (__bzero)
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
jb L(less_vec)
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
- VMOVU %VEC(0), (%rdi)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
+ */
+ VMOVU %VEC(0), (%rax)
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
VZEROUPPER_RETURN
-
- .p2align 4
-L(stosb_more_2x_vec):
- cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
- ja L(stosb)
-#else
- .p2align 4
#endif
-L(more_2x_vec):
- /* Stores to first 2x VEC before cmp as any path forward will
- require it. */
- VMOVU %VEC(0), (%rdi)
- VMOVU %VEC(0), VEC_SIZE(%rdi)
- cmpq $(VEC_SIZE * 4), %rdx
- ja L(loop_start)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
-L(return):
-#if VEC_SIZE > 16
- ZERO_UPPER_VEC_REGISTERS_RETURN
+
+ .p2align 4,, 10
+L(last_2x_vec):
+#ifdef USE_LESS_VEC_MASK_STORE
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
#else
- ret
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
#endif
+ VZEROUPPER_RETURN
-L(loop_start):
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
- cmpq $(VEC_SIZE * 8), %rdx
- jbe L(loop_end)
- andq $-(VEC_SIZE * 2), %rdi
- subq $-(VEC_SIZE * 4), %rdi
- leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
- .p2align 4
-L(loop):
- VMOVA %VEC(0), (%rdi)
- VMOVA %VEC(0), VEC_SIZE(%rdi)
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
- subq $-(VEC_SIZE * 4), %rdi
- cmpq %rcx, %rdi
- jb L(loop)
-L(loop_end):
- /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
- rdx as length is also unchanged. */
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
- VZEROUPPER_SHORT_RETURN
-
- .p2align 4
+ /* If have AVX512 mask instructions put L(less_vec) close to
+ entry as it doesn't take much space and is likely a hot target.
+ */
+#ifdef USE_LESS_VEC_MASK_STORE
+ .p2align 4,, 10
L(less_vec):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
# endif
-# ifdef USE_LESS_VEC_MASK_STORE
/* Clear high bits from edi. Only keeping bits relevant to page
cross check. Note that we are using rax which is set in
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
- */
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
andl $(PAGE_SIZE - 1), %edi
- /* Check if VEC_SIZE store cross page. Mask stores suffer serious
- performance degradation when it has to fault supress. */
+ /* Check if VEC_SIZE store cross page. Mask stores suffer
+ serious performance degradation when it has to fault supress.
+ */
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
+ /* This is generally considered a cold target. */
ja L(cross_page)
# if VEC_SIZE > 32
movq $-1, %rcx
@@ -247,58 +235,185 @@ L(less_vec):
bzhil %edx, %ecx, %ecx
kmovd %ecx, %k1
# endif
- vmovdqu8 %VEC(0), (%rax) {%k1}
+ vmovdqu8 %VEC(0), (%rax){%k1}
VZEROUPPER_RETURN
+# if defined USE_MULTIARCH && IS_IN (libc)
+ /* Include L(stosb_local) here if including L(less_vec) between
+ L(stosb_more_2x_vec) and ENTRY. This is to cache align the
+ L(stosb_more_2x_vec) target. */
+ .p2align 4,, 10
+L(stosb_local):
+ movzbl %sil, %eax
+ mov %RDX_LP, %RCX_LP
+ mov %RDI_LP, %RDX_LP
+ rep stosb
+ mov %RDX_LP, %RAX_LP
+ VZEROUPPER_RETURN
+# endif
+#endif
+
+#if defined USE_MULTIARCH && IS_IN (libc)
.p2align 4
-L(cross_page):
+L(stosb_more_2x_vec):
+ cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
+ ja L(stosb_local)
+#endif
+ /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
+ and (4x, 8x] jump to target. */
+L(more_2x_vec):
+
+ /* Two different methods of setting up pointers / compare. The
+ two methods are based on the fact that EVEX/AVX512 mov
+ instructions take more bytes then AVX2/SSE2 mov instructions. As
+ well that EVEX/AVX512 machines also have fast LEA_BID. Both
+ setup and END_REG to avoid complex address mode. For EVEX/AVX512
+ this saves code size and keeps a few targets in one fetch block.
+ For AVX2/SSE2 this helps prevent AGU bottlenecks. */
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
+ LOOP_4X_OFFSET) with LEA_BID. */
+
+ /* END_REG is rcx for EVEX/AVX512. */
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
+ /* Stores to first 2x VEC before cmp as any path forward will
+ require it. */
+ VMOVU %VEC(0), (%rax)
+ VMOVU %VEC(0), VEC_SIZE(%rax)
+
+
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
+ addq %rdx, %END_REG
+#endif
+
+ cmpq $(VEC_SIZE * 4), %rdx
+ jbe L(last_2x_vec)
+
+ /* Store next 2x vec regardless. */
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
+ VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
+
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
+ extra offset to addresses in loop. Used for AVX512 to save space
+ as no way to get (VEC_SIZE * 4) in imm8. */
+# if LOOP_4X_OFFSET == 0
+ subq $-(VEC_SIZE * 4), %LOOP_REG
# endif
-# if VEC_SIZE > 32
- cmpb $32, %dl
- jae L(between_32_63)
+ /* Avoid imm32 compare here to save code size. */
+ cmpq %rdi, %rcx
+#else
+ addq $-(VEC_SIZE * 4), %END_REG
+ cmpq $(VEC_SIZE * 8), %rdx
+#endif
+ jbe L(last_4x_vec)
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
+ /* Set LOOP_REG (rdx). */
+ leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
+#endif
+ /* Align dst for loop. */
+ andq $(VEC_SIZE * -2), %LOOP_REG
+ .p2align 4
+L(loop):
+ VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
+ VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
+ VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
+ VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
+ subq $-(VEC_SIZE * 4), %LOOP_REG
+ cmpq %END_REG, %LOOP_REG
+ jb L(loop)
+ .p2align 4,, MOV_SIZE
+L(last_4x_vec):
+ VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG)
+ VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
+L(return):
+#if VEC_SIZE > 16
+ ZERO_UPPER_VEC_REGISTERS_RETURN
+#else
+ ret
+#endif
+
+ .p2align 4,, 10
+#ifndef USE_LESS_VEC_MASK_STORE
+# if defined USE_MULTIARCH && IS_IN (libc)
+ /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
+ range for 2-byte jump encoding. */
+L(stosb_local):
+ movzbl %sil, %eax
+ mov %RDX_LP, %RCX_LP
+ mov %RDI_LP, %RDX_LP
+ rep stosb
+ mov %RDX_LP, %RAX_LP
+ VZEROUPPER_RETURN
# endif
-# if VEC_SIZE > 16
- cmpb $16, %dl
+ /* Define L(less_vec) only if not otherwise defined. */
+ .p2align 4
+L(less_vec):
+#endif
+L(cross_page):
+#if VEC_SIZE > 32
+ cmpl $32, %edx
+ jae L(between_32_63)
+#endif
+#if VEC_SIZE > 16
+ cmpl $16, %edx
jae L(between_16_31)
-# endif
- MOVQ %XMM0, %rcx
- cmpb $8, %dl
+#endif
+ MOVQ %XMM0, %rdi
+ cmpl $8, %edx
jae L(between_8_15)
- cmpb $4, %dl
+ cmpl $4, %edx
jae L(between_4_7)
- cmpb $1, %dl
+ cmpl $1, %edx
ja L(between_2_3)
- jb 1f
- movb %cl, (%rax)
-1:
+ jb L(return)
+ movb %sil, (%rax)
VZEROUPPER_RETURN
-# if VEC_SIZE > 32
+
+ /* Align small targets only if not doing so would cross a fetch
+ line. */
+#if VEC_SIZE > 32
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, -32(%rax,%rdx)
VMOVU %YMM0, (%rax)
+ VMOVU %YMM0, -32(%rax, %rdx)
VZEROUPPER_RETURN
-# endif
-# if VEC_SIZE > 16
- /* From 16 to 31. No branch when size == 16. */
+#endif
+
+#if VEC_SIZE >= 32
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
L(between_16_31):
- VMOVU %XMM0, -16(%rax,%rdx)
+ /* From 16 to 31. No branch when size == 16. */
VMOVU %XMM0, (%rax)
+ VMOVU %XMM0, -16(%rax, %rdx)
VZEROUPPER_RETURN
-# endif
- /* From 8 to 15. No branch when size == 8. */
+#endif
+
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
L(between_8_15):
- movq %rcx, -8(%rax,%rdx)
- movq %rcx, (%rax)
+ /* From 8 to 15. No branch when size == 8. */
+ movq %rdi, (%rax)
+ movq %rdi, -8(%rax, %rdx)
VZEROUPPER_RETURN
+
+ .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
- movl %ecx, -4(%rax,%rdx)
- movl %ecx, (%rax)
+ movl %edi, (%rax)
+ movl %edi, -4(%rax, %rdx)
VZEROUPPER_RETURN
+
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
- movw %cx, -2(%rax,%rdx)
- movw %cx, (%rax)
+ movw %di, (%rax)
+ movb %dil, -1(%rax, %rdx)
VZEROUPPER_RETURN
END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
GitLab

View File

@ -0,0 +1,45 @@
From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sat, 23 Oct 2021 01:26:47 -0400
Subject: [PATCH] x86: Replace sse2 instructions with avx in
memcmp-evex-movbe.S
Content-type: text/plain; charset=UTF-8
This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
it could potentially be dangerous to use SSE2 if this function is ever
called without using 'vzeroupper' beforehand. While compilers appear
to use 'vzeroupper' before function calls if AVX2 has been used, using
SSE2 here is more brittle. Since it is not absolutely necessary it
should be avoided.
It costs 2-extra bytes but the extra bytes should only eat into
alignment padding.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 2761b54f..640f6757 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -561,13 +561,13 @@ L(between_16_31):
/* From 16 to 31 bytes. No branch when size == 16. */
/* Use movups to save code size. */
- movups (%rsi), %xmm2
+ vmovdqu (%rsi), %xmm2
VPCMP $4, (%rdi), %xmm2, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_0_lv)
/* Use overlapping loads to avoid branches. */
- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
kmovd %k1, %eax
--
GitLab

View File

@ -0,0 +1,695 @@
From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 29 Oct 2021 12:40:20 -0700
Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load
Content-type: text/plain; charset=UTF-8
In strcmp-evex.S, to compare 2 32-byte strings, replace
VMOVU (%rdi, %rdx), %YMM0
VMOVU (%rsi, %rdx), %YMM1
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
VPCMP $4, %YMM0, %YMM1, %k0
VPCMP $0, %YMMZERO, %YMM0, %k1
VPCMP $0, %YMMZERO, %YMM1, %k2
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
kord %k1, %k2, %k1
/* Each bit in K1 represents a NULL or a mismatch. */
kord %k0, %k1, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jne L(last_vector)
with
VMOVU (%rdi, %rdx), %YMM0
VPTESTM %YMM0, %YMM0, %k2
/* Each bit cleared in K1 represents a mismatch or a null CHAR
in YMM0 and 32 bytes at (%rsi, %rdx). */
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
incl %ecx
jne L(last_vector)
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
and Ice Lake.
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------
1 file changed, 243 insertions(+), 218 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index d5aa6daa..82f12ac8 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -41,6 +41,8 @@
# ifdef USE_AS_WCSCMP
/* Compare packed dwords. */
# define VPCMP vpcmpd
+# define VPMINU vpminud
+# define VPTESTM vptestmd
# define SHIFT_REG32 r8d
# define SHIFT_REG64 r8
/* 1 dword char == 4 bytes. */
@@ -48,6 +50,8 @@
# else
/* Compare packed bytes. */
# define VPCMP vpcmpb
+# define VPMINU vpminub
+# define VPTESTM vptestmb
# define SHIFT_REG32 ecx
# define SHIFT_REG64 rcx
/* 1 byte char == 1 byte. */
@@ -67,6 +71,9 @@
# define YMM5 ymm22
# define YMM6 ymm23
# define YMM7 ymm24
+# define YMM8 ymm25
+# define YMM9 ymm26
+# define YMM10 ymm27
/* Warning!
wcscmp/wcsncmp have to use SIGNED comparison for elements.
@@ -76,7 +83,7 @@
/* The main idea of the string comparison (byte or dword) using 256-bit
EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
latter can be on either packed bytes or dwords depending on
- USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
+ USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
@@ -123,27 +130,21 @@ ENTRY (STRCMP)
jg L(cross_page)
/* Start comparing 4 vectors. */
VMOVU (%rdi), %YMM0
- VMOVU (%rsi), %YMM1
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
- VPCMP $4, %YMM0, %YMM1, %k0
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
- /* Check for NULL in YMM0. */
- VPCMP $0, %YMMZERO, %YMM0, %k1
- /* Check for NULL in YMM1. */
- VPCMP $0, %YMMZERO, %YMM1, %k2
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
- kord %k1, %k2, %k1
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (%rsi). */
+ VPCMP $0, (%rsi), %YMM0, %k1{%k2}
- /* Each bit in K1 represents:
- 1. A mismatch in YMM0 and YMM1. Or
- 2. A NULL in YMM0 or YMM1.
- */
- kord %k0, %k1, %k1
-
- ktestd %k1, %k1
- je L(next_3_vectors)
kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
+ je L(next_3_vectors)
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -172,9 +173,7 @@ L(return):
# endif
ret
- .p2align 4
L(return_vec_size):
- kmovd %k1, %ecx
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -210,9 +209,7 @@ L(return_vec_size):
# endif
ret
- .p2align 4
L(return_2_vec_size):
- kmovd %k1, %ecx
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -248,9 +245,7 @@ L(return_2_vec_size):
# endif
ret
- .p2align 4
L(return_3_vec_size):
- kmovd %k1, %ecx
tzcntl %ecx, %edx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -289,43 +284,45 @@ L(return_3_vec_size):
.p2align 4
L(next_3_vectors):
VMOVU VEC_SIZE(%rdi), %YMM0
- VMOVU VEC_SIZE(%rsi), %YMM1
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
- VPCMP $4, %YMM0, %YMM1, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k2
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- ktestd %k1, %k1
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at VEC_SIZE(%rsi). */
+ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(return_vec_size)
- VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
- VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
-
- /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
- VPCMP $4, %YMM2, %YMM4, %k0
- VPCMP $0, %YMMZERO, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM4, %k2
- /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- ktestd %k1, %k1
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
+ VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(return_2_vec_size)
- /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
- VPCMP $4, %YMM3, %YMM5, %k0
- VPCMP $0, %YMMZERO, %YMM3, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k2
- /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- ktestd %k1, %k1
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
+ VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(return_3_vec_size)
L(main_loop_header):
leaq (VEC_SIZE * 4)(%rdi), %rdx
@@ -375,56 +372,51 @@ L(back_to_loop):
VMOVA VEC_SIZE(%rax), %YMM2
VMOVA (VEC_SIZE * 2)(%rax), %YMM4
VMOVA (VEC_SIZE * 3)(%rax), %YMM6
- VMOVU (%rdx), %YMM1
- VMOVU VEC_SIZE(%rdx), %YMM3
- VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
- VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
-
- VPCMP $4, %YMM0, %YMM1, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k2
- kord %k1, %k2, %k1
- /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
- YMM1. */
- kord %k0, %k1, %k4
-
- VPCMP $4, %YMM2, %YMM3, %k0
- VPCMP $0, %YMMZERO, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM3, %k2
- kord %k1, %k2, %k1
- /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
- YMM3. */
- kord %k0, %k1, %k5
-
- VPCMP $4, %YMM4, %YMM5, %k0
- VPCMP $0, %YMMZERO, %YMM4, %k1
- VPCMP $0, %YMMZERO, %YMM5, %k2
- kord %k1, %k2, %k1
- /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
- YMM5. */
- kord %k0, %k1, %k6
-
- VPCMP $4, %YMM6, %YMM7, %k0
- VPCMP $0, %YMMZERO, %YMM6, %k1
- VPCMP $0, %YMMZERO, %YMM7, %k2
- kord %k1, %k2, %k1
- /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
- YMM7. */
- kord %k0, %k1, %k7
-
- kord %k4, %k5, %k0
- kord %k6, %k7, %k1
-
- /* Test each mask (32 bits) individually because for VEC_SIZE
- == 32 is not possible to OR the four masks and keep all bits
- in a 64-bit integer register, differing from SSE2 strcmp
- where ORing is possible. */
- kortestd %k0, %k1
- je L(loop)
- ktestd %k4, %k4
+
+ VPMINU %YMM0, %YMM2, %YMM8
+ VPMINU %YMM4, %YMM6, %YMM9
+
+ /* A zero CHAR in YMM8 means that there is a null CHAR. */
+ VPMINU %YMM8, %YMM9, %YMM8
+
+ /* Each bit set in K1 represents a non-null CHAR in YMM8. */
+ VPTESTM %YMM8, %YMM8, %k1
+
+ /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */
+ vpxorq (%rdx), %YMM0, %YMM1
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
+ vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
+
+ vporq %YMM1, %YMM3, %YMM9
+ vporq %YMM5, %YMM7, %YMM10
+
+ /* A non-zero CHAR in YMM9 represents a mismatch. */
+ vporq %YMM9, %YMM10, %YMM9
+
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR. */
+ VPCMP $0, %YMMZERO, %YMM9, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
+ je L(loop)
+
+ /* Each bit set in K1 represents a non-null CHAR in YMM0. */
+ VPTESTM %YMM0, %YMM0, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM0 and (%rdx). */
+ VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
je L(test_vec)
- kmovd %k4, %edi
- tzcntl %edi, %ecx
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %ecx
@@ -466,9 +458,18 @@ L(test_vec):
cmpq $VEC_SIZE, %r11
jbe L(zero)
# endif
- ktestd %k5, %k5
+ /* Each bit set in K1 represents a non-null CHAR in YMM2. */
+ VPTESTM %YMM2, %YMM2, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM2 and VEC_SIZE(%rdx). */
+ VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
je L(test_2_vec)
- kmovd %k5, %ecx
tzcntl %ecx, %edi
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -512,9 +513,18 @@ L(test_2_vec):
cmpq $(VEC_SIZE * 2), %r11
jbe L(zero)
# endif
- ktestd %k6, %k6
+ /* Each bit set in K1 represents a non-null CHAR in YMM4. */
+ VPTESTM %YMM4, %YMM4, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM4 and (VEC_SIZE * 2)(%rdx). */
+ VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
je L(test_3_vec)
- kmovd %k6, %ecx
tzcntl %ecx, %edi
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
@@ -558,8 +568,18 @@ L(test_3_vec):
cmpq $(VEC_SIZE * 3), %r11
jbe L(zero)
# endif
- kmovd %k7, %esi
- tzcntl %esi, %ecx
+ /* Each bit set in K1 represents a non-null CHAR in YMM6. */
+ VPTESTM %YMM6, %YMM6, %k1
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
+ in YMM6 and (VEC_SIZE * 3)(%rdx). */
+ VPCMP $0, %YMMZERO, %YMM7, %k0{%k1}
+ kmovd %k0, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
+ tzcntl %ecx, %ecx
# ifdef USE_AS_WCSCMP
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
sall $2, %ecx
@@ -615,39 +635,51 @@ L(loop_cross_page):
VMOVU (%rax, %r10), %YMM2
VMOVU VEC_SIZE(%rax, %r10), %YMM3
- VMOVU (%rdx, %r10), %YMM4
- VMOVU VEC_SIZE(%rdx, %r10), %YMM5
-
- VPCMP $4, %YMM4, %YMM2, %k0
- VPCMP $0, %YMMZERO, %YMM2, %k1
- VPCMP $0, %YMMZERO, %YMM4, %k2
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
- YMM4. */
- kord %k0, %k1, %k1
-
- VPCMP $4, %YMM5, %YMM3, %k3
- VPCMP $0, %YMMZERO, %YMM3, %k4
- VPCMP $0, %YMMZERO, %YMM5, %k5
- kord %k4, %k5, %k4
- /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
- YMM5. */
- kord %k3, %k4, %k3
+
+ /* Each bit set in K2 represents a non-null CHAR in YMM2. */
+ VPTESTM %YMM2, %YMM2, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM2 and 32 bytes at (%rdx, %r10). */
+ VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2}
+ kmovd %k1, %r9d
+ /* Don't use subl since it is the lower 16/32 bits of RDI
+ below. */
+ notl %r9d
+# ifdef USE_AS_WCSCMP
+ /* Only last 8 bits are valid. */
+ andl $0xff, %r9d
+# endif
+
+ /* Each bit set in K4 represents a non-null CHAR in YMM3. */
+ VPTESTM %YMM3, %YMM3, %k4
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
+ in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
+ VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
+ kmovd %k3, %edi
+# ifdef USE_AS_WCSCMP
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
+ notl %edi
+ andl $0xff, %edi
+# else
+ incl %edi
+# endif
# ifdef USE_AS_WCSCMP
- /* NB: Each bit in K1/K3 represents 4-byte element. */
- kshiftlw $8, %k3, %k2
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
+ sall $8, %edi
/* NB: Divide shift count by 4 since each bit in K1 represent 4
bytes. */
movl %ecx, %SHIFT_REG32
sarl $2, %SHIFT_REG32
+
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
+ orl %r9d, %edi
# else
- kshiftlq $32, %k3, %k2
-# endif
+ salq $32, %rdi
- /* Each bit in K1 represents a NULL or a mismatch. */
- korq %k1, %k2, %k1
- kmovq %k1, %rdi
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
+ orq %r9, %rdi
+# endif
/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
shrxq %SHIFT_REG64, %rdi, %rdi
@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
/* The first VEC_SIZE * 2 bytes match or are ignored. */
VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
- VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
- VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
-
- VPCMP $4, %YMM0, %YMM2, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM2, %k2
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
- YMM2. */
- kord %k0, %k1, %k1
-
- VPCMP $4, %YMM1, %YMM3, %k3
- VPCMP $0, %YMMZERO, %YMM1, %k4
- VPCMP $0, %YMMZERO, %YMM3, %k5
- kord %k4, %k5, %k4
- /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
- YMM3. */
- kord %k3, %k4, %k3
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */
+ VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
+ kmovd %k1, %r9d
+ /* Don't use subl since it is the lower 16/32 bits of RDI
+ below. */
+ notl %r9d
# ifdef USE_AS_WCSCMP
- /* NB: Each bit in K1/K3 represents 4-byte element. */
- kshiftlw $8, %k3, %k2
+ /* Only last 8 bits are valid. */
+ andl $0xff, %r9d
+# endif
+
+ VPTESTM %YMM1, %YMM1, %k4
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
+ in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
+ VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
+ kmovd %k3, %edi
+# ifdef USE_AS_WCSCMP
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
+ notl %edi
+ andl $0xff, %edi
# else
- kshiftlq $32, %k3, %k2
+ incl %edi
# endif
- /* Each bit in K1 represents a NULL or a mismatch. */
- korq %k1, %k2, %k1
- kmovq %k1, %rdi
+# ifdef USE_AS_WCSCMP
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
+ sall $8, %edi
+
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
+ orl %r9d, %edi
+# else
+ salq $32, %rdi
+
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
+ orq %r9, %rdi
+# endif
xorl %r8d, %r8d
/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
/* R8 has number of bytes skipped. */
movl %ecx, %r8d
# ifdef USE_AS_WCSCMP
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
+ /* NB: Divide shift count by 4 since each bit in RDI represent 4
bytes. */
sarl $2, %ecx
-# endif
+ /* Skip ECX bytes. */
+ shrl %cl, %edi
+# else
/* Skip ECX bytes. */
shrq %cl, %rdi
+# endif
1:
/* Before jumping back to the loop, set ESI to the number of
VEC_SIZE * 4 blocks before page crossing. */
@@ -818,7 +863,7 @@ L(cross_page_loop):
movzbl (%rdi, %rdx), %eax
movzbl (%rsi, %rdx), %ecx
# endif
- /* Check null char. */
+ /* Check null CHAR. */
testl %eax, %eax
jne L(cross_page_loop)
/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
@@ -901,18 +946,17 @@ L(cross_page):
jg L(cross_page_1_vector)
L(loop_1_vector):
VMOVU (%rdi, %rdx), %YMM0
- VMOVU (%rsi, %rdx), %YMM1
-
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
- VPCMP $4, %YMM0, %YMM1, %k0
- VPCMP $0, %YMMZERO, %YMM0, %k1
- VPCMP $0, %YMMZERO, %YMM1, %k2
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
+
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in YMM0 and 32 bytes at (%rsi, %rdx). */
+ VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
kmovd %k1, %ecx
- testl %ecx, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xff, %ecx
+# else
+ incl %ecx
+# endif
jne L(last_vector)
addl $VEC_SIZE, %edx
@@ -931,18 +975,17 @@ L(cross_page_1_vector):
cmpl $(PAGE_SIZE - 16), %eax
jg L(cross_page_1_xmm)
VMOVU (%rdi, %rdx), %XMM0
- VMOVU (%rsi, %rdx), %XMM1
-
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
- VPCMP $4, %XMM0, %XMM1, %k0
- VPCMP $0, %XMMZERO, %XMM0, %k1
- VPCMP $0, %XMMZERO, %XMM1, %k2
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
- korw %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- korw %k0, %k1, %k1
- kmovw %k1, %ecx
- testl %ecx, %ecx
+
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in XMM0 and 16 bytes at (%rsi, %rdx). */
+ VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2}
+ kmovd %k1, %ecx
+# ifdef USE_AS_WCSCMP
+ subl $0xf, %ecx
+# else
+ subl $0xffff, %ecx
+# endif
jne L(last_vector)
addl $16, %edx
@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
vmovq (%rdi, %rdx), %XMM0
vmovq (%rsi, %rdx), %XMM1
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
- VPCMP $4, %XMM0, %XMM1, %k0
- VPCMP $0, %XMMZERO, %XMM0, %k1
- VPCMP $0, %XMMZERO, %XMM1, %k2
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
- kmovd %k1, %ecx
-
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in XMM0 and XMM1. */
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
+ kmovb %k1, %ecx
# ifdef USE_AS_WCSCMP
- /* Only last 2 bits are valid. */
- andl $0x3, %ecx
+ subl $0x3, %ecx
# else
- /* Only last 8 bits are valid. */
- andl $0xff, %ecx
+ subl $0xff, %ecx
# endif
-
- testl %ecx, %ecx
jne L(last_vector)
addl $8, %edx
@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
vmovd (%rdi, %rdx), %XMM0
vmovd (%rsi, %rdx), %XMM1
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
- VPCMP $4, %XMM0, %XMM1, %k0
- VPCMP $0, %XMMZERO, %XMM0, %k1
- VPCMP $0, %XMMZERO, %XMM1, %k2
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
- kord %k1, %k2, %k1
- /* Each bit in K1 represents a NULL or a mismatch. */
- kord %k0, %k1, %k1
+ VPTESTM %YMM0, %YMM0, %k2
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
+ in XMM0 and XMM1. */
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
kmovd %k1, %ecx
-
# ifdef USE_AS_WCSCMP
- /* Only the last bit is valid. */
- andl $0x1, %ecx
+ subl $0x1, %ecx
# else
- /* Only last 4 bits are valid. */
- andl $0xf, %ecx
+ subl $0xf, %ecx
# endif
-
- testl %ecx, %ecx
jne L(last_vector)
addl $4, %edx
--
GitLab

View File

@ -0,0 +1,300 @@
From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:33:52 -0800
Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes the strncmp family for x32. Tested on x86-64 and x32.
On x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
* sysdeps/x86_64/strcmp.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
tst-size_t-strncmp and tst-size_t-wcsncmp.
* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 6 +-
sysdeps/x86_64/multiarch/strcmp-sse42.S | 6 +-
sysdeps/x86_64/strcmp.S | 6 +-
sysdeps/x86_64/x32/Makefile | 6 +-
sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-strncmp.c | 78 +++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | 20 ++++++
7 files changed, 170 insertions(+), 11 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 327e3d87..156c1949 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -79,15 +79,15 @@
ENTRY (STRCMP)
# ifdef USE_AS_STRNCMP
/* Check for simple cases (0 or 1) in offset. */
- cmp $1, %rdx
+ cmp $1, %RDX_LP
je L(char0)
jb L(zero)
# ifdef USE_AS_WCSCMP
/* Convert units: from wide to byte char. */
- shl $2, %rdx
+ shl $2, %RDX_LP
# endif
/* Register %r11 tracks the maximum offset. */
- movq %rdx, %r11
+ mov %RDX_LP, %R11_LP
# endif
movl %edi, %eax
xorl %edx, %edx
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index d3c07bd2..a1ebea46 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -156,11 +156,11 @@ STRCMP_SSE42:
#endif
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
je LABEL(strcmp_exitz)
- cmp $1, %rdx
+ cmp $1, %RDX_LP
je LABEL(Byte0)
- mov %rdx, %r11
+ mov %RDX_LP, %R11_LP
#endif
mov %esi, %ecx
mov %edi, %eax
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index e16945b9..f47c8ad4 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -135,11 +135,11 @@ ENTRY (STRCMP)
* This implementation uses SSE to compare up to 16 bytes at a time.
*/
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
- test %rdx, %rdx
+ test %RDX_LP, %RDX_LP
je LABEL(strcmp_exitz)
- cmp $1, %rdx
+ cmp $1, %RDX_LP
je LABEL(Byte0)
- mov %rdx, %r11
+ mov %RDX_LP, %R11_LP
#endif
mov %esi, %ecx
mov %edi, %eax
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 98bd9ae9..db302839 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -7,9 +7,11 @@ endif
ifeq ($(subdir),string)
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
- tst-size_t-memrchr tst-size_t-memset
+ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
+ tst-size_t-strncmp
endif
ifeq ($(subdir),wcsmbs)
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
+ tst-size_t-wcsncmp
endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
new file mode 100644
index 00000000..86233593
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
@@ -0,0 +1,59 @@
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_NAME "strncasecmp"
+#include "test-size_t.h"
+
+IMPL (strncasecmp, 1)
+
+typedef int (*proto_t) (const char *, const char *, size_t);
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncasecmp (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ strncpy ((char *) buf1, (const char *) buf2, page_size);
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ int res = do_strncasecmp (dest, src);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
new file mode 100644
index 00000000..54e6bd83
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
@@ -0,0 +1,78 @@
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef WIDE
+# define TEST_NAME "wcsncmp"
+#else
+# define TEST_NAME "strncmp"
+#endif
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+
+# define STRNCMP wcsncmp
+# define STRNCPY wcsncpy
+# define CHAR wchar_t
+#else
+# define STRNCMP strncmp
+# define STRNCPY strncpy
+# define CHAR char
+#endif
+
+IMPL (STRNCMP, 1)
+
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
+
+
+static int
+__attribute__ ((noinline, noclone))
+do_strncmp (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ size_t size = page_size / sizeof (CHAR);
+ parameter_t dest = { { size }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ int res = do_strncmp (dest, src);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
new file mode 100644
index 00000000..4829647c
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
@@ -0,0 +1,20 @@
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-strncmp.c"
--
GitLab

View File

@ -0,0 +1,54 @@
From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001
From: Fangrui Song <maskray@google.com>
Date: Tue, 2 Nov 2021 20:59:52 -0700
Subject: [PATCH] x86-64: Replace movzx with movzbl
Content-type: text/plain; charset=UTF-8
Clang cannot assemble movzx in the AT&T dialect mode.
../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
movzx (%rsi), %ecx
^~~~
Change movzx to movzbl, which follows the AT&T dialect and is used
elsewhere in the file.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++--
sysdeps/x86_64/strcmp.S | 4 ++--
2 files changed, 4 insertions(+), 4 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
index a1ebea46..d8fdeb3a 100644
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
.p2align 4
// XXX Same as code above
LABEL(Byte0):
- movzx (%rsi), %ecx
- movzx (%rdi), %eax
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
index f47c8ad4..aa6df898 100644
--- a/sysdeps/x86_64/strcmp.S
+++ b/sysdeps/x86_64/strcmp.S
@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
.p2align 4
LABEL(Byte0):
- movzx (%rsi), %ecx
- movzx (%rdi), %eax
+ movzbl (%rsi), %ecx
+ movzbl (%rdi), %eax
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
--
GitLab

View File

@ -0,0 +1,56 @@
From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 30 Apr 2021 05:58:59 -0700
Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
Content-type: text/plain; charset=UTF-8
The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
that REP MOVSB became faster after 2112 bytes:
Vector Move REP MOVSB
length=2112, align1=0, align2=0: 24.20 24.40
length=2112, align1=1, align2=0: 26.07 23.13
length=2112, align1=0, align2=1: 27.18 28.13
length=2112, align1=1, align2=1: 26.23 25.16
length=2176, align1=0, align2=0: 23.18 22.52
length=2176, align1=2, align2=0: 25.45 22.52
length=2176, align1=0, align2=2: 27.14 27.82
length=2176, align1=2, align2=2: 22.73 25.56
length=2240, align1=0, align2=0: 24.62 24.25
length=2240, align1=3, align2=0: 29.77 27.15
length=2240, align1=0, align2=3: 35.55 29.93
length=2240, align1=3, align2=3: 34.49 25.15
length=2304, align1=0, align2=0: 34.75 26.64
length=2304, align1=4, align2=0: 32.09 22.63
length=2304, align1=0, align2=4: 28.43 31.24
Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
fast short REP MOVSB (FSRM).
* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
rep_movsb_threshold to 2112 on processors with fast short REP
MOVSB (FSRM).
---
sysdeps/x86/cacheinfo.h | 6 ++++++
1 file changed, 6 insertions(+)
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index f72f634a..cc3941d3 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -430,6 +430,12 @@ init_cacheinfo (void)
rep_movsb_threshold = 2048 * (16 / 16);
minimum_rep_movsb_threshold = 16 * 8;
}
+
+ /* NB: The default REP MOVSB threshold is 2112 on processors with fast
+ short REP MOVSB (FSRM). */
+ if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
+ rep_movsb_threshold = 2112;
+
if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
else
--
GitLab

View File

@ -0,0 +1,136 @@
From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 1 Nov 2021 00:49:52 -0500
Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
dl-cacheinfo.h
Content-type: text/plain; charset=UTF-8
No bug.
This patch doubles the rep_movsb_threshold when using ERMS. Based on
benchmarks the vector copy loop, especially now that it handles 4k
aliasing, is better for these medium ranged.
On Skylake with ERMS:
Size, Align1, Align2, dst>src,(rep movsb) / (vec copy)
4096, 0, 0, 0, 0.975
4096, 0, 0, 1, 0.953
4096, 12, 0, 0, 0.969
4096, 12, 0, 1, 0.872
4096, 44, 0, 0, 0.979
4096, 44, 0, 1, 0.83
4096, 0, 12, 0, 1.006
4096, 0, 12, 1, 0.989
4096, 0, 44, 0, 0.739
4096, 0, 44, 1, 0.942
4096, 12, 12, 0, 1.009
4096, 12, 12, 1, 0.973
4096, 44, 44, 0, 0.791
4096, 44, 44, 1, 0.961
4096, 2048, 0, 0, 0.978
4096, 2048, 0, 1, 0.951
4096, 2060, 0, 0, 0.986
4096, 2060, 0, 1, 0.963
4096, 2048, 12, 0, 0.971
4096, 2048, 12, 1, 0.941
4096, 2060, 12, 0, 0.977
4096, 2060, 12, 1, 0.949
8192, 0, 0, 0, 0.85
8192, 0, 0, 1, 0.845
8192, 13, 0, 0, 0.937
8192, 13, 0, 1, 0.939
8192, 45, 0, 0, 0.932
8192, 45, 0, 1, 0.927
8192, 0, 13, 0, 0.621
8192, 0, 13, 1, 0.62
8192, 0, 45, 0, 0.53
8192, 0, 45, 1, 0.516
8192, 13, 13, 0, 0.664
8192, 13, 13, 1, 0.659
8192, 45, 45, 0, 0.593
8192, 45, 45, 1, 0.575
8192, 2048, 0, 0, 0.854
8192, 2048, 0, 1, 0.834
8192, 2061, 0, 0, 0.863
8192, 2061, 0, 1, 0.857
8192, 2048, 13, 0, 0.63
8192, 2048, 13, 1, 0.629
8192, 2061, 13, 0, 0.627
8192, 2061, 13, 1, 0.62
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/cacheinfo.h | 8 +++++---
sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
2 files changed, 20 insertions(+), 14 deletions(-)
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
index cc3941d3..ac025e08 100644
--- a/sysdeps/x86/cacheinfo.h
+++ b/sysdeps/x86/cacheinfo.h
@@ -411,18 +411,20 @@ init_cacheinfo (void)
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
unsigned int minimum_rep_movsb_threshold;
- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
+ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
+ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
+ threshold is 2048 * (VEC_SIZE / 16). */
unsigned int rep_movsb_threshold;
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
{
- rep_movsb_threshold = 2048 * (64 / 16);
+ rep_movsb_threshold = 4096 * (64 / 16);
minimum_rep_movsb_threshold = 64 * 8;
}
else if (CPU_FEATURE_PREFERRED_P (cpu_features,
AVX_Fast_Unaligned_Load))
{
- rep_movsb_threshold = 2048 * (32 / 16);
+ rep_movsb_threshold = 4096 * (32 / 16);
minimum_rep_movsb_threshold = 32 * 8;
}
else
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
index 89bf2966..56c6834a 100644
--- a/sysdeps/x86/dl-tunables.list
+++ b/sysdeps/x86/dl-tunables.list
@@ -32,17 +32,21 @@ glibc {
}
x86_rep_movsb_threshold {
type: SIZE_T
- # Since there is overhead to set up REP MOVSB operation, REP MOVSB
- # isn't faster on short data. The memcpy micro benchmark in glibc
- # shows that 2KB is the approximate value above which REP MOVSB
- # becomes faster than SSE2 optimization on processors with Enhanced
- # REP MOVSB. Since larger register size can move more data with a
- # single load and store, the threshold is higher with larger register
- # size. Note: Since the REP MOVSB threshold must be greater than 8
- # times of vector size and the default value is 2048 * (vector size
- # / 16), the default value and the minimum value must be updated at
- # run-time. NB: Don't set the default value since we can't tell if
- # the tunable value is set by user or not [BZ #27069].
+ # Since there is overhead to set up REP MOVSB operation, REP
+ # MOVSB isn't faster on short data. The memcpy micro benchmark
+ # in glibc shows that 2KB is the approximate value above which
+ # REP MOVSB becomes faster than SSE2 optimization on processors
+ # with Enhanced REP MOVSB. Since larger register size can move
+ # more data with a single load and store, the threshold is
+ # higher with larger register size. Micro benchmarks show AVX
+ # REP MOVSB becomes faster apprximately at 8KB. The AVX512
+ # threshold is extrapolated to 16KB. For machines with FSRM the
+ # threshold is universally set at 2112 bytes. Note: Since the
+ # REP MOVSB threshold must be greater than 8 times of vector
+ # size and the default value is 4096 * (vector size / 16), the
+ # default value and the minimum value must be updated at
+ # run-time. NB: Don't set the default value since we can't tell
+ # if the tunable value is set by user or not [BZ #27069].
minval: 1
}
x86_rep_stosb_threshold {
--
GitLab

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,39 @@
From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 11 Nov 2021 06:31:51 -0800
Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ
#28537]
Content-type: text/plain; charset=UTF-8
Replace boolean CAS with value CAS to avoid the extra load.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
---
nptl/pthread_mutex_lock.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index 29cc143e..60ada70d 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
meantime. */
if ((oldval & FUTEX_WAITERS) == 0)
{
- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
- oldval | FUTEX_WAITERS,
- oldval)
- != 0)
+ int val;
+ if ((val = atomic_compare_and_exchange_val_acq
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+ oldval)) != oldval)
{
- oldval = mutex->__data.__lock;
+ oldval = val;
continue;
}
oldval |= FUTEX_WAITERS;
--
GitLab

View File

@ -0,0 +1,39 @@
From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Thu, 11 Nov 2021 06:54:01 -0800
Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common
[BZ #28537]
Content-type: text/plain; charset=UTF-8
Replace boolean CAS with value CAS to avoid the extra load.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
---
nptl/pthread_mutex_timedlock.c | 10 +++++-----
1 file changed, 5 insertions(+), 5 deletions(-)
diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
index 888c12fe..c4627ef6 100644
--- a/nptl/pthread_mutex_timedlock.c
+++ b/nptl/pthread_mutex_timedlock.c
@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
meantime. */
if ((oldval & FUTEX_WAITERS) == 0)
{
- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
- oldval | FUTEX_WAITERS,
- oldval)
- != 0)
+ int val;
+ if ((val = atomic_compare_and_exchange_val_acq
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
+ oldval)) != oldval)
{
- oldval = mutex->__data.__lock;
+ oldval = val;
continue;
}
oldval |= FUTEX_WAITERS;
--
GitLab

View File

@ -0,0 +1,51 @@
From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Tue, 2 Nov 2021 18:33:07 -0700
Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
Content-type: text/plain; charset=UTF-8
CAS instruction is expensive. From the x86 CPU's point of view, getting
a cache line for writing is more expensive than reading. See Appendix
A.2 Spinlock in:
https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
The full compare and swap will grab the cache line exclusive and cause
excessive cache line bouncing.
Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
loop if compare may fail to reduce cache line bouncing on contended locks.
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
---
nptl/pthread_mutex_lock.c | 7 +++++++
1 file changed, 7 insertions(+)
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index 60ada70d..eb4d8baa 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -56,6 +56,11 @@
#define FORCE_ELISION(m, s)
#endif
+#ifndef LLL_MUTEX_READ_LOCK
+# define LLL_MUTEX_READ_LOCK(mutex) \
+ atomic_load_relaxed (&(mutex)->__data.__lock)
+#endif
+
static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
__attribute_noinline__;
@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
break;
}
atomic_spin_nop ();
+ if (LLL_MUTEX_READ_LOCK (mutex) != 0)
+ continue;
}
while (LLL_MUTEX_TRYLOCK (mutex) != 0);
--
GitLab

View File

@ -0,0 +1,71 @@
From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 12 Nov 2021 11:47:42 -0800
Subject: [PATCH] Move assignment out of the CAS condition
Content-type: text/plain; charset=UTF-8
Update
commit 49302b8fdf9103b6fc0a398678668a22fa19574c
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Nov 11 06:54:01 2021 -0800
Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
Replace boolean CAS with value CAS to avoid the extra load.
and
commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Thu Nov 11 06:31:51 2021 -0800
Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
Replace boolean CAS with value CAS to avoid the extra load.
by moving assignment out of the CAS condition.
---
nptl/pthread_mutex_lock.c | 7 +++----
nptl/pthread_mutex_timedlock.c | 7 +++----
2 files changed, 6 insertions(+), 8 deletions(-)
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index eb4d8baa..a633d95e 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
meantime. */
if ((oldval & FUTEX_WAITERS) == 0)
{
- int val;
- if ((val = atomic_compare_and_exchange_val_acq
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
- oldval)) != oldval)
+ int val = atomic_compare_and_exchange_val_acq
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
+ if (val != oldval)
{
oldval = val;
continue;
diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
index c4627ef6..a76c30b7 100644
--- a/nptl/pthread_mutex_timedlock.c
+++ b/nptl/pthread_mutex_timedlock.c
@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
meantime. */
if ((oldval & FUTEX_WAITERS) == 0)
{
- int val;
- if ((val = atomic_compare_and_exchange_val_acq
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
- oldval)) != oldval)
+ int val = atomic_compare_and_exchange_val_acq
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
+ if (val != oldval)
{
oldval = val;
continue;
--
GitLab

View File

@ -0,0 +1,60 @@
From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 3 Dec 2021 15:29:25 -0800
Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
Content-type: text/plain; charset=UTF-8
Must use notl %edi here as lower bits are for CHAR comparisons
potentially out of range thus can be 0 without indicating mismatch.
This fixes BZ #28646.
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
1 file changed, 8 insertions(+), 6 deletions(-)
Conflicts:
string/test-strcmp.c
(new check omitted)
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 82f12ac8..6f5c4bf9 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -656,12 +656,13 @@ L(loop_cross_page):
in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
kmovd %k3, %edi
+ /* Must use notl %edi here as lower bits are for CHAR
+ comparisons potentially out of range thus can be 0 without
+ indicating mismatch. */
+ notl %edi
# ifdef USE_AS_WCSCMP
/* Don't use subl since it is the upper 8 bits of EDI below. */
- notl %edi
andl $0xff, %edi
-# else
- incl %edi
# endif
# ifdef USE_AS_WCSCMP
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
kmovd %k3, %edi
+ /* Must use notl %edi here as lower bits are for CHAR
+ comparisons potentially out of range thus can be 0 without
+ indicating mismatch. */
+ notl %edi
# ifdef USE_AS_WCSCMP
/* Don't use subl since it is the upper 8 bits of EDI below. */
- notl %edi
andl $0xff, %edi
-# else
- incl %edi
# endif
# ifdef USE_AS_WCSCMP
--
GitLab

View File

@ -0,0 +1,35 @@
From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 6 Dec 2021 07:14:12 -0800
Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512
and AVX-VNNI
Content-type: text/plain; charset=UTF-8
Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
they won't lower CPU frequency when ZMM load and store instructions are
used.
---
sysdeps/x86/cpu-features.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 956bfb4f..5ff2baa0 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|= bit_arch_Prefer_No_VZEROUPPER;
else
{
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
- |= bit_arch_Prefer_No_AVX512;
+ /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
+ when ZMM load and store instructions are used. */
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
+ |= bit_arch_Prefer_No_AVX512;
/* Avoid RTM abort triggered by VZEROUPPER inside a
transactionally executing RTM region. */
--
GitLab

View File

@ -0,0 +1,153 @@
From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:35:18 -0800
Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes strncpy for x32. Tested on x86-64 and x32. On x86-64,
libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
---
.../x86_64/multiarch/strcpy-sse2-unaligned.S | 4 +-
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 6 +-
sysdeps/x86_64/x32/Makefile | 2 +-
sysdeps/x86_64/x32/tst-size_t-strncpy.c | 58 +++++++++++++++++++
4 files changed, 64 insertions(+), 6 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
Conflicts:
ChangeLog
(removed)
sysdeps/x86_64/multiarch/strcpy-avx2.S
(skipped, only needed for x32 arch)
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
index 72bf7e85..50aca22d 100644
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
@@ -40,8 +40,8 @@
.text
ENTRY (STRCPY)
# ifdef USE_AS_STRNCPY
- mov %rdx, %r8
- test %r8, %r8
+ mov %RDX_LP, %R8_LP
+ test %R8_LP, %R8_LP
jz L(ExitZero)
# endif
mov %rsi, %rcx
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
index 9858d0c4..0a62814a 100644
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
@@ -31,13 +31,13 @@ ENTRY (STRCPY)
mov %rsi, %rcx
# ifdef USE_AS_STRNCPY
- mov %rdx, %r8
+ mov %RDX_LP, %R8_LP
# endif
mov %rdi, %rdx
# ifdef USE_AS_STRNCPY
- test %r8, %r8
+ test %R8_LP, %R8_LP
jz L(Exit0)
- cmp $8, %r8
+ cmp $8, %R8_LP
jbe L(StrncpyExit8Bytes)
# endif
cmpb $0, (%rcx)
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index db302839..2a9e20a9 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -8,7 +8,7 @@ endif
ifeq ($(subdir),string)
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
- tst-size_t-strncmp
+ tst-size_t-strncmp tst-size_t-strncpy
endif
ifeq ($(subdir),wcsmbs)
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
new file mode 100644
index 00000000..4dec71e6
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
@@ -0,0 +1,58 @@
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define TEST_NAME "strncpy"
+#include "test-size_t.h"
+
+IMPL (strncpy, 1)
+
+typedef char *(*proto_t) (char *, const char*, size_t);
+
+static void *
+__attribute__ ((noinline, noclone))
+do_strncpy (parameter_t a, parameter_t b)
+{
+ return CALL (&b, a.p, b.p, a.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ parameter_t dest = { { page_size }, buf1 };
+ parameter_t src = { { 0 }, buf2 };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ do_strncpy (dest, src);
+ int res = strncmp (dest.p, src.p, dest.len);
+ if (res)
+ {
+ error (0, 0, "Wrong result in function %s: %i != 0",
+ impl->name, res);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
--
GitLab

View File

@ -0,0 +1,389 @@
From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 24 Dec 2021 18:54:41 -0600
Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
Content-type: text/plain; charset=UTF-8
No bug.
Optimizations are twofold.
1) Replace page cross and 0/1 checks with masked load instructions in
L(less_vec). In applications this reduces branch-misses in the
hot [0, 32] case.
2) Change controlflow so that L(less_vec) case gets the fall through.
Change 2) helps copies in the [0, 32] size range but comes at the cost
of copies in the [33, 64] size range. From profiles of GCC and
Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
appears to the the right tradeoff.
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
1 file changed, 56 insertions(+), 193 deletions(-)
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
index 640f6757..d2899e7c 100644
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
@@ -62,15 +62,18 @@ Latency:
# define VMOVU vmovdqu64
# ifdef USE_AS_WMEMCMP
+# define VMOVU_MASK vmovdqu32
# define CHAR_SIZE 4
# define VPCMP vpcmpd
# define VPTEST vptestmd
# else
+# define VMOVU_MASK vmovdqu8
# define CHAR_SIZE 1
# define VPCMP vpcmpub
# define VPTEST vptestmb
# endif
+
# define VEC_SIZE 32
# define PAGE_SIZE 4096
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
movl %edx, %edx
# endif
cmp $CHAR_PER_VEC, %RDX_LP
- jb L(less_vec)
+ /* Fall through for [0, VEC_SIZE] as its the hottest. */
+ ja L(more_1x_vec)
+
+ /* Create mask for CHAR's we want to compare. This allows us to
+ avoid having to include page cross logic. */
+ movl $-1, %ecx
+ bzhil %edx, %ecx, %ecx
+ kmovd %ecx, %k2
+
+ /* Safe to load full ymm with mask. */
+ VMOVU_MASK (%rsi), %YMM2{%k2}
+ VPCMP $4,(%rdi), %YMM2, %k1{%k2}
+ kmovd %k1, %eax
+ testl %eax, %eax
+ jnz L(return_vec_0)
+ ret
+ .p2align 4
+L(return_vec_0):
+ tzcntl %eax, %eax
+# ifdef USE_AS_WMEMCMP
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
+ xorl %edx, %edx
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
+ /* NB: no partial register stall here because xorl zero idiom
+ above. */
+ setg %dl
+ leal -1(%rdx, %rdx), %eax
+# else
+ movzbl (%rsi, %rax), %ecx
+ movzbl (%rdi, %rax), %eax
+ subl %ecx, %eax
+# endif
+ ret
+
+
+ .p2align 4
+L(more_1x_vec):
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
VMOVU (%rsi), %YMM1
/* Use compare not equals to directly check for mismatch. */
- VPCMP $4, (%rdi), %YMM1, %k1
+ VPCMP $4,(%rdi), %YMM1, %k1
kmovd %k1, %eax
/* NB: eax must be destination register if going to
L(return_vec_[0,2]). For L(return_vec_3) destination register
@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
/* Check third and fourth VEC no matter what. */
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(return_vec_2)
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
kmovd %k1, %ecx
testl %ecx, %ecx
jnz L(return_vec_3)
@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
oring with YMM1. Result is stored in YMM4. */
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
/* Or together YMM2, YMM3, and YMM4 into YMM4. */
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
/* NB: eax must be zero to reach here. */
ret
- .p2align 4
+
+ .p2align 4,, 8
L(8x_end_return_vec_0_1_2_3):
movq %rdx, %rdi
L(8x_return_vec_0_1_2_3):
@@ -222,23 +262,6 @@ L(return_vec_3):
# endif
ret
- .p2align 4
-L(return_vec_0):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax, CHAR_SIZE), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
- /* NB: no partial register stall here because xorl zero idiom
- above. */
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
.p2align 4
L(return_vec_1):
@@ -297,7 +320,7 @@ L(loop_4x_vec):
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
@@ -324,7 +347,7 @@ L(loop_4x_vec):
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
VPTEST %YMM4, %YMM4, %k1
kmovd %k1, %ecx
@@ -336,14 +359,14 @@ L(loop_4x_vec):
/* Only entry is from L(more_8x_vec). */
.p2align 4,, 10
L(8x_last_2x_vec):
- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
+ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(8x_return_vec_2)
/* Naturally aligned to 16 bytes. */
L(8x_last_1x_vec):
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
+ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
kmovd %k1, %eax
testl %eax, %eax
jnz L(8x_return_vec_3)
@@ -392,7 +415,9 @@ L(last_1x_vec):
jnz L(return_vec_0_end)
ret
- .p2align 4,, 10
+
+ /* Don't align. Takes 2-fetch blocks either way and aligning
+ will cause code to spill into another cacheline. */
L(return_vec_1_end):
/* Use bsf to save code size. This is necessary to have
L(one_or_less) fit in aligning bytes between. */
@@ -411,31 +436,8 @@ L(return_vec_1_end):
# endif
ret
- /* NB: L(one_or_less) fits in alignment padding between
- L(return_vec_1_end) and L(return_vec_0_end). */
-# ifdef USE_AS_WMEMCMP
-L(one_or_less):
- jb L(zero)
- movl (%rdi), %ecx
- xorl %edx, %edx
- cmpl (%rsi), %ecx
- je L(zero)
- setg %dl
- leal -1(%rdx, %rdx), %eax
- ret
-# else
-L(one_or_less):
- jb L(zero)
- movzbl (%rsi), %ecx
- movzbl (%rdi), %eax
- subl %ecx, %eax
- ret
-# endif
-L(zero):
- xorl %eax, %eax
- ret
-
- .p2align 4
+ /* Don't align. Takes 2-fetch blocks either way and aligning
+ will cause code to spill into another cacheline. */
L(return_vec_0_end):
tzcntl %eax, %eax
addl %edx, %eax
@@ -451,146 +453,7 @@ L(return_vec_0_end):
subl %ecx, %eax
# endif
ret
+ /* 1-byte until next cache line. */
- .p2align 4
-L(less_vec):
- /* Check if one or less CHAR. This is necessary for size == 0
- but is also faster for size == CHAR_SIZE. */
- cmpl $1, %edx
- jbe L(one_or_less)
-
- /* Check if loading one VEC from either s1 or s2 could cause a
- page cross. This can have false positives but is by far the
- fastest method. */
- movl %edi, %eax
- orl %esi, %eax
- andl $(PAGE_SIZE - 1), %eax
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
- jg L(page_cross_less_vec)
-
- /* No page cross possible. */
- VMOVU (%rsi), %YMM2
- VPCMP $4, (%rdi), %YMM2, %k1
- kmovd %k1, %eax
- /* Check if any matches where in bounds. Intentionally not
- storing result in eax to limit dependency chain if it goes to
- L(return_vec_0_lv). */
- bzhil %edx, %eax, %edx
- jnz L(return_vec_0_lv)
- xorl %eax, %eax
- ret
-
- /* Essentially duplicate of L(return_vec_0). Ends up not costing
- any code as shrinks L(less_vec) by allowing 2-byte encoding of
- the jump and ends up fitting in aligning bytes. As well fits on
- same cache line as L(less_vec) so also saves a line from having
- to be fetched on cold calls to memcmp. */
- .p2align 4,, 4
-L(return_vec_0_lv):
- tzcntl %eax, %eax
-# ifdef USE_AS_WMEMCMP
- movl (%rdi, %rax, CHAR_SIZE), %ecx
- xorl %edx, %edx
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
- /* NB: no partial register stall here because xorl zero idiom
- above. */
- setg %dl
- leal -1(%rdx, %rdx), %eax
-# else
- movzbl (%rsi, %rax), %ecx
- movzbl (%rdi, %rax), %eax
- subl %ecx, %eax
-# endif
- ret
-
- .p2align 4
-L(page_cross_less_vec):
- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
- bytes. */
- cmpl $(16 / CHAR_SIZE), %edx
- jae L(between_16_31)
-# ifndef USE_AS_WMEMCMP
- cmpl $8, %edx
- jae L(between_8_15)
- cmpl $4, %edx
- jb L(between_2_3)
-
- /* Load as big endian with overlapping movbe to avoid branches.
- */
- movbe (%rdi), %eax
- movbe (%rsi), %ecx
- shlq $32, %rax
- shlq $32, %rcx
- movbe -4(%rdi, %rdx), %edi
- movbe -4(%rsi, %rdx), %esi
- orq %rdi, %rax
- orq %rsi, %rcx
- subq %rcx, %rax
- /* edx is guranteed to be positive int32 in range [4, 7]. */
- cmovne %edx, %eax
- /* ecx is -1 if rcx > rax. Otherwise 0. */
- sbbl %ecx, %ecx
- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
- rax then eax and ecx are zero. If rax < rax then ecx is -1 so
- eax doesn't matter. */
- orl %ecx, %eax
- ret
-
- .p2align 4,, 8
-L(between_8_15):
-# endif
- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
- vmovq (%rdi), %xmm1
- vmovq (%rsi), %xmm2
- VPCMP $4, %xmm1, %xmm2, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_lv)
- /* Use overlapping loads to avoid branches. */
- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
- VPCMP $4, %xmm1, %xmm2, %k1
- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_end)
- ret
-
- .p2align 4,, 8
-L(between_16_31):
- /* From 16 to 31 bytes. No branch when size == 16. */
-
- /* Use movups to save code size. */
- vmovdqu (%rsi), %xmm2
- VPCMP $4, (%rdi), %xmm2, %k1
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_lv)
- /* Use overlapping loads to avoid branches. */
- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
- kmovd %k1, %eax
- testl %eax, %eax
- jnz L(return_vec_0_end)
- ret
-
-# ifndef USE_AS_WMEMCMP
-L(between_2_3):
- /* Load as big endian to avoid branches. */
- movzwl (%rdi), %eax
- movzwl (%rsi), %ecx
- shll $8, %eax
- shll $8, %ecx
- bswap %eax
- bswap %ecx
- movzbl -1(%rdi, %rdx), %edi
- movzbl -1(%rsi, %rdx), %esi
- orl %edi, %eax
- orl %esi, %ecx
- /* Subtraction is okay because the upper 8 bits are zero. */
- subl %ecx, %eax
- ret
-# endif
END (MEMCMP)
#endif
--
GitLab

View File

@ -0,0 +1,43 @@
From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
From: Jangwoong Kim <6812skiii@gmail.com>
Date: Tue, 14 Dec 2021 21:30:51 +0900
Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
Content-type: text/plain; charset=UTF-8
The commit:
"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
if atomic load fails. But, "continue" inside of do-while loop
does not skip the evaluation of escape expression, thus CAS
is not skipped.
Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
LLL_MUTEX_READ_LOCK fails.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
nptl/pthread_mutex_lock.c | 5 ++---
1 file changed, 2 insertions(+), 3 deletions(-)
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
index a633d95e..d96a9933 100644
--- a/nptl/pthread_mutex_lock.c
+++ b/nptl/pthread_mutex_lock.c
@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
break;
}
atomic_spin_nop ();
- if (LLL_MUTEX_READ_LOCK (mutex) != 0)
- continue;
}
- while (LLL_MUTEX_TRYLOCK (mutex) != 0);
+ while (LLL_MUTEX_READ_LOCK (mutex) != 0
+ || LLL_MUTEX_TRYLOCK (mutex) != 0);
mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
}
--
GitLab

View File

@ -0,0 +1,146 @@
From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 18 Feb 2022 14:19:15 -0600
Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
Content-type: text/plain; charset=UTF-8
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
not checks around vzeroupper and would trigger spurious
aborts. This commit fixes that.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
AVX2 machines with and without RTM.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/Makefile | 5 ++++-
sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++---------
sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++
3 files changed, 48 insertions(+), 10 deletions(-)
create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 2d814915..c2111f49 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -28,7 +28,9 @@ tests += \
tst-strcpy-rtm \
tst-strlen-rtm \
tst-strncmp-rtm \
- tst-strrchr-rtm
+ tst-strrchr-rtm \
+ tst-wcsncmp-rtm \
+# tests
CFLAGS-tst-memchr-rtm.c += -mrtm
CFLAGS-tst-memcmp-rtm.c += -mrtm
@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
CFLAGS-tst-strlen-rtm.c += -mrtm
CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
CFLAGS-tst-strrchr-rtm.c += -mrtm
+CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
endif
ifneq ($(enable-cet),no)
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index 4d0004b5..4e9f094f 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -19,18 +19,32 @@
#include <stdint.h>
#include <tst-string-rtm.h>
+#ifdef WIDE
+# define CHAR wchar_t
+# define MEMSET wmemset
+# define STRNCMP wcsncmp
+# define TEST_NAME wcsncmp
+#else /* !WIDE */
+# define CHAR char
+# define MEMSET memset
+# define STRNCMP strncmp
+# define TEST_NAME strncmp
+#endif /* !WIDE */
+
+
+
#define LOOP 3000
#define STRING_SIZE 1024
-char string1[STRING_SIZE];
-char string2[STRING_SIZE];
+CHAR string1[STRING_SIZE];
+CHAR string2[STRING_SIZE];
__attribute__ ((noinline, noclone))
static int
prepare (void)
{
- memset (string1, 'a', STRING_SIZE - 1);
- memset (string2, 'a', STRING_SIZE - 1);
- if (strncmp (string1, string2, STRING_SIZE) == 0)
+ MEMSET (string1, 'a', STRING_SIZE - 1);
+ MEMSET (string2, 'a', STRING_SIZE - 1);
+ if (STRNCMP (string1, string2, STRING_SIZE) == 0)
return EXIT_SUCCESS;
else
return EXIT_FAILURE;
@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone))
static int
function (void)
{
- if (strncmp (string1, string2, STRING_SIZE) == 0)
+ if (STRNCMP (string1, string2, STRING_SIZE) == 0)
return 0;
else
return 1;
@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone))
static int
function_overflow (void)
{
- if (strncmp (string1, string2, SIZE_MAX) == 0)
+ if (STRNCMP (string1, string2, SIZE_MAX) == 0)
return 0;
else
return 1;
@@ -59,9 +73,9 @@ function_overflow (void)
static int
do_test (void)
{
- int status = do_test_1 ("strncmp", LOOP, prepare, function);
+ int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
if (status != EXIT_SUCCESS)
return status;
- status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
+ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
return status;
}
diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
new file mode 100644
index 00000000..bad3b863
--- /dev/null
+++ b/sysdeps/x86/tst-wcsncmp-rtm.c
@@ -0,0 +1,21 @@
+/* Test case for wcsncmp inside a transactionally executing RTM region.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include <wchar.h>
+#include "tst-strncmp-rtm.c"
--
GitLab

View File

@ -0,0 +1,37 @@
From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Fri, 18 Feb 2022 17:00:25 -0600
Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
Content-type: text/plain; charset=UTF-8
Previously TEST_NAME was passing a function pointer. This didn't fail
because of the -Wno-error flag (to allow for overflow sizes passed
to strncmp/wcsncmp)
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
1 file changed, 2 insertions(+), 2 deletions(-)
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index 4e9f094f..aef9866c 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -23,12 +23,12 @@
# define CHAR wchar_t
# define MEMSET wmemset
# define STRNCMP wcsncmp
-# define TEST_NAME wcsncmp
+# define TEST_NAME "wcsncmp"
#else /* !WIDE */
# define CHAR char
# define MEMSET memset
# define STRNCMP strncmp
-# define TEST_NAME strncmp
+# define TEST_NAME "strncmp"
#endif /* !WIDE */
--
GitLab

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,33 @@
From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 4 Feb 2022 11:09:10 -0800
Subject: [PATCH] x86-64: Fix strcmp-avx2.S
Content-type: text/plain; charset=UTF-8
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon Jan 10 15:35:38 2022 -0600
x86: Optimize strcmp-avx2.S
---
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 554ffe4c..04675aa4 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -106,7 +106,7 @@ ENTRY(STRCMP)
# ifdef USE_AS_STRNCMP
# ifdef __ILP32__
/* Clear the upper 32 bits. */
- movl %edx, %rdx
+ movl %edx, %edx
# endif
cmp $1, %RDX_LP
/* Signed comparison intentional. We use this branch to also
--
GitLab

View File

@ -0,0 +1,33 @@
From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Fri, 4 Feb 2022 11:11:08 -0800
Subject: [PATCH] x86-64: Fix strcmp-evex.S
Content-type: text/plain; charset=UTF-8
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon Jan 10 15:35:39 2022 -0600
x86: Optimize strcmp-evex.S
---
sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index 99d8409a..ed56af8e 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -116,7 +116,7 @@ ENTRY(STRCMP)
# ifdef USE_AS_STRNCMP
# ifdef __ILP32__
/* Clear the upper 32 bits. */
- movl %edx, %rdx
+ movl %edx, %edx
# endif
cmp $1, %RDX_LP
/* Signed comparison intentional. We use this branch to also
--
GitLab

View File

@ -0,0 +1,459 @@
From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun, 6 Feb 2022 00:54:18 -0600
Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8
No bug.
Split vec generation into multiple steps. This allows the
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
case. This saves an expensive lane-cross instruction and removes
the need for 'vzeroupper'.
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
byte broadcast.
Results for memset-avx2 small (geomean of N = 20 benchset runs).
size, New Time, Old Time, New / Old
0, 4.100, 3.831, 0.934
1, 5.074, 4.399, 0.867
2, 4.433, 4.411, 0.995
4, 4.487, 4.415, 0.984
8, 4.454, 4.396, 0.987
16, 4.502, 4.443, 0.987
All relevant string/wcsmbs tests are passing.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86_64/memset.S | 21 ++-
.../multiarch/memset-avx2-unaligned-erms.S | 18 +-
.../multiarch/memset-avx512-unaligned-erms.S | 18 +-
.../multiarch/memset-evex-unaligned-erms.S | 18 +-
.../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
5 files changed, 152 insertions(+), 87 deletions(-)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 8672b030..27debd2b 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -28,17 +28,22 @@
#define VMOVU movups
#define VMOVA movaps
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- movq r, %rax; \
- punpcklbw %xmm0, %xmm0; \
- punpcklwd %xmm0, %xmm0; \
- pshufd $0, %xmm0, %xmm0
+ pxor %xmm1, %xmm1; \
+ pshufb %xmm1, %xmm0; \
+ movq r, %rax
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- movq r, %rax; \
- pshufd $0, %xmm0, %xmm0
+ pshufd $0, %xmm0, %xmm0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
#define SECTION(p) p
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index 1af668af..c0bf2875 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -10,15 +10,18 @@
# define VMOVU vmovdqu
# define VMOVA vmovdqa
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastb %xmm0, %ymm0
+ movq r, %rax;
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- vmovd d, %xmm0; \
- movq r, %rax; \
- vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
+
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
# ifndef SECTION
# define SECTION(p) p##.avx
@@ -30,5 +33,6 @@
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
+# define USE_XMM_LESS_VEC
# include "memset-vec-unaligned-erms.S"
#endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index f14d6f84..5241216a 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -15,13 +15,19 @@
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex512
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 64b09e77..63700215 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -15,13 +15,19 @@
# define VZEROUPPER
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastb d, %VEC0
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastb d, %VEC0; \
+ movq r, %rax
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
- movq r, %rax; \
- vpbroadcastd d, %VEC0
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
+ vpbroadcastd d, %VEC0; \
+ movq r, %rax
+
+# define MEMSET_VDUP_TO_VEC0_HIGH()
+# define MEMSET_VDUP_TO_VEC0_LOW()
+
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
+# define WMEMSET_VDUP_TO_VEC0_LOW()
# define SECTION(p) p##.evex
# define MEMSET_SYMBOL(p,s) p##_evex_##s
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index f08b7323..a67f9833 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -58,8 +58,10 @@
#ifndef MOVQ
# if VEC_SIZE > 16
# define MOVQ vmovq
+# define MOVD vmovd
# else
# define MOVQ movq
+# define MOVD movd
# endif
#endif
@@ -72,9 +74,17 @@
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
# define END_REG rcx
# define LOOP_REG rdi
+# define LESS_VEC_REG rax
#else
# define END_REG rdi
# define LOOP_REG rdx
+# define LESS_VEC_REG rdi
+#endif
+
+#ifdef USE_XMM_LESS_VEC
+# define XMM_SMALL 1
+#else
+# define XMM_SMALL 0
#endif
#define PAGE_SIZE 4096
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
shl $2, %RDX_LP
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
- jmp L(entry_from_bzero)
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
+ WMEMSET_VDUP_TO_VEC0_LOW()
+ cmpq $VEC_SIZE, %rdx
+ jb L(less_vec_no_vdup)
+ WMEMSET_VDUP_TO_VEC0_HIGH()
+ jmp L(entry_from_wmemset)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
#endif
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH()
+L(entry_from_wmemset):
cmpq $(VEC_SIZE * 2), %rdx
ja L(more_2x_vec)
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
# endif
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
# ifdef __ILP32__
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
cmp $VEC_SIZE, %RDX_LP
jb L(less_vec)
+ MEMSET_VDUP_TO_VEC0_HIGH ()
cmp $(VEC_SIZE * 2), %RDX_LP
ja L(stosb_more_2x_vec)
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
- */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
VZEROUPPER_RETURN
#endif
- .p2align 4,, 10
+ .p2align 4,, 4
L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
#else
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
@@ -212,6 +228,7 @@ L(last_2x_vec):
#ifdef USE_LESS_VEC_MASK_STORE
.p2align 4,, 10
L(less_vec):
+L(less_vec_no_vdup):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
and (4x, 8x] jump to target. */
L(more_2x_vec):
-
- /* Two different methods of setting up pointers / compare. The
- two methods are based on the fact that EVEX/AVX512 mov
- instructions take more bytes then AVX2/SSE2 mov instructions. As
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
- this saves code size and keeps a few targets in one fetch block.
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
- LOOP_4X_OFFSET) with LEA_BID. */
-
- /* END_REG is rcx for EVEX/AVX512. */
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
-#endif
-
- /* Stores to first 2x VEC before cmp as any path forward will
- require it. */
- VMOVU %VEC(0), (%rax)
- VMOVU %VEC(0), VEC_SIZE(%rax)
+ /* Store next 2x vec regardless. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
+ /* Two different methods of setting up pointers / compare. The two
+ methods are based on the fact that EVEX/AVX512 mov instructions take
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
+ bottlenecks. */
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
addq %rdx, %END_REG
@@ -292,6 +299,15 @@ L(more_2x_vec):
cmpq $(VEC_SIZE * 4), %rdx
jbe L(last_2x_vec)
+
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
+ LEA_BID. */
+
+ /* END_REG is rcx for EVEX/AVX512. */
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
+#endif
+
/* Store next 2x vec regardless. */
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
@@ -355,65 +371,93 @@ L(stosb_local):
/* Define L(less_vec) only if not otherwise defined. */
.p2align 4
L(less_vec):
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
+ xmm). This is only does anything for AVX2. */
+ MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_no_vdup):
#endif
L(cross_page):
#if VEC_SIZE > 32
cmpl $32, %edx
- jae L(between_32_63)
+ jge L(between_32_63)
#endif
#if VEC_SIZE > 16
cmpl $16, %edx
- jae L(between_16_31)
+ jge L(between_16_31)
+#endif
+#ifndef USE_XMM_LESS_VEC
+ MOVQ %XMM0, %rcx
#endif
- MOVQ %XMM0, %rdi
cmpl $8, %edx
- jae L(between_8_15)
+ jge L(between_8_15)
cmpl $4, %edx
- jae L(between_4_7)
+ jge L(between_4_7)
cmpl $1, %edx
- ja L(between_2_3)
- jb L(return)
- movb %sil, (%rax)
- VZEROUPPER_RETURN
+ jg L(between_2_3)
+ jl L(between_0_0)
+ movb %sil, (%LESS_VEC_REG)
+L(between_0_0):
+ ret
- /* Align small targets only if not doing so would cross a fetch
- line. */
+ /* Align small targets only if not doing so would cross a fetch line.
+ */
#if VEC_SIZE > 32
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
/* From 32 to 63. No branch when size == 32. */
L(between_32_63):
- VMOVU %YMM0, (%rax)
- VMOVU %YMM0, -32(%rax, %rdx)
+ VMOVU %YMM0, (%LESS_VEC_REG)
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
VZEROUPPER_RETURN
#endif
#if VEC_SIZE >= 32
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
L(between_16_31):
/* From 16 to 31. No branch when size == 16. */
- VMOVU %XMM0, (%rax)
- VMOVU %XMM0, -16(%rax, %rdx)
- VZEROUPPER_RETURN
+ VMOVU %XMM0, (%LESS_VEC_REG)
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
+ ret
#endif
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
L(between_8_15):
/* From 8 to 15. No branch when size == 8. */
- movq %rdi, (%rax)
- movq %rdi, -8(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVQ %XMM0, (%rdi)
+ MOVQ %XMM0, -8(%rdi, %rdx)
+#else
+ movq %rcx, (%LESS_VEC_REG)
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
+ */
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
L(between_4_7):
/* From 4 to 7. No branch when size == 4. */
- movl %edi, (%rax)
- movl %edi, -4(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ MOVD %XMM0, (%rdi)
+ MOVD %XMM0, -4(%rdi, %rdx)
+#else
+ movl %ecx, (%LESS_VEC_REG)
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
+#endif
+ ret
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
- movw %di, (%rax)
- movb %dil, -1(%rax, %rdx)
- VZEROUPPER_RETURN
+#ifdef USE_XMM_LESS_VEC
+ movb %sil, (%rdi)
+ movb %sil, 1(%rdi)
+ movb %sil, -1(%rdi, %rdx)
+#else
+ movw %cx, (%LESS_VEC_REG)
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
+#endif
+ ret
END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
GitLab

View File

@ -0,0 +1,40 @@
From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Mon, 7 Feb 2022 00:32:23 -0600
Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
Only)
Content-type: text/plain; charset=UTF-8
commit b62ace2740a106222e124cc86956448fa07abf4d
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun Feb 6 00:54:18 2022 -0600
x86: Improve vec generation in memset-vec-unaligned-erms.S
Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
instruction and memset.S is restricted to only SSE2 instructions.
---
sysdeps/x86_64/memset.S | 7 ++++---
1 file changed, 4 insertions(+), 3 deletions(-)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 27debd2b..4cb4aa71 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -30,9 +30,10 @@
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
- pxor %xmm1, %xmm1; \
- pshufb %xmm1, %xmm0; \
- movq r, %rax
+ movq r, %rax; \
+ punpcklbw %xmm0, %xmm0; \
+ punpcklwd %xmm0, %xmm0; \
+ pshufd $0, %xmm0, %xmm0
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
--
GitLab

View File

@ -0,0 +1,218 @@
From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 21 Jan 2019 11:36:36 -0800
Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
[BZ# 24097]
Content-type: text/plain; charset=UTF-8
On x32, the size_t parameter may be passed in the lower 32 bits of a
64-bit register with the non-zero upper 32 bits. The string/memory
functions written in assembly can only use the lower 32 bits of a
64-bit register as length or must clear the upper 32 bits before using
the full 64-bit register for length.
This pach fixes strnlen/wcsnlen for x32. Tested on x86-64 and x32. On
x86-64, libc.so is the same with and withou the fix.
[BZ# 24097]
CVE-2019-6488
* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
Clear the upper 32 bits of RSI register.
* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
and tst-size_t-wcsnlen.
* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
---
sysdeps/x86_64/multiarch/strlen-avx2.S | 9 ++--
sysdeps/x86_64/strlen.S | 12 ++---
sysdeps/x86_64/x32/Makefile | 4 +-
sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
5 files changed, 106 insertions(+), 11 deletions(-)
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
Conflicts:
ChangeLog
(removed)
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
index fb2418cd..645e0446 100644
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
@@ -42,12 +42,15 @@
ENTRY (STRLEN)
# ifdef USE_AS_STRNLEN
/* Check for zero length. */
- testq %rsi, %rsi
+ test %RSI_LP, %RSI_LP
jz L(zero)
# ifdef USE_AS_WCSLEN
- shl $2, %rsi
+ shl $2, %RSI_LP
+# elif defined __ILP32__
+ /* Clear the upper 32 bits. */
+ movl %esi, %esi
# endif
- movq %rsi, %r8
+ mov %RSI_LP, %R8_LP
# endif
movl %edi, %ecx
movq %rdi, %rdx
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
index 01cb5fa8..f845f3d4 100644
--- a/sysdeps/x86_64/strlen.S
+++ b/sysdeps/x86_64/strlen.S
@@ -59,21 +59,21 @@ ENTRY(strlen)
#ifdef AS_STRNLEN
/* Do not read anything when n==0. */
- test %rsi, %rsi
+ test %RSI_LP, %RSI_LP
jne L(n_nonzero)
xor %rax, %rax
ret
L(n_nonzero):
# ifdef AS_WCSLEN
- shlq $2, %rsi
+ shl $2, %RSI_LP
# endif
/* Initialize long lived registers. */
- add %rdi, %rsi
- mov %rsi, %r10
- and $-64, %r10
- mov %rsi, %r11
+ add %RDI_LP, %RSI_LP
+ mov %RSI_LP, %R10_LP
+ and $-64, %R10_LP
+ mov %RSI_LP, %R11_LP
#endif
pxor %xmm0, %xmm0
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
index 2a9e20a9..1557724b 100644
--- a/sysdeps/x86_64/x32/Makefile
+++ b/sysdeps/x86_64/x32/Makefile
@@ -8,10 +8,10 @@ endif
ifeq ($(subdir),string)
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
- tst-size_t-strncmp tst-size_t-strncpy
+ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
endif
ifeq ($(subdir),wcsmbs)
tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
- tst-size_t-wcsncmp
+ tst-size_t-wcsncmp tst-size_t-wcsnlen
endif
diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
new file mode 100644
index 00000000..690a4a8a
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
@@ -0,0 +1,72 @@
+/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifdef WIDE
+# define TEST_NAME "wcsnlen"
+#else
+# define TEST_NAME "strnlen"
+#endif /* WIDE */
+
+#include "test-size_t.h"
+
+#ifdef WIDE
+# include <wchar.h>
+# define STRNLEN wcsnlen
+# define CHAR wchar_t
+#else
+# define STRNLEN strnlen
+# define CHAR char
+#endif /* WIDE */
+
+IMPL (STRNLEN, 1)
+
+typedef size_t (*proto_t) (const CHAR *, size_t);
+
+static size_t
+__attribute__ ((noinline, noclone))
+do_strnlen (parameter_t a, parameter_t b)
+{
+ return CALL (&a, a.p, b.len);
+}
+
+static int
+test_main (void)
+{
+ test_init ();
+
+ size_t size = page_size / sizeof (CHAR);
+ parameter_t src = { { 0 }, buf2 };
+ parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
+
+ int ret = 0;
+ FOR_EACH_IMPL (impl, 0)
+ {
+ src.fn = impl->fn;
+ size_t res = do_strnlen (src, c);
+ if (res != size)
+ {
+ error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
+ impl->name, res, size);
+ ret = 1;
+ }
+ }
+
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
new file mode 100644
index 00000000..093b4bbe
--- /dev/null
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
@@ -0,0 +1,20 @@
+/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
+ Copyright (C) 2019 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#define WIDE 1
+#include "tst-size_t-strnlen.c"
--
GitLab

View File

@ -0,0 +1,753 @@
From 3d9f171bfb5325bd5f427e9fc386453358c6e840 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Mon, 7 Feb 2022 05:55:15 -0800
Subject: [PATCH] x86-64: Optimize bzero
Content-type: text/plain; charset=UTF-8
memset with zero as the value to set is by far the majority value (99%+
for Python3 and GCC).
bzero can be slightly more optimized for this case by using a zero-idiom
xor for broadcasting the set value to a register (vector or GPR).
Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
sysdeps/generic/ifunc-init.h | 5 +-
sysdeps/x86_64/memset.S | 8 +
sysdeps/x86_64/multiarch/Makefile | 205 +++++++++++-------
sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 ++++
.../memset-avx2-unaligned-erms-rtm.S | 1 +
.../multiarch/memset-avx2-unaligned-erms.S | 6 +
.../multiarch/memset-avx512-unaligned-erms.S | 3 +
.../multiarch/memset-evex-unaligned-erms.S | 3 +
.../multiarch/memset-sse2-unaligned-erms.S | 1 +
.../multiarch/memset-vec-unaligned-erms.S | 110 +++++++---
11 files changed, 384 insertions(+), 106 deletions(-)
create mode 100644 sysdeps/x86_64/multiarch/bzero.c
Conflicts:
sysdeps/generic/ifunc-init.h
(needs macros from cf4fd28ea453d1a9cec93939bc88b58ccef5437a (memcmpeq))
sysdeps/x86_64/multiarch/Makefile
(file ordering)
diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h
index 241e4161..f7a72375 100644
--- a/sysdeps/generic/ifunc-init.h
+++ b/sysdeps/generic/ifunc-init.h
@@ -50,5 +50,8 @@
'__<symbol>_<variant>' as the optimized implementation and
'<symbol>_ifunc_selector' as the IFUNC selector. */
#define REDIRECT_NAME EVALUATOR1 (__redirect, SYMBOL_NAME)
-#define OPTIMIZE(name) EVALUATOR2 (SYMBOL_NAME, name)
+#define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name)
+#define OPTIMIZE2(name) EVALUATOR2 (SYMBOL_NAME, name)
+/* Default is to use OPTIMIZE2. */
+#define OPTIMIZE(name) OPTIMIZE2(name)
#define IFUNC_SELECTOR EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
index 4cb4aa71..a1353f89 100644
--- a/sysdeps/x86_64/memset.S
+++ b/sysdeps/x86_64/memset.S
@@ -35,6 +35,9 @@
punpcklwd %xmm0, %xmm0; \
pshufd $0, %xmm0, %xmm0
+# define BZERO_ZERO_VEC0() \
+ pxor %xmm0, %xmm0
+
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
movd d, %xmm0; \
pshufd $0, %xmm0, %xmm0; \
@@ -53,6 +56,10 @@
# define MEMSET_SYMBOL(p,s) memset
#endif
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s) __bzero
+#endif
+
#ifndef WMEMSET_SYMBOL
# define WMEMSET_CHK_SYMBOL(p,s) p
# define WMEMSET_SYMBOL(p,s) __wmemset
@@ -63,6 +70,7 @@
libc_hidden_builtin_def (memset)
#if IS_IN (libc)
+weak_alias (__bzero, bzero)
libc_hidden_def (__wmemset)
weak_alias (__wmemset, wmemset)
libc_hidden_weak (wmemset)
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
index 26be4095..37d8d6f0 100644
--- a/sysdeps/x86_64/multiarch/Makefile
+++ b/sysdeps/x86_64/multiarch/Makefile
@@ -1,85 +1,130 @@
ifeq ($(subdir),string)
-sysdep_routines += strncat-c stpncpy-c strncpy-c \
- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \
- strcmp-sse4_2 strcmp-avx2 \
- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
- memrchr-sse2 memrchr-avx2 \
- memcmp-sse2 \
- memcmp-avx2-movbe \
- memcmp-sse4 memcpy-ssse3 \
- memmove-ssse3 \
- memcpy-ssse3-back \
- memmove-ssse3-back \
- memmove-avx512-no-vzeroupper \
- strcasecmp_l-sse2 strcasecmp_l-ssse3 \
- strcasecmp_l-sse4_2 strcasecmp_l-avx \
- strncase_l-sse2 strncase_l-ssse3 \
- strncase_l-sse4_2 strncase_l-avx \
- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
- strrchr-sse2 strrchr-avx2 \
- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
- strcat-avx2 strncat-avx2 \
- strcat-ssse3 strncat-ssse3\
- strcpy-avx2 strncpy-avx2 \
- strcpy-sse2 stpcpy-sse2 \
- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
- strcpy-sse2-unaligned strncpy-sse2-unaligned \
- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
- stpcpy-avx2 stpncpy-avx2 \
- strcat-sse2 \
- strcat-sse2-unaligned strncat-sse2-unaligned \
- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
- strcspn-sse2 strpbrk-sse2 strspn-sse2 \
- strcspn-c strpbrk-c strspn-c varshift \
- memset-avx512-no-vzeroupper \
- memmove-sse2-unaligned-erms \
- memmove-avx-unaligned-erms \
- memmove-avx512-unaligned-erms \
- memset-sse2-unaligned-erms \
- memset-avx2-unaligned-erms \
- memset-avx512-unaligned-erms \
- memchr-avx2-rtm \
- memcmp-avx2-movbe-rtm \
- memmove-avx-unaligned-erms-rtm \
- memrchr-avx2-rtm \
- memset-avx2-unaligned-erms-rtm \
- rawmemchr-avx2-rtm \
- strchr-avx2-rtm \
- strcmp-avx2-rtm \
- strchrnul-avx2-rtm \
- stpcpy-avx2-rtm \
- stpncpy-avx2-rtm \
- strcat-avx2-rtm \
- strcpy-avx2-rtm \
- strlen-avx2-rtm \
- strncat-avx2-rtm \
- strncmp-avx2-rtm \
- strncpy-avx2-rtm \
- strnlen-avx2-rtm \
- strrchr-avx2-rtm \
- memchr-evex \
- memcmp-evex-movbe \
- memmove-evex-unaligned-erms \
- memrchr-evex \
- memset-evex-unaligned-erms \
- rawmemchr-evex \
- stpcpy-evex \
- stpncpy-evex \
- strcat-evex \
- strchr-evex \
- strchrnul-evex \
- strcmp-evex \
- strcpy-evex \
- strlen-evex \
- strncat-evex \
- strncmp-evex \
- strncpy-evex \
- strnlen-evex \
- strrchr-evex \
- memchr-evex-rtm \
- rawmemchr-evex-rtm
+sysdep_routines += \
+ bzero \
+ memchr-avx2 \
+ memchr-avx2-rtm \
+ memchr-evex \
+ memchr-evex-rtm \
+ memchr-sse2 \
+ memcmp-avx2-movbe \
+ memcmp-avx2-movbe-rtm \
+ memcmp-evex-movbe \
+ memcmp-sse2 \
+ memcmp-sse4 \
+ memcmp-ssse3 \
+ memcpy-ssse3 \
+ memcpy-ssse3-back \
+ memmove-avx-unaligned-erms \
+ memmove-avx-unaligned-erms-rtm \
+ memmove-avx512-no-vzeroupper \
+ memmove-avx512-unaligned-erms \
+ memmove-evex-unaligned-erms \
+ memmove-sse2-unaligned-erms \
+ memmove-ssse3 \
+ memmove-ssse3-back \
+ memrchr-avx2 \
+ memrchr-avx2-rtm \
+ memrchr-evex \
+ memrchr-sse2 \
+ memset-avx2-unaligned-erms \
+ memset-avx2-unaligned-erms-rtm \
+ memset-avx512-no-vzeroupper \
+ memset-avx512-unaligned-erms \
+ memset-evex-unaligned-erms \
+ memset-sse2-unaligned-erms \
+ rawmemchr-avx2 \
+ rawmemchr-avx2-rtm \
+ rawmemchr-evex \
+ rawmemchr-evex-rtm \
+ rawmemchr-sse2 \
+ stpcpy-avx2 \
+ stpcpy-avx2-rtm \
+ stpcpy-evex \
+ stpcpy-sse2 \
+ stpcpy-sse2-unaligned \
+ stpcpy-ssse3 \
+ stpncpy-avx2 \
+ stpncpy-avx2-rtm \
+ stpncpy-c \
+ stpncpy-evex \
+ stpncpy-sse2-unaligned \
+ stpncpy-ssse3 \
+ strcasecmp_l-avx \
+ strcasecmp_l-sse2 \
+ strcasecmp_l-sse4_2 \
+ strcasecmp_l-ssse3 \
+ strcat-avx2 \
+ strcat-avx2-rtm \
+ strcat-evex \
+ strcat-sse2 \
+ strcat-sse2-unaligned \
+ strcat-ssse3 \
+ strchr-avx2 \
+ strchr-avx2-rtm \
+ strchr-evex \
+ strchr-sse2 \
+ strchr-sse2-no-bsf \
+ strchrnul-avx2 \
+ strchrnul-avx2-rtm \
+ strchrnul-evex \
+ strchrnul-sse2 \
+ strcmp-avx2 \
+ strcmp-avx2-rtm \
+ strcmp-evex \
+ strcmp-sse2 \
+ strcmp-sse2-unaligned \
+ strcmp-sse4_2 \
+ strcmp-ssse3 \
+ strcpy-avx2 \
+ strcpy-avx2-rtm \
+ strcpy-evex \
+ strcpy-sse2 \
+ strcpy-sse2-unaligned \
+ strcpy-ssse3 \
+ strcspn-c \
+ strcspn-sse2 \
+ strlen-avx2 \
+ strlen-avx2-rtm \
+ strlen-evex \
+ strlen-sse2 \
+ strncase_l-avx \
+ strncase_l-sse2 \
+ strncase_l-sse4_2 \
+ strncase_l-ssse3 \
+ strncat-avx2 \
+ strncat-avx2-rtm \
+ strncat-c \
+ strncat-evex \
+ strncat-sse2-unaligned \
+ strncat-ssse3 \
+ strncmp-avx2 \
+ strncmp-avx2-rtm \
+ strncmp-evex \
+ strncmp-sse2 \
+ strncmp-sse4_2 \
+ strncmp-ssse3 \
+ strncpy-avx2 \
+ strncpy-avx2-rtm \
+ strncpy-c \
+ strncpy-evex \
+ strncpy-sse2-unaligned \
+ strncpy-ssse3 \
+ strnlen-avx2 \
+ strnlen-avx2-rtm \
+ strnlen-evex \
+ strnlen-sse2 \
+ strpbrk-c \
+ strpbrk-sse2 \
+ strrchr-avx2 \
+ strrchr-avx2-rtm \
+ strrchr-evex \
+ strrchr-sse2 \
+ strspn-c \
+ strspn-sse2 \
+ strstr-sse2-unaligned \
+ varshift \
+# sysdep_routines
CFLAGS-varshift.c += -msse4
CFLAGS-strcspn-c.c += -msse4
CFLAGS-strpbrk-c.c += -msse4
diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
new file mode 100644
index 00000000..58a14b2c
--- /dev/null
+++ b/sysdeps/x86_64/multiarch/bzero.c
@@ -0,0 +1,106 @@
+/* Multiple versions of bzero.
+ All versions must be listed in ifunc-impl-list.c.
+ Copyright (C) 2022 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+/* Define multiple versions only for the definition in libc. */
+#if IS_IN (libc)
+# define __bzero __redirect___bzero
+# include <string.h>
+# undef __bzero
+
+# define SYMBOL_NAME __bzero
+# include <init-arch.h>
+
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
+ attribute_hidden;
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
+ attribute_hidden;
+
+static inline void *
+IFUNC_SELECTOR (void)
+{
+ const struct cpu_features* cpu_features = __get_cpu_features ();
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE1 (avx512_unaligned_erms);
+
+ return OPTIMIZE1 (avx512_unaligned);
+ }
+ }
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE1 (evex_unaligned_erms);
+
+ return OPTIMIZE1 (evex_unaligned);
+ }
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE1 (avx2_unaligned_erms_rtm);
+
+ return OPTIMIZE1 (avx2_unaligned_rtm);
+ }
+
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
+ {
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE1 (avx2_unaligned_erms);
+
+ return OPTIMIZE1 (avx2_unaligned);
+ }
+ }
+
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
+ return OPTIMIZE1 (sse2_unaligned_erms);
+
+ return OPTIMIZE1 (sse2_unaligned);
+}
+
+libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
+
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index 8be0d78a..c963d391 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
__memset_avx512_no_vzeroupper)
)
+ /* Support sysdeps/x86_64/multiarch/bzero.c. */
+ IFUNC_IMPL (i, name, bzero,
+ IFUNC_IMPL_ADD (array, i, bzero, 1,
+ __bzero_sse2_unaligned)
+ IFUNC_IMPL_ADD (array, i, bzero, 1,
+ __bzero_sse2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ CPU_FEATURE_USABLE (AVX2),
+ __bzero_avx2_unaligned)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ CPU_FEATURE_USABLE (AVX2),
+ __bzero_avx2_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __bzero_avx2_unaligned_rtm)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ (CPU_FEATURE_USABLE (AVX2)
+ && CPU_FEATURE_USABLE (RTM)),
+ __bzero_avx2_unaligned_erms_rtm)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __bzero_evex_unaligned)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __bzero_evex_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __bzero_avx512_unaligned_erms)
+ IFUNC_IMPL_ADD (array, i, bzero,
+ (CPU_FEATURE_USABLE (AVX512VL)
+ && CPU_FEATURE_USABLE (AVX512BW)
+ && CPU_FEATURE_USABLE (BMI2)),
+ __bzero_avx512_unaligned)
+ )
+
/* Support sysdeps/x86_64/multiarch/rawmemchr.c. */
IFUNC_IMPL (i, name, rawmemchr,
IFUNC_IMPL_ADD (array, i, rawmemchr,
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
index 8ac3e479..5a5ee6f6 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
@@ -5,6 +5,7 @@
#define SECTION(p) p##.avx.rtm
#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
+#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm
#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
#include "memset-avx2-unaligned-erms.S"
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
index c0bf2875..a093a283 100644
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
@@ -14,6 +14,9 @@
vmovd d, %xmm0; \
movq r, %rax;
+# define BZERO_ZERO_VEC0() \
+ vpxor %xmm0, %xmm0, %xmm0
+
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
@@ -29,6 +32,9 @@
# ifndef MEMSET_SYMBOL
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
+# ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s) p##_avx2_##s
+# endif
# ifndef WMEMSET_SYMBOL
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
# endif
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
index 5241216a..727c9213 100644
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
@@ -19,6 +19,9 @@
vpbroadcastb d, %VEC0; \
movq r, %rax
+# define BZERO_ZERO_VEC0() \
+ vpxorq %XMM0, %XMM0, %XMM0
+
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastd d, %VEC0; \
movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
index 63700215..5d8fa78f 100644
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
@@ -19,6 +19,9 @@
vpbroadcastb d, %VEC0; \
movq r, %rax
+# define BZERO_ZERO_VEC0() \
+ vpxorq %XMM0, %XMM0, %XMM0
+
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
vpbroadcastd d, %VEC0; \
movq r, %rax
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
index 56b81f5c..8f579ad6 100644
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
@@ -22,6 +22,7 @@
#if IS_IN (libc)
# define MEMSET_SYMBOL(p,s) p##_sse2_##s
+# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
# ifdef SHARED
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index a67f9833..06f5f5d7 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -26,6 +26,10 @@
#include <sysdep.h>
+#ifndef BZERO_SYMBOL
+# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
+#endif
+
#ifndef MEMSET_CHK_SYMBOL
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
#endif
@@ -87,6 +91,18 @@
# define XMM_SMALL 0
#endif
+#ifdef USE_LESS_VEC_MASK_STORE
+# define SET_REG64 rcx
+# define SET_REG32 ecx
+# define SET_REG16 cx
+# define SET_REG8 cl
+#else
+# define SET_REG64 rsi
+# define SET_REG32 esi
+# define SET_REG16 si
+# define SET_REG8 sil
+#endif
+
#define PAGE_SIZE 4096
/* Macro to calculate size of small memset block for aligning
@@ -96,18 +112,6 @@
#ifndef SECTION
# error SECTION is not defined!
-#endif
-
- .section SECTION(.text),"ax",@progbits
-#if VEC_SIZE == 16 && IS_IN (libc)
-ENTRY (__bzero)
- mov %RDI_LP, %RAX_LP /* Set return value. */
- mov %RSI_LP, %RDX_LP /* Set n. */
- xorl %esi, %esi
- pxor %XMM0, %XMM0
- jmp L(entry_from_bzero)
-END (__bzero)
-weak_alias (__bzero, bzero)
#endif
#if IS_IN (libc)
@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
WMEMSET_VDUP_TO_VEC0_LOW()
cmpq $VEC_SIZE, %rdx
- jb L(less_vec_no_vdup)
+ jb L(less_vec_from_wmemset)
WMEMSET_VDUP_TO_VEC0_HIGH()
jmp L(entry_from_wmemset)
END (WMEMSET_SYMBOL (__wmemset, unaligned))
#endif
+ENTRY (BZERO_SYMBOL(__bzero, unaligned))
+#if VEC_SIZE > 16
+ BZERO_ZERO_VEC0 ()
+#endif
+ mov %RDI_LP, %RAX_LP
+ mov %RSI_LP, %RDX_LP
+#ifndef USE_LESS_VEC_MASK_STORE
+ xorl %esi, %esi
+#endif
+ cmp $VEC_SIZE, %RDX_LP
+ jb L(less_vec_no_vdup)
+#ifdef USE_LESS_VEC_MASK_STORE
+ xorl %esi, %esi
+#endif
+#if VEC_SIZE <= 16
+ BZERO_ZERO_VEC0 ()
+#endif
+ cmp $(VEC_SIZE * 2), %RDX_LP
+ ja L(more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned))
+
#if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
cmp %RDX_LP, %RCX_LP
@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
/* Clear the upper 32 bits. */
mov %edx, %edx
# endif
-L(entry_from_bzero):
cmpq $VEC_SIZE, %rdx
jb L(less_vec)
MEMSET_VDUP_TO_VEC0_HIGH()
@@ -187,6 +215,31 @@ END (__memset_erms)
END (MEMSET_SYMBOL (__memset, erms))
# endif
+ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
+# if VEC_SIZE > 16
+ BZERO_ZERO_VEC0 ()
+# endif
+ mov %RDI_LP, %RAX_LP
+ mov %RSI_LP, %RDX_LP
+# ifndef USE_LESS_VEC_MASK_STORE
+ xorl %esi, %esi
+# endif
+ cmp $VEC_SIZE, %RDX_LP
+ jb L(less_vec_no_vdup)
+# ifdef USE_LESS_VEC_MASK_STORE
+ xorl %esi, %esi
+# endif
+# if VEC_SIZE <= 16
+ BZERO_ZERO_VEC0 ()
+# endif
+ cmp $(VEC_SIZE * 2), %RDX_LP
+ ja L(stosb_more_2x_vec)
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
+ VMOVU %VEC(0), (%rdi)
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
+ VZEROUPPER_RETURN
+END (BZERO_SYMBOL(__bzero, unaligned_erms))
+
# if defined SHARED && IS_IN (libc)
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
cmp %RDX_LP, %RCX_LP
@@ -229,6 +282,7 @@ L(last_2x_vec):
.p2align 4,, 10
L(less_vec):
L(less_vec_no_vdup):
+L(less_vec_from_wmemset):
/* Less than 1 VEC. */
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
# error Unsupported VEC_SIZE!
@@ -374,8 +428,11 @@ L(less_vec):
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
xmm). This is only does anything for AVX2. */
MEMSET_VDUP_TO_VEC0_LOW ()
+L(less_vec_from_wmemset):
+#if VEC_SIZE > 16
L(less_vec_no_vdup):
#endif
+#endif
L(cross_page):
#if VEC_SIZE > 32
cmpl $32, %edx
@@ -386,7 +443,10 @@ L(cross_page):
jge L(between_16_31)
#endif
#ifndef USE_XMM_LESS_VEC
- MOVQ %XMM0, %rcx
+ MOVQ %XMM0, %SET_REG64
+#endif
+#if VEC_SIZE <= 16
+L(less_vec_no_vdup):
#endif
cmpl $8, %edx
jge L(between_8_15)
@@ -395,7 +455,7 @@ L(cross_page):
cmpl $1, %edx
jg L(between_2_3)
jl L(between_0_0)
- movb %sil, (%LESS_VEC_REG)
+ movb %SET_REG8, (%LESS_VEC_REG)
L(between_0_0):
ret
@@ -428,8 +488,8 @@ L(between_8_15):
MOVQ %XMM0, (%rdi)
MOVQ %XMM0, -8(%rdi, %rdx)
#else
- movq %rcx, (%LESS_VEC_REG)
- movq %rcx, -8(%LESS_VEC_REG, %rdx)
+ movq %SET_REG64, (%LESS_VEC_REG)
+ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx)
#endif
ret
@@ -442,8 +502,8 @@ L(between_4_7):
MOVD %XMM0, (%rdi)
MOVD %XMM0, -4(%rdi, %rdx)
#else
- movl %ecx, (%LESS_VEC_REG)
- movl %ecx, -4(%LESS_VEC_REG, %rdx)
+ movl %SET_REG32, (%LESS_VEC_REG)
+ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx)
#endif
ret
@@ -452,12 +512,12 @@ L(between_4_7):
L(between_2_3):
/* From 2 to 3. No branch when size == 2. */
#ifdef USE_XMM_LESS_VEC
- movb %sil, (%rdi)
- movb %sil, 1(%rdi)
- movb %sil, -1(%rdi, %rdx)
+ movb %SET_REG8, (%rdi)
+ movb %SET_REG8, 1(%rdi)
+ movb %SET_REG8, -1(%rdi, %rdx)
#else
- movw %cx, (%LESS_VEC_REG)
- movb %sil, -1(%LESS_VEC_REG, %rdx)
+ movw %SET_REG16, (%LESS_VEC_REG)
+ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
#endif
ret
END (MEMSET_SYMBOL (__memset, unaligned_erms))
--
GitLab

View File

@ -0,0 +1,33 @@
From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sat, 12 Feb 2022 00:45:00 -0600
Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
Content-type: text/plain; charset=UTF-8
commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
Author: H.J. Lu <hjl.tools@gmail.com>
Date: Mon Feb 7 05:55:15 2022 -0800
x86-64: Optimize bzero
Remove setting the .text section for the code. This commit
adds that back.
---
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
1 file changed, 1 insertion(+)
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
index 06f5f5d7..4fb475c0 100644
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
@@ -114,6 +114,7 @@
# error SECTION is not defined!
#endif
+ .section SECTION(.text), "ax", @progbits
#if IS_IN (libc)
# if defined SHARED
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
--
GitLab

View File

@ -0,0 +1,90 @@
From e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Tue, 15 Feb 2022 20:27:21 -0600
Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
Content-type: text/plain; charset=UTF-8
Logic can read before the start of `s1` / `s2` if both `s1` and `s2`
are near the start of a page. To avoid having the result contimated by
these comparisons the `strcmp` variants would mask off these
comparisons. This was missing in the `strncmp` variants causing
the bug. This commit adds the masking to `strncmp` so that out of
range comparisons don't affect the result.
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as
well a full xcheck on x86_64 linux.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
string/test-strncmp.c | 23 +++++++++++++++++++++++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 1 +
sysdeps/x86_64/multiarch/strcmp-evex.S | 1 +
3 files changed, 25 insertions(+)
diff --git a/string/test-strncmp.c b/string/test-strncmp.c
index 927a6daa..e61fffd9 100644
--- a/string/test-strncmp.c
+++ b/string/test-strncmp.c
@@ -403,6 +403,28 @@ check2 (void)
free (s2);
}
+static void
+check4 (void)
+{
+ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
+ the end of the page. 2) For there to be no mismatch/null byte before the
+ first page cross. 3) For length (`n`) to be large enough for one string to
+ cross the page. And 4) for there to be either mismatch/null bytes before
+ the start of the strings. */
+
+ size_t size = 10;
+ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
+ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
+ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
+ int exp_result;
+
+ STRCPY (s1, L ("tst-tlsmod%"));
+ STRCPY (s2, L ("tst-tls-manydynamic73mod"));
+ exp_result = SIMPLE_STRNCMP (s1, s2, size);
+ FOR_EACH_IMPL (impl, 0)
+ check_result (impl, s1, s2, size, exp_result);
+}
+
static void
check3 (void)
{
@@ -445,6 +467,7 @@ test_main (void)
check1 ();
check2 ();
check3 ();
+ check4 ();
printf ("%23s", "");
FOR_EACH_IMPL (impl, 0)
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 04675aa4..179cc0e3 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -661,6 +661,7 @@ L(ret8):
# ifdef USE_AS_STRNCMP
.p2align 4,, 10
L(return_page_cross_end_check):
+ andl %r10d, %ecx
tzcntl %ecx, %ecx
leal -VEC_SIZE(%rax, %rcx), %ecx
cmpl %ecx, %edx
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
index ed56af8e..0dfa62bd 100644
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
@@ -689,6 +689,7 @@ L(ret8):
# ifdef USE_AS_STRNCMP
.p2align 4,, 10
L(return_page_cross_end_check):
+ andl %r10d, %ecx
tzcntl %ecx, %ecx
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
# ifdef USE_AS_WCSCMP
--
GitLab

View File

@ -0,0 +1,77 @@
From 9fef7039a7d04947bc89296ee0d187bc8d89b772 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Thu, 24 Mar 2022 15:50:33 -0500
Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ
#28896]
Content-type: text/plain; charset=UTF-8
Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
__wcscmp_avx2.
commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
Author: Noah Goldstein <goldstein.w.n@gmail.com>
Date: Sun Jan 9 16:02:21 2022 -0600
x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
can cause spurious aborts.
This change will need to be backported.
All string/memory tests pass.
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
---
sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
2 files changed, 16 insertions(+), 1 deletion(-)
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
index aef9866c..ba6543be 100644
--- a/sysdeps/x86/tst-strncmp-rtm.c
+++ b/sysdeps/x86/tst-strncmp-rtm.c
@@ -70,6 +70,16 @@ function_overflow (void)
return 1;
}
+__attribute__ ((noinline, noclone))
+static int
+function_overflow2 (void)
+{
+ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
+ return 0;
+ else
+ return 1;
+}
+
static int
do_test (void)
{
@@ -77,5 +87,10 @@ do_test (void)
if (status != EXIT_SUCCESS)
return status;
status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
+ if (status != EXIT_SUCCESS)
+ return status;
+ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
+ if (status != EXIT_SUCCESS)
+ return status;
return status;
}
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
index 179cc0e3..782f9472 100644
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
@@ -122,7 +122,7 @@ ENTRY(STRCMP)
are cases where length is large enough that it can never be a
bound on valid memory so just use wcscmp. */
shrq $56, %rcx
- jnz __wcscmp_avx2
+ jnz OVERFLOW_STRCMP
leaq (, %rdx, 4), %rdx
# endif
--
GitLab

View File

@ -0,0 +1,27 @@
From 1283948f236f209b7d3f44b69a42b96806fa6da0 Mon Sep 17 00:00:00 2001
From: "H.J. Lu" <hjl.tools@gmail.com>
Date: Sat, 5 Feb 2022 11:06:01 -0800
Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ))
Content-type: text/plain; charset=UTF-8
---
sysdeps/x86/sysdep.h | 3 ++-
1 file changed, 2 insertions(+), 1 deletion(-)
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index a70bb3a2..49b0efe2 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -111,7 +111,8 @@ enum cf_protection_level
/* Local label name for asm code. */
#ifndef L
/* ELF-like local names start with `.L'. */
-# define L(name) .L##name
+# define LOCAL_LABEL(name) .L##name
+# define L(name) LOCAL_LABEL(name)
#endif
#define atom_text_section .section ".text.atom", "ax"
--
GitLab

Some files were not shown because too many files have changed in this diff Show More