Compare commits
No commits in common. "c8" and "c8s" have entirely different histories.
@ -1,112 +0,0 @@
|
||||
commit 849274d48fc59bfa6db3c713c8ced8026b20f3b7
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Thu Nov 16 19:55:35 2023 +0100
|
||||
|
||||
elf: Fix force_first handling in dlclose (bug 30981)
|
||||
|
||||
The force_first parameter was ineffective because the dlclose'd
|
||||
object was not necessarily the first in the maps array. Also
|
||||
enable force_first handling unconditionally, regardless of namespace.
|
||||
The initial object in a namespace should be destructed first, too.
|
||||
|
||||
The _dl_sort_maps_dfs function had early returns for relocation
|
||||
dependency processing which broke force_first handling, too, and
|
||||
this is fixed in this change as well.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
|
||||
diff --git a/elf/dl-close.c b/elf/dl-close.c
|
||||
index 66524b6708c59f29..8107c2d5f6ad2bc6 100644
|
||||
--- a/elf/dl-close.c
|
||||
+++ b/elf/dl-close.c
|
||||
@@ -182,6 +182,16 @@ _dl_close_worker (struct link_map *map, bool force)
|
||||
}
|
||||
assert (idx == nloaded);
|
||||
|
||||
+ /* Put the dlclose'd map first, so that its destructor runs first.
|
||||
+ The map variable is NULL after a retry. */
|
||||
+ if (map != NULL)
|
||||
+ {
|
||||
+ maps[map->l_idx] = maps[0];
|
||||
+ maps[map->l_idx]->l_idx = map->l_idx;
|
||||
+ maps[0] = map;
|
||||
+ maps[0]->l_idx = 0;
|
||||
+ }
|
||||
+
|
||||
/* Keep track of the lowest index link map we have covered already. */
|
||||
int done_index = -1;
|
||||
while (++done_index < nloaded)
|
||||
@@ -255,9 +265,10 @@ _dl_close_worker (struct link_map *map, bool force)
|
||||
}
|
||||
}
|
||||
|
||||
- /* Sort the entries. We can skip looking for the binary itself which is
|
||||
- at the front of the search list for the main namespace. */
|
||||
- _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
|
||||
+ /* Sort the entries. Unless retrying, the maps[0] object (the
|
||||
+ original argument to dlclose) needs to remain first, so that its
|
||||
+ destructor runs first. */
|
||||
+ _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
|
||||
|
||||
/* Call all termination functions at once. */
|
||||
bool unload_any = false;
|
||||
@@ -768,7 +779,11 @@ _dl_close_worker (struct link_map *map, bool force)
|
||||
/* Recheck if we need to retry, release the lock. */
|
||||
out:
|
||||
if (dl_close_state == rerun)
|
||||
- goto retry;
|
||||
+ {
|
||||
+ /* The map may have been deallocated. */
|
||||
+ map = NULL;
|
||||
+ goto retry;
|
||||
+ }
|
||||
|
||||
dl_close_state = not_pending;
|
||||
}
|
||||
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
|
||||
index aeb79b40b45054c0..c17ac325eca658ef 100644
|
||||
--- a/elf/dl-sort-maps.c
|
||||
+++ b/elf/dl-sort-maps.c
|
||||
@@ -260,13 +260,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
|
||||
The below memcpy is not needed in the do_reldeps case here,
|
||||
since we wrote back to maps[] during DFS traversal. */
|
||||
if (maps_head == maps)
|
||||
- return;
|
||||
+ break;
|
||||
}
|
||||
assert (maps_head == maps);
|
||||
- return;
|
||||
}
|
||||
-
|
||||
- memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
|
||||
+ else
|
||||
+ memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
|
||||
|
||||
/* Skipping the first object at maps[0] is not valid in general,
|
||||
since traversing along object dependency-links may "find" that
|
||||
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
|
||||
index 4bf9052db16fb352..cf6453e9eb85ac65 100644
|
||||
--- a/elf/dso-sort-tests-1.def
|
||||
+++ b/elf/dso-sort-tests-1.def
|
||||
@@ -56,14 +56,16 @@ output: b>a>{}<a<b
|
||||
# relocation(dynamic) dependencies. While this is technically unspecified, the
|
||||
# presumed reasonable practical behavior is for the destructor order to respect
|
||||
# the static DT_NEEDED links (here this means the a->b->c->d order).
|
||||
-# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
|
||||
-# dynamic_sort=2 algorithm does, although it is still arguable whether going
|
||||
-# beyond spec to do this is the right thing to do.
|
||||
+# The older dynamic_sort=1 algorithm originally did not achieve this,
|
||||
+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
|
||||
+# effectively disabling proper force_first handling.
|
||||
+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
|
||||
+# handling: the a object is simply moved to the front.
|
||||
# The below expected outputs are what the two algorithms currently produce
|
||||
# respectively, for regression testing purposes.
|
||||
tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
|
||||
-output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
|
||||
-output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
|
||||
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
|
||||
+output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
|
||||
|
||||
# Test that even in the presence of dependency loops involving dlopen'ed
|
||||
# object, that object is initialized last (and not unloaded prematurely).
|
@ -1,83 +0,0 @@
|
||||
commit c00b984fcd53f679ca2dafcd1aee2c89836e6e73
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Tue Aug 29 08:28:31 2023 +0200
|
||||
|
||||
nscd: Skip unusable entries in first pass in prune_cache (bug 30800)
|
||||
|
||||
Previously, if an entry was marked unusable for any reason, but had
|
||||
not timed out yet, the assert would trigger.
|
||||
|
||||
One way to get into such state is if a data change is detected during
|
||||
re-validation of an entry. This causes the entry to be marked as not
|
||||
usable. If exits nscd soon after that, then the clock jumps
|
||||
backwards, and nscd restarted, the cache re-validation run after
|
||||
startup triggers the removed assert.
|
||||
|
||||
The change is more complicated than just the removal of the assert
|
||||
because entries marked as not usable should be garbage-collected in
|
||||
the second pass. To make this happen, it is necessary to update some
|
||||
book-keeping data.
|
||||
|
||||
Reviewed-by: DJ Delorie <dj@redhat.com>
|
||||
|
||||
diff --git a/nscd/cache.c b/nscd/cache.c
|
||||
index efe4214d953edb30..2fd3f78ebb567bbe 100644
|
||||
--- a/nscd/cache.c
|
||||
+++ b/nscd/cache.c
|
||||
@@ -371,8 +371,11 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
serv2str[runp->type], str, dh->timeout);
|
||||
}
|
||||
|
||||
- /* Check whether the entry timed out. */
|
||||
- if (dh->timeout < now)
|
||||
+ /* Check whether the entry timed out. Timed out entries
|
||||
+ will be revalidated. For unusable records, it is still
|
||||
+ necessary to record that the bucket needs to be scanned
|
||||
+ again below. */
|
||||
+ if (dh->timeout < now || !dh->usable)
|
||||
{
|
||||
/* This hash bucket could contain entries which need to
|
||||
be looked at. */
|
||||
@@ -384,7 +387,7 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
/* We only have to look at the data of the first entries
|
||||
since the count information is kept in the data part
|
||||
which is shared. */
|
||||
- if (runp->first)
|
||||
+ if (runp->first && dh->usable)
|
||||
{
|
||||
|
||||
/* At this point there are two choices: we reload the
|
||||
@@ -400,9 +403,6 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
{
|
||||
/* Remove the value. */
|
||||
dh->usable = false;
|
||||
-
|
||||
- /* We definitely have some garbage entries now. */
|
||||
- any = true;
|
||||
}
|
||||
else
|
||||
{
|
||||
@@ -414,18 +414,15 @@ prune_cache (struct database_dyn *table, time_t now, int fd)
|
||||
|
||||
time_t timeout = readdfcts[runp->type] (table, runp, dh);
|
||||
next_timeout = MIN (next_timeout, timeout);
|
||||
-
|
||||
- /* If the entry has been replaced, we might need
|
||||
- cleanup. */
|
||||
- any |= !dh->usable;
|
||||
}
|
||||
}
|
||||
+
|
||||
+ /* If the entry has been replaced, we might need cleanup. */
|
||||
+ any |= !dh->usable;
|
||||
}
|
||||
else
|
||||
- {
|
||||
- assert (dh->usable);
|
||||
- next_timeout = MIN (next_timeout, dh->timeout);
|
||||
- }
|
||||
+ /* Entry has not timed out and is usable. */
|
||||
+ next_timeout = MIN (next_timeout, dh->timeout);
|
||||
|
||||
run = runp->next;
|
||||
}
|
@ -1,72 +0,0 @@
|
||||
commit 2aa0974d2573441bffd596b07bff8698b1f2f18c
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Fri Oct 20 14:29:50 2023 +0200
|
||||
|
||||
elf: ldconfig should skip temporary files created by package managers
|
||||
|
||||
This avoids crashes due to partially written files, after a package
|
||||
update is interrupted.
|
||||
|
||||
Reviewed-by: Adhemerval Zanella <adhemerval.zanella@linaro.org>
|
||||
|
||||
Conflicts:
|
||||
elf/ldconfig.c
|
||||
(missing alloca removal downstream)
|
||||
|
||||
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
|
||||
index 8c66d7e5426d8cc4..51de08f91fbaf093 100644
|
||||
--- a/elf/ldconfig.c
|
||||
+++ b/elf/ldconfig.c
|
||||
@@ -771,6 +771,31 @@ struct dlib_entry
|
||||
struct dlib_entry *next;
|
||||
};
|
||||
|
||||
+/* Skip some temporary DSO files. These files may be partially written
|
||||
+ and lead to ldconfig crashes when examined. */
|
||||
+static bool
|
||||
+skip_dso_based_on_name (const char *name, size_t len)
|
||||
+{
|
||||
+ /* Skip temporary files created by the prelink program. Files with
|
||||
+ names like these are never really DSOs we want to look at. */
|
||||
+ if (len >= sizeof (".#prelink#") - 1)
|
||||
+ {
|
||||
+ if (strcmp (name + len - sizeof (".#prelink#") + 1,
|
||||
+ ".#prelink#") == 0)
|
||||
+ return true;
|
||||
+ if (len >= sizeof (".#prelink#.XXXXXX") - 1
|
||||
+ && memcmp (name + len - sizeof (".#prelink#.XXXXXX")
|
||||
+ + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
|
||||
+ return true;
|
||||
+ }
|
||||
+ /* Skip temporary files created by RPM. */
|
||||
+ if (memchr (name, len, ';') != NULL)
|
||||
+ return true;
|
||||
+ /* Skip temporary files created by dpkg. */
|
||||
+ if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
|
||||
+ return true;
|
||||
+ return false;
|
||||
+}
|
||||
|
||||
static void
|
||||
search_dir (const struct dir_entry *entry)
|
||||
@@ -849,18 +874,8 @@ search_dir (const struct dir_entry *entry)
|
||||
continue;
|
||||
|
||||
size_t len = strlen (direntry->d_name);
|
||||
- /* Skip temporary files created by the prelink program. Files with
|
||||
- names like these are never really DSOs we want to look at. */
|
||||
- if (len >= sizeof (".#prelink#") - 1)
|
||||
- {
|
||||
- if (strcmp (direntry->d_name + len - sizeof (".#prelink#") + 1,
|
||||
- ".#prelink#") == 0)
|
||||
- continue;
|
||||
- if (len >= sizeof (".#prelink#.XXXXXX") - 1
|
||||
- && memcmp (direntry->d_name + len - sizeof (".#prelink#.XXXXXX")
|
||||
- + 1, ".#prelink#.", sizeof (".#prelink#.") - 1) == 0)
|
||||
- continue;
|
||||
- }
|
||||
+ if (skip_dso_based_on_name (direntry->d_name, len))
|
||||
+ continue;
|
||||
len += strlen (entry->path) + 2;
|
||||
if (len > file_name_len)
|
||||
{
|
@ -1,61 +0,0 @@
|
||||
commit cfb5a97a93ea656e3b2263e42142a4032986d9ba
|
||||
Author: Florian Weimer <fweimer@redhat.com>
|
||||
Date: Mon Oct 23 12:53:16 2023 +0200
|
||||
|
||||
ldconfig: Fixes for skipping temporary files.
|
||||
|
||||
Arguments to a memchr call were swapped, causing incorrect skipping
|
||||
of files.
|
||||
|
||||
Files related to dpkg have different names: they actually end in
|
||||
.dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
|
||||
|
||||
Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
|
||||
temporary files created by package managers").
|
||||
|
||||
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
|
||||
index 51de08f91fbaf093..fb19dd68d41c07a4 100644
|
||||
--- a/elf/ldconfig.c
|
||||
+++ b/elf/ldconfig.c
|
||||
@@ -771,6 +771,17 @@ struct dlib_entry
|
||||
struct dlib_entry *next;
|
||||
};
|
||||
|
||||
+/* Return true if the N bytes at NAME end with with the characters in
|
||||
+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.)
|
||||
+ Expected to be called with a string literal for SUFFIX. */
|
||||
+static inline bool
|
||||
+endswithn (const char *name, size_t n, const char *suffix)
|
||||
+{
|
||||
+ return (n >= strlen (suffix)
|
||||
+ && memcmp (name + n - strlen (suffix), suffix,
|
||||
+ strlen (suffix)) == 0);
|
||||
+}
|
||||
+
|
||||
/* Skip some temporary DSO files. These files may be partially written
|
||||
and lead to ldconfig crashes when examined. */
|
||||
static bool
|
||||
@@ -780,8 +791,7 @@ skip_dso_based_on_name (const char *name, size_t len)
|
||||
names like these are never really DSOs we want to look at. */
|
||||
if (len >= sizeof (".#prelink#") - 1)
|
||||
{
|
||||
- if (strcmp (name + len - sizeof (".#prelink#") + 1,
|
||||
- ".#prelink#") == 0)
|
||||
+ if (endswithn (name, len, ".#prelink#"))
|
||||
return true;
|
||||
if (len >= sizeof (".#prelink#.XXXXXX") - 1
|
||||
&& memcmp (name + len - sizeof (".#prelink#.XXXXXX")
|
||||
@@ -789,10 +799,11 @@ skip_dso_based_on_name (const char *name, size_t len)
|
||||
return true;
|
||||
}
|
||||
/* Skip temporary files created by RPM. */
|
||||
- if (memchr (name, len, ';') != NULL)
|
||||
+ if (memchr (name, ';', len) != NULL)
|
||||
return true;
|
||||
/* Skip temporary files created by dpkg. */
|
||||
- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
|
||||
+ if (endswithn (name, len, ".dpkg-new")
|
||||
+ || endswithn (name, len, ".dpkg-tmp"))
|
||||
return true;
|
||||
return false;
|
||||
}
|
@ -1,259 +0,0 @@
|
||||
From 97700a34f36721b11a754cf37a1cc40695ece1fd Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:23:59 -0800
|
||||
Subject: [PATCH] x86-64 memchr/wmemchr: Properly handle the length parameter
|
||||
[BZ# 24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memchr/wmemchr for x32. Tested on x86-64 and x32. On
|
||||
x86-64, libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/memchr.S: Use RDX_LP for length. Clear the
|
||||
upper 32 bits of RDX register.
|
||||
* sysdeps/x86_64/multiarch/memchr-avx2.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memchr and
|
||||
tst-size_t-wmemchr.
|
||||
* sysdeps/x86_64/x32/test-size_t.h: New file.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memchr.c: Likewise.
|
||||
* sysdeps/x86_64/x32/tst-size_t-wmemchr.c: Likewise.
|
||||
---
|
||||
sysdeps/x86_64/memchr.S | 10 ++--
|
||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 8 ++-
|
||||
sysdeps/x86_64/x32/Makefile | 8 +++
|
||||
sysdeps/x86_64/x32/test-size_t.h | 35 ++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-memchr.c | 72 +++++++++++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-wmemchr.c | 20 +++++++
|
||||
6 files changed, 148 insertions(+), 5 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/test-size_t.h
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memchr.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
NEWS
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
||||
index feef5d4f..cb320257 100644
|
||||
--- a/sysdeps/x86_64/memchr.S
|
||||
+++ b/sysdeps/x86_64/memchr.S
|
||||
@@ -34,12 +34,16 @@ ENTRY(MEMCHR)
|
||||
mov %edi, %ecx
|
||||
|
||||
#ifdef USE_AS_WMEMCHR
|
||||
- test %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
#else
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
punpcklbw %xmm1, %xmm1
|
||||
- test %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
punpcklbw %xmm1, %xmm1
|
||||
#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
index 5f5e7725..c81da19b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
@@ -40,16 +40,20 @@
|
||||
ENTRY (MEMCHR)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
- testq %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(null)
|
||||
# endif
|
||||
movl %edi, %ecx
|
||||
/* Broadcast CHAR to YMM0. */
|
||||
vmovd %esi, %xmm0
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
vpbroadcastd %xmm0, %ymm0
|
||||
# else
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
vpbroadcastb %xmm0, %ymm0
|
||||
# endif
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index f2ebc24f..7d528889 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -4,3 +4,11 @@ ifeq ($(subdir),math)
|
||||
# 64-bit llround. Add -fno-builtin-lround to silence the compiler.
|
||||
CFLAGS-s_llround.c += -fno-builtin-lround
|
||||
endif
|
||||
+
|
||||
+ifeq ($(subdir),string)
|
||||
+tests += tst-size_t-memchr
|
||||
+endif
|
||||
+
|
||||
+ifeq ($(subdir),wcsmbs)
|
||||
+tests += tst-size_t-wmemchr
|
||||
+endif
|
||||
diff --git a/sysdeps/x86_64/x32/test-size_t.h b/sysdeps/x86_64/x32/test-size_t.h
|
||||
new file mode 100644
|
||||
index 00000000..78a94086
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/test-size_t.h
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Test string/memory functions with size_t in the lower 32 bits of
|
||||
+ 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_MAIN
|
||||
+#include <string/test-string.h>
|
||||
+
|
||||
+/* On x32, parameter_t may be passed in a 64-bit register with the LEN
|
||||
+ field in the lower 32 bits. When the LEN field of 64-bit register
|
||||
+ is passed to string/memory function as the size_t parameter, only
|
||||
+ the lower 32 bits can be used. */
|
||||
+typedef struct
|
||||
+{
|
||||
+ union
|
||||
+ {
|
||||
+ size_t len;
|
||||
+ void (*fn) (void);
|
||||
+ };
|
||||
+ void *p;
|
||||
+} parameter_t;
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memchr.c b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
||||
new file mode 100644
|
||||
index 00000000..29a3daf1
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memchr.c
|
||||
@@ -0,0 +1,72 @@
|
||||
+/* Test memchr with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef WIDE
|
||||
+# define TEST_NAME "memchr"
|
||||
+#else
|
||||
+# define TEST_NAME "wmemchr"
|
||||
+#endif /* WIDE */
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+#ifndef WIDE
|
||||
+# define MEMCHR memchr
|
||||
+# define CHAR char
|
||||
+# define UCHAR unsigned char
|
||||
+#else
|
||||
+# include <wchar.h>
|
||||
+# define MEMCHR wmemchr
|
||||
+# define CHAR wchar_t
|
||||
+# define UCHAR wchar_t
|
||||
+#endif /* WIDE */
|
||||
+
|
||||
+IMPL (MEMCHR, 1)
|
||||
+
|
||||
+typedef CHAR * (*proto_t) (const CHAR*, int, size_t);
|
||||
+
|
||||
+static CHAR *
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memchr (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
|
||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ c.fn = impl->fn;
|
||||
+ CHAR *res = do_memchr (src, c);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %p != NULL",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemchr.c b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
||||
new file mode 100644
|
||||
index 00000000..877801d6
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemchr.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* Test wmemchr with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include "tst-size_t-memchr.c"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,41 +0,0 @@
|
||||
From ddf0992cf57a93200e0c782e2a94d0733a5a0b87 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun, 9 Jan 2022 16:02:21 -0600
|
||||
Subject: [PATCH] x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
|
||||
__wcscmp_avx2. For x86_64 this covers the entire address range so any
|
||||
length larger could not possibly be used to bound `s1` or `s2`.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 156c1949..8fb8eedc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -83,6 +83,16 @@ ENTRY (STRCMP)
|
||||
je L(char0)
|
||||
jb L(zero)
|
||||
# ifdef USE_AS_WCSCMP
|
||||
+# ifndef __ILP32__
|
||||
+ movq %rdx, %rcx
|
||||
+ /* Check if length could overflow when multiplied by
|
||||
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
|
||||
+ overflow cases as well as redirect cases where its impossible to
|
||||
+ length to bound a valid memory region. In these cases just use
|
||||
+ 'wcscmp'. */
|
||||
+ shrq $56, %rcx
|
||||
+ jnz __wcscmp_avx2
|
||||
+# endif
|
||||
/* Convert units: from wide to byte char. */
|
||||
shl $2, %RDX_LP
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,257 +0,0 @@
|
||||
From 244b415d386487521882debb845a040a4758cb18 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 25 Mar 2022 17:13:33 -0500
|
||||
Subject: [PATCH] x86: Small improvements for wcslen
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Just a few QOL changes.
|
||||
1. Prefer `add` > `lea` as it has high execution units it can run
|
||||
on.
|
||||
2. Don't break macro-fusion between `test` and `jcc`
|
||||
3. Reduce code size by removing gratuitous padding bytes (-90
|
||||
bytes).
|
||||
|
||||
geometric_mean(N=20) of all benchmarks New / Original: 0.959
|
||||
|
||||
All string/memory tests pass.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/wcslen.S | 86 ++++++++++++++++++++---------------------
|
||||
1 file changed, 41 insertions(+), 45 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/wcslen.S b/sysdeps/x86_64/wcslen.S
|
||||
index 9f5f7232..254bb030 100644
|
||||
--- a/sysdeps/x86_64/wcslen.S
|
||||
+++ b/sysdeps/x86_64/wcslen.S
|
||||
@@ -41,82 +41,82 @@ ENTRY (__wcslen)
|
||||
pxor %xmm0, %xmm0
|
||||
|
||||
lea 32(%rdi), %rax
|
||||
- lea 16(%rdi), %rcx
|
||||
+ addq $16, %rdi
|
||||
and $-16, %rax
|
||||
|
||||
pcmpeqd (%rax), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
pxor %xmm1, %xmm1
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
pxor %xmm2, %xmm2
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
pxor %xmm3, %xmm3
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm0
|
||||
pmovmskb %xmm0, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm1
|
||||
pmovmskb %xmm1, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd (%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $16, %rax
|
||||
test %edx, %edx
|
||||
- lea 16(%rax), %rax
|
||||
jnz L(exit)
|
||||
|
||||
and $-0x40, %rax
|
||||
@@ -133,104 +133,100 @@ L(aligned_64_loop):
|
||||
pminub %xmm0, %xmm2
|
||||
pcmpeqd %xmm3, %xmm2
|
||||
pmovmskb %xmm2, %edx
|
||||
+ addq $64, %rax
|
||||
test %edx, %edx
|
||||
- lea 64(%rax), %rax
|
||||
jz L(aligned_64_loop)
|
||||
|
||||
pcmpeqd -64(%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $48, %rdi
|
||||
test %edx, %edx
|
||||
- lea 48(%rcx), %rcx
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd %xmm1, %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $-16, %rdi
|
||||
test %edx, %edx
|
||||
- lea -16(%rcx), %rcx
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd -32(%rax), %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $-16, %rdi
|
||||
test %edx, %edx
|
||||
- lea -16(%rcx), %rcx
|
||||
jnz L(exit)
|
||||
|
||||
pcmpeqd %xmm6, %xmm3
|
||||
pmovmskb %xmm3, %edx
|
||||
+ addq $-16, %rdi
|
||||
test %edx, %edx
|
||||
- lea -16(%rcx), %rcx
|
||||
- jnz L(exit)
|
||||
-
|
||||
- jmp L(aligned_64_loop)
|
||||
+ jz L(aligned_64_loop)
|
||||
|
||||
.p2align 4
|
||||
L(exit):
|
||||
- sub %rcx, %rax
|
||||
+ sub %rdi, %rax
|
||||
shr $2, %rax
|
||||
test %dl, %dl
|
||||
jz L(exit_high)
|
||||
|
||||
- mov %dl, %cl
|
||||
- and $15, %cl
|
||||
+ andl $15, %edx
|
||||
jz L(exit_1)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ /* No align here. Naturally aligned % 16 == 1. */
|
||||
L(exit_high):
|
||||
- mov %dh, %ch
|
||||
- and $15, %ch
|
||||
+ andl $(15 << 8), %edx
|
||||
jz L(exit_3)
|
||||
add $2, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_1):
|
||||
add $1, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_3):
|
||||
add $3, %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail0):
|
||||
- xor %rax, %rax
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail1):
|
||||
- mov $1, %rax
|
||||
+ movl $1, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail2):
|
||||
- mov $2, %rax
|
||||
+ movl $2, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail3):
|
||||
- mov $3, %rax
|
||||
+ movl $3, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail4):
|
||||
- mov $4, %rax
|
||||
+ movl $4, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail5):
|
||||
- mov $5, %rax
|
||||
+ movl $5, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail6):
|
||||
- mov $6, %rax
|
||||
+ movl $6, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 3
|
||||
L(exit_tail7):
|
||||
- mov $7, %rax
|
||||
+ movl $7, %eax
|
||||
ret
|
||||
|
||||
END (__wcslen)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,964 +0,0 @@
|
||||
From 7cbc03d03091d5664060924789afe46d30a5477e Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 15 Apr 2022 12:28:00 -0500
|
||||
Subject: [PATCH] x86: Remove memcmp-sse4.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Code didn't actually use any sse4 instructions since `ptest` was
|
||||
removed in:
|
||||
|
||||
commit 2f9062d7171850451e6044ef78d91ff8c017b9c0
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Nov 10 16:18:56 2021 -0600
|
||||
|
||||
x86: Shrink memcmp-sse4.S code size
|
||||
|
||||
The new memcmp-sse2 implementation is also faster.
|
||||
|
||||
geometric_mean(N=20) of page cross cases SSE2 / SSE4: 0.905
|
||||
|
||||
Note there are two regressions preferring SSE2 for Size = 1 and Size =
|
||||
65.
|
||||
|
||||
Size = 1:
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
1, 1, 1, 0, 1.2
|
||||
1, 1, 1, 1, 1.197
|
||||
1, 1, 1, -1, 1.2
|
||||
|
||||
This is intentional. Size == 1 is significantly less hot based on
|
||||
profiles of GCC11 and Python3 than sizes [4, 8] (which is made
|
||||
hotter).
|
||||
|
||||
Python3 Size = 1 -> 13.64%
|
||||
Python3 Size = [4, 8] -> 60.92%
|
||||
|
||||
GCC11 Size = 1 -> 1.29%
|
||||
GCC11 Size = [4, 8] -> 33.86%
|
||||
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
4, 4, 4, 0, 0.622
|
||||
4, 4, 4, 1, 0.797
|
||||
4, 4, 4, -1, 0.805
|
||||
5, 5, 5, 0, 0.623
|
||||
5, 5, 5, 1, 0.777
|
||||
5, 5, 5, -1, 0.802
|
||||
6, 6, 6, 0, 0.625
|
||||
6, 6, 6, 1, 0.813
|
||||
6, 6, 6, -1, 0.788
|
||||
7, 7, 7, 0, 0.625
|
||||
7, 7, 7, 1, 0.799
|
||||
7, 7, 7, -1, 0.795
|
||||
8, 8, 8, 0, 0.625
|
||||
8, 8, 8, 1, 0.848
|
||||
8, 8, 8, -1, 0.914
|
||||
9, 9, 9, 0, 0.625
|
||||
|
||||
Size = 65:
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
65, 0, 0, 0, 1.103
|
||||
65, 0, 0, 1, 1.216
|
||||
65, 0, 0, -1, 1.227
|
||||
65, 65, 0, 0, 1.091
|
||||
65, 0, 65, 1, 1.19
|
||||
65, 65, 65, -1, 1.215
|
||||
|
||||
This is because A) the checks in range [65, 96] are now unrolled 2x
|
||||
and B) because smaller values <= 16 are now given a hotter path. By
|
||||
contrast the SSE4 version has a branch for Size = 80. The unrolled
|
||||
version has get better performance for returns which need both
|
||||
comparisons.
|
||||
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
128, 4, 8, 0, 0.858
|
||||
128, 4, 8, 1, 0.879
|
||||
128, 4, 8, -1, 0.888
|
||||
|
||||
As well, out of microbenchmark environments that are not full
|
||||
predictable the branch will have a real-cost.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 2 -
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 -
|
||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 4 -
|
||||
sysdeps/x86_64/multiarch/memcmp-sse4.S | 804 ---------------------
|
||||
4 files changed, 814 deletions(-)
|
||||
delete mode 100644 sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index bca82e38..b503e4b8 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -11,7 +11,6 @@ sysdep_routines += \
|
||||
memcmp-avx2-movbe-rtm \
|
||||
memcmp-evex-movbe \
|
||||
memcmp-sse2 \
|
||||
- memcmp-sse4 \
|
||||
memcmp-ssse3 \
|
||||
memcpy-ssse3 \
|
||||
memcpy-ssse3-back \
|
||||
@@ -174,7 +173,6 @@ sysdep_routines += \
|
||||
wmemcmp-avx2-movbe-rtm \
|
||||
wmemcmp-c \
|
||||
wmemcmp-evex-movbe \
|
||||
- wmemcmp-sse4 \
|
||||
wmemcmp-ssse3 \
|
||||
# sysdep_routines
|
||||
endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 14314367..450a2917 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -78,8 +78,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__memcmp_evex_movbe)
|
||||
- IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
- __memcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
__memcmp_ssse3)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, 1, __memcmp_sse2))
|
||||
@@ -824,8 +822,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__wmemcmp_evex_movbe)
|
||||
- IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
- __wmemcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
__wmemcmp_ssse3)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, 1, __wmemcmp_sse2))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
index 690dffe8..0bc47a7f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
@@ -21,7 +21,6 @@
|
||||
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe_rtm) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
||||
@@ -47,9 +46,6 @@ IFUNC_SELECTOR (void)
|
||||
return OPTIMIZE (avx2_movbe);
|
||||
}
|
||||
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
- return OPTIMIZE (sse4_1);
|
||||
-
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSSE3))
|
||||
return OPTIMIZE (ssse3);
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
deleted file mode 100644
|
||||
index 50060006..00000000
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
+++ /dev/null
|
||||
@@ -1,804 +0,0 @@
|
||||
-/* memcmp with SSE4.1, wmemcmp with SSE4.1
|
||||
- Copyright (C) 2010-2018 Free Software Foundation, Inc.
|
||||
- Contributed by Intel Corporation.
|
||||
- This file is part of the GNU C Library.
|
||||
-
|
||||
- The GNU C Library is free software; you can redistribute it and/or
|
||||
- modify it under the terms of the GNU Lesser General Public
|
||||
- License as published by the Free Software Foundation; either
|
||||
- version 2.1 of the License, or (at your option) any later version.
|
||||
-
|
||||
- The GNU C Library is distributed in the hope that it will be useful,
|
||||
- but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
- Lesser General Public License for more details.
|
||||
-
|
||||
- You should have received a copy of the GNU Lesser General Public
|
||||
- License along with the GNU C Library; if not, see
|
||||
- <http://www.gnu.org/licenses/>. */
|
||||
-
|
||||
-#if IS_IN (libc)
|
||||
-
|
||||
-# include <sysdep.h>
|
||||
-
|
||||
-# ifndef MEMCMP
|
||||
-# define MEMCMP __memcmp_sse4_1
|
||||
-# endif
|
||||
-
|
||||
-#ifdef USE_AS_WMEMCMP
|
||||
-# define CMPEQ pcmpeqd
|
||||
-# define CHAR_SIZE 4
|
||||
-#else
|
||||
-# define CMPEQ pcmpeqb
|
||||
-# define CHAR_SIZE 1
|
||||
-#endif
|
||||
-
|
||||
-
|
||||
-/* Warning!
|
||||
- wmemcmp has to use SIGNED comparison for elements.
|
||||
- memcmp has to use UNSIGNED comparison for elemnts.
|
||||
-*/
|
||||
-
|
||||
- .section .text.sse4.1,"ax",@progbits
|
||||
-ENTRY (MEMCMP)
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %RDX_LP
|
||||
-# elif defined __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- mov %edx, %edx
|
||||
-# endif
|
||||
- cmp $79, %RDX_LP
|
||||
- ja L(79bytesormore)
|
||||
-
|
||||
- cmp $CHAR_SIZE, %RDX_LP
|
||||
- jbe L(firstbyte)
|
||||
-
|
||||
- /* N in (CHAR_SIZE, 79) bytes. */
|
||||
- cmpl $32, %edx
|
||||
- ja L(more_32_bytes)
|
||||
-
|
||||
- cmpl $16, %edx
|
||||
- jae L(16_to_32_bytes)
|
||||
-
|
||||
-# ifndef USE_AS_WMEMCMP
|
||||
- cmpl $8, %edx
|
||||
- jae L(8_to_16_bytes)
|
||||
-
|
||||
- cmpl $4, %edx
|
||||
- jb L(2_to_3_bytes)
|
||||
-
|
||||
- movl (%rdi), %eax
|
||||
- movl (%rsi), %ecx
|
||||
-
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
-
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
-
|
||||
- movl -4(%rdi, %rdx), %edi
|
||||
- movl -4(%rsi, %rdx), %esi
|
||||
-
|
||||
- bswap %edi
|
||||
- bswap %esi
|
||||
-
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- cmovne %edx, %eax
|
||||
- sbbl %ecx, %ecx
|
||||
- orl %ecx, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(2_to_3_bytes):
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(8_to_16_bytes):
|
||||
- movq (%rdi), %rax
|
||||
- movq (%rsi), %rcx
|
||||
-
|
||||
- bswap %rax
|
||||
- bswap %rcx
|
||||
-
|
||||
- subq %rcx, %rax
|
||||
- jne L(8_to_16_bytes_done)
|
||||
-
|
||||
- movq -8(%rdi, %rdx), %rax
|
||||
- movq -8(%rsi, %rdx), %rcx
|
||||
-
|
||||
- bswap %rax
|
||||
- bswap %rcx
|
||||
-
|
||||
- subq %rcx, %rax
|
||||
-
|
||||
-L(8_to_16_bytes_done):
|
||||
- cmovne %edx, %eax
|
||||
- sbbl %ecx, %ecx
|
||||
- orl %ecx, %eax
|
||||
- ret
|
||||
-# else
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi), %ecx
|
||||
- cmpl (%rsi), %ecx
|
||||
- jne L(8_to_16_bytes_done)
|
||||
- movl 4(%rdi), %ecx
|
||||
- cmpl 4(%rsi), %ecx
|
||||
- jne L(8_to_16_bytes_done)
|
||||
- movl -4(%rdi, %rdx), %ecx
|
||||
- cmpl -4(%rsi, %rdx), %ecx
|
||||
- jne L(8_to_16_bytes_done)
|
||||
- ret
|
||||
-# endif
|
||||
-
|
||||
- .p2align 4,, 3
|
||||
-L(ret_zero):
|
||||
- xorl %eax, %eax
|
||||
-L(zero):
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(firstbyte):
|
||||
- jb L(ret_zero)
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi), %ecx
|
||||
- cmpl (%rsi), %ecx
|
||||
- je L(zero)
|
||||
-L(8_to_16_bytes_done):
|
||||
- setg %al
|
||||
- leal -1(%rax, %rax), %eax
|
||||
-# else
|
||||
- movzbl (%rdi), %eax
|
||||
- movzbl (%rsi), %ecx
|
||||
- sub %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(vec_return_begin_48):
|
||||
- addq $16, %rdi
|
||||
- addq $16, %rsi
|
||||
-L(vec_return_begin_32):
|
||||
- bsfl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl 32(%rdi, %rax), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl 32(%rsi, %rax), %ecx
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl 32(%rsi, %rax), %ecx
|
||||
- movzbl 32(%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(vec_return_begin_16):
|
||||
- addq $16, %rdi
|
||||
- addq $16, %rsi
|
||||
-L(vec_return_begin):
|
||||
- bsfl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rax), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rax), %ecx
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rsi, %rax), %ecx
|
||||
- movzbl (%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(vec_return_end_16):
|
||||
- subl $16, %edx
|
||||
-L(vec_return_end):
|
||||
- bsfl %eax, %eax
|
||||
- addl %edx, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl -16(%rdi, %rax), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl -16(%rsi, %rax), %ecx
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl -16(%rsi, %rax), %ecx
|
||||
- movzbl -16(%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(more_32_bytes):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu (%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm0
|
||||
- movdqu 16(%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- cmpl $64, %edx
|
||||
- jbe L(32_to_64_bytes)
|
||||
- movdqu 32(%rdi), %xmm0
|
||||
- movdqu 32(%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(32_to_64_bytes):
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(16_to_32_bytes):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu (%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
-
|
||||
- .p2align 4
|
||||
-L(79bytesormore):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu (%rsi), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
-
|
||||
- mov %rsi, %rcx
|
||||
- and $-16, %rsi
|
||||
- add $16, %rsi
|
||||
- sub %rsi, %rcx
|
||||
-
|
||||
- sub %rcx, %rdi
|
||||
- add %rcx, %rdx
|
||||
- test $0xf, %rdi
|
||||
- jz L(2aligned)
|
||||
-
|
||||
- cmp $128, %rdx
|
||||
- ja L(128bytesormore)
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(less128bytes):
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqu 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- cmp $96, %rdx
|
||||
- jb L(32_to_64_bytes)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
- subq $64, %rdx
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(last_64_bytes):
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(128bytesormore):
|
||||
- cmp $256, %rdx
|
||||
- ja L(unaligned_loop)
|
||||
-L(less256bytes):
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqu 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
-
|
||||
- movdqu (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqu 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $-128, %rdx
|
||||
- subq $-64, %rsi
|
||||
- subq $-64, %rdi
|
||||
-
|
||||
- cmp $64, %rdx
|
||||
- ja L(less128bytes)
|
||||
-
|
||||
- cmp $32, %rdx
|
||||
- ja L(last_64_bytes)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(unaligned_loop):
|
||||
-# ifdef DATA_CACHE_SIZE_HALF
|
||||
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
||||
-# else
|
||||
- mov __x86_data_cache_size_half(%rip), %R8_LP
|
||||
-# endif
|
||||
- movq %r8, %r9
|
||||
- addq %r8, %r8
|
||||
- addq %r9, %r8
|
||||
- cmpq %r8, %rdx
|
||||
- ja L(L2_L3_cache_unaligned)
|
||||
- sub $64, %rdx
|
||||
- .p2align 4
|
||||
-L(64bytesormore_loop):
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- movdqu 32(%rdi), %xmm2
|
||||
- movdqu 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
-
|
||||
- add $64, %rsi
|
||||
- add $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- ja L(64bytesormore_loop)
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(loop_tail):
|
||||
- addq %rdx, %rdi
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- movdqu 32(%rdi), %xmm2
|
||||
- movdqu 48(%rdi), %xmm3
|
||||
-
|
||||
- addq %rdx, %rsi
|
||||
- movdqu (%rsi), %xmm4
|
||||
- movdqu 16(%rsi), %xmm5
|
||||
- movdqu 32(%rsi), %xmm6
|
||||
- movdqu 48(%rsi), %xmm7
|
||||
-
|
||||
- CMPEQ %xmm4, %xmm0
|
||||
- CMPEQ %xmm5, %xmm1
|
||||
- CMPEQ %xmm6, %xmm2
|
||||
- CMPEQ %xmm7, %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
- ret
|
||||
-
|
||||
-L(L2_L3_cache_unaligned):
|
||||
- subq $64, %rdx
|
||||
- .p2align 4
|
||||
-L(L2_L3_unaligned_128bytes_loop):
|
||||
- prefetchnta 0x1c0(%rdi)
|
||||
- prefetchnta 0x1c0(%rsi)
|
||||
-
|
||||
- movdqu (%rdi), %xmm0
|
||||
- movdqu 16(%rdi), %xmm1
|
||||
- movdqu 32(%rdi), %xmm2
|
||||
- movdqu 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
-
|
||||
- add $64, %rsi
|
||||
- add $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- ja L(L2_L3_unaligned_128bytes_loop)
|
||||
- jmp L(loop_tail)
|
||||
-
|
||||
-
|
||||
- /* This case is for machines which are sensitive for unaligned
|
||||
- * instructions. */
|
||||
- .p2align 4
|
||||
-L(2aligned):
|
||||
- cmp $128, %rdx
|
||||
- ja L(128bytesormorein2aligned)
|
||||
-L(less128bytesin2aligned):
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- cmp $96, %rdx
|
||||
- jb L(32_to_64_bytes)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
- subq $64, %rdx
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(aligned_last_64_bytes):
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(128bytesormorein2aligned):
|
||||
- cmp $256, %rdx
|
||||
- ja L(aligned_loop)
|
||||
-L(less256bytesin2alinged):
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $64, %rdi
|
||||
- addq $64, %rsi
|
||||
-
|
||||
- movdqa (%rdi), %xmm1
|
||||
- CMPEQ (%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin)
|
||||
-
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_16)
|
||||
-
|
||||
- movdqa 32(%rdi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_32)
|
||||
-
|
||||
- movdqa 48(%rdi), %xmm1
|
||||
- CMPEQ 48(%rsi), %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_begin_48)
|
||||
-
|
||||
- addq $-128, %rdx
|
||||
- subq $-64, %rsi
|
||||
- subq $-64, %rdi
|
||||
-
|
||||
- cmp $64, %rdx
|
||||
- ja L(less128bytesin2aligned)
|
||||
-
|
||||
- cmp $32, %rdx
|
||||
- ja L(aligned_last_64_bytes)
|
||||
-
|
||||
- movdqu -32(%rdi, %rdx), %xmm0
|
||||
- movdqu -32(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end_16)
|
||||
-
|
||||
- movdqu -16(%rdi, %rdx), %xmm0
|
||||
- movdqu -16(%rsi, %rdx), %xmm1
|
||||
- CMPEQ %xmm0, %xmm1
|
||||
- pmovmskb %xmm1, %eax
|
||||
- incw %ax
|
||||
- jnz L(vec_return_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(aligned_loop):
|
||||
-# ifdef DATA_CACHE_SIZE_HALF
|
||||
- mov $DATA_CACHE_SIZE_HALF, %R8_LP
|
||||
-# else
|
||||
- mov __x86_data_cache_size_half(%rip), %R8_LP
|
||||
-# endif
|
||||
- movq %r8, %r9
|
||||
- addq %r8, %r8
|
||||
- addq %r9, %r8
|
||||
- cmpq %r8, %rdx
|
||||
- ja L(L2_L3_cache_aligned)
|
||||
-
|
||||
- sub $64, %rdx
|
||||
- .p2align 4
|
||||
-L(64bytesormore_loopin2aligned):
|
||||
- movdqa (%rdi), %xmm0
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- movdqa 32(%rdi), %xmm2
|
||||
- movdqa 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
- add $64, %rsi
|
||||
- add $64, %rdi
|
||||
- sub $64, %rdx
|
||||
- ja L(64bytesormore_loopin2aligned)
|
||||
- jmp L(loop_tail)
|
||||
-
|
||||
-L(L2_L3_cache_aligned):
|
||||
- subq $64, %rdx
|
||||
- .p2align 4
|
||||
-L(L2_L3_aligned_128bytes_loop):
|
||||
- prefetchnta 0x1c0(%rdi)
|
||||
- prefetchnta 0x1c0(%rsi)
|
||||
- movdqa (%rdi), %xmm0
|
||||
- movdqa 16(%rdi), %xmm1
|
||||
- movdqa 32(%rdi), %xmm2
|
||||
- movdqa 48(%rdi), %xmm3
|
||||
-
|
||||
- CMPEQ (%rsi), %xmm0
|
||||
- CMPEQ 16(%rsi), %xmm1
|
||||
- CMPEQ 32(%rsi), %xmm2
|
||||
- CMPEQ 48(%rsi), %xmm3
|
||||
-
|
||||
- pand %xmm0, %xmm1
|
||||
- pand %xmm2, %xmm3
|
||||
- pand %xmm1, %xmm3
|
||||
-
|
||||
- pmovmskb %xmm3, %eax
|
||||
- incw %ax
|
||||
- jnz L(64bytesormore_loop_end)
|
||||
-
|
||||
- addq $64, %rsi
|
||||
- addq $64, %rdi
|
||||
- subq $64, %rdx
|
||||
- ja L(L2_L3_aligned_128bytes_loop)
|
||||
- jmp L(loop_tail)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(64bytesormore_loop_end):
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- incw %cx
|
||||
- jnz L(loop_end_ret)
|
||||
-
|
||||
- pmovmskb %xmm1, %ecx
|
||||
- notw %cx
|
||||
- sall $16, %ecx
|
||||
- jnz L(loop_end_ret)
|
||||
-
|
||||
- pmovmskb %xmm2, %ecx
|
||||
- notw %cx
|
||||
- shlq $32, %rcx
|
||||
- jnz L(loop_end_ret)
|
||||
-
|
||||
- addq $48, %rdi
|
||||
- addq $48, %rsi
|
||||
- movq %rax, %rcx
|
||||
-
|
||||
- .p2align 4,, 6
|
||||
-L(loop_end_ret):
|
||||
- bsfq %rcx, %rcx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rcx), %eax
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rcx), %eax
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rdi, %rcx), %eax
|
||||
- movzbl (%rsi, %rcx), %ecx
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-END (MEMCMP)
|
||||
-#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,263 +0,0 @@
|
||||
From 23102686ec67b856a2d4fd25ddaa1c0b8d175c4f Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 15 Apr 2022 12:28:01 -0500
|
||||
Subject: [PATCH] x86: Cleanup page cross code in memcmp-avx2-movbe.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Old code was both inefficient and wasted code size. New code (-62
|
||||
bytes) and comparable or better performance in the page cross case.
|
||||
|
||||
geometric_mean(N=20) of page cross cases New / Original: 0.960
|
||||
|
||||
size, align0, align1, ret, New Time/Old Time
|
||||
1, 4095, 0, 0, 1.001
|
||||
1, 4095, 0, 1, 0.999
|
||||
1, 4095, 0, -1, 1.0
|
||||
2, 4094, 0, 0, 1.0
|
||||
2, 4094, 0, 1, 1.0
|
||||
2, 4094, 0, -1, 1.0
|
||||
3, 4093, 0, 0, 1.0
|
||||
3, 4093, 0, 1, 1.0
|
||||
3, 4093, 0, -1, 1.0
|
||||
4, 4092, 0, 0, 0.987
|
||||
4, 4092, 0, 1, 1.0
|
||||
4, 4092, 0, -1, 1.0
|
||||
5, 4091, 0, 0, 0.984
|
||||
5, 4091, 0, 1, 1.002
|
||||
5, 4091, 0, -1, 1.005
|
||||
6, 4090, 0, 0, 0.993
|
||||
6, 4090, 0, 1, 1.001
|
||||
6, 4090, 0, -1, 1.003
|
||||
7, 4089, 0, 0, 0.991
|
||||
7, 4089, 0, 1, 1.0
|
||||
7, 4089, 0, -1, 1.001
|
||||
8, 4088, 0, 0, 0.875
|
||||
8, 4088, 0, 1, 0.881
|
||||
8, 4088, 0, -1, 0.888
|
||||
9, 4087, 0, 0, 0.872
|
||||
9, 4087, 0, 1, 0.879
|
||||
9, 4087, 0, -1, 0.883
|
||||
10, 4086, 0, 0, 0.878
|
||||
10, 4086, 0, 1, 0.886
|
||||
10, 4086, 0, -1, 0.873
|
||||
11, 4085, 0, 0, 0.878
|
||||
11, 4085, 0, 1, 0.881
|
||||
11, 4085, 0, -1, 0.879
|
||||
12, 4084, 0, 0, 0.873
|
||||
12, 4084, 0, 1, 0.889
|
||||
12, 4084, 0, -1, 0.875
|
||||
13, 4083, 0, 0, 0.873
|
||||
13, 4083, 0, 1, 0.863
|
||||
13, 4083, 0, -1, 0.863
|
||||
14, 4082, 0, 0, 0.838
|
||||
14, 4082, 0, 1, 0.869
|
||||
14, 4082, 0, -1, 0.877
|
||||
15, 4081, 0, 0, 0.841
|
||||
15, 4081, 0, 1, 0.869
|
||||
15, 4081, 0, -1, 0.876
|
||||
16, 4080, 0, 0, 0.988
|
||||
16, 4080, 0, 1, 0.99
|
||||
16, 4080, 0, -1, 0.989
|
||||
17, 4079, 0, 0, 0.978
|
||||
17, 4079, 0, 1, 0.981
|
||||
17, 4079, 0, -1, 0.98
|
||||
18, 4078, 0, 0, 0.981
|
||||
18, 4078, 0, 1, 0.98
|
||||
18, 4078, 0, -1, 0.985
|
||||
19, 4077, 0, 0, 0.977
|
||||
19, 4077, 0, 1, 0.979
|
||||
19, 4077, 0, -1, 0.986
|
||||
20, 4076, 0, 0, 0.977
|
||||
20, 4076, 0, 1, 0.986
|
||||
20, 4076, 0, -1, 0.984
|
||||
21, 4075, 0, 0, 0.977
|
||||
21, 4075, 0, 1, 0.983
|
||||
21, 4075, 0, -1, 0.988
|
||||
22, 4074, 0, 0, 0.983
|
||||
22, 4074, 0, 1, 0.994
|
||||
22, 4074, 0, -1, 0.993
|
||||
23, 4073, 0, 0, 0.98
|
||||
23, 4073, 0, 1, 0.992
|
||||
23, 4073, 0, -1, 0.995
|
||||
24, 4072, 0, 0, 0.989
|
||||
24, 4072, 0, 1, 0.989
|
||||
24, 4072, 0, -1, 0.991
|
||||
25, 4071, 0, 0, 0.99
|
||||
25, 4071, 0, 1, 0.999
|
||||
25, 4071, 0, -1, 0.996
|
||||
26, 4070, 0, 0, 0.993
|
||||
26, 4070, 0, 1, 0.995
|
||||
26, 4070, 0, -1, 0.998
|
||||
27, 4069, 0, 0, 0.993
|
||||
27, 4069, 0, 1, 0.999
|
||||
27, 4069, 0, -1, 1.0
|
||||
28, 4068, 0, 0, 0.997
|
||||
28, 4068, 0, 1, 1.0
|
||||
28, 4068, 0, -1, 0.999
|
||||
29, 4067, 0, 0, 0.996
|
||||
29, 4067, 0, 1, 0.999
|
||||
29, 4067, 0, -1, 0.999
|
||||
30, 4066, 0, 0, 0.991
|
||||
30, 4066, 0, 1, 1.001
|
||||
30, 4066, 0, -1, 0.999
|
||||
31, 4065, 0, 0, 0.988
|
||||
31, 4065, 0, 1, 0.998
|
||||
31, 4065, 0, -1, 0.998
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 98 ++++++++++++--------
|
||||
1 file changed, 61 insertions(+), 37 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
index 16fc673e..99258cf5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
@@ -429,22 +429,21 @@ L(page_cross_less_vec):
|
||||
# ifndef USE_AS_WMEMCMP
|
||||
cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
+ /* Fall through for [4, 7]. */
|
||||
cmpl $4, %edx
|
||||
- jae L(between_4_7)
|
||||
+ jb L(between_2_3)
|
||||
|
||||
- /* Load as big endian to avoid branches. */
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
- subl %ecx, %eax
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ /* Fast path for return zero. */
|
||||
+ jnz L(ret_nonzero)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
@@ -457,9 +456,33 @@ L(one_or_less):
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
+ .p2align 4,, 5
|
||||
+L(ret_nonzero):
|
||||
+ sbbl %eax, %eax
|
||||
+ orl $1, %eax
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4,, 2
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
L(between_8_15):
|
||||
-# endif
|
||||
+ movbe (%rdi), %rax
|
||||
+ movbe (%rsi), %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ jnz L(ret_nonzero)
|
||||
+ movbe -8(%rdi, %rdx), %rax
|
||||
+ movbe -8(%rsi, %rdx), %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ /* Fast path for return zero. */
|
||||
+ jnz L(ret_nonzero)
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
+# else
|
||||
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
vmovq (%rdi), %xmm1
|
||||
vmovq (%rsi), %xmm2
|
||||
@@ -475,16 +498,13 @@ L(between_8_15):
|
||||
VPCMPEQ %xmm1, %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
subl $0xffff, %eax
|
||||
+ /* Fast path for return zero. */
|
||||
jnz L(return_vec_0)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
+# endif
|
||||
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 10
|
||||
L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
vmovdqu (%rsi), %xmm2
|
||||
@@ -501,11 +521,17 @@ L(between_16_31):
|
||||
VPCMPEQ (%rdi), %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
subl $0xffff, %eax
|
||||
+ /* Fast path for return zero. */
|
||||
jnz L(return_vec_0)
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
+ .p2align 4,, 2
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
L(one_or_less):
|
||||
jb L(zero)
|
||||
@@ -520,22 +546,20 @@ L(one_or_less):
|
||||
# else
|
||||
|
||||
.p2align 4
|
||||
-L(between_4_7):
|
||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
||||
- */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- jz L(zero_4_7)
|
||||
- sbbl %eax, %eax
|
||||
- orl $1, %eax
|
||||
-L(zero_4_7):
|
||||
+L(between_2_3):
|
||||
+ /* Load as big endian to avoid branches. */
|
||||
+ movzwl (%rdi), %eax
|
||||
+ movzwl (%rsi), %ecx
|
||||
+ bswap %eax
|
||||
+ bswap %ecx
|
||||
+ shrl %eax
|
||||
+ shrl %ecx
|
||||
+ movzbl -1(%rdi, %rdx), %edi
|
||||
+ movzbl -1(%rsi, %rdx), %esi
|
||||
+ orl %edi, %eax
|
||||
+ orl %esi, %ecx
|
||||
+ /* Subtraction is okay because the upper bit is zero. */
|
||||
+ subl %ecx, %eax
|
||||
/* No ymm register was touched. */
|
||||
ret
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,876 +0,0 @@
|
||||
From 5307aa9c1800f36a64c183c091c9af392c1fa75c Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 21 Apr 2022 20:52:28 -0500
|
||||
Subject: [PATCH] x86: Optimize {str|wcs}rchr-sse2
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The new code unrolls the main loop slightly without adding too much
|
||||
overhead and minimizes the comparisons for the search CHAR.
|
||||
|
||||
Geometric Mean of all benchmarks New / Old: 0.741
|
||||
See email for all results.
|
||||
|
||||
Full xcheck passes on x86_64 with and without multiarch enabled.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strrchr-sse2.S | 2 +-
|
||||
sysdeps/x86_64/multiarch/wcsrchr-sse2.S | 3 +-
|
||||
sysdeps/x86_64/strrchr.S | 510 +++++++++++++++---------
|
||||
sysdeps/x86_64/wcsrchr.S | 266 +-----------
|
||||
4 files changed, 338 insertions(+), 443 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/wcsrchr.S
|
||||
(copyright header)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-sse2.S b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||||
index 0ec76fe9..6bb1284b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strrchr-sse2.S
|
||||
@@ -17,7 +17,7 @@
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
-# define strrchr __strrchr_sse2
|
||||
+# define STRRCHR __strrchr_sse2
|
||||
|
||||
# undef weak_alias
|
||||
# define weak_alias(strrchr, rindex)
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||||
index d015e953..f26d53b5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsrchr-sse2.S
|
||||
@@ -17,7 +17,6 @@
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#if IS_IN (libc)
|
||||
-# define wcsrchr __wcsrchr_sse2
|
||||
+# define STRRCHR __wcsrchr_sse2
|
||||
#endif
|
||||
-
|
||||
#include "../wcsrchr.S"
|
||||
diff --git a/sysdeps/x86_64/strrchr.S b/sysdeps/x86_64/strrchr.S
|
||||
index aca98e7e..a58cc220 100644
|
||||
--- a/sysdeps/x86_64/strrchr.S
|
||||
+++ b/sysdeps/x86_64/strrchr.S
|
||||
@@ -19,210 +19,360 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
+#ifndef STRRCHR
|
||||
+# define STRRCHR strrchr
|
||||
+#endif
|
||||
+
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+# define PCMPEQ pcmpeqd
|
||||
+# define CHAR_SIZE 4
|
||||
+# define PMINU pminud
|
||||
+#else
|
||||
+# define PCMPEQ pcmpeqb
|
||||
+# define CHAR_SIZE 1
|
||||
+# define PMINU pminub
|
||||
+#endif
|
||||
+
|
||||
+#define PAGE_SIZE 4096
|
||||
+#define VEC_SIZE 16
|
||||
+
|
||||
.text
|
||||
-ENTRY (strrchr)
|
||||
- movd %esi, %xmm1
|
||||
+ENTRY(STRRCHR)
|
||||
+ movd %esi, %xmm0
|
||||
movq %rdi, %rax
|
||||
- andl $4095, %eax
|
||||
- punpcklbw %xmm1, %xmm1
|
||||
- cmpq $4032, %rax
|
||||
- punpcklwd %xmm1, %xmm1
|
||||
- pshufd $0, %xmm1, %xmm1
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ punpcklbw %xmm0, %xmm0
|
||||
+ punpcklwd %xmm0, %xmm0
|
||||
+#endif
|
||||
+ pshufd $0, %xmm0, %xmm0
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
ja L(cross_page)
|
||||
- movdqu (%rdi), %xmm0
|
||||
+
|
||||
+L(cross_page_continue):
|
||||
+ movups (%rdi), %xmm1
|
||||
pxor %xmm2, %xmm2
|
||||
- movdqa %xmm0, %xmm3
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pcmpeqb %xmm2, %xmm3
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- pmovmskb %xmm3, %edx
|
||||
- testq %rdx, %rdx
|
||||
- je L(next_48_bytes)
|
||||
- leaq -1(%rdx), %rax
|
||||
- xorq %rdx, %rax
|
||||
- andq %rcx, %rax
|
||||
- je L(exit)
|
||||
- bsrq %rax, %rax
|
||||
+ PCMPEQ %xmm1, %xmm2
|
||||
+ pmovmskb %xmm2, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(aligned_more)
|
||||
+
|
||||
+ PCMPEQ %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ leal -1(%rcx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret0)
|
||||
+ bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
|
||||
+ search CHAR is zero we are correct. Either way `andq
|
||||
+ -CHAR_SIZE, %rax` gets the correct result. */
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+L(ret0):
|
||||
ret
|
||||
|
||||
+ /* Returns for first vec x1/x2 have hard coded backward search
|
||||
+ path for earlier matches. */
|
||||
.p2align 4
|
||||
-L(next_48_bytes):
|
||||
- movdqu 16(%rdi), %xmm4
|
||||
- movdqa %xmm4, %xmm5
|
||||
- movdqu 32(%rdi), %xmm3
|
||||
- pcmpeqb %xmm1, %xmm4
|
||||
- pcmpeqb %xmm2, %xmm5
|
||||
- movdqu 48(%rdi), %xmm0
|
||||
- pmovmskb %xmm5, %edx
|
||||
- movdqa %xmm3, %xmm5
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pcmpeqb %xmm2, %xmm5
|
||||
- pcmpeqb %xmm0, %xmm2
|
||||
- salq $16, %rdx
|
||||
- pmovmskb %xmm3, %r8d
|
||||
- pmovmskb %xmm5, %eax
|
||||
- pmovmskb %xmm2, %esi
|
||||
- salq $32, %r8
|
||||
- salq $32, %rax
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- orq %rdx, %rax
|
||||
- movq %rsi, %rdx
|
||||
- pmovmskb %xmm4, %esi
|
||||
- salq $48, %rdx
|
||||
- salq $16, %rsi
|
||||
- orq %r8, %rsi
|
||||
- orq %rcx, %rsi
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- salq $48, %rcx
|
||||
- orq %rcx, %rsi
|
||||
- orq %rdx, %rax
|
||||
- je L(loop_header2)
|
||||
- leaq -1(%rax), %rcx
|
||||
- xorq %rax, %rcx
|
||||
- andq %rcx, %rsi
|
||||
- je L(exit)
|
||||
- bsrq %rsi, %rsi
|
||||
- leaq (%rdi,%rsi), %rax
|
||||
+L(first_vec_x0_test):
|
||||
+ PCMPEQ %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jz L(ret0)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %r8, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(loop_header2):
|
||||
- testq %rsi, %rsi
|
||||
- movq %rdi, %rcx
|
||||
- je L(no_c_found)
|
||||
-L(loop_header):
|
||||
- addq $64, %rdi
|
||||
- pxor %xmm7, %xmm7
|
||||
- andq $-64, %rdi
|
||||
- jmp L(loop_entry)
|
||||
+L(first_vec_x1):
|
||||
+ PCMPEQ %xmm0, %xmm2
|
||||
+ pmovmskb %xmm2, %eax
|
||||
+ leal -1(%rcx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
|
||||
.p2align 4
|
||||
-L(loop64):
|
||||
- testq %rdx, %rdx
|
||||
- cmovne %rdx, %rsi
|
||||
- cmovne %rdi, %rcx
|
||||
- addq $64, %rdi
|
||||
-L(loop_entry):
|
||||
- movdqa 32(%rdi), %xmm3
|
||||
- pxor %xmm6, %xmm6
|
||||
- movdqa 48(%rdi), %xmm2
|
||||
- movdqa %xmm3, %xmm0
|
||||
- movdqa 16(%rdi), %xmm4
|
||||
- pminub %xmm2, %xmm0
|
||||
- movdqa (%rdi), %xmm5
|
||||
- pminub %xmm4, %xmm0
|
||||
- pminub %xmm5, %xmm0
|
||||
- pcmpeqb %xmm7, %xmm0
|
||||
- pmovmskb %xmm0, %eax
|
||||
- movdqa %xmm5, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %r9d
|
||||
- movdqa %xmm4, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- movdqa %xmm3, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- salq $16, %rdx
|
||||
- pmovmskb %xmm0, %r10d
|
||||
- movdqa %xmm2, %xmm0
|
||||
- pcmpeqb %xmm1, %xmm0
|
||||
- salq $32, %r10
|
||||
- orq %r10, %rdx
|
||||
- pmovmskb %xmm0, %r8d
|
||||
- orq %r9, %rdx
|
||||
- salq $48, %r8
|
||||
- orq %r8, %rdx
|
||||
+L(first_vec_x1_test):
|
||||
+ PCMPEQ %xmm0, %xmm2
|
||||
+ pmovmskb %xmm2, %eax
|
||||
testl %eax, %eax
|
||||
- je L(loop64)
|
||||
- pcmpeqb %xmm6, %xmm4
|
||||
- pcmpeqb %xmm6, %xmm3
|
||||
- pcmpeqb %xmm6, %xmm5
|
||||
- pmovmskb %xmm4, %eax
|
||||
- pmovmskb %xmm3, %r10d
|
||||
- pcmpeqb %xmm6, %xmm2
|
||||
- pmovmskb %xmm5, %r9d
|
||||
- salq $32, %r10
|
||||
- salq $16, %rax
|
||||
- pmovmskb %xmm2, %r8d
|
||||
- orq %r10, %rax
|
||||
- orq %r9, %rax
|
||||
- salq $48, %r8
|
||||
- orq %r8, %rax
|
||||
- leaq -1(%rax), %r8
|
||||
- xorq %rax, %r8
|
||||
- andq %r8, %rdx
|
||||
- cmovne %rdi, %rcx
|
||||
- cmovne %rdx, %rsi
|
||||
- bsrq %rsi, %rsi
|
||||
- leaq (%rcx,%rsi), %rax
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax), %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ PCMPEQ %xmm0, %xmm3
|
||||
+ pmovmskb %xmm3, %eax
|
||||
+ leal -1(%rcx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x1_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
+ /* Save original pointer if match was in VEC 0. */
|
||||
+ movq %rdi, %r8
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+
|
||||
+ movaps VEC_SIZE(%rdi), %xmm2
|
||||
+ pxor %xmm3, %xmm3
|
||||
+ PCMPEQ %xmm2, %xmm3
|
||||
+ pmovmskb %xmm3, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x1)
|
||||
+
|
||||
+ movaps (VEC_SIZE * 2)(%rdi), %xmm3
|
||||
+ pxor %xmm4, %xmm4
|
||||
+ PCMPEQ %xmm3, %xmm4
|
||||
+ pmovmskb %xmm4, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x2)
|
||||
+
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ /* Save pointer again before realigning. */
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-(VEC_SIZE * 2), %rdi
|
||||
+ .p2align 4
|
||||
+L(first_loop):
|
||||
+ /* Do 2x VEC at a time. */
|
||||
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||||
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||||
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||||
+ detecting zero. Note if this is found to be a bottleneck it
|
||||
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ pxor %xmm8, %xmm8
|
||||
+
|
||||
+ PCMPEQ %xmm8, %xmm5
|
||||
+ PCMPEQ %xmm4, %xmm8
|
||||
+ por %xmm5, %xmm8
|
||||
+#else
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ PMINU %xmm4, %xmm5
|
||||
+#endif
|
||||
+
|
||||
+ movaps %xmm4, %xmm9
|
||||
+ PCMPEQ %xmm0, %xmm4
|
||||
+ PCMPEQ %xmm0, %xmm6
|
||||
+ movaps %xmm6, %xmm7
|
||||
+ por %xmm4, %xmm6
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ pxor %xmm8, %xmm8
|
||||
+ PCMPEQ %xmm5, %xmm8
|
||||
+#endif
|
||||
+ pmovmskb %xmm8, %ecx
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
+ /* Use `addl` 1) so we can undo it with `subl` and 2) it can
|
||||
+ macro-fuse with `jz`. */
|
||||
+ addl %ecx, %eax
|
||||
+ jz L(first_loop)
|
||||
+
|
||||
+ /* Check if there is zero match. */
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_loop_match)
|
||||
+
|
||||
+ /* Check if there was a match in last iteration. */
|
||||
+ subl %ecx, %eax
|
||||
+ jnz L(new_match)
|
||||
+
|
||||
+L(first_loop_old_match):
|
||||
+ PCMPEQ %xmm0, %xmm2
|
||||
+ PCMPEQ %xmm0, %xmm3
|
||||
+ pmovmskb %xmm2, %ecx
|
||||
+ pmovmskb %xmm3, %eax
|
||||
+ addl %eax, %ecx
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ /* NB: We could move this shift to before the branch and save a
|
||||
+ bit of code size / performance on the fall through. The
|
||||
+ branch leads to the null case which generally seems hotter
|
||||
+ than char in first 3x VEC. */
|
||||
+ sall $16, %eax
|
||||
+ orl %ecx, %eax
|
||||
+
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rsi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(new_match):
|
||||
+ pxor %xmm6, %xmm6
|
||||
+ PCMPEQ %xmm9, %xmm6
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+ sall $16, %ecx
|
||||
+ orl %eax, %ecx
|
||||
+
|
||||
+ /* We can't reuse either of the old comparisons as since we mask
|
||||
+ of zeros after first zero (instead of using the full
|
||||
+ comparison) we can't gurantee no interference between match
|
||||
+ after end of string and valid match. */
|
||||
+ pmovmskb %xmm4, %eax
|
||||
+ pmovmskb %xmm7, %edx
|
||||
+ sall $16, %edx
|
||||
+ orl %edx, %eax
|
||||
+
|
||||
+ leal -1(%ecx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_loop_old_match)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
ret
|
||||
|
||||
+ /* Save minimum state for getting most recent match. We can
|
||||
+ throw out all previous work. */
|
||||
.p2align 4
|
||||
-L(no_c_found):
|
||||
- movl $1, %esi
|
||||
- xorl %ecx, %ecx
|
||||
- jmp L(loop_header)
|
||||
+L(second_loop_match):
|
||||
+ movq %rdi, %rsi
|
||||
+ movaps %xmm4, %xmm2
|
||||
+ movaps %xmm7, %xmm3
|
||||
|
||||
.p2align 4
|
||||
-L(exit):
|
||||
- xorl %eax, %eax
|
||||
+L(second_loop):
|
||||
+ movaps (VEC_SIZE * 2)(%rdi), %xmm4
|
||||
+ movaps (VEC_SIZE * 3)(%rdi), %xmm5
|
||||
+ /* Since SSE2 no pminud so wcsrchr needs seperate logic for
|
||||
+ detecting zero. Note if this is found to be a bottleneck it
|
||||
+ may be worth adding an SSE4.1 wcsrchr implementation. */
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ pxor %xmm8, %xmm8
|
||||
+
|
||||
+ PCMPEQ %xmm8, %xmm5
|
||||
+ PCMPEQ %xmm4, %xmm8
|
||||
+ por %xmm5, %xmm8
|
||||
+#else
|
||||
+ movaps %xmm5, %xmm6
|
||||
+ PMINU %xmm4, %xmm5
|
||||
+#endif
|
||||
+
|
||||
+ movaps %xmm4, %xmm9
|
||||
+ PCMPEQ %xmm0, %xmm4
|
||||
+ PCMPEQ %xmm0, %xmm6
|
||||
+ movaps %xmm6, %xmm7
|
||||
+ por %xmm4, %xmm6
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ pxor %xmm8, %xmm8
|
||||
+ PCMPEQ %xmm5, %xmm8
|
||||
+#endif
|
||||
+
|
||||
+ pmovmskb %xmm8, %ecx
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
+ /* Either null term or new occurence of CHAR. */
|
||||
+ addl %ecx, %eax
|
||||
+ jz L(second_loop)
|
||||
+
|
||||
+ /* No null term so much be new occurence of CHAR. */
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_loop_match)
|
||||
+
|
||||
+
|
||||
+ subl %ecx, %eax
|
||||
+ jnz L(second_loop_new_match)
|
||||
+
|
||||
+L(second_loop_old_match):
|
||||
+ pmovmskb %xmm2, %ecx
|
||||
+ pmovmskb %xmm3, %eax
|
||||
+ sall $16, %eax
|
||||
+ orl %ecx, %eax
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rsi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
+L(second_loop_new_match):
|
||||
+ pxor %xmm6, %xmm6
|
||||
+ PCMPEQ %xmm9, %xmm6
|
||||
+ pmovmskb %xmm6, %eax
|
||||
+ sall $16, %ecx
|
||||
+ orl %eax, %ecx
|
||||
+
|
||||
+ /* We can't reuse either of the old comparisons as since we mask
|
||||
+ of zeros after first zero (instead of using the full
|
||||
+ comparison) we can't gurantee no interference between match
|
||||
+ after end of string and valid match. */
|
||||
+ pmovmskb %xmm4, %eax
|
||||
+ pmovmskb %xmm7, %edx
|
||||
+ sall $16, %edx
|
||||
+ orl %edx, %eax
|
||||
+
|
||||
+ leal -1(%ecx), %edx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(second_loop_old_match)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4,, 4
|
||||
L(cross_page):
|
||||
- movq %rdi, %rax
|
||||
- pxor %xmm0, %xmm0
|
||||
- andq $-64, %rax
|
||||
- movdqu (%rax), %xmm5
|
||||
- movdqa %xmm5, %xmm6
|
||||
- movdqu 16(%rax), %xmm4
|
||||
- pcmpeqb %xmm1, %xmm5
|
||||
- pcmpeqb %xmm0, %xmm6
|
||||
- movdqu 32(%rax), %xmm3
|
||||
- pmovmskb %xmm6, %esi
|
||||
- movdqa %xmm4, %xmm6
|
||||
- movdqu 48(%rax), %xmm2
|
||||
- pcmpeqb %xmm1, %xmm4
|
||||
- pcmpeqb %xmm0, %xmm6
|
||||
- pmovmskb %xmm6, %edx
|
||||
- movdqa %xmm3, %xmm6
|
||||
- pcmpeqb %xmm1, %xmm3
|
||||
- pcmpeqb %xmm0, %xmm6
|
||||
- pcmpeqb %xmm2, %xmm0
|
||||
- salq $16, %rdx
|
||||
- pmovmskb %xmm3, %r9d
|
||||
- pmovmskb %xmm6, %r8d
|
||||
- pmovmskb %xmm0, %ecx
|
||||
- salq $32, %r9
|
||||
- salq $32, %r8
|
||||
- pcmpeqb %xmm1, %xmm2
|
||||
- orq %r8, %rdx
|
||||
- salq $48, %rcx
|
||||
- pmovmskb %xmm5, %r8d
|
||||
- orq %rsi, %rdx
|
||||
- pmovmskb %xmm4, %esi
|
||||
- orq %rcx, %rdx
|
||||
- pmovmskb %xmm2, %ecx
|
||||
- salq $16, %rsi
|
||||
- salq $48, %rcx
|
||||
- orq %r9, %rsi
|
||||
- orq %r8, %rsi
|
||||
- orq %rcx, %rsi
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-VEC_SIZE, %rsi
|
||||
+ movaps (%rsi), %xmm1
|
||||
+ pxor %xmm2, %xmm2
|
||||
+ PCMPEQ %xmm1, %xmm2
|
||||
+ pmovmskb %xmm2, %edx
|
||||
movl %edi, %ecx
|
||||
- subl %eax, %ecx
|
||||
- shrq %cl, %rdx
|
||||
- shrq %cl, %rsi
|
||||
- testq %rdx, %rdx
|
||||
- je L(loop_header2)
|
||||
- leaq -1(%rdx), %rax
|
||||
- xorq %rdx, %rax
|
||||
- andq %rax, %rsi
|
||||
- je L(exit)
|
||||
- bsrq %rsi, %rax
|
||||
+ andl $(VEC_SIZE - 1), %ecx
|
||||
+ sarl %cl, %edx
|
||||
+ jz L(cross_page_continue)
|
||||
+ PCMPEQ %xmm0, %xmm1
|
||||
+ pmovmskb %xmm1, %eax
|
||||
+ sarl %cl, %eax
|
||||
+ leal -1(%rdx), %ecx
|
||||
+ xorl %edx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret1)
|
||||
+ bsrl %eax, %eax
|
||||
addq %rdi, %rax
|
||||
+#ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+#endif
|
||||
+L(ret1):
|
||||
ret
|
||||
-END (strrchr)
|
||||
+END(STRRCHR)
|
||||
|
||||
-weak_alias (strrchr, rindex)
|
||||
-libc_hidden_builtin_def (strrchr)
|
||||
+#ifndef USE_AS_WCSRCHR
|
||||
+ weak_alias (STRRCHR, rindex)
|
||||
+ libc_hidden_builtin_def (STRRCHR)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/wcsrchr.S b/sysdeps/x86_64/wcsrchr.S
|
||||
index 2f388537..ae3cfa7d 100644
|
||||
--- a/sysdeps/x86_64/wcsrchr.S
|
||||
+++ b/sysdeps/x86_64/wcsrchr.S
|
||||
@@ -17,266 +17,12 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include <sysdep.h>
|
||||
|
||||
- .text
|
||||
-ENTRY (wcsrchr)
|
||||
+#define USE_AS_WCSRCHR 1
|
||||
+#define NO_PMINU 1
|
||||
|
||||
- movd %rsi, %xmm1
|
||||
- mov %rdi, %rcx
|
||||
- punpckldq %xmm1, %xmm1
|
||||
- pxor %xmm2, %xmm2
|
||||
- punpckldq %xmm1, %xmm1
|
||||
- and $63, %rcx
|
||||
- cmp $48, %rcx
|
||||
- ja L(crosscache)
|
||||
+#ifndef STRRCHR
|
||||
+# define STRRCHR wcsrchr
|
||||
+#endif
|
||||
|
||||
- movdqu (%rdi), %xmm0
|
||||
- pcmpeqd %xmm0, %xmm2
|
||||
- pcmpeqd %xmm1, %xmm0
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm0, %rax
|
||||
- add $16, %rdi
|
||||
-
|
||||
- test %rax, %rax
|
||||
- jnz L(unaligned_match1)
|
||||
-
|
||||
- test %rcx, %rcx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- and $-16, %rdi
|
||||
- xor %r8, %r8
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(unaligned_match1):
|
||||
- test %rcx, %rcx
|
||||
- jnz L(prolog_find_zero_1)
|
||||
-
|
||||
- mov %rax, %r8
|
||||
- mov %rdi, %rsi
|
||||
- and $-16, %rdi
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(crosscache):
|
||||
- and $15, %rcx
|
||||
- and $-16, %rdi
|
||||
- pxor %xmm3, %xmm3
|
||||
- movdqa (%rdi), %xmm0
|
||||
- pcmpeqd %xmm0, %xmm3
|
||||
- pcmpeqd %xmm1, %xmm0
|
||||
- pmovmskb %xmm3, %rdx
|
||||
- pmovmskb %xmm0, %rax
|
||||
- shr %cl, %rdx
|
||||
- shr %cl, %rax
|
||||
- add $16, %rdi
|
||||
-
|
||||
- test %rax, %rax
|
||||
- jnz L(unaligned_match)
|
||||
-
|
||||
- test %rdx, %rdx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- xor %r8, %r8
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(unaligned_match):
|
||||
- test %rdx, %rdx
|
||||
- jnz L(prolog_find_zero)
|
||||
-
|
||||
- mov %rax, %r8
|
||||
- lea (%rdi, %rcx), %rsi
|
||||
-
|
||||
-/* Loop start on aligned string. */
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
- movdqa (%rdi), %xmm0
|
||||
- pcmpeqd %xmm0, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm0
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm0, %rax
|
||||
- or %rax, %rcx
|
||||
- jnz L(matches)
|
||||
-
|
||||
- movdqa (%rdi), %xmm3
|
||||
- pcmpeqd %xmm3, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm3
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm3, %rax
|
||||
- or %rax, %rcx
|
||||
- jnz L(matches)
|
||||
-
|
||||
- movdqa (%rdi), %xmm4
|
||||
- pcmpeqd %xmm4, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm4
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm4, %rax
|
||||
- or %rax, %rcx
|
||||
- jnz L(matches)
|
||||
-
|
||||
- movdqa (%rdi), %xmm5
|
||||
- pcmpeqd %xmm5, %xmm2
|
||||
- add $16, %rdi
|
||||
- pcmpeqd %xmm1, %xmm5
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- pmovmskb %xmm5, %rax
|
||||
- or %rax, %rcx
|
||||
- jz L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(matches):
|
||||
- test %rax, %rax
|
||||
- jnz L(match)
|
||||
-L(return_value):
|
||||
- test %r8, %r8
|
||||
- jz L(return_null)
|
||||
- mov %r8, %rax
|
||||
- mov %rsi, %rdi
|
||||
-
|
||||
- test $15 << 4, %ah
|
||||
- jnz L(match_fourth_wchar)
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match):
|
||||
- pmovmskb %xmm2, %rcx
|
||||
- test %rcx, %rcx
|
||||
- jnz L(find_zero)
|
||||
- mov %rax, %r8
|
||||
- mov %rdi, %rsi
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero):
|
||||
- test $15, %cl
|
||||
- jnz L(find_zero_in_first_wchar)
|
||||
- test %cl, %cl
|
||||
- jnz L(find_zero_in_second_wchar)
|
||||
- test $15, %ch
|
||||
- jnz L(find_zero_in_third_wchar)
|
||||
-
|
||||
- and $1 << 13 - 1, %rax
|
||||
- jz L(return_value)
|
||||
-
|
||||
- test $15 << 4, %ah
|
||||
- jnz L(match_fourth_wchar)
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero_in_first_wchar):
|
||||
- test $1, %rax
|
||||
- jz L(return_value)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero_in_second_wchar):
|
||||
- and $1 << 5 - 1, %rax
|
||||
- jz L(return_value)
|
||||
-
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(find_zero_in_third_wchar):
|
||||
- and $1 << 9 - 1, %rax
|
||||
- jz L(return_value)
|
||||
-
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero):
|
||||
- add %rcx, %rdi
|
||||
- mov %rdx, %rcx
|
||||
-L(prolog_find_zero_1):
|
||||
- test $15, %cl
|
||||
- jnz L(prolog_find_zero_in_first_wchar)
|
||||
- test %cl, %cl
|
||||
- jnz L(prolog_find_zero_in_second_wchar)
|
||||
- test $15, %ch
|
||||
- jnz L(prolog_find_zero_in_third_wchar)
|
||||
-
|
||||
- and $1 << 13 - 1, %rax
|
||||
- jz L(return_null)
|
||||
-
|
||||
- test $15 << 4, %ah
|
||||
- jnz L(match_fourth_wchar)
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero_in_first_wchar):
|
||||
- test $1, %rax
|
||||
- jz L(return_null)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero_in_second_wchar):
|
||||
- and $1 << 5 - 1, %rax
|
||||
- jz L(return_null)
|
||||
-
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(prolog_find_zero_in_third_wchar):
|
||||
- and $1 << 9 - 1, %rax
|
||||
- jz L(return_null)
|
||||
-
|
||||
- test %ah, %ah
|
||||
- jnz L(match_third_wchar)
|
||||
- test $15 << 4, %al
|
||||
- jnz L(match_second_wchar)
|
||||
- lea -16(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match_second_wchar):
|
||||
- lea -12(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match_third_wchar):
|
||||
- lea -8(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(match_fourth_wchar):
|
||||
- lea -4(%rdi), %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(return_null):
|
||||
- xor %rax, %rax
|
||||
- ret
|
||||
-
|
||||
-END (wcsrchr)
|
||||
+#include "../strrchr.S"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,501 +0,0 @@
|
||||
From df7e295d18ffa34f629578c0017a9881af7620f6 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 21 Apr 2022 20:52:29 -0500
|
||||
Subject: [PATCH] x86: Optimize {str|wcs}rchr-avx2
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The new code unrolls the main loop slightly without adding too much
|
||||
overhead and minimizes the comparisons for the search CHAR.
|
||||
|
||||
Geometric Mean of all benchmarks New / Old: 0.832
|
||||
See email for all results.
|
||||
|
||||
Full xcheck passes on x86_64 with and without multiarch enabled.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strrchr-avx2.S | 426 +++++++++++++++---------
|
||||
1 file changed, 269 insertions(+), 157 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
index c949410b..3d26fad4 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
@@ -27,9 +27,13 @@
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
# define VPBROADCAST vpbroadcastd
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
+# define VPMIN vpminud
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
+# define VPMIN vpminub
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
@@ -41,196 +45,304 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
- .section SECTION(.text),"ax",@progbits
|
||||
-ENTRY (STRRCHR)
|
||||
- movd %esi, %xmm4
|
||||
- movl %edi, %ecx
|
||||
+ .section SECTION(.text), "ax", @progbits
|
||||
+ENTRY(STRRCHR)
|
||||
+ movd %esi, %xmm7
|
||||
+ movl %edi, %eax
|
||||
/* Broadcast CHAR to YMM4. */
|
||||
- VPBROADCAST %xmm4, %ymm4
|
||||
+ VPBROADCAST %xmm7, %ymm7
|
||||
vpxor %xmm0, %xmm0, %xmm0
|
||||
|
||||
- /* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ /* Shift here instead of `andl` to save code size (saves a fetch
|
||||
+ block). */
|
||||
+ sall $20, %eax
|
||||
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
|
||||
+ ja L(cross_page)
|
||||
|
||||
+L(page_cross_continue):
|
||||
vmovdqu (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ /* Check end of string match. */
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(aligned_more)
|
||||
+
|
||||
+ /* Only check match with search CHAR if needed. */
|
||||
+ VPCMPEQ %ymm1, %ymm7, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* Check if match before first zero. */
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret0)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+ /* We are off by 3 for wcsrchr if search CHAR is non-zero. If
|
||||
+ search CHAR is zero we are correct. Either way `andq
|
||||
+ -CHAR_SIZE, %rax` gets the correct result. */
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+L(ret0):
|
||||
+L(return_vzeroupper):
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+
|
||||
+ /* Returns for first vec x1/x2 have hard coded backward search
|
||||
+ path for earlier matches. */
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x1):
|
||||
+ VPCMPEQ %ymm2, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(first_vec_x1_return)
|
||||
+
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x0_test):
|
||||
+ VPCMPEQ %ymm1, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jz L(ret1)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %r8, %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+L(ret1):
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x0_x1_test):
|
||||
+ VPCMPEQ %ymm2, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ /* Check ymm2 for search CHAR match. If no match then check ymm1
|
||||
+ before returning. */
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec)
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x1_return):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq 1(%rdi, %rax), %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(return_null)
|
||||
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- xorl %edx, %edx
|
||||
- jmp L(aligned_loop)
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x2):
|
||||
+ VPCMPEQ %ymm3, %ymm7, %ymm6
|
||||
+ vpmovmskb %ymm6, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* If no in-range search CHAR match in ymm3 then need to check
|
||||
+ ymm1/ymm2 for an earlier match (we delay checking search
|
||||
+ CHAR matches until needed). */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x0_x1_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE + 1)(%rdi, %rax), %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec):
|
||||
- /* Check if there is a nul CHAR. */
|
||||
+L(aligned_more):
|
||||
+ /* Save original pointer if match was in VEC 0. */
|
||||
+ movq %rdi, %r8
|
||||
+
|
||||
+ /* Align src. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+ vmovdqu 1(%rdi), %ymm2
|
||||
+ VPCMPEQ %ymm2, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
testl %ecx, %ecx
|
||||
- jnz L(char_and_nul_in_first_vec)
|
||||
+ jnz L(first_vec_x1)
|
||||
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- movq %rdi, %rsi
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- jmp L(aligned_loop)
|
||||
+ vmovdqu (VEC_SIZE + 1)(%rdi), %ymm3
|
||||
+ VPCMPEQ %ymm3, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x2)
|
||||
|
||||
+ /* Save pointer again before realigning. */
|
||||
+ movq %rdi, %rsi
|
||||
+ addq $(VEC_SIZE + 1), %rdi
|
||||
+ andq $-(VEC_SIZE * 2), %rdi
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %edx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- shrl %cl, %edx
|
||||
- shrl %cl, %eax
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
- /* Check if there is a CHAR. */
|
||||
+L(first_aligned_loop):
|
||||
+ /* Do 2x VEC at a time. Any more and the cost of finding the
|
||||
+ match outweights loop benefit. */
|
||||
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
|
||||
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
|
||||
+
|
||||
+ VPCMPEQ %ymm4, %ymm7, %ymm6
|
||||
+ VPMIN %ymm4, %ymm5, %ymm8
|
||||
+ VPCMPEQ %ymm5, %ymm7, %ymm10
|
||||
+ vpor %ymm6, %ymm10, %ymm5
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm8
|
||||
+ vpor %ymm5, %ymm8, %ymm9
|
||||
+
|
||||
+ vpmovmskb %ymm9, %eax
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
+ /* No zero or search CHAR. */
|
||||
testl %eax, %eax
|
||||
- jnz L(found_char)
|
||||
-
|
||||
- testl %edx, %edx
|
||||
- jnz L(return_null)
|
||||
+ jz L(first_aligned_loop)
|
||||
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(found_char):
|
||||
- testl %edx, %edx
|
||||
- jnz L(char_and_nul)
|
||||
+ /* If no zero CHAR then go to second loop (this allows us to
|
||||
+ throw away all prior work). */
|
||||
+ vpmovmskb %ymm8, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_aligned_loop_prep)
|
||||
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- leaq (%rdi, %rcx), %rsi
|
||||
+ /* Search char could be zero so we need to get the true match.
|
||||
+ */
|
||||
+ vpmovmskb %ymm5, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_aligned_loop_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(aligned_loop):
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- add $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x1_or_x2):
|
||||
+ VPCMPEQ %ymm3, %ymm7, %ymm3
|
||||
+ VPCMPEQ %ymm2, %ymm7, %ymm2
|
||||
vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm2
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm1, %ymm4, %ymm3
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- orl %eax, %ecx
|
||||
- jz L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(char_nor_null):
|
||||
- /* Find a CHAR or a nul CHAR in a loop. */
|
||||
- testl %eax, %eax
|
||||
- jnz L(match)
|
||||
-L(return_value):
|
||||
- testl %edx, %edx
|
||||
- jz L(return_null)
|
||||
- movl %edx, %eax
|
||||
- movq %rsi, %rdi
|
||||
+ vpmovmskb %ymm2, %edx
|
||||
+ /* Use add for macro-fusion. */
|
||||
+ addq %rax, %rdx
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ /* NB: We could move this shift to before the branch and save a
|
||||
+ bit of code size / performance on the fall through. The
|
||||
+ branch leads to the null case which generally seems hotter
|
||||
+ than char in first 3x VEC. */
|
||||
+ salq $32, %rax
|
||||
+ addq %rdx, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq 1(%rsi, %rax), %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4,, 8
|
||||
+L(first_aligned_loop_return):
|
||||
+ VPCMPEQ %ymm4, %ymm0, %ymm4
|
||||
+ vpmovmskb %ymm4, %edx
|
||||
+ salq $32, %rcx
|
||||
+ orq %rdx, %rcx
|
||||
+
|
||||
+ vpmovmskb %ymm10, %eax
|
||||
+ vpmovmskb %ymm6, %edx
|
||||
+ salq $32, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ blsmskq %rcx, %rcx
|
||||
+ andq %rcx, %rax
|
||||
+ jz L(first_vec_x1_or_x2)
|
||||
+
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq -(VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* Keep the first bit for each matching CHAR for bsr. */
|
||||
- andl $0x11111111, %eax
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
# endif
|
||||
- bsrl %eax, %eax
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
-L(return_vzeroupper):
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ /* Search char cannot be zero. */
|
||||
.p2align 4
|
||||
-L(match):
|
||||
- /* Find a CHAR. Check if there is a nul CHAR. */
|
||||
- vpmovmskb %ymm2, %ecx
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(find_nul)
|
||||
-
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
+L(second_aligned_loop_set_furthest_match):
|
||||
+ /* Save VEC and pointer from most recent match. */
|
||||
+L(second_aligned_loop_prep):
|
||||
movq %rdi, %rsi
|
||||
- jmp L(aligned_loop)
|
||||
+ vmovdqu %ymm6, %ymm2
|
||||
+ vmovdqu %ymm10, %ymm3
|
||||
|
||||
.p2align 4
|
||||
-L(find_nul):
|
||||
-# ifdef USE_AS_WCSRCHR
|
||||
- /* Keep the first bit for each matching CHAR for bsr. */
|
||||
- andl $0x11111111, %ecx
|
||||
- andl $0x11111111, %eax
|
||||
-# endif
|
||||
- /* Mask out any matching bits after the nul CHAR. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
+L(second_aligned_loop):
|
||||
+ /* Search 2x at at time. */
|
||||
+ vmovdqa (VEC_SIZE * 0)(%rdi), %ymm4
|
||||
+ vmovdqa (VEC_SIZE * 1)(%rdi), %ymm5
|
||||
+
|
||||
+ VPCMPEQ %ymm4, %ymm7, %ymm6
|
||||
+ VPMIN %ymm4, %ymm5, %ymm1
|
||||
+ VPCMPEQ %ymm5, %ymm7, %ymm10
|
||||
+ vpor %ymm6, %ymm10, %ymm5
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
+ vpor %ymm5, %ymm1, %ymm9
|
||||
+
|
||||
+ vpmovmskb %ymm9, %eax
|
||||
+ addq $(VEC_SIZE * 2), %rdi
|
||||
testl %eax, %eax
|
||||
- /* If there is no CHAR here, return the remembered one. */
|
||||
- jz L(return_value)
|
||||
- bsrl %eax, %eax
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
- VZEROUPPER_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
-L(char_and_nul):
|
||||
- /* Find both a CHAR and a nul CHAR. */
|
||||
- addq %rcx, %rdi
|
||||
- movl %edx, %ecx
|
||||
-L(char_and_nul_in_first_vec):
|
||||
-# ifdef USE_AS_WCSRCHR
|
||||
- /* Keep the first bit for each matching CHAR for bsr. */
|
||||
- andl $0x11111111, %ecx
|
||||
- andl $0x11111111, %eax
|
||||
-# endif
|
||||
- /* Mask out any matching bits after the nul CHAR. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
+ jz L(second_aligned_loop)
|
||||
+ vpmovmskb %ymm1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(second_aligned_loop_set_furthest_match)
|
||||
+ vpmovmskb %ymm5, %eax
|
||||
testl %eax, %eax
|
||||
- /* Return null pointer if the nul CHAR comes first. */
|
||||
- jz L(return_null)
|
||||
- bsrl %eax, %eax
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
+ jnz L(return_new_match)
|
||||
+
|
||||
+ /* This is the hot patch. We know CHAR is inbounds and that
|
||||
+ ymm3/ymm2 have latest match. */
|
||||
+ .p2align 4,, 4
|
||||
+L(return_old_match):
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ vpmovmskb %ymm2, %edx
|
||||
+ salq $32, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ /* Search char cannot be zero so safe to just use lea for
|
||||
+ wcsrchr. */
|
||||
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
-L(return_null):
|
||||
- xorl %eax, %eax
|
||||
+ /* Last iteration also potentially has a match. */
|
||||
+ .p2align 4,, 8
|
||||
+L(return_new_match):
|
||||
+ VPCMPEQ %ymm4, %ymm0, %ymm4
|
||||
+ vpmovmskb %ymm4, %edx
|
||||
+ salq $32, %rcx
|
||||
+ orq %rdx, %rcx
|
||||
+
|
||||
+ vpmovmskb %ymm10, %eax
|
||||
+ vpmovmskb %ymm6, %edx
|
||||
+ salq $32, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ blsmskq %rcx, %rcx
|
||||
+ andq %rcx, %rax
|
||||
+ jz L(return_old_match)
|
||||
+ bsrq %rax, %rax
|
||||
+ /* Search char cannot be zero so safe to just use lea for
|
||||
+ wcsrchr. */
|
||||
+ leaq (VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
-END (STRRCHR)
|
||||
+ .p2align 4,, 4
|
||||
+L(cross_page):
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-VEC_SIZE, %rsi
|
||||
+ vmovdqu (%rsi), %ymm1
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
+ /* Shift out zero CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
+ shrxl %edi, %ecx, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(page_cross_continue)
|
||||
+ VPCMPEQ %ymm1, %ymm7, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+
|
||||
+ /* Shift out search CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
+ shrxl %edi, %eax, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* Check if any search CHAR match in range. */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret2)
|
||||
+ bsrl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ andq $-CHAR_SIZE, %rax
|
||||
+# endif
|
||||
+L(ret2):
|
||||
+ VZEROUPPER_RETURN
|
||||
+END(STRRCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,558 +0,0 @@
|
||||
From c966099cdc3e0fdf92f63eac09b22fa7e5f5f02d Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 21 Apr 2022 20:52:30 -0500
|
||||
Subject: [PATCH] x86: Optimize {str|wcs}rchr-evex
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The new code unrolls the main loop slightly without adding too much
|
||||
overhead and minimizes the comparisons for the search CHAR.
|
||||
|
||||
Geometric Mean of all benchmarks New / Old: 0.755
|
||||
See email for all results.
|
||||
|
||||
Full xcheck passes on x86_64 with and without multiarch enabled.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strrchr-evex.S | 471 +++++++++++++++---------
|
||||
1 file changed, 290 insertions(+), 181 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
||||
index f920b5a5..f5b6d755 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
||||
@@ -24,242 +24,351 @@
|
||||
# define STRRCHR __strrchr_evex
|
||||
# endif
|
||||
|
||||
-# define VMOVU vmovdqu64
|
||||
-# define VMOVA vmovdqa64
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
+# define SHIFT_REG esi
|
||||
+
|
||||
+# define kunpck kunpckbw
|
||||
+# define kmov_2x kmovd
|
||||
+# define maskz_2x ecx
|
||||
+# define maskm_2x eax
|
||||
+# define CHAR_SIZE 4
|
||||
+# define VPMIN vpminud
|
||||
+# define VPTESTN vptestnmd
|
||||
# define VPBROADCAST vpbroadcastd
|
||||
-# define VPCMP vpcmpd
|
||||
-# define SHIFT_REG r8d
|
||||
+# define VPCMP vpcmpd
|
||||
# else
|
||||
+# define SHIFT_REG edi
|
||||
+
|
||||
+# define kunpck kunpckdq
|
||||
+# define kmov_2x kmovq
|
||||
+# define maskz_2x rcx
|
||||
+# define maskm_2x rax
|
||||
+
|
||||
+# define CHAR_SIZE 1
|
||||
+# define VPMIN vpminub
|
||||
+# define VPTESTN vptestnmb
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
-# define VPCMP vpcmpb
|
||||
-# define SHIFT_REG ecx
|
||||
+# define VPCMP vpcmpb
|
||||
# endif
|
||||
|
||||
# define XMMZERO xmm16
|
||||
# define YMMZERO ymm16
|
||||
# define YMMMATCH ymm17
|
||||
-# define YMM1 ymm18
|
||||
+# define YMMSAVE ymm18
|
||||
+
|
||||
+# define YMM1 ymm19
|
||||
+# define YMM2 ymm20
|
||||
+# define YMM3 ymm21
|
||||
+# define YMM4 ymm22
|
||||
+# define YMM5 ymm23
|
||||
+# define YMM6 ymm24
|
||||
+# define YMM7 ymm25
|
||||
+# define YMM8 ymm26
|
||||
|
||||
-# define VEC_SIZE 32
|
||||
|
||||
- .section .text.evex,"ax",@progbits
|
||||
-ENTRY (STRRCHR)
|
||||
- movl %edi, %ecx
|
||||
+# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
+ .section .text.evex, "ax", @progbits
|
||||
+ENTRY(STRRCHR)
|
||||
+ movl %edi, %eax
|
||||
/* Broadcast CHAR to YMMMATCH. */
|
||||
VPBROADCAST %esi, %YMMMATCH
|
||||
|
||||
- vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
-
|
||||
- /* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jg L(cross_page_boundary)
|
||||
|
||||
+L(page_cross_continue):
|
||||
VMOVU (%rdi), %YMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ /* k0 has a 1 for each zero CHAR in YMM1. */
|
||||
+ VPTESTN %YMM1, %YMM1, %k0
|
||||
kmovd %k0, %ecx
|
||||
- kmovd %k1, %eax
|
||||
-
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
testl %ecx, %ecx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- xorl %edx, %edx
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec):
|
||||
- /* Check if there is a null byte. */
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(char_and_nul_in_first_vec)
|
||||
-
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- movq %rdi, %rsi
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ jz L(aligned_more)
|
||||
+ /* fallthrough: zero CHAR in first VEC. */
|
||||
|
||||
+ /* K1 has a 1 for each search CHAR match in YMM1. */
|
||||
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Build mask up until first zero CHAR (used to mask of
|
||||
+ potential search CHAR matches past the end of the string).
|
||||
+ */
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret0)
|
||||
+ /* Get last match (the `andl` removed any out of bounds
|
||||
+ matches). */
|
||||
+ bsrl %eax, %eax
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
- bytes. */
|
||||
- movl %ecx, %SHIFT_REG
|
||||
- sarl $2, %SHIFT_REG
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ addq %rdi, %rax
|
||||
# endif
|
||||
+L(ret0):
|
||||
+ ret
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
+ /* Returns for first vec x1/x2/x3 have hard coded backward
|
||||
+ search path for earlier matches. */
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x1):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* eax non-zero if search CHAR in range. */
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(first_vec_x1_return)
|
||||
+
|
||||
+ /* fallthrough: no match in YMM2 then need to check for earlier
|
||||
+ matches (in YMM1). */
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x0_test):
|
||||
VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
- kmovd %k0, %edx
|
||||
kmovd %k1, %eax
|
||||
-
|
||||
- shrxl %SHIFT_REG, %edx, %edx
|
||||
- shrxl %SHIFT_REG, %eax, %eax
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
- /* Check if there is a CHAR. */
|
||||
testl %eax, %eax
|
||||
- jnz L(found_char)
|
||||
-
|
||||
- testl %edx, %edx
|
||||
- jnz L(return_null)
|
||||
-
|
||||
- jmp L(aligned_loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(found_char):
|
||||
- testl %edx, %edx
|
||||
- jnz L(char_and_nul)
|
||||
-
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
- leaq (%rdi, %rcx), %rsi
|
||||
+ jz L(ret1)
|
||||
+ bsrl %eax, %eax
|
||||
+# ifdef USE_AS_WCSRCHR
|
||||
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ addq %rsi, %rax
|
||||
+# endif
|
||||
+L(ret1):
|
||||
+ ret
|
||||
|
||||
- .p2align 4
|
||||
-L(aligned_loop):
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x1_or_x2):
|
||||
+ VPCMP $0, %YMM3, %YMMMATCH, %k3
|
||||
+ VPCMP $0, %YMM2, %YMMMATCH, %k2
|
||||
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
|
||||
+ matches between either of them. Otherwise check YMM1. */
|
||||
+ kortestd %k2, %k3
|
||||
+ jz L(first_vec_x0_test)
|
||||
+
|
||||
+ /* Guranteed that YMM2 and YMM3 are within range so merge the
|
||||
+ two bitmasks then get last result. */
|
||||
+ kunpck %k2, %k3, %k3
|
||||
+ kmovq %k3, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
- kmovd %k0, %ecx
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x3):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM4, %k1
|
||||
kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x1_or_x2)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- add $VEC_SIZE, %rdi
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x0_x1_test):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Check YMM2 for last match first. If no match try YMM1. */
|
||||
+ testl %eax, %eax
|
||||
+ jz L(first_vec_x0_test)
|
||||
+ .p2align 4,, 4
|
||||
+L(first_vec_x1_return):
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
- kmovd %k0, %ecx
|
||||
+ .p2align 4,, 10
|
||||
+L(first_vec_x2):
|
||||
+ VPCMP $0, %YMMMATCH, %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ /* Check YMM3 for last match first. If no match try YMM2/YMM1.
|
||||
+ */
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(first_vec_x0_x1_test)
|
||||
+ bsrl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- addq $VEC_SIZE, %rdi
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
+ /* Need to keep original pointer incase YMM1 has last match. */
|
||||
+ movq %rdi, %rsi
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ VMOVU VEC_SIZE(%rdi), %YMM2
|
||||
+ VPTESTN %YMM2, %YMM2, %k0
|
||||
kmovd %k0, %ecx
|
||||
- kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jnz L(char_nor_null)
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x1)
|
||||
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM3
|
||||
+ VPTESTN %YMM3, %YMM3, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x2)
|
||||
|
||||
- /* Each bit in K0 represents a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
- /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
- VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM4
|
||||
+ VPTESTN %YMM4, %YMM4, %k0
|
||||
kmovd %k0, %ecx
|
||||
- kmovd %k1, %eax
|
||||
- orl %eax, %ecx
|
||||
- jz L(aligned_loop)
|
||||
+ movq %rdi, %r8
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(first_vec_x3)
|
||||
|
||||
+ andq $-(VEC_SIZE * 2), %rdi
|
||||
.p2align 4
|
||||
-L(char_nor_null):
|
||||
- /* Find a CHAR or a null byte in a loop. */
|
||||
+L(first_aligned_loop):
|
||||
+ /* Preserve YMM1, YMM2, YMM3, and YMM4 until we can gurantee
|
||||
+ they don't store a match. */
|
||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM5
|
||||
+ VMOVA (VEC_SIZE * 5)(%rdi), %YMM6
|
||||
+
|
||||
+ VPCMP $0, %YMM5, %YMMMATCH, %k2
|
||||
+ vpxord %YMM6, %YMMMATCH, %YMM7
|
||||
+
|
||||
+ VPMIN %YMM5, %YMM6, %YMM8
|
||||
+ VPMIN %YMM8, %YMM7, %YMM7
|
||||
+
|
||||
+ VPTESTN %YMM7, %YMM7, %k1
|
||||
+ subq $(VEC_SIZE * -2), %rdi
|
||||
+ kortestd %k1, %k2
|
||||
+ jz L(first_aligned_loop)
|
||||
+
|
||||
+ VPCMP $0, %YMM6, %YMMMATCH, %k3
|
||||
+ VPTESTN %YMM8, %YMM8, %k1
|
||||
+ ktestd %k1, %k1
|
||||
+ jz L(second_aligned_loop_prep)
|
||||
+
|
||||
+ kortestd %k2, %k3
|
||||
+ jnz L(return_first_aligned_loop)
|
||||
+
|
||||
+ .p2align 4,, 6
|
||||
+L(first_vec_x1_or_x2_or_x3):
|
||||
+ VPCMP $0, %YMM4, %YMMMATCH, %k4
|
||||
+ kmovd %k4, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(match)
|
||||
-L(return_value):
|
||||
- testl %edx, %edx
|
||||
- jz L(return_null)
|
||||
- movl %edx, %eax
|
||||
- movq %rsi, %rdi
|
||||
+ jz L(first_vec_x1_or_x2)
|
||||
bsrl %eax, %eax
|
||||
-# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(match):
|
||||
- /* Find a CHAR. Check if there is a null byte. */
|
||||
- kmovd %k0, %ecx
|
||||
- testl %ecx, %ecx
|
||||
- jnz L(find_nul)
|
||||
+ .p2align 4,, 8
|
||||
+L(return_first_aligned_loop):
|
||||
+ VPTESTN %YMM5, %YMM5, %k0
|
||||
+ kunpck %k0, %k1, %k0
|
||||
+ kmov_2x %k0, %maskz_2x
|
||||
+
|
||||
+ blsmsk %maskz_2x, %maskz_2x
|
||||
+ kunpck %k2, %k3, %k3
|
||||
+ kmov_2x %k3, %maskm_2x
|
||||
+ and %maskz_2x, %maskm_2x
|
||||
+ jz L(first_vec_x1_or_x2_or_x3)
|
||||
|
||||
- /* Remember the match and keep searching. */
|
||||
- movl %eax, %edx
|
||||
+ bsr %maskm_2x, %maskm_2x
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+ /* We can throw away the work done for the first 4x checks here
|
||||
+ as we have a later match. This is the 'fast' path persay.
|
||||
+ */
|
||||
+L(second_aligned_loop_prep):
|
||||
+L(second_aligned_loop_set_furthest_match):
|
||||
movq %rdi, %rsi
|
||||
- jmp L(aligned_loop)
|
||||
+ kunpck %k2, %k3, %k4
|
||||
|
||||
.p2align 4
|
||||
-L(find_nul):
|
||||
- /* Mask out any matching bits after the null byte. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
- testl %eax, %eax
|
||||
- /* If there is no CHAR here, return the remembered one. */
|
||||
- jz L(return_value)
|
||||
- bsrl %eax, %eax
|
||||
+L(second_aligned_loop):
|
||||
+ VMOVU (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
+ VMOVU (VEC_SIZE * 5)(%rdi), %YMM2
|
||||
+
|
||||
+ VPCMP $0, %YMM1, %YMMMATCH, %k2
|
||||
+ vpxord %YMM2, %YMMMATCH, %YMM3
|
||||
+
|
||||
+ VPMIN %YMM1, %YMM2, %YMM4
|
||||
+ VPMIN %YMM3, %YMM4, %YMM3
|
||||
+
|
||||
+ VPTESTN %YMM3, %YMM3, %k1
|
||||
+ subq $(VEC_SIZE * -2), %rdi
|
||||
+ kortestd %k1, %k2
|
||||
+ jz L(second_aligned_loop)
|
||||
+
|
||||
+ VPCMP $0, %YMM2, %YMMMATCH, %k3
|
||||
+ VPTESTN %YMM4, %YMM4, %k1
|
||||
+ ktestd %k1, %k1
|
||||
+ jz L(second_aligned_loop_set_furthest_match)
|
||||
+
|
||||
+ kortestd %k2, %k3
|
||||
+ /* branch here because there is a significant advantage interms
|
||||
+ of output dependency chance in using edx. */
|
||||
+ jnz L(return_new_match)
|
||||
+L(return_old_match):
|
||||
+ kmovq %k4, %rax
|
||||
+ bsrq %rax, %rax
|
||||
+ leaq (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
+L(return_new_match):
|
||||
+ VPTESTN %YMM1, %YMM1, %k0
|
||||
+ kunpck %k0, %k1, %k0
|
||||
+ kmov_2x %k0, %maskz_2x
|
||||
+
|
||||
+ blsmsk %maskz_2x, %maskz_2x
|
||||
+ kunpck %k2, %k3, %k3
|
||||
+ kmov_2x %k3, %maskm_2x
|
||||
+ and %maskz_2x, %maskm_2x
|
||||
+ jz L(return_old_match)
|
||||
+
|
||||
+ bsr %maskm_2x, %maskm_2x
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
+L(cross_page_boundary):
|
||||
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
|
||||
+ rax` sets pointer will all page offset bits cleared so
|
||||
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
|
||||
+ before page cross (guranteed to be safe to read). Doing this
|
||||
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
|
||||
+ a bit of code size. */
|
||||
+ xorq %rdi, %rax
|
||||
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %YMM1
|
||||
+ VPTESTN %YMM1, %YMM1, %k0
|
||||
+ kmovd %k0, %ecx
|
||||
+
|
||||
+ /* Shift out zero CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
+ movl %edi, %esi
|
||||
+ andl $(VEC_SIZE - 1), %esi
|
||||
+ shrl $2, %esi
|
||||
# endif
|
||||
- ret
|
||||
+ shrxl %SHIFT_REG, %ecx, %ecx
|
||||
|
||||
- .p2align 4
|
||||
-L(char_and_nul):
|
||||
- /* Find both a CHAR and a null byte. */
|
||||
- addq %rcx, %rdi
|
||||
- movl %edx, %ecx
|
||||
-L(char_and_nul_in_first_vec):
|
||||
- /* Mask out any matching bits after the null byte. */
|
||||
- movl %ecx, %r8d
|
||||
- subl $1, %r8d
|
||||
- xorl %ecx, %r8d
|
||||
- andl %r8d, %eax
|
||||
- testl %eax, %eax
|
||||
- /* Return null pointer if the null byte comes first. */
|
||||
- jz L(return_null)
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(page_cross_continue)
|
||||
+
|
||||
+ /* Found zero CHAR so need to test for search CHAR. */
|
||||
+ VPCMP $0, %YMMMATCH, %YMM1, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Shift out search CHAR matches that are before the begining of
|
||||
+ src (rdi). */
|
||||
+ shrxl %SHIFT_REG, %eax, %eax
|
||||
+
|
||||
+ /* Check if any search CHAR match in range. */
|
||||
+ blsmskl %ecx, %ecx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(ret3)
|
||||
bsrl %eax, %eax
|
||||
# ifdef USE_AS_WCSRCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq -VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- leaq -VEC_SIZE(%rdi, %rax), %rax
|
||||
+ addq %rdi, %rax
|
||||
# endif
|
||||
+L(ret3):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(return_null):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
-END (STRRCHR)
|
||||
+END(STRRCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,73 +0,0 @@
|
||||
From 911c63a51c690dd1a97dfc587097277029baf00f Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 27 Apr 2022 15:13:02 -0500
|
||||
Subject: [PATCH] sysdeps: Add 'get_fast_jitter' interace in fast-jitter.h
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
'get_fast_jitter' is meant to be used purely for performance
|
||||
purposes. In all cases it's used it should be acceptable to get no
|
||||
randomness (see default case). An example use case is in setting
|
||||
jitter for retries between threads at a lock. There is a
|
||||
performance benefit to having jitter, but only if the jitter can
|
||||
be generated very quickly and ultimately there is no serious issue
|
||||
if no jitter is generated.
|
||||
|
||||
The implementation generally uses 'HP_TIMING_NOW' iff it is
|
||||
inlined (avoid any potential syscall paths).
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/generic/fast-jitter.h | 42 +++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 42 insertions(+)
|
||||
create mode 100644 sysdeps/generic/fast-jitter.h
|
||||
|
||||
diff --git a/sysdeps/generic/fast-jitter.h b/sysdeps/generic/fast-jitter.h
|
||||
new file mode 100644
|
||||
index 00000000..4dd53e34
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/generic/fast-jitter.h
|
||||
@@ -0,0 +1,42 @@
|
||||
+/* Fallback for fast jitter just return 0.
|
||||
+ Copyright (C) 2019-2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifndef _FAST_JITTER_H
|
||||
+# define _FAST_JITTER_H
|
||||
+
|
||||
+# include <stdint.h>
|
||||
+# include <hp-timing.h>
|
||||
+
|
||||
+/* Baseline just return 0. We could create jitter using a clock or
|
||||
+ 'random_bits' but that may imply a syscall and the goal of
|
||||
+ 'get_fast_jitter' is minimal overhead "randomness" when such
|
||||
+ randomness helps performance. Adding high overhead the function
|
||||
+ defeats the purpose. */
|
||||
+static inline uint32_t
|
||||
+get_fast_jitter (void)
|
||||
+{
|
||||
+# if HP_TIMING_INLINE
|
||||
+ hp_timing_t jitter;
|
||||
+ HP_TIMING_NOW (jitter);
|
||||
+ return (uint32_t) jitter;
|
||||
+# else
|
||||
+ return 0;
|
||||
+# endif
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,226 +0,0 @@
|
||||
From 8162147872491bb5b48e91543b19c49a29ae6b6d Mon Sep 17 00:00:00 2001
|
||||
From: Wangyang Guo <wangyang.guo@intel.com>
|
||||
Date: Fri, 6 May 2022 01:50:10 +0000
|
||||
Subject: [PATCH] nptl: Add backoff mechanism to spinlock loop
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
When mutiple threads waiting for lock at the same time, once lock owner
|
||||
releases the lock, waiters will see lock available and all try to lock,
|
||||
which may cause an expensive CAS storm.
|
||||
|
||||
Binary exponential backoff with random jitter is introduced. As try-lock
|
||||
attempt increases, there is more likely that a larger number threads
|
||||
compete for adaptive mutex lock, so increase wait time in exponential.
|
||||
A random jitter is also added to avoid synchronous try-lock from other
|
||||
threads.
|
||||
|
||||
v2: Remove read-check before try-lock for performance.
|
||||
|
||||
v3:
|
||||
1. Restore read-check since it works well in some platform.
|
||||
2. Make backoff arch dependent, and enable it for x86_64.
|
||||
3. Limit max backoff to reduce latency in large critical section.
|
||||
|
||||
v4: Fix strict-prototypes error in sysdeps/nptl/pthread_mutex_backoff.h
|
||||
|
||||
v5: Commit log updated for regression in large critical section.
|
||||
|
||||
Result of pthread-mutex-locks bench
|
||||
|
||||
Test Platform: Xeon 8280L (2 socket, 112 CPUs in total)
|
||||
First Row: thread number
|
||||
First Col: critical section length
|
||||
Values: backoff vs upstream, time based, low is better
|
||||
|
||||
non-critical-length: 1
|
||||
1 2 4 8 16 32 64 112 140
|
||||
0 0.99 0.58 0.52 0.49 0.43 0.44 0.46 0.52 0.54
|
||||
1 0.98 0.43 0.56 0.50 0.44 0.45 0.50 0.56 0.57
|
||||
2 0.99 0.41 0.57 0.51 0.45 0.47 0.48 0.60 0.61
|
||||
4 0.99 0.45 0.59 0.53 0.48 0.49 0.52 0.64 0.65
|
||||
8 1.00 0.66 0.71 0.63 0.56 0.59 0.66 0.72 0.71
|
||||
16 0.97 0.78 0.91 0.73 0.67 0.70 0.79 0.80 0.80
|
||||
32 0.95 1.17 0.98 0.87 0.82 0.86 0.89 0.90 0.90
|
||||
64 0.96 0.95 1.01 1.01 0.98 1.00 1.03 0.99 0.99
|
||||
128 0.99 1.01 1.01 1.17 1.08 1.12 1.02 0.97 1.02
|
||||
|
||||
non-critical-length: 32
|
||||
1 2 4 8 16 32 64 112 140
|
||||
0 1.03 0.97 0.75 0.65 0.58 0.58 0.56 0.70 0.70
|
||||
1 0.94 0.95 0.76 0.65 0.58 0.58 0.61 0.71 0.72
|
||||
2 0.97 0.96 0.77 0.66 0.58 0.59 0.62 0.74 0.74
|
||||
4 0.99 0.96 0.78 0.66 0.60 0.61 0.66 0.76 0.77
|
||||
8 0.99 0.99 0.84 0.70 0.64 0.66 0.71 0.80 0.80
|
||||
16 0.98 0.97 0.95 0.76 0.70 0.73 0.81 0.85 0.84
|
||||
32 1.04 1.12 1.04 0.89 0.82 0.86 0.93 0.91 0.91
|
||||
64 0.99 1.15 1.07 1.00 0.99 1.01 1.05 0.99 0.99
|
||||
128 1.00 1.21 1.20 1.22 1.25 1.31 1.12 1.10 0.99
|
||||
|
||||
non-critical-length: 128
|
||||
1 2 4 8 16 32 64 112 140
|
||||
0 1.02 1.00 0.99 0.67 0.61 0.61 0.61 0.74 0.73
|
||||
1 0.95 0.99 1.00 0.68 0.61 0.60 0.60 0.74 0.74
|
||||
2 1.00 1.04 1.00 0.68 0.59 0.61 0.65 0.76 0.76
|
||||
4 1.00 0.96 0.98 0.70 0.63 0.63 0.67 0.78 0.77
|
||||
8 1.01 1.02 0.89 0.73 0.65 0.67 0.71 0.81 0.80
|
||||
16 0.99 0.96 0.96 0.79 0.71 0.73 0.80 0.84 0.84
|
||||
32 0.99 0.95 1.05 0.89 0.84 0.85 0.94 0.92 0.91
|
||||
64 1.00 0.99 1.16 1.04 1.00 1.02 1.06 0.99 0.99
|
||||
128 1.00 1.06 0.98 1.14 1.39 1.26 1.08 1.02 0.98
|
||||
|
||||
There is regression in large critical section. But adaptive mutex is
|
||||
aimed for "quick" locks. Small critical section is more common when
|
||||
users choose to use adaptive pthread_mutex.
|
||||
|
||||
Signed-off-by: Wangyang Guo <wangyang.guo@intel.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
Conflicts:
|
||||
pthreadP.h
|
||||
(had been moved)
|
||||
nptl/pthread_mutex_lock.c
|
||||
(max_adaptive_count renamed)
|
||||
|
||||
---
|
||||
nptl/pthreadP.h | 1 +
|
||||
nptl/pthread_mutex_lock.c | 16 +++++++--
|
||||
sysdeps/nptl/pthread_mutex_backoff.h | 35 ++++++++++++++++++
|
||||
sysdeps/x86_64/nptl/pthread_mutex_backoff.h | 39 +++++++++++++++++++++
|
||||
4 files changed, 89 insertions(+), 2 deletions(-)
|
||||
create mode 100644 sysdeps/nptl/pthread_mutex_backoff.h
|
||||
create mode 100644 sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
||||
|
||||
diff --git a/nptl/pthreadP.h b/nptl/pthreadP.h
|
||||
index 7ddc166c..1550e3b6 100644
|
||||
--- a/nptl/pthreadP.h
|
||||
+++ b/nptl/pthreadP.h
|
||||
@@ -33,6 +33,7 @@
|
||||
#include <kernel-features.h>
|
||||
#include <errno.h>
|
||||
#include <internal-signals.h>
|
||||
+#include <pthread_mutex_backoff.h>
|
||||
|
||||
|
||||
/* Atomic operations on TLS memory. */
|
||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
||||
index d96a9933..c7770fc9 100644
|
||||
--- a/nptl/pthread_mutex_lock.c
|
||||
+++ b/nptl/pthread_mutex_lock.c
|
||||
@@ -133,14 +133,26 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
||||
int cnt = 0;
|
||||
int max_cnt = MIN (MAX_ADAPTIVE_COUNT,
|
||||
mutex->__data.__spins * 2 + 10);
|
||||
+ int spin_count, exp_backoff = 1;
|
||||
+ unsigned int jitter = get_jitter ();
|
||||
do
|
||||
{
|
||||
- if (cnt++ >= max_cnt)
|
||||
+ /* In each loop, spin count is exponential backoff plus
|
||||
+ random jitter, random range is [0, exp_backoff-1]. */
|
||||
+ spin_count = exp_backoff + (jitter & (exp_backoff - 1));
|
||||
+ cnt += spin_count;
|
||||
+ if (cnt >= max_cnt)
|
||||
{
|
||||
+ /* If cnt exceeds max spin count, just go to wait
|
||||
+ queue. */
|
||||
LLL_MUTEX_LOCK (mutex);
|
||||
break;
|
||||
}
|
||||
- atomic_spin_nop ();
|
||||
+ do
|
||||
+ atomic_spin_nop ();
|
||||
+ while (--spin_count > 0);
|
||||
+ /* Prepare for next loop. */
|
||||
+ exp_backoff = get_next_backoff (exp_backoff);
|
||||
}
|
||||
while (LLL_MUTEX_READ_LOCK (mutex) != 0
|
||||
|| LLL_MUTEX_TRYLOCK (mutex) != 0);
|
||||
diff --git a/sysdeps/nptl/pthread_mutex_backoff.h b/sysdeps/nptl/pthread_mutex_backoff.h
|
||||
new file mode 100644
|
||||
index 00000000..5b26c22a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/nptl/pthread_mutex_backoff.h
|
||||
@@ -0,0 +1,35 @@
|
||||
+/* Pthread mutex backoff configuration.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
||||
+
|
||||
+static inline unsigned int
|
||||
+get_jitter (void)
|
||||
+{
|
||||
+ /* Arch dependent random jitter, return 0 disables random. */
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static inline int
|
||||
+get_next_backoff (int backoff)
|
||||
+{
|
||||
+ /* Next backoff, return 1 disables mutex backoff. */
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/nptl/pthread_mutex_backoff.h b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
||||
new file mode 100644
|
||||
index 00000000..ec74c3d9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/nptl/pthread_mutex_backoff.h
|
||||
@@ -0,0 +1,39 @@
|
||||
+/* Pthread mutex backoff configuration.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+#ifndef _PTHREAD_MUTEX_BACKOFF_H
|
||||
+#define _PTHREAD_MUTEX_BACKOFF_H 1
|
||||
+
|
||||
+#include <fast-jitter.h>
|
||||
+
|
||||
+static inline unsigned int
|
||||
+get_jitter (void)
|
||||
+{
|
||||
+ return get_fast_jitter ();
|
||||
+}
|
||||
+
|
||||
+#define MAX_BACKOFF 16
|
||||
+
|
||||
+static inline int
|
||||
+get_next_backoff (int backoff)
|
||||
+{
|
||||
+ /* Binary expontial backoff. Limiting max backoff
|
||||
+ can reduce latency in large critical section. */
|
||||
+ return (backoff < MAX_BACKOFF) ? backoff << 1 : backoff;
|
||||
+}
|
||||
+
|
||||
+#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,55 +0,0 @@
|
||||
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue, 15 Feb 2022 08:18:15 -0600
|
||||
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
|
||||
#28896]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
|
||||
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
|
||||
not checks around vzeroupper and would trigger spurious
|
||||
aborts. This commit fixes that.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
|
||||
AVX2 machines with and without RTM.
|
||||
|
||||
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 8 ++------
|
||||
1 file changed, 2 insertions(+), 6 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
(split into two patches due to upstream bug differences)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 28cc98b6..e267c6cb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -345,10 +345,10 @@ L(one_or_less):
|
||||
movq %LOCALE_REG, %rdx
|
||||
# endif
|
||||
jb L(ret_zero)
|
||||
-# ifdef USE_AS_WCSCMP
|
||||
/* 'nbe' covers the case where length is negative (large
|
||||
unsigned). */
|
||||
- jnbe __wcscmp_avx2
|
||||
+ jnbe OVERFLOW_STRCMP
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
movl (%rdi), %edx
|
||||
xorl %eax, %eax
|
||||
cmpl (%rsi), %edx
|
||||
@@ -357,10 +357,6 @@ L(one_or_less):
|
||||
negl %eax
|
||||
orl $1, %eax
|
||||
# else
|
||||
- /* 'nbe' covers the case where length is negative (large
|
||||
- unsigned). */
|
||||
-
|
||||
- jnbe __strcmp_avx2
|
||||
movzbl (%rdi), %eax
|
||||
movzbl (%rsi), %ecx
|
||||
TOLOWER_gpr (%rax, %eax)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,60 +0,0 @@
|
||||
From 259a17cc98058d2576511201f85d28cb5d9de2a2 Mon Sep 17 00:00:00 2001
|
||||
From: Stefan Liebler <stli@linux.ibm.com>
|
||||
Date: Mon, 28 Jun 2021 13:01:07 +0200
|
||||
Subject: s390x: Update math: redirect roundeven function
|
||||
|
||||
After recent commit
|
||||
447954a206837b5f153869cfeeeab44631c3fac9
|
||||
"math: redirect roundeven function", building on
|
||||
s390x fails with:
|
||||
Error: symbol `__roundevenl' is already defined
|
||||
|
||||
Similar to aarch64/riscv fix, this patch redirects target
|
||||
specific functions for s390x:
|
||||
commit 3213ed770cbc5821920d16caa93c85e92dd7b9f6
|
||||
"Update math: redirect roundeven function"
|
||||
|
||||
diff --git a/sysdeps/s390/fpu/s_roundeven.c b/sysdeps/s390/fpu/s_roundeven.c
|
||||
index 40b07e054b..0773adfed0 100644
|
||||
--- a/sysdeps/s390/fpu/s_roundeven.c
|
||||
+++ b/sysdeps/s390/fpu/s_roundeven.c
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
||||
+# define NO_MATH_REDIRECT
|
||||
# include <math.h>
|
||||
# include <libm-alias-double.h>
|
||||
|
||||
@@ -31,7 +32,6 @@ __roundeven (double x)
|
||||
__asm__ ("fidbra %0,4,%1,4" : "=f" (y) : "f" (x));
|
||||
return y;
|
||||
}
|
||||
-hidden_def (__roundeven)
|
||||
libm_alias_double (__roundeven, roundeven)
|
||||
|
||||
#else
|
||||
diff --git a/sysdeps/s390/fpu/s_roundevenf.c b/sysdeps/s390/fpu/s_roundevenf.c
|
||||
index d2fbf3d2b6..289785bc4a 100644
|
||||
--- a/sysdeps/s390/fpu/s_roundevenf.c
|
||||
+++ b/sysdeps/s390/fpu/s_roundevenf.c
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
||||
+# define NO_MATH_REDIRECT
|
||||
# include <math.h>
|
||||
# include <libm-alias-float.h>
|
||||
|
||||
diff --git a/sysdeps/s390/fpu/s_roundevenl.c b/sysdeps/s390/fpu/s_roundevenl.c
|
||||
index 29ab7a8616..94b6459ab4 100644
|
||||
--- a/sysdeps/s390/fpu/s_roundevenl.c
|
||||
+++ b/sysdeps/s390/fpu/s_roundevenl.c
|
||||
@@ -18,6 +18,7 @@
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
#ifdef HAVE_S390_MIN_Z196_ZARCH_ASM_SUPPORT
|
||||
+# define NO_MATH_REDIRECT
|
||||
# include <math.h>
|
||||
# include <math_private.h>
|
||||
# include <libm-alias-ldouble.h>
|
@ -1,74 +0,0 @@
|
||||
From 1da50d4bda07f04135dca39f40e79fc9eabed1f8 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 26 Feb 2021 05:36:59 -0800
|
||||
Subject: [PATCH] x86: Set Prefer_No_VZEROUPPER and add Prefer_AVX2_STRCMP
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
1. Set Prefer_No_VZEROUPPER if RTM is usable to avoid RTM abort triggered
|
||||
by VZEROUPPER inside a transactionally executing RTM region.
|
||||
2. Since to compare 2 32-byte strings, 256-bit EVEX strcmp requires 2
|
||||
loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp requires 1 load, 2 VPCMPEQs,
|
||||
1 VPMINU and 1 VPMOVMSKB, AVX2 strcmp is faster than EVEX strcmp. Add
|
||||
Prefer_AVX2_STRCMP to prefer AVX2 strcmp family functions.
|
||||
---
|
||||
sysdeps/x86/cpu-features.c | 20 +++++++++++++++++--
|
||||
sysdeps/x86/cpu-tunables.c | 2 ++
|
||||
...cpu-features-preferred_feature_index_1.def | 1 +
|
||||
3 files changed, 21 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 91042505..3610ee5c 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -524,8 +524,24 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
||||
|= bit_arch_Prefer_No_VZEROUPPER;
|
||||
else
|
||||
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
- |= bit_arch_Prefer_No_AVX512;
|
||||
+ {
|
||||
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
+ |= bit_arch_Prefer_No_AVX512;
|
||||
+
|
||||
+ /* Avoid RTM abort triggered by VZEROUPPER inside a
|
||||
+ transactionally executing RTM region. */
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
+ cpu_features->preferred[index_arch_Prefer_No_VZEROUPPER]
|
||||
+ |= bit_arch_Prefer_No_VZEROUPPER;
|
||||
+
|
||||
+ /* Since to compare 2 32-byte strings, 256-bit EVEX strcmp
|
||||
+ requires 2 loads, 3 VPCMPs and 2 KORDs while AVX2 strcmp
|
||||
+ requires 1 load, 2 VPCMPEQs, 1 VPMINU and 1 VPMOVMSKB,
|
||||
+ AVX2 strcmp is faster than EVEX strcmp. */
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
+ cpu_features->preferred[index_arch_Prefer_AVX2_STRCMP]
|
||||
+ |= bit_arch_Prefer_AVX2_STRCMP;
|
||||
+ }
|
||||
}
|
||||
/* This spells out "AuthenticAMD". */
|
||||
else if (ebx == 0x68747541 && ecx == 0x444d4163 && edx == 0x69746e65)
|
||||
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
|
||||
index 3173b2b9..73adbaba 100644
|
||||
--- a/sysdeps/x86/cpu-tunables.c
|
||||
+++ b/sysdeps/x86/cpu-tunables.c
|
||||
@@ -239,6 +239,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
|
||||
CHECK_GLIBC_IFUNC_PREFERRED_BOTH (n, cpu_features,
|
||||
Fast_Copy_Backward,
|
||||
disable, 18);
|
||||
+ CHECK_GLIBC_IFUNC_PREFERRED_NEED_BOTH
|
||||
+ (n, cpu_features, Prefer_AVX2_STRCMP, AVX2, disable, 18);
|
||||
}
|
||||
break;
|
||||
case 19:
|
||||
diff --git a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
index 17a5cc42..4ca70b40 100644
|
||||
--- a/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
+++ b/sysdeps/x86/include/cpu-features-preferred_feature_index_1.def
|
||||
@@ -32,3 +32,4 @@ BIT (Prefer_ERMS)
|
||||
BIT (Prefer_FSRM)
|
||||
BIT (Prefer_No_AVX512)
|
||||
BIT (MathVec_Prefer_No_AVX512)
|
||||
+BIT (Prefer_AVX2_STRCMP)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,26 +0,0 @@
|
||||
From 3213ed770cbc5821920d16caa93c85e92dd7b9f6 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Wed, 23 Jun 2021 13:29:41 -0700
|
||||
Subject: Update math: redirect roundeven function
|
||||
|
||||
Redirect target specific roundeven functions for aarch64, ldbl-128ibm
|
||||
and riscv.
|
||||
|
||||
Conflicts:
|
||||
sysdeps/aarch64/*
|
||||
(not needed)
|
||||
sysdeps/riscv/*
|
||||
(not supported)
|
||||
|
||||
diff --git a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
||||
index 6701970f4a..90eecf496b 100644
|
||||
--- a/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
||||
+++ b/sysdeps/ieee754/ldbl-128ibm/s_roundevenl.c
|
||||
@@ -17,6 +17,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <math.h>
|
||||
#include <math_private.h>
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,242 +0,0 @@
|
||||
From 63ad43566f7a25d140dc723598aeb441ad657eed Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 5 Mar 2021 06:46:08 -0800
|
||||
Subject: [PATCH] x86-64: Add memmove family functions with 256-bit EVEX
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memmove.h to select the function optimized with 256-bit EVEX
|
||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 36 +++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 21 +++++++++--
|
||||
.../multiarch/memmove-evex-unaligned-erms.S | 33 +++++++++++++++++
|
||||
.../multiarch/memmove-vec-unaligned-erms.S | 24 ++++++++-----
|
||||
5 files changed, 104 insertions(+), 11 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 46783cd1..4563fc56 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
memset-avx2-unaligned-erms \
|
||||
memset-avx512-unaligned-erms \
|
||||
memchr-evex \
|
||||
+ memmove-evex-unaligned-erms \
|
||||
memrchr-evex \
|
||||
rawmemchr-evex \
|
||||
stpcpy-evex \
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 082e4da3..6bd3abfc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -80,6 +80,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memmove_chk_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
CPU_FEATURE_USABLE (SSSE3),
|
||||
__memmove_chk_ssse3_back)
|
||||
@@ -102,6 +108,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memmove_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, memmove,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memmove_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memmove_avx512_no_vzeroupper)
|
||||
@@ -565,6 +577,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memcpy_chk_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
CPU_FEATURE_USABLE (SSSE3),
|
||||
__memcpy_chk_ssse3_back)
|
||||
@@ -587,6 +605,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__memcpy_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __memcpy_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
__memcpy_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
@@ -623,6 +647,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__mempcpy_chk_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
CPU_FEATURE_USABLE (SSSE3),
|
||||
__mempcpy_chk_ssse3_back)
|
||||
@@ -654,6 +684,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
__mempcpy_avx_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __mempcpy_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
__mempcpy_ssse3_back)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy, CPU_FEATURE_USABLE (SSSE3),
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
index 5e5f0299..6f8bce5f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
@@ -29,6 +29,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3_back) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx_unaligned_erms)
|
||||
attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
||||
@@ -59,10 +63,21 @@ IFUNC_SELECTOR (void)
|
||||
|
||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx_unaligned_erms);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (evex_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+ }
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx_unaligned_erms);
|
||||
|
||||
- return OPTIMIZE (avx_unaligned);
|
||||
+ return OPTIMIZE (avx_unaligned);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (!CPU_FEATURE_USABLE_P (cpu_features, SSSE3)
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
new file mode 100644
|
||||
index 00000000..0cbce8f9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-evex-unaligned-erms.S
|
||||
@@ -0,0 +1,33 @@
|
||||
+#if IS_IN (libc)
|
||||
+# define VEC_SIZE 32
|
||||
+# define XMM0 xmm16
|
||||
+# define XMM1 xmm17
|
||||
+# define YMM0 ymm16
|
||||
+# define YMM1 ymm17
|
||||
+# define VEC0 ymm16
|
||||
+# define VEC1 ymm17
|
||||
+# define VEC2 ymm18
|
||||
+# define VEC3 ymm19
|
||||
+# define VEC4 ymm20
|
||||
+# define VEC5 ymm21
|
||||
+# define VEC6 ymm22
|
||||
+# define VEC7 ymm23
|
||||
+# define VEC8 ymm24
|
||||
+# define VEC9 ymm25
|
||||
+# define VEC10 ymm26
|
||||
+# define VEC11 ymm27
|
||||
+# define VEC12 ymm28
|
||||
+# define VEC13 ymm29
|
||||
+# define VEC14 ymm30
|
||||
+# define VEC15 ymm31
|
||||
+# define VEC(i) VEC##i
|
||||
+# define VMOVNT vmovntdq
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
+
|
||||
+# define SECTION(p) p##.evex
|
||||
+# define MEMMOVE_SYMBOL(p,s) p##_evex_##s
|
||||
+
|
||||
+# include "memmove-vec-unaligned-erms.S"
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index 274aa1c7..08e21692 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -48,6 +48,14 @@
|
||||
# define MEMMOVE_CHK_SYMBOL(p,s) MEMMOVE_SYMBOL(p, s)
|
||||
#endif
|
||||
|
||||
+#ifndef XMM0
|
||||
+# define XMM0 xmm0
|
||||
+#endif
|
||||
+
|
||||
+#ifndef YMM0
|
||||
+# define YMM0 ymm0
|
||||
+#endif
|
||||
+
|
||||
#ifndef VZEROUPPER
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER vzeroupper
|
||||
@@ -277,20 +285,20 @@ L(less_vec):
|
||||
#if VEC_SIZE > 32
|
||||
L(between_32_63):
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
- vmovdqu (%rsi), %ymm0
|
||||
- vmovdqu -32(%rsi,%rdx), %ymm1
|
||||
- vmovdqu %ymm0, (%rdi)
|
||||
- vmovdqu %ymm1, -32(%rdi,%rdx)
|
||||
+ VMOVU (%rsi), %YMM0
|
||||
+ VMOVU -32(%rsi,%rdx), %YMM1
|
||||
+ VMOVU %YMM0, (%rdi)
|
||||
+ VMOVU %YMM1, -32(%rdi,%rdx)
|
||||
VZEROUPPER
|
||||
ret
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
- vmovdqu (%rsi), %xmm0
|
||||
- vmovdqu -16(%rsi,%rdx), %xmm1
|
||||
- vmovdqu %xmm0, (%rdi)
|
||||
- vmovdqu %xmm1, -16(%rdi,%rdx)
|
||||
+ VMOVU (%rsi), %XMM0
|
||||
+ VMOVU -16(%rsi,%rdx), %XMM1
|
||||
+ VMOVU %XMM0, (%rdi)
|
||||
+ VMOVU %XMM1, -16(%rdi,%rdx)
|
||||
ret
|
||||
#endif
|
||||
L(between_8_15):
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,254 +0,0 @@
|
||||
From 1b968b6b9b3aac702ac2f133e0dd16cfdbb415ee Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 5 Mar 2021 07:15:03 -0800
|
||||
Subject: [PATCH] x86-64: Add memset family functions with 256-bit EVEX
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
||||
with 256-bit EVEX instructions using YMM16-YMM31 registers to avoid RTM
|
||||
abort with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
||||
function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 1 +
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 22 +++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 24 +++++++++++++++----
|
||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 13 ++++++----
|
||||
.../multiarch/memset-evex-unaligned-erms.S | 24 +++++++++++++++++++
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 20 +++++++++++-----
|
||||
6 files changed, 90 insertions(+), 14 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 4563fc56..1cc0a10e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -43,6 +43,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
memchr-evex \
|
||||
memmove-evex-unaligned-erms \
|
||||
memrchr-evex \
|
||||
+ memset-evex-unaligned-erms \
|
||||
rawmemchr-evex \
|
||||
stpcpy-evex \
|
||||
stpncpy-evex \
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 6bd3abfc..7cf83485 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -160,6 +160,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__memset_chk_avx2_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_chk_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memset_chk_avx512_unaligned_erms)
|
||||
@@ -185,6 +193,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__memset_avx2_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, memset,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ __memset_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memset_avx512_unaligned_erms)
|
||||
@@ -555,6 +571,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__wmemset_avx2_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __wmemset_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__wmemset_avx512_unaligned))
|
||||
@@ -723,6 +742,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__wmemset_chk_avx2_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ __wmemset_chk_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__wmemset_chk_avx512_unaligned))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
index 708bd72e..6f31f4dc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
@@ -27,6 +27,10 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned_erms)
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned_erms)
|
||||
attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned)
|
||||
attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned_erms)
|
||||
@@ -56,10 +60,22 @@ IFUNC_SELECTOR (void)
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
{
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx2_unaligned_erms);
|
||||
- else
|
||||
- return OPTIMIZE (avx2_unaligned);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (evex_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+ }
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx2_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE (avx2_unaligned);
|
||||
+ }
|
||||
}
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
index eb242210..9290c4bf 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
@@ -20,6 +20,7 @@
|
||||
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_unaligned) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_unaligned) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx512_unaligned) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
@@ -27,14 +28,18 @@ IFUNC_SELECTOR (void)
|
||||
{
|
||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx512_unaligned);
|
||||
- else
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
return OPTIMIZE (avx2_unaligned);
|
||||
}
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
new file mode 100644
|
||||
index 00000000..ae0a4d6e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -0,0 +1,24 @@
|
||||
+#if IS_IN (libc)
|
||||
+# define VEC_SIZE 32
|
||||
+# define XMM0 xmm16
|
||||
+# define YMM0 ymm16
|
||||
+# define VEC0 ymm16
|
||||
+# define VEC(i) VEC##i
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ movq r, %rax; \
|
||||
+ vpbroadcastb d, %VEC0
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ movq r, %rax; \
|
||||
+ vpbroadcastd d, %VEC0
|
||||
+
|
||||
+# define SECTION(p) p##.evex
|
||||
+# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
+# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
+
|
||||
+# include "memset-vec-unaligned-erms.S"
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index 9a0fd818..71e91a8f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -34,6 +34,14 @@
|
||||
# define WMEMSET_CHK_SYMBOL(p,s) WMEMSET_SYMBOL(p, s)
|
||||
#endif
|
||||
|
||||
+#ifndef XMM0
|
||||
+# define XMM0 xmm0
|
||||
+#endif
|
||||
+
|
||||
+#ifndef YMM0
|
||||
+# define YMM0 ymm0
|
||||
+#endif
|
||||
+
|
||||
#ifndef VZEROUPPER
|
||||
# if VEC_SIZE > 16
|
||||
# define VZEROUPPER vzeroupper
|
||||
@@ -67,7 +75,7 @@
|
||||
ENTRY (__bzero)
|
||||
mov %RDI_LP, %RAX_LP /* Set return value. */
|
||||
mov %RSI_LP, %RDX_LP /* Set n. */
|
||||
- pxor %xmm0, %xmm0
|
||||
+ pxor %XMM0, %XMM0
|
||||
jmp L(entry_from_bzero)
|
||||
END (__bzero)
|
||||
weak_alias (__bzero, bzero)
|
||||
@@ -223,7 +231,7 @@ L(less_vec):
|
||||
cmpb $16, %dl
|
||||
jae L(between_16_31)
|
||||
# endif
|
||||
- MOVQ %xmm0, %rcx
|
||||
+ MOVQ %XMM0, %rcx
|
||||
cmpb $8, %dl
|
||||
jae L(between_8_15)
|
||||
cmpb $4, %dl
|
||||
@@ -238,16 +246,16 @@ L(less_vec):
|
||||
# if VEC_SIZE > 32
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
- vmovdqu %ymm0, -32(%rdi,%rdx)
|
||||
- vmovdqu %ymm0, (%rdi)
|
||||
+ VMOVU %YMM0, -32(%rdi,%rdx)
|
||||
+ VMOVU %YMM0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
# endif
|
||||
# if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
- vmovdqu %xmm0, -16(%rdi,%rdx)
|
||||
- vmovdqu %xmm0, (%rdi)
|
||||
+ VMOVU %XMM0, -16(%rdi,%rdx)
|
||||
+ VMOVU %XMM0, (%rdi)
|
||||
VZEROUPPER
|
||||
ret
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,561 +0,0 @@
|
||||
From 91264fe3577fe887b4860923fa6142b5274c8965 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 5 Mar 2021 07:20:28 -0800
|
||||
Subject: [PATCH] x86-64: Add memcmp family functions with 256-bit EVEX
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memcmp.h to select the function optimized with 256-bit EVEX
|
||||
instructions using YMM16-YMM31 registers to avoid RTM abort with usable
|
||||
AVX512VL, AVX512BW and MOVBE since VZEROUPPER isn't needed at function
|
||||
exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 10 +
|
||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 13 +-
|
||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 440 ++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S | 4 +
|
||||
5 files changed, 467 insertions(+), 4 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
create mode 100644 sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 1cc0a10e..9d79b138 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -41,6 +41,7 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
memset-avx2-unaligned-erms \
|
||||
memset-avx512-unaligned-erms \
|
||||
memchr-evex \
|
||||
+ memcmp-evex-movbe \
|
||||
memmove-evex-unaligned-erms \
|
||||
memrchr-evex \
|
||||
memset-evex-unaligned-erms \
|
||||
@@ -81,7 +82,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
||||
wcsncmp-evex \
|
||||
wcsnlen-evex \
|
||||
wcsrchr-evex \
|
||||
- wmemchr-evex
|
||||
+ wmemchr-evex \
|
||||
+ wmemcmp-evex-movbe
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),debug)
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 7cf83485..c8da910e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -56,6 +56,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__memcmp_avx2_movbe)
|
||||
+ IFUNC_IMPL_ADD (array, i, memcmp,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
||||
+ __memcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
__memcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
@@ -558,6 +563,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__wmemcmp_avx2_movbe)
|
||||
+ IFUNC_IMPL_ADD (array, i, wmemcmp,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (MOVBE)),
|
||||
+ __wmemcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
__wmemcmp_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSSE3),
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
index 6c1f3153..3ca1f0a6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
@@ -23,17 +23,24 @@ extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (ssse3) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_movbe) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_movbe) attribute_hidden;
|
||||
|
||||
static inline void *
|
||||
IFUNC_SELECTOR (void)
|
||||
{
|
||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
- return OPTIMIZE (avx2_movbe);
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ return OPTIMIZE (evex_movbe);
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ return OPTIMIZE (avx2_movbe);
|
||||
+ }
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
return OPTIMIZE (sse4_1);
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
new file mode 100644
|
||||
index 00000000..9c093972
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -0,0 +1,440 @@
|
||||
+/* memcmp/wmemcmp optimized with 256-bit EVEX instructions.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#if IS_IN (libc)
|
||||
+
|
||||
+/* memcmp/wmemcmp is implemented as:
|
||||
+ 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
||||
+ to avoid branches.
|
||||
+ 2. Use overlapping compare to avoid branch.
|
||||
+ 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
||||
+ bytes for wmemcmp.
|
||||
+ 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
+ 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
+ area.
|
||||
+ 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
||||
+ 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
||||
+ 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
||||
+
|
||||
+# include <sysdep.h>
|
||||
+
|
||||
+# ifndef MEMCMP
|
||||
+# define MEMCMP __memcmp_evex_movbe
|
||||
+# endif
|
||||
+
|
||||
+# define VMOVU vmovdqu64
|
||||
+
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+# define VPCMPEQ vpcmpeqd
|
||||
+# else
|
||||
+# define VPCMPEQ vpcmpeqb
|
||||
+# endif
|
||||
+
|
||||
+# define XMM1 xmm17
|
||||
+# define XMM2 xmm18
|
||||
+# define YMM1 ymm17
|
||||
+# define YMM2 ymm18
|
||||
+# define YMM3 ymm19
|
||||
+# define YMM4 ymm20
|
||||
+# define YMM5 ymm21
|
||||
+# define YMM6 ymm22
|
||||
+
|
||||
+# define VEC_SIZE 32
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+# define VEC_MASK 0xff
|
||||
+# define XMM_MASK 0xf
|
||||
+# else
|
||||
+# define VEC_MASK 0xffffffff
|
||||
+# define XMM_MASK 0xffff
|
||||
+# endif
|
||||
+
|
||||
+/* Warning!
|
||||
+ wmemcmp has to use SIGNED comparison for elements.
|
||||
+ memcmp has to use UNSIGNED comparison for elemnts.
|
||||
+*/
|
||||
+
|
||||
+ .section .text.evex,"ax",@progbits
|
||||
+ENTRY (MEMCMP)
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ shl $2, %RDX_LP
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
+ jb L(less_vec)
|
||||
+
|
||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ jbe L(last_vec)
|
||||
+
|
||||
+ /* More than 2 * VEC. */
|
||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
||||
+ ja L(more_8x_vec)
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jb L(last_4x_vec)
|
||||
+
|
||||
+ /* From 4 * VEC to 8 * VEC, inclusively. */
|
||||
+ VMOVU (%rsi), %YMM1
|
||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
+
|
||||
+ kandd %k1, %k2, %k5
|
||||
+ kandd %k3, %k4, %k6
|
||||
+ kandd %k5, %k6, %k6
|
||||
+
|
||||
+ kmovd %k6, %eax
|
||||
+ cmpl $VEC_MASK, %eax
|
||||
+ jne L(4x_vec_end)
|
||||
+
|
||||
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %YMM1
|
||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
+ kandd %k1, %k2, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
+ kandd %k3, %k5, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
+ kandd %k4, %k5, %k5
|
||||
+
|
||||
+ kmovd %k5, %eax
|
||||
+ cmpl $VEC_MASK, %eax
|
||||
+ jne L(4x_vec_end)
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_2x_vec):
|
||||
+ /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+L(last_vec):
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
||||
+ leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec):
|
||||
+ /* A byte or int32 is different within 16 or 32 bytes. */
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl (%rdi, %rcx, 4), %edx
|
||||
+ cmpl (%rsi, %rcx, 4), %edx
|
||||
+L(wmemcmp_return):
|
||||
+ setl %al
|
||||
+ negl %eax
|
||||
+ orl $1, %eax
|
||||
+# else
|
||||
+ movzbl (%rdi, %rcx), %eax
|
||||
+ movzbl (%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ .p2align 4
|
||||
+L(4):
|
||||
+ xorl %eax, %eax
|
||||
+ movl (%rdi), %edx
|
||||
+ cmpl (%rsi), %edx
|
||||
+ jne L(wmemcmp_return)
|
||||
+ ret
|
||||
+# else
|
||||
+ .p2align 4
|
||||
+L(between_4_7):
|
||||
+ /* Load as big endian with overlapping movbe to avoid branches. */
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ je L(exit)
|
||||
+ sbbl %eax, %eax
|
||||
+ orl $1, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit):
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(between_2_3):
|
||||
+ /* Load as big endian to avoid branches. */
|
||||
+ movzwl (%rdi), %eax
|
||||
+ movzwl (%rsi), %ecx
|
||||
+ shll $8, %eax
|
||||
+ shll $8, %ecx
|
||||
+ bswap %eax
|
||||
+ bswap %ecx
|
||||
+ movb -1(%rdi, %rdx), %al
|
||||
+ movb -1(%rsi, %rdx), %cl
|
||||
+ /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
+ subl %ecx, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(1):
|
||||
+ movzbl (%rdi), %eax
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ subl %ecx, %eax
|
||||
+ ret
|
||||
+# endif
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(less_vec):
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
||||
+ cmpb $4, %dl
|
||||
+ je L(4)
|
||||
+ jb L(zero)
|
||||
+# else
|
||||
+ cmpb $1, %dl
|
||||
+ je L(1)
|
||||
+ jb L(zero)
|
||||
+ cmpb $4, %dl
|
||||
+ jb L(between_2_3)
|
||||
+ cmpb $8, %dl
|
||||
+ jb L(between_4_7)
|
||||
+# endif
|
||||
+ cmpb $16, %dl
|
||||
+ jae L(between_16_31)
|
||||
+ /* It is between 8 and 15 bytes. */
|
||||
+ vmovq (%rdi), %XMM1
|
||||
+ vmovq (%rsi), %XMM2
|
||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -8(%rdi, %rdx), %rdi
|
||||
+ leaq -8(%rsi, %rdx), %rsi
|
||||
+ vmovq (%rdi), %XMM1
|
||||
+ vmovq (%rsi), %XMM2
|
||||
+ VPCMPEQ %XMM1, %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(between_16_31):
|
||||
+ /* From 16 to 31 bytes. No branch when size == 16. */
|
||||
+ VMOVU (%rsi), %XMM2
|
||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -16(%rdi, %rdx), %rdi
|
||||
+ leaq -16(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %XMM2
|
||||
+ VPCMPEQ (%rdi), %XMM2, %k2
|
||||
+ kmovw %k2, %eax
|
||||
+ subl $XMM_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(more_8x_vec):
|
||||
+ /* More than 8 * VEC. Check the first VEC. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ /* Align the first memory area for aligned loads in the loop.
|
||||
+ Compute how much the first memory area is misaligned. */
|
||||
+ movq %rdi, %rcx
|
||||
+ andl $(VEC_SIZE - 1), %ecx
|
||||
+ /* Get the negative of offset for alignment. */
|
||||
+ subq $VEC_SIZE, %rcx
|
||||
+ /* Adjust the second memory area. */
|
||||
+ subq %rcx, %rsi
|
||||
+ /* Adjust the first memory area which should be aligned now. */
|
||||
+ subq %rcx, %rdi
|
||||
+ /* Adjust length. */
|
||||
+ addq %rcx, %rdx
|
||||
+
|
||||
+L(loop_4x_vec):
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
+ VMOVU (%rsi), %YMM1
|
||||
+ VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
+ kandd %k2, %k1, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
+ kandd %k3, %k5, %k5
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
+ kandd %k4, %k5, %k5
|
||||
+
|
||||
+ kmovd %k5, %eax
|
||||
+ cmpl $VEC_MASK, %eax
|
||||
+ jne L(4x_vec_end)
|
||||
+
|
||||
+ addq $(VEC_SIZE * 4), %rdi
|
||||
+ addq $(VEC_SIZE * 4), %rsi
|
||||
+
|
||||
+ subq $(VEC_SIZE * 4), %rdx
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jae L(loop_4x_vec)
|
||||
+
|
||||
+ /* Less than 4 * VEC. */
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ jbe L(last_vec)
|
||||
+ cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ jbe L(last_2x_vec)
|
||||
+
|
||||
+L(last_4x_vec):
|
||||
+ /* From 2 * VEC to 4 * VEC. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ addq $VEC_SIZE, %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ /* Use overlapping loads to avoid branches. */
|
||||
+ leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
+ leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ addq $VEC_SIZE, %rsi
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMPEQ (%rdi), %YMM2, %k2
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(4x_vec_end):
|
||||
+ kmovd %k1, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec)
|
||||
+ kmovd %k2, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec_x1)
|
||||
+ kmovd %k3, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ jnz L(first_vec_x2)
|
||||
+ kmovd %k4, %eax
|
||||
+ subl $VEC_MASK, %eax
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
|
||||
+ jmp L(wmemcmp_return)
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl VEC_SIZE(%rdi, %rcx, 4), %edx
|
||||
+ cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
|
||||
+ jmp L(wmemcmp_return)
|
||||
+# else
|
||||
+ movzbl VEC_SIZE(%rdi, %rcx), %eax
|
||||
+ movzbl VEC_SIZE(%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ xorl %eax, %eax
|
||||
+ movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
|
||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
|
||||
+ jmp L(wmemcmp_return)
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
||||
+ sub %edx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+END (MEMCMP)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
||||
new file mode 100644
|
||||
index 00000000..4726d74a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/wmemcmp-evex-movbe.S
|
||||
@@ -0,0 +1,4 @@
|
||||
+#define MEMCMP __wmemcmp_evex_movbe
|
||||
+#define USE_AS_WMEMCMP 1
|
||||
+
|
||||
+#include "memcmp-evex-movbe.S"
|
||||
--
|
||||
GitLab
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,735 +0,0 @@
|
||||
From 4bd660be40967cd69072f69ebc2ad32bfcc1f206 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Tue, 23 Feb 2021 06:33:10 -0800
|
||||
Subject: [PATCH] x86: Add string/memory function tests in RTM region
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
At function exit, AVX optimized string/memory functions have VZEROUPPER
|
||||
which triggers RTM abort. When such functions are called inside a
|
||||
transactionally executing RTM region, RTM abort causes severe performance
|
||||
degradation. Add tests to verify that string/memory functions won't
|
||||
cause RTM abort in RTM region.
|
||||
---
|
||||
sysdeps/x86/Makefile | 23 +++++++++++
|
||||
sysdeps/x86/tst-memchr-rtm.c | 54 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memcmp-rtm.c | 52 +++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memmove-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memrchr-rtm.c | 54 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-memset-rtm.c | 45 ++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strchr-rtm.c | 54 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strcpy-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-string-rtm.h | 72 +++++++++++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strlen-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strncmp-rtm.c | 52 +++++++++++++++++++++++++
|
||||
sysdeps/x86/tst-strrchr-rtm.c | 53 ++++++++++++++++++++++++++
|
||||
12 files changed, 618 insertions(+)
|
||||
create mode 100644 sysdeps/x86/tst-memchr-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memcmp-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memmove-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memrchr-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-memset-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strchr-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strcpy-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-string-rtm.h
|
||||
create mode 100644 sysdeps/x86/tst-strlen-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strncmp-rtm.c
|
||||
create mode 100644 sysdeps/x86/tst-strrchr-rtm.c
|
||||
|
||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
||||
index 59e928e9..5be71ada 100644
|
||||
--- a/sysdeps/x86/Makefile
|
||||
+++ b/sysdeps/x86/Makefile
|
||||
@@ -17,6 +17,29 @@ endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
sysdep_routines += cacheinfo
|
||||
+
|
||||
+tests += \
|
||||
+ tst-memchr-rtm \
|
||||
+ tst-memcmp-rtm \
|
||||
+ tst-memmove-rtm \
|
||||
+ tst-memrchr-rtm \
|
||||
+ tst-memset-rtm \
|
||||
+ tst-strchr-rtm \
|
||||
+ tst-strcpy-rtm \
|
||||
+ tst-strlen-rtm \
|
||||
+ tst-strncmp-rtm \
|
||||
+ tst-strrchr-rtm
|
||||
+
|
||||
+CFLAGS-tst-memchr-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memcmp-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memmove-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memrchr-rtm.c += -mrtm
|
||||
+CFLAGS-tst-memset-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strchr-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strcpy-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strlen-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strncmp-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strrchr-rtm.c += -mrtm
|
||||
endif
|
||||
|
||||
ifneq ($(enable-cet),no)
|
||||
diff --git a/sysdeps/x86/tst-memchr-rtm.c b/sysdeps/x86/tst-memchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..e4749401
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memchr-rtm.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Test case for memchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ string1[100] = 'c';
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = memchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memchr", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memcmp-rtm.c b/sysdeps/x86/tst-memcmp-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..e4c8a623
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memcmp-rtm.c
|
||||
@@ -0,0 +1,52 @@
|
||||
+/* Test case for memcmp inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ memset (string2, 'a', STRING_SIZE);
|
||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (memcmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memcmp", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memmove-rtm.c b/sysdeps/x86/tst-memmove-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..4bf97ef1
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memmove-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for memmove inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (memmove (string2, string1, STRING_SIZE) == string2
|
||||
+ && memcmp (string2, string1, STRING_SIZE) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memmove", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memrchr-rtm.c b/sysdeps/x86/tst-memrchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..a57a5a8e
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memrchr-rtm.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Test case for memrchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ string1[100] = 'c';
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = memrchr (string1, 'c', STRING_SIZE);
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memrchr", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-memset-rtm.c b/sysdeps/x86/tst-memset-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..bf343a4d
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-memset-rtm.c
|
||||
@@ -0,0 +1,45 @@
|
||||
+/* Test case for memset inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ return EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE);
|
||||
+ return 0;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("memset", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strchr-rtm.c b/sysdeps/x86/tst-strchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..a82e29c0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strchr-rtm.c
|
||||
@@ -0,0 +1,54 @@
|
||||
+/* Test case for strchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ string1[100] = 'c';
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = strchr (string1, 'c');
|
||||
+ if (p == &string1[100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = strchr (string1, 'c');
|
||||
+ if (p == &string1[100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strchr", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strcpy-rtm.c b/sysdeps/x86/tst-strcpy-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..2b2a583f
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strcpy-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for strcpy inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ if (strcpy (string2, string1) == string2
|
||||
+ && strcmp (string2, string1) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (strcpy (string2, string1) == string2
|
||||
+ && strcmp (string2, string1) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strcpy", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-string-rtm.h b/sysdeps/x86/tst-string-rtm.h
|
||||
new file mode 100644
|
||||
index 00000000..d2470afa
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-string-rtm.h
|
||||
@@ -0,0 +1,72 @@
|
||||
+/* Test string function in a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <string.h>
|
||||
+#include <x86intrin.h>
|
||||
+#include <sys/platform/x86.h>
|
||||
+#include <support/check.h>
|
||||
+#include <support/test-driver.h>
|
||||
+
|
||||
+static int
|
||||
+do_test_1 (const char *name, unsigned int loop, int (*prepare) (void),
|
||||
+ int (*function) (void))
|
||||
+{
|
||||
+ if (!CPU_FEATURE_USABLE (RTM))
|
||||
+ return EXIT_UNSUPPORTED;
|
||||
+
|
||||
+ int status = prepare ();
|
||||
+ if (status != EXIT_SUCCESS)
|
||||
+ return status;
|
||||
+
|
||||
+ unsigned int i;
|
||||
+ unsigned int naborts = 0;
|
||||
+ unsigned int failed = 0;
|
||||
+ for (i = 0; i < loop; i++)
|
||||
+ {
|
||||
+ failed |= function ();
|
||||
+ if (_xbegin() == _XBEGIN_STARTED)
|
||||
+ {
|
||||
+ failed |= function ();
|
||||
+ _xend();
|
||||
+ }
|
||||
+ else
|
||||
+ {
|
||||
+ failed |= function ();
|
||||
+ ++naborts;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (failed)
|
||||
+ FAIL_EXIT1 ("%s() failed", name);
|
||||
+
|
||||
+ if (naborts)
|
||||
+ {
|
||||
+ /* NB: Low single digit (<= 5%) noise-level aborts are normal for
|
||||
+ TSX. */
|
||||
+ double rate = 100 * ((double) naborts) / ((double) loop);
|
||||
+ if (rate > 5)
|
||||
+ FAIL_EXIT1 ("TSX abort rate: %.2f%% (%d out of %d)",
|
||||
+ rate, naborts, loop);
|
||||
+ }
|
||||
+
|
||||
+ return EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+static int do_test (void);
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86/tst-strlen-rtm.c b/sysdeps/x86/tst-strlen-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..0dcf14db
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strlen-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for strlen inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ string1[STRING_SIZE - 100] = '\0';
|
||||
+ size_t len = strlen (string1);
|
||||
+ if (len == STRING_SIZE - 100)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ size_t len = strlen (string1);
|
||||
+ if (len == STRING_SIZE - 100)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strlen", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..236ad951
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
@@ -0,0 +1,52 @@
|
||||
+/* Test case for strncmp inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+char string2[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ memset (string2, 'a', STRING_SIZE - 1);
|
||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ if (strncmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strncmp", LOOP, prepare, function);
|
||||
+}
|
||||
diff --git a/sysdeps/x86/tst-strrchr-rtm.c b/sysdeps/x86/tst-strrchr-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..e32bfaf5
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-strrchr-rtm.c
|
||||
@@ -0,0 +1,53 @@
|
||||
+/* Test case for strrchr inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <tst-string-rtm.h>
|
||||
+
|
||||
+#define LOOP 3000
|
||||
+#define STRING_SIZE 1024
|
||||
+char string1[STRING_SIZE];
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+prepare (void)
|
||||
+{
|
||||
+ memset (string1, 'a', STRING_SIZE - 1);
|
||||
+ string1[STRING_SIZE - 100] = 'c';
|
||||
+ char *p = strrchr (string1, 'c');
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return EXIT_SUCCESS;
|
||||
+ else
|
||||
+ return EXIT_FAILURE;
|
||||
+}
|
||||
+
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function (void)
|
||||
+{
|
||||
+ char *p = strrchr (string1, 'c');
|
||||
+ if (p == &string1[STRING_SIZE - 100])
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+do_test (void)
|
||||
+{
|
||||
+ return do_test_1 ("strrchr", LOOP, prepare, function);
|
||||
+}
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,148 +0,0 @@
|
||||
From 4e2d8f352774b56078c34648b14a2412c38384f4 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sun, 7 Mar 2021 09:44:18 -0800
|
||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memset family functions
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memset.h/ifunc-wmemset.h to select the function optimized
|
||||
with AVX512 instructions using ZMM16-ZMM31 registers to avoid RTM abort
|
||||
with usable AVX512VL and AVX512BW since VZEROUPPER isn't needed at
|
||||
function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 14 +++++++++-----
|
||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 13 ++++++++-----
|
||||
sysdeps/x86_64/multiarch/ifunc-wmemset.h | 12 ++++++------
|
||||
.../multiarch/memset-avx512-unaligned-erms.S | 16 ++++++++--------
|
||||
4 files changed, 31 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index c1efeec0..d969a156 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -211,10 +211,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
@@ -252,10 +254,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
__memset_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
@@ -719,7 +723,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512VL),
|
||||
__wmemset_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__wmemset_avx512_unaligned))
|
||||
|
||||
#ifdef SHARED
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
index 6f3375cc..19795938 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
@@ -53,13 +53,16 @@ IFUNC_SELECTOR (void)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
{
|
||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
||||
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx512_unaligned_erms);
|
||||
+ return OPTIMIZE (avx512_unaligned);
|
||||
+ }
|
||||
|
||||
- return OPTIMIZE (avx512_unaligned);
|
||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
||||
}
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wmemset.h b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
index bdc94c6c..98c5d406 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wmemset.h
|
||||
@@ -33,13 +33,13 @@ IFUNC_SELECTOR (void)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512)
|
||||
- && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx512_unaligned);
|
||||
-
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
- return OPTIMIZE (evex_unaligned);
|
||||
+ {
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
+ return OPTIMIZE (avx512_unaligned);
|
||||
+
|
||||
+ return OPTIMIZE (evex_unaligned);
|
||||
+ }
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
return OPTIMIZE (avx2_unaligned_rtm);
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index 0783979c..22e7b187 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -1,22 +1,22 @@
|
||||
#if IS_IN (libc)
|
||||
# define VEC_SIZE 64
|
||||
-# define VEC(i) zmm##i
|
||||
+# define XMM0 xmm16
|
||||
+# define YMM0 ymm16
|
||||
+# define VEC0 zmm16
|
||||
+# define VEC(i) VEC##i
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
- vpbroadcastb %xmm0, %xmm0; \
|
||||
- vpbroadcastq %xmm0, %zmm0
|
||||
+ vpbroadcastb d, %VEC0
|
||||
|
||||
# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- vmovd d, %xmm0; \
|
||||
movq r, %rax; \
|
||||
- vpbroadcastd %xmm0, %xmm0; \
|
||||
- vpbroadcastq %xmm0, %zmm0
|
||||
+ vpbroadcastd d, %VEC0
|
||||
|
||||
-# define SECTION(p) p##.avx512
|
||||
+# define SECTION(p) p##.evex512
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,230 +0,0 @@
|
||||
From b304fc201d2f6baf52ea790df8643e99772243cd Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:25:56 -0800
|
||||
Subject: [PATCH] x86-64 memcmp/wmemcmp: Properly handle the length parameter
|
||||
[BZ# 24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memcmp/wmemcmp for x32. Tested on x86-64 and x32. On
|
||||
x86-64, libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S: Use RDX_LP for
|
||||
length. Clear the upper 32 bits of RDX register.
|
||||
* sysdeps/x86_64/multiarch/memcmp-sse4.S: Likewise.
|
||||
* sysdeps/x86_64/multiarch/memcmp-ssse3.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcmp and
|
||||
tst-size_t-wmemcmp.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memcmp.c: New file.
|
||||
* sysdeps/x86_64/x32/tst-size_t-wmemcmp.c: Likewise.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 7 +-
|
||||
sysdeps/x86_64/multiarch/memcmp-sse4.S | 9 ++-
|
||||
sysdeps/x86_64/multiarch/memcmp-ssse3.S | 7 +-
|
||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-memcmp.c | 76 ++++++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-wmemcmp.c | 20 ++++++
|
||||
6 files changed, 114 insertions(+), 9 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
index 30f764c3..e3a35b89 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
@@ -58,9 +58,12 @@
|
||||
.section .text.avx,"ax",@progbits
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
# endif
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
|
||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-sse4.S b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
index 8e164f2c..302900f5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-sse4.S
|
||||
@@ -42,13 +42,16 @@
|
||||
.section .text.sse4.1,"ax",@progbits
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
# endif
|
||||
pxor %xmm0, %xmm0
|
||||
- cmp $79, %rdx
|
||||
+ cmp $79, %RDX_LP
|
||||
ja L(79bytesormore)
|
||||
# ifndef USE_AS_WMEMCMP
|
||||
- cmp $1, %rdx
|
||||
+ cmp $1, %RDX_LP
|
||||
je L(firstbyte)
|
||||
# endif
|
||||
add %rdx, %rsi
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-ssse3.S b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
||||
index 6f76c641..69d030fc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-ssse3.S
|
||||
@@ -33,9 +33,12 @@
|
||||
atom_text_section
|
||||
ENTRY (MEMCMP)
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %rdx
|
||||
- test %rdx, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz L(equal)
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
# endif
|
||||
mov %rdx, %rcx
|
||||
mov %rdi, %rdx
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index 7d528889..ddec7f04 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -6,9 +6,9 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
-tests += tst-size_t-memchr
|
||||
+tests += tst-size_t-memchr tst-size_t-memcmp
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
-tests += tst-size_t-wmemchr
|
||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp
|
||||
endif
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcmp.c b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
||||
new file mode 100644
|
||||
index 00000000..9bd6fdb4
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcmp.c
|
||||
@@ -0,0 +1,76 @@
|
||||
+/* Test memcmp with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_MAIN
|
||||
+#ifdef WIDE
|
||||
+# define TEST_NAME "wmemcmp"
|
||||
+#else
|
||||
+# define TEST_NAME "memcmp"
|
||||
+#endif
|
||||
+
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# include <inttypes.h>
|
||||
+# include <wchar.h>
|
||||
+
|
||||
+# define MEMCMP wmemcmp
|
||||
+# define CHAR wchar_t
|
||||
+#else
|
||||
+# define MEMCMP memcmp
|
||||
+# define CHAR char
|
||||
+#endif
|
||||
+
|
||||
+IMPL (MEMCMP, 1)
|
||||
+
|
||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
||||
+
|
||||
+static int
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memcmp (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t dest = { { page_size / sizeof (CHAR) }, buf1 };
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+
|
||||
+ memcpy (buf1, buf2, page_size);
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ int res = do_memcmp (dest, src);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
||||
new file mode 100644
|
||||
index 00000000..e8b5ffd0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemcmp.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* Test wmemcmp with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include "tst-size_t-memcmp.c"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,164 +0,0 @@
|
||||
From e4fda4631017e49d4ee5a2755db34289b6860fa4 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sun, 7 Mar 2021 09:45:23 -0800
|
||||
Subject: [PATCH] x86-64: Use ZMM16-ZMM31 in AVX512 memmove family functions
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update ifunc-memmove.h to select the function optimized with AVX512
|
||||
instructions using ZMM16-ZMM31 registers to avoid RTM abort with usable
|
||||
AVX512VL since VZEROUPPER isn't needed at function exit.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 24 +++++++++---------
|
||||
sysdeps/x86_64/multiarch/ifunc-memmove.h | 12 +++++----
|
||||
.../multiarch/memmove-avx512-unaligned-erms.S | 25 +++++++++++++++++--
|
||||
3 files changed, 42 insertions(+), 19 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index d969a156..fec384f6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -83,10 +83,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memmove_chk_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memmove_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
@@ -148,10 +148,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memmove_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memmove,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memmove_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memmove, CPU_FEATURE_USABLE (SSSE3),
|
||||
__memmove_ssse3_back)
|
||||
@@ -733,10 +733,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memcpy_chk_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
@@ -802,10 +802,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__memcpy_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__memcpy_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, 1, __memcpy_sse2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memcpy, 1,
|
||||
@@ -819,10 +819,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__mempcpy_chk_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __mempcpy_chk,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
@@ -864,10 +864,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
__mempcpy_avx512_no_vzeroupper)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ CPU_FEATURE_USABLE (AVX512VL),
|
||||
__mempcpy_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, mempcpy,
|
||||
CPU_FEATURE_USABLE (AVX),
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memmove.h b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
index fa09b9fb..014e95c7 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memmove.h
|
||||
@@ -56,13 +56,15 @@ IFUNC_SELECTOR (void)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
{
|
||||
- if (CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx512_no_vzeroupper);
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE (avx512_unaligned_erms);
|
||||
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
- return OPTIMIZE (avx512_unaligned_erms);
|
||||
+ return OPTIMIZE (avx512_unaligned);
|
||||
+ }
|
||||
|
||||
- return OPTIMIZE (avx512_unaligned);
|
||||
+ return OPTIMIZE (avx512_no_vzeroupper);
|
||||
}
|
||||
|
||||
if (CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
index aac1515c..848848ab 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-unaligned-erms.S
|
||||
@@ -1,11 +1,32 @@
|
||||
#if IS_IN (libc)
|
||||
# define VEC_SIZE 64
|
||||
-# define VEC(i) zmm##i
|
||||
+# define XMM0 xmm16
|
||||
+# define XMM1 xmm17
|
||||
+# define YMM0 ymm16
|
||||
+# define YMM1 ymm17
|
||||
+# define VEC0 zmm16
|
||||
+# define VEC1 zmm17
|
||||
+# define VEC2 zmm18
|
||||
+# define VEC3 zmm19
|
||||
+# define VEC4 zmm20
|
||||
+# define VEC5 zmm21
|
||||
+# define VEC6 zmm22
|
||||
+# define VEC7 zmm23
|
||||
+# define VEC8 zmm24
|
||||
+# define VEC9 zmm25
|
||||
+# define VEC10 zmm26
|
||||
+# define VEC11 zmm27
|
||||
+# define VEC12 zmm28
|
||||
+# define VEC13 zmm29
|
||||
+# define VEC14 zmm30
|
||||
+# define VEC15 zmm31
|
||||
+# define VEC(i) VEC##i
|
||||
# define VMOVNT vmovntdq
|
||||
# define VMOVU vmovdqu64
|
||||
# define VMOVA vmovdqa64
|
||||
+# define VZEROUPPER
|
||||
|
||||
-# define SECTION(p) p##.avx512
|
||||
+# define SECTION(p) p##.evex512
|
||||
# define MEMMOVE_SYMBOL(p,s) p##_avx512_##s
|
||||
|
||||
# include "memmove-vec-unaligned-erms.S"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,71 +0,0 @@
|
||||
From 595c22ecd8e87a27fd19270ed30fdbae9ad25426 Mon Sep 17 00:00:00 2001
|
||||
From: Sunil K Pandey <skpgkp2@gmail.com>
|
||||
Date: Thu, 1 Apr 2021 15:47:04 -0700
|
||||
Subject: [PATCH] x86-64: Fix ifdef indentation in strlen-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Fix some indentations of ifdef in file strlen-evex.S which are off by 1
|
||||
and confusing to read.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-evex.S | 16 ++++++++--------
|
||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
index cd022509..05838190 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
@@ -276,10 +276,10 @@ L(last_2x_vec):
|
||||
.p2align 4
|
||||
L(first_vec_x0_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
@@ -293,10 +293,10 @@ L(first_vec_x0_check):
|
||||
.p2align 4
|
||||
L(first_vec_x1_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
@@ -311,10 +311,10 @@ L(first_vec_x1_check):
|
||||
.p2align 4
|
||||
L(first_vec_x2_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
@@ -329,10 +329,10 @@ L(first_vec_x2_check):
|
||||
.p2align 4
|
||||
L(first_vec_x3_check):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %eax
|
||||
-# endif
|
||||
+# endif
|
||||
/* Check the end of data. */
|
||||
cmpq %rax, %rsi
|
||||
jbe L(max)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,51 +0,0 @@
|
||||
From 55bf411b451c13f0fb7ff3d3bf9a820020b45df1 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 07:07:21 -0700
|
||||
Subject: [PATCH] x86-64: Require BMI2 for __strlen_evex and __strnlen_evex
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Since __strlen_evex and __strnlen_evex added by
|
||||
|
||||
commit 1fd8c163a83d96ace1ff78fa6bac7aee084f6f77
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Fri Mar 5 06:24:52 2021 -0800
|
||||
|
||||
x86-64: Add ifunc-avx2.h functions with 256-bit EVEX
|
||||
|
||||
use sarx:
|
||||
|
||||
c4 e2 6a f7 c0 sarx %edx,%eax,%eax
|
||||
|
||||
require BMI2 for __strlen_evex and __strnlen_evex in ifunc-impl-list.c.
|
||||
ifunc-avx2.h already requires BMI2 for EVEX implementation.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 ++++--
|
||||
1 file changed, 4 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index fec384f6..cbfc1a5d 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -293,7 +293,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__strlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strlen_evex)
|
||||
IFUNC_IMPL_ADD (array, i, strlen, 1, __strlen_sse2))
|
||||
|
||||
@@ -308,7 +309,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__strnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strnlen_evex)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen, 1, __strnlen_sse2))
|
||||
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,584 +0,0 @@
|
||||
From acfd088a1963ba51cd83c78f95c0ab25ead79e04 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 3 May 2021 03:01:58 -0400
|
||||
Subject: [PATCH] x86: Optimize memchr-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes memchr-avx2.S. The optimizations include
|
||||
replacing some branches with cmovcc, avoiding some branches entirely
|
||||
in the less_4x_vec case, making the page cross logic less strict,
|
||||
asaving a few instructions the in loop return loop. test-memchr,
|
||||
test-rawmemchr, and test-wmemchr are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 425 ++++++++++++++-----------
|
||||
1 file changed, 247 insertions(+), 178 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
index cf893e77..b377f22e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
@@ -26,8 +26,22 @@
|
||||
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
+# define VPBROADCAST vpbroadcastd
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
+# define VPBROADCAST vpbroadcastb
|
||||
+# define CHAR_SIZE 1
|
||||
+# endif
|
||||
+
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+# define ERAW_PTR_REG ecx
|
||||
+# define RRAW_PTR_REG rcx
|
||||
+# define ALGN_PTR_REG rdi
|
||||
+# else
|
||||
+# define ERAW_PTR_REG edi
|
||||
+# define RRAW_PTR_REG rdi
|
||||
+# define ALGN_PTR_REG rcx
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
@@ -39,6 +53,7 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
@@ -47,295 +62,349 @@ ENTRY (MEMCHR)
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(null)
|
||||
# endif
|
||||
- movl %edi, %ecx
|
||||
- /* Broadcast CHAR to YMM0. */
|
||||
- vmovd %esi, %xmm0
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
shl $2, %RDX_LP
|
||||
- vpbroadcastd %xmm0, %ymm0
|
||||
# else
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
- vpbroadcastb %xmm0, %ymm0
|
||||
# endif
|
||||
+ /* Broadcast CHAR to YMMMATCH. */
|
||||
+ vmovd %esi, %xmm0
|
||||
+ VPBROADCAST %xmm0, %ymm0
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ movl %edi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rdx
|
||||
- jbe L(zero)
|
||||
-# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ /* If length < CHAR_PER_VEC handle special. */
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rdx
|
||||
+ .p2align 5
|
||||
+L(first_vec_x0):
|
||||
+ /* Check if first match was before length. */
|
||||
+ tzcntl %eax, %eax
|
||||
+ xorl %ecx, %ecx
|
||||
+ cmpl %eax, %edx
|
||||
+ leaq (%rdi, %rax), %rax
|
||||
+ cmovle %rcx, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+L(null):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
# endif
|
||||
- jmp L(more_4x_vec)
|
||||
-
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+L(cross_page_boundary):
|
||||
+ /* Save pointer before aligning as its original value is necessary
|
||||
+ for computer return address if byte is found or adjusting length
|
||||
+ if it is not and this is memchr. */
|
||||
+ movq %rdi, %rcx
|
||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
||||
+ rdi for rawmemchr. */
|
||||
+ orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
||||
+ VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Calculate length until end of page (length checked for a
|
||||
+ match). */
|
||||
+ leaq 1(%ALGN_PTR_REG), %rsi
|
||||
+ subq %RRAW_PTR_REG, %rsi
|
||||
+# endif
|
||||
/* Remove the leading bytes. */
|
||||
- sarl %cl, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
- tzcntl %eax, %eax
|
||||
+ sarxl %ERAW_PTR_REG, %eax, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
+ cmpq %rsi, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
+ testl %eax, %eax
|
||||
+ jz L(cross_page_continue)
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq %RRAW_PTR_REG, %rax
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
||||
- overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+ incq %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rdx
|
||||
- jbe L(zero)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $(VEC_SIZE + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $(VEC_SIZE * 2 + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+L(cross_page_continue):
|
||||
+ /* Align data to VEC_SIZE - 1. */
|
||||
+ xorl %ecx, %ecx
|
||||
+ subl %edi, %ecx
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+ /* esi is for adjusting length to see if near the end. */
|
||||
+ leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
||||
+# else
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+L(cross_page_continue):
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rsi, %rdx
|
||||
+ jbe L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
||||
subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
- andq $-(4 * VEC_SIZE), %rdi
|
||||
-
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
||||
+ /* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
||||
+ length. */
|
||||
+ incq %rdi
|
||||
+ movl %edi, %ecx
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
addq %rcx, %rdx
|
||||
+# else
|
||||
+ /* Align data to VEC_SIZE * 4 - 1 for loop. */
|
||||
+ incq %rdi
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
# endif
|
||||
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
||||
-
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm2
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm3
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm4
|
||||
vpor %ymm1, %ymm2, %ymm5
|
||||
vpor %ymm3, %ymm4, %ymm6
|
||||
vpor %ymm5, %ymm6, %ymm5
|
||||
|
||||
- vpmovmskb %ymm5, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
+ vpmovmskb %ymm5, %ecx
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
- jmp L(loop_4x_vec)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(loop_4x_vec)
|
||||
# else
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- ja L(loop_4x_vec)
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(loop_4x_vec_end)
|
||||
|
||||
-L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- jle L(last_2x_vec)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ subq $(VEC_SIZE * 4), %rdx
|
||||
+ ja L(loop_4x_vec)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
||||
+ */
|
||||
+ VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+ .p2align 4
|
||||
+L(last_4x_vec_or_less):
|
||||
+ /* Check if first VEC contained match. */
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
+ jnz L(first_vec_x1_check)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. */
|
||||
+ addl $(VEC_SIZE * 2), %edx
|
||||
+ jg L(last_4x_vec)
|
||||
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
+L(last_2x_vec):
|
||||
+ /* If remaining length < VEC_SIZE. */
|
||||
+ addl $VEC_SIZE, %edx
|
||||
+ jle L(zero_end)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
+ /* Check VEC2 and compare any match with remaining length. */
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
- jnz L(first_vec_x3_check)
|
||||
- xorl %eax, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ addq $(VEC_SIZE + 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+L(zero_end):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+L(loop_4x_vec_end):
|
||||
+# endif
|
||||
+ /* rawmemchr will fall through into this if match was found in
|
||||
+ loop. */
|
||||
+
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
+ jnz L(last_vec_x1_return)
|
||||
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
-
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- xorl %eax, %eax
|
||||
- VZEROUPPER_RETURN
|
||||
+ jnz L(last_vec_x2_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
- tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* Combine VEC3 matches (eax) with VEC4 matches (ecx). */
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
||||
+# else
|
||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
||||
+# endif
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1_check):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $VEC_SIZE, %rax
|
||||
+ /* Adjust length. */
|
||||
+ subl $-(VEC_SIZE * 4), %edx
|
||||
+ /* Check if match within remaining length. */
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ incq %rdi
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+ .p2align 4
|
||||
+L(set_zero_end):
|
||||
+ xorl %eax, %eax
|
||||
+ VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
+L(last_vec_x1_return):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
||||
+# else
|
||||
+ incq %rdi
|
||||
+# endif
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x2_return):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
||||
+# else
|
||||
+ subq $-(VEC_SIZE + 1), %rdi
|
||||
+# endif
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
.p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- jmp L(return_vzeroupper)
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ /* Check first VEC regardless. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x1_check)
|
||||
|
||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
||||
+ addl $(VEC_SIZE * 2), %edx
|
||||
+ jle L(last_2x_vec)
|
||||
.p2align 4
|
||||
-L(null):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-# endif
|
||||
+L(last_4x_vec):
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0):
|
||||
- tzcntl %eax, %eax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x1):
|
||||
- tzcntl %eax, %eax
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
- VZEROUPPER_RETURN
|
||||
+ /* Create mask for possible matches within remaining length. */
|
||||
+ movq $-1, %rcx
|
||||
+ bzhiq %rdx, %rcx, %rcx
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x2):
|
||||
+ /* Test matches in data against length match. */
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(last_vec_x3)
|
||||
+
|
||||
+ /* if remaining length <= VEC_SIZE * 3 (Note this is after
|
||||
+ remaining length was found to be > VEC_SIZE * 2. */
|
||||
+ subl $VEC_SIZE, %edx
|
||||
+ jbe L(zero_end2)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* Shift remaining length mask for last VEC. */
|
||||
+ shrq $32, %rcx
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(zero_end2)
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
||||
addq %rdi, %rax
|
||||
+L(zero_end2):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(4x_vec_end):
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- vpmovmskb %ymm4, %eax
|
||||
- testl %eax, %eax
|
||||
-L(first_vec_x3):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
+ subq $-(VEC_SIZE * 2 + 1), %rdi
|
||||
addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
END (MEMCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,388 +0,0 @@
|
||||
From 645a158978f9520e74074e8c14047503be4db0f0 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 9 Jun 2021 16:25:32 -0400
|
||||
Subject: [PATCH] x86: Fix overflow bug with wmemchr-sse2 and wmemchr-avx2 [BZ
|
||||
#27974]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This commit fixes the bug mentioned in the previous commit.
|
||||
|
||||
The previous implementations of wmemchr in these files relied
|
||||
on n * sizeof(wchar_t) which was not guranteed by the standard.
|
||||
|
||||
The new overflow tests added in the previous commit now
|
||||
pass (As well as all the other tests).
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/memchr.S | 77 +++++++++++++++++++-------
|
||||
sysdeps/x86_64/multiarch/memchr-avx2.S | 58 +++++++++++++------
|
||||
2 files changed, 98 insertions(+), 37 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memchr.S b/sysdeps/x86_64/memchr.S
|
||||
index cb320257..24f9a0c5 100644
|
||||
--- a/sysdeps/x86_64/memchr.S
|
||||
+++ b/sysdeps/x86_64/memchr.S
|
||||
@@ -21,9 +21,11 @@
|
||||
#ifdef USE_AS_WMEMCHR
|
||||
# define MEMCHR wmemchr
|
||||
# define PCMPEQ pcmpeqd
|
||||
+# define CHAR_PER_VEC 4
|
||||
#else
|
||||
# define MEMCHR memchr
|
||||
# define PCMPEQ pcmpeqb
|
||||
+# define CHAR_PER_VEC 16
|
||||
#endif
|
||||
|
||||
/* fast SSE2 version with using pmaxub and 64 byte loop */
|
||||
@@ -33,15 +35,14 @@ ENTRY(MEMCHR)
|
||||
movd %esi, %xmm1
|
||||
mov %edi, %ecx
|
||||
|
||||
+#ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+#endif
|
||||
#ifdef USE_AS_WMEMCHR
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
- shl $2, %RDX_LP
|
||||
#else
|
||||
-# ifdef __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- movl %edx, %edx
|
||||
-# endif
|
||||
punpcklbw %xmm1, %xmm1
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(return_null)
|
||||
@@ -60,13 +61,16 @@ ENTRY(MEMCHR)
|
||||
test %eax, %eax
|
||||
|
||||
jnz L(matches_1)
|
||||
- sub $16, %rdx
|
||||
+ sub $CHAR_PER_VEC, %rdx
|
||||
jbe L(return_null)
|
||||
add $16, %rdi
|
||||
and $15, %ecx
|
||||
and $-16, %rdi
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ shr $2, %ecx
|
||||
+#endif
|
||||
add %rcx, %rdx
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
jmp L(loop_prolog)
|
||||
|
||||
@@ -77,16 +81,21 @@ L(crosscache):
|
||||
movdqa (%rdi), %xmm0
|
||||
|
||||
PCMPEQ %xmm1, %xmm0
|
||||
-/* Check if there is a match. */
|
||||
+ /* Check if there is a match. */
|
||||
pmovmskb %xmm0, %eax
|
||||
-/* Remove the leading bytes. */
|
||||
+ /* Remove the leading bytes. */
|
||||
sar %cl, %eax
|
||||
test %eax, %eax
|
||||
je L(unaligned_no_match)
|
||||
-/* Check which byte is a match. */
|
||||
+ /* Check which byte is a match. */
|
||||
bsf %eax, %eax
|
||||
-
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
add %rdi, %rax
|
||||
add %rcx, %rax
|
||||
@@ -94,15 +103,18 @@ L(crosscache):
|
||||
|
||||
.p2align 4
|
||||
L(unaligned_no_match):
|
||||
- /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
||||
+ /* "rcx" is less than 16. Calculate "rdx + rcx - 16" by using
|
||||
"rdx - (16 - rcx)" instead of "(rdx + rcx) - 16" to void
|
||||
possible addition overflow. */
|
||||
neg %rcx
|
||||
add $16, %rcx
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ shr $2, %ecx
|
||||
+#endif
|
||||
sub %rcx, %rdx
|
||||
jbe L(return_null)
|
||||
add $16, %rdi
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
|
||||
.p2align 4
|
||||
@@ -135,7 +147,7 @@ L(loop_prolog):
|
||||
test $0x3f, %rdi
|
||||
jz L(align64_loop)
|
||||
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
@@ -167,11 +179,14 @@ L(loop_prolog):
|
||||
mov %rdi, %rcx
|
||||
and $-64, %rdi
|
||||
and $63, %ecx
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ shr $2, %ecx
|
||||
+#endif
|
||||
add %rcx, %rdx
|
||||
|
||||
.p2align 4
|
||||
L(align64_loop):
|
||||
- sub $64, %rdx
|
||||
+ sub $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(exit_loop)
|
||||
movdqa (%rdi), %xmm0
|
||||
movdqa 16(%rdi), %xmm2
|
||||
@@ -218,7 +233,7 @@ L(align64_loop):
|
||||
|
||||
.p2align 4
|
||||
L(exit_loop):
|
||||
- add $32, %edx
|
||||
+ add $(CHAR_PER_VEC * 2), %edx
|
||||
jle L(exit_loop_32)
|
||||
|
||||
movdqa (%rdi), %xmm0
|
||||
@@ -238,7 +253,7 @@ L(exit_loop):
|
||||
pmovmskb %xmm3, %eax
|
||||
test %eax, %eax
|
||||
jnz L(matches32_1)
|
||||
- sub $16, %edx
|
||||
+ sub $CHAR_PER_VEC, %edx
|
||||
jle L(return_null)
|
||||
|
||||
PCMPEQ 48(%rdi), %xmm1
|
||||
@@ -250,13 +265,13 @@ L(exit_loop):
|
||||
|
||||
.p2align 4
|
||||
L(exit_loop_32):
|
||||
- add $32, %edx
|
||||
+ add $(CHAR_PER_VEC * 2), %edx
|
||||
movdqa (%rdi), %xmm0
|
||||
PCMPEQ %xmm1, %xmm0
|
||||
pmovmskb %xmm0, %eax
|
||||
test %eax, %eax
|
||||
jnz L(matches_1)
|
||||
- sub $16, %edx
|
||||
+ sub $CHAR_PER_VEC, %edx
|
||||
jbe L(return_null)
|
||||
|
||||
PCMPEQ 16(%rdi), %xmm1
|
||||
@@ -293,7 +308,13 @@ L(matches32):
|
||||
.p2align 4
|
||||
L(matches_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
add %rdi, %rax
|
||||
ret
|
||||
@@ -301,7 +322,13 @@ L(matches_1):
|
||||
.p2align 4
|
||||
L(matches16_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
lea 16(%rdi, %rax), %rax
|
||||
ret
|
||||
@@ -309,7 +336,13 @@ L(matches16_1):
|
||||
.p2align 4
|
||||
L(matches32_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
lea 32(%rdi, %rax), %rax
|
||||
ret
|
||||
@@ -317,7 +350,13 @@ L(matches32_1):
|
||||
.p2align 4
|
||||
L(matches48_1):
|
||||
bsf %eax, %eax
|
||||
+#ifdef USE_AS_WMEMCHR
|
||||
+ mov %eax, %esi
|
||||
+ shr $2, %esi
|
||||
+ sub %rsi, %rdx
|
||||
+#else
|
||||
sub %rax, %rdx
|
||||
+#endif
|
||||
jbe L(return_null)
|
||||
lea 48(%rdi, %rax), %rax
|
||||
ret
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-avx2.S b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
index b377f22e..16027abb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-avx2.S
|
||||
@@ -54,21 +54,19 @@
|
||||
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
- test %RDX_LP, %RDX_LP
|
||||
- jz L(null)
|
||||
-# endif
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- shl $2, %RDX_LP
|
||||
-# else
|
||||
# ifdef __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- movl %edx, %edx
|
||||
+ /* Clear upper bits. */
|
||||
+ and %RDX_LP, %RDX_LP
|
||||
+# else
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
# endif
|
||||
+ jz L(null)
|
||||
# endif
|
||||
/* Broadcast CHAR to YMMMATCH. */
|
||||
vmovd %esi, %xmm0
|
||||
@@ -84,7 +82,7 @@ ENTRY (MEMCHR)
|
||||
vpmovmskb %ymm1, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* If length < CHAR_PER_VEC handle special. */
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+ cmpq $CHAR_PER_VEC, %rdx
|
||||
jbe L(first_vec_x0)
|
||||
# endif
|
||||
testl %eax, %eax
|
||||
@@ -98,6 +96,10 @@ ENTRY (MEMCHR)
|
||||
L(first_vec_x0):
|
||||
/* Check if first match was before length. */
|
||||
tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %edx
|
||||
+# endif
|
||||
xorl %ecx, %ecx
|
||||
cmpl %eax, %edx
|
||||
leaq (%rdi, %rax), %rax
|
||||
@@ -110,12 +112,12 @@ L(null):
|
||||
# endif
|
||||
.p2align 4
|
||||
L(cross_page_boundary):
|
||||
- /* Save pointer before aligning as its original value is necessary
|
||||
- for computer return address if byte is found or adjusting length
|
||||
- if it is not and this is memchr. */
|
||||
+ /* Save pointer before aligning as its original value is
|
||||
+ necessary for computer return address if byte is found or
|
||||
+ adjusting length if it is not and this is memchr. */
|
||||
movq %rdi, %rcx
|
||||
- /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr and
|
||||
- rdi for rawmemchr. */
|
||||
+ /* Align data to VEC_SIZE - 1. ALGN_PTR_REG is rcx for memchr
|
||||
+ and rdi for rawmemchr. */
|
||||
orq $(VEC_SIZE - 1), %ALGN_PTR_REG
|
||||
VPCMPEQ -(VEC_SIZE - 1)(%ALGN_PTR_REG), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
@@ -124,6 +126,10 @@ L(cross_page_boundary):
|
||||
match). */
|
||||
leaq 1(%ALGN_PTR_REG), %rsi
|
||||
subq %RRAW_PTR_REG, %rsi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
+ shrl $2, %esi
|
||||
+# endif
|
||||
# endif
|
||||
/* Remove the leading bytes. */
|
||||
sarxl %ERAW_PTR_REG, %eax, %eax
|
||||
@@ -181,6 +187,10 @@ L(cross_page_continue):
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
/* esi is for adjusting length to see if near the end. */
|
||||
leal (VEC_SIZE * 4 + 1)(%rdi, %rcx), %esi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %esi
|
||||
+# endif
|
||||
# else
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
L(cross_page_continue):
|
||||
@@ -213,7 +223,7 @@ L(cross_page_continue):
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check if at last VEC_SIZE * 4 length. */
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(last_4x_vec_or_less_cmpeq)
|
||||
/* Align data to VEC_SIZE * 4 - 1 for the loop and readjust
|
||||
length. */
|
||||
@@ -221,6 +231,10 @@ L(cross_page_continue):
|
||||
movl %edi, %ecx
|
||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
addq %rcx, %rdx
|
||||
# else
|
||||
/* Align data to VEC_SIZE * 4 - 1 for loop. */
|
||||
@@ -250,15 +264,19 @@ L(loop_4x_vec):
|
||||
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
ja L(loop_4x_vec)
|
||||
|
||||
- /* Fall through into less than 4 remaining vectors of length case.
|
||||
- */
|
||||
+ /* Fall through into less than 4 remaining vectors of length
|
||||
+ case. */
|
||||
VPCMPEQ (VEC_SIZE * 0 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
.p2align 4
|
||||
L(last_4x_vec_or_less):
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %edx
|
||||
+# endif
|
||||
/* Check if first VEC contained match. */
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1_check)
|
||||
@@ -355,6 +373,10 @@ L(last_vec_x2_return):
|
||||
L(last_4x_vec_or_less_cmpeq):
|
||||
VPCMPEQ (VEC_SIZE * 4 + 1)(%rdi), %ymm0, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %edx
|
||||
+# endif
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
/* Check first VEC regardless. */
|
||||
testl %eax, %eax
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,767 +0,0 @@
|
||||
From aaa23c35071537e2dcf5807e956802ed215210aa Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 19:36:07 -0400
|
||||
Subject: [PATCH] x86: Optimize strlen-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes strlen-avx2.S. The optimizations are
|
||||
mostly small things but they add up to roughly 10-30% performance
|
||||
improvement for strlen. The results for strnlen are bit more
|
||||
ambiguous. test-strlen, test-strnlen, test-wcslen, and test-wcsnlen
|
||||
are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 16 +-
|
||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 532 +++++++++++++--------
|
||||
2 files changed, 334 insertions(+), 214 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index cbfc1a5d..f1a6460a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -285,10 +285,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
||||
IFUNC_IMPL (i, name, strlen,
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__strlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strlen,
|
||||
@@ -301,10 +303,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/strnlen.c. */
|
||||
IFUNC_IMPL (i, name, strnlen,
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strnlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__strnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strnlen,
|
||||
@@ -640,10 +644,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/wcslen.c. */
|
||||
IFUNC_IMPL (i, name, wcslen,
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcslen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__wcslen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
@@ -656,10 +662,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
||||
IFUNC_IMPL (i, name, wcsnlen,
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcsnlen_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__wcsnlen_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
index 82826e10..be8a5db5 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
@@ -27,9 +27,11 @@
|
||||
# ifdef USE_AS_WCSLEN
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
# define VPMINU vpminud
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
# define VPMINU vpminub
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# ifndef VZEROUPPER
|
||||
@@ -41,349 +43,459 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check for zero length. */
|
||||
+ /* Check zero length. */
|
||||
test %RSI_LP, %RSI_LP
|
||||
jz L(zero)
|
||||
+ /* Store max len in R8_LP before adjusting if using WCSLEN. */
|
||||
+ mov %RSI_LP, %R8_LP
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shl $2, %RSI_LP
|
||||
# elif defined __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %esi, %esi
|
||||
# endif
|
||||
- mov %RSI_LP, %R8_LP
|
||||
# endif
|
||||
- movl %edi, %ecx
|
||||
+ movl %edi, %eax
|
||||
movq %rdi, %rdx
|
||||
vpxor %xmm0, %xmm0, %xmm0
|
||||
-
|
||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
||||
+ cross check. */
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
+ VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rsi
|
||||
- jbe L(max)
|
||||
-# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ /* If length < VEC_SIZE handle special. */
|
||||
+ cmpq $VEC_SIZE, %rsi
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ /* If empty continue to aligned_more. Otherwise return bit
|
||||
+ position of first match. */
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rsi
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x0):
|
||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
||||
+ and position of first match. */
|
||||
+ btsq %rsi, %rax
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
# endif
|
||||
- jmp L(more_4x_vec)
|
||||
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- /* Remove the leading bytes. */
|
||||
- sarl %cl, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
+L(first_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE * 4 + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ incl %edi
|
||||
+ addl %edi, %eax
|
||||
# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
- subq %rdx, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ shrl $2, %eax
|
||||
# endif
|
||||
-L(return_vzeroupper):
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
||||
- to void possible addition overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
-
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rsi
|
||||
- jbe L(max)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE * 3 + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ addl $(VEC_SIZE + 1), %edi
|
||||
+ addl %edi, %eax
|
||||
# endif
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ .p2align 4
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE * 2 + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ addl $(VEC_SIZE * 2 + 1), %edi
|
||||
+ addl %edi, %eax
|
||||
+# endif
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ subl $(VEC_SIZE + 1), %ecx
|
||||
+ addl %ecx, %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+ addl $(VEC_SIZE * 3 + 1), %edi
|
||||
+ addl %edi, %eax
|
||||
# endif
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 5
|
||||
+L(aligned_more):
|
||||
+ /* Align data to VEC_SIZE - 1. This is the same number of
|
||||
+ instructions as using andq with -VEC_SIZE but saves 4 bytes of
|
||||
+ code on the x4 check. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+L(cross_page_continue):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
-
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
||||
+ it simplies the logic in last_4x_vec_or_less. */
|
||||
+ leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
||||
+ subq %rdx, %rcx
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rcx, %rsi
|
||||
+ jb L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
-# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
- andq $-(4 * VEC_SIZE), %rdi
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
|
||||
+ /* Align data to VEC_SIZE * 4 - 1. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
+ /* Before adjusting length check if at last VEC_SIZE * 4. */
|
||||
+ cmpq $(VEC_SIZE * 4 - 1), %rsi
|
||||
+ jbe L(last_4x_vec_or_less_load)
|
||||
+ incq %rdi
|
||||
+ movl %edi, %ecx
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+ /* Readjust length. */
|
||||
addq %rcx, %rsi
|
||||
+# else
|
||||
+ incq %rdi
|
||||
+ orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
# endif
|
||||
-
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- vmovdqa (%rdi), %ymm1
|
||||
- vmovdqa VEC_SIZE(%rdi), %ymm2
|
||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm3
|
||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm4
|
||||
- VPMINU %ymm1, %ymm2, %ymm5
|
||||
- VPMINU %ymm3, %ymm4, %ymm6
|
||||
- VPMINU %ymm5, %ymm6, %ymm5
|
||||
-
|
||||
- VPCMPEQ %ymm5, %ymm0, %ymm5
|
||||
- vpmovmskb %ymm5, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
-# ifndef USE_AS_STRNLEN
|
||||
- jmp L(loop_4x_vec)
|
||||
-# else
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Break if at end of length. */
|
||||
subq $(VEC_SIZE * 4), %rsi
|
||||
- ja L(loop_4x_vec)
|
||||
-
|
||||
-L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
- jle L(last_2x_vec)
|
||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
||||
+# endif
|
||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
||||
+ */
|
||||
+ vmovdqa 1(%rdi), %ymm1
|
||||
+ VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
||||
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
||||
+ VPMINU (VEC_SIZE * 3 + 1)(%rdi), %ymm3, %ymm4
|
||||
+
|
||||
+ VPMINU %ymm2, %ymm4, %ymm5
|
||||
+ VPCMPEQ %ymm5, %ymm0, %ymm5
|
||||
+ vpmovmskb %ymm5, %ecx
|
||||
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(loop_4x_vec)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ subq %rdx, %rdi
|
||||
testl %eax, %eax
|
||||
+ jnz L(last_vec_return_x0)
|
||||
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
-
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ VPCMPEQ %ymm2, %ymm0, %ymm2
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
-
|
||||
- jnz L(first_vec_x3_check)
|
||||
- movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+ jnz L(last_vec_return_x1)
|
||||
+
|
||||
+ /* Combine last 2 VEC. */
|
||||
+ VPCMPEQ %ymm3, %ymm0, %ymm3
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* rcx has combined result from all 4 VEC. It will only be used if
|
||||
+ the first 3 other VEC all did not contain a match. */
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+ subq $(VEC_SIZE * 2 - 1), %rdi
|
||||
+ addq %rdi, %rax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
- VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
+L(last_4x_vec_or_less_load):
|
||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
+L(last_4x_vec_or_less):
|
||||
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
||||
+ VEC_SIZE * 4. */
|
||||
+ testl $(VEC_SIZE * 2), %esi
|
||||
+ jnz L(last_4x_vec)
|
||||
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ /* length may have been negative or positive by an offset of
|
||||
+ VEC_SIZE * 4 depending on where this was called from. This fixes
|
||||
+ that. */
|
||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
- VZEROUPPER_RETURN
|
||||
+ jnz L(last_vec_x1_check)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
+ subl $VEC_SIZE, %esi
|
||||
+ jb L(max)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1_check):
|
||||
+L(last_vec_return_x0):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $VEC_SIZE, %rax
|
||||
+ subq $(VEC_SIZE * 4 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
+L(last_vec_return_x1):
|
||||
tzcntl %eax, %eax
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+ subq $(VEC_SIZE * 3 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x1_check):
|
||||
+
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
+ incl %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
L(max):
|
||||
movq %r8, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_4x_vec):
|
||||
+ /* Test first 2x VEC normally. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x1)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2)
|
||||
+
|
||||
+ /* Normalize length. */
|
||||
+ andl $(VEC_SIZE * 4 - 1), %esi
|
||||
+ VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x3)
|
||||
+
|
||||
+ subl $(VEC_SIZE * 3), %esi
|
||||
+ jb L(max)
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE * 3 + 1), %eax
|
||||
+ addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-# endif
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x0):
|
||||
+L(last_vec_x1):
|
||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
||||
+ instructions. */
|
||||
tzcntl %eax, %eax
|
||||
+ subq %rdx, %rdi
|
||||
+ incl %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1):
|
||||
+L(last_vec_x2):
|
||||
+ /* essentially duplicates of first_vec_x1 but use 64 bit
|
||||
+ instructions. */
|
||||
tzcntl %eax, %eax
|
||||
- addq $VEC_SIZE, %rax
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
+ subl $(VEC_SIZE * 2), %esi
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max_end)
|
||||
+ subq %rdx, %rdi
|
||||
+ addl $(VEC_SIZE * 2 + 1), %eax
|
||||
addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
shrq $2, %rax
|
||||
-# endif
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+L(max_end):
|
||||
+ movq %r8, %rax
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
+ /* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
-L(4x_vec_end):
|
||||
- VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- VPCMPEQ %ymm2, %ymm0, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
+L(cross_page_boundary):
|
||||
+ /* Align data to VEC_SIZE - 1. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+ VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
||||
+ so no need to manually mod rdx. */
|
||||
+ sarxl %edx, %eax, %eax
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- VPCMPEQ %ymm3, %ymm0, %ymm3
|
||||
- vpmovmskb %ymm3, %eax
|
||||
+ jnz L(cross_page_less_vec)
|
||||
+ leaq 1(%rdi), %rcx
|
||||
+ subq %rdx, %rcx
|
||||
+ /* Check length. */
|
||||
+ cmpq %rsi, %rcx
|
||||
+ jb L(cross_page_continue)
|
||||
+ movq %r8, %rax
|
||||
+# else
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- VPCMPEQ %ymm4, %ymm0, %ymm4
|
||||
- vpmovmskb %ymm4, %eax
|
||||
-L(first_vec_x3):
|
||||
+ jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
# endif
|
||||
+L(return_vzeroupper):
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ .p2align 4
|
||||
+L(cross_page_less_vec):
|
||||
+ tzcntl %eax, %eax
|
||||
+ cmpq %rax, %rsi
|
||||
+ cmovb %esi, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrl $2, %eax
|
||||
+# endif
|
||||
VZEROUPPER_RETURN
|
||||
+# endif
|
||||
|
||||
END (STRLEN)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,701 +0,0 @@
|
||||
From 2a76821c3081d2c0231ecd2618f52662cb48fccd Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 3 May 2021 03:03:19 -0400
|
||||
Subject: [PATCH] x86: Optimize memchr-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes memchr-evex.S. The optimizations include
|
||||
replacing some branches with cmovcc, avoiding some branches entirely
|
||||
in the less_4x_vec case, making the page cross logic less strict,
|
||||
saving some ALU in the alignment process, and most importantly
|
||||
increasing ILP in the 4x loop. test-memchr, test-rawmemchr, and
|
||||
test-wmemchr are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memchr-evex.S | 547 +++++++++++++++----------
|
||||
1 file changed, 322 insertions(+), 225 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
index 6dd5d67b..81d5cd64 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
@@ -26,14 +26,28 @@
|
||||
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
# define VPBROADCAST vpbroadcastd
|
||||
-# define VPCMP vpcmpd
|
||||
-# define SHIFT_REG r8d
|
||||
+# define VPMINU vpminud
|
||||
+# define VPCMP vpcmpd
|
||||
+# define VPCMPEQ vpcmpeqd
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
-# define VPCMP vpcmpb
|
||||
-# define SHIFT_REG ecx
|
||||
+# define VPMINU vpminub
|
||||
+# define VPCMP vpcmpb
|
||||
+# define VPCMPEQ vpcmpeqb
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+# define RAW_PTR_REG rcx
|
||||
+# define ALGN_PTR_REG rdi
|
||||
+# else
|
||||
+# define RAW_PTR_REG rdi
|
||||
+# define ALGN_PTR_REG rcx
|
||||
+# endif
|
||||
+
|
||||
+# define XMMZERO xmm23
|
||||
+# define YMMZERO ymm23
|
||||
# define XMMMATCH xmm16
|
||||
# define YMMMATCH ymm16
|
||||
# define YMM1 ymm17
|
||||
@@ -44,6 +58,8 @@
|
||||
# define YMM6 ymm22
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
@@ -51,11 +67,7 @@ ENTRY (MEMCHR)
|
||||
/* Check for zero length. */
|
||||
test %RDX_LP, %RDX_LP
|
||||
jz L(zero)
|
||||
-# endif
|
||||
- movl %edi, %ecx
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- shl $2, %RDX_LP
|
||||
-# else
|
||||
+
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
@@ -64,318 +76,403 @@ ENTRY (MEMCHR)
|
||||
/* Broadcast CHAR to YMMMATCH. */
|
||||
VPBROADCAST %esi, %YMMMATCH
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ movl %edi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
+ VPCMP $0, (%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rdx
|
||||
- jbe L(zero)
|
||||
+ /* If length < CHAR_PER_VEC handle special. */
|
||||
+ cmpq $CHAR_PER_VEC, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
+# endif
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ addq %rdi, %rax
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ ret
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rdx
|
||||
-
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
- jmp L(more_4x_vec)
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
+ .p2align 5
|
||||
+L(first_vec_x0):
|
||||
+ /* Check if first match was before length. */
|
||||
+ tzcntl %eax, %eax
|
||||
+ xorl %ecx, %ecx
|
||||
+ cmpl %eax, %edx
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ cmovle %rcx, %rax
|
||||
+ ret
|
||||
+# else
|
||||
+ /* NB: first_vec_x0 is 17 bytes which will leave
|
||||
+ cross_page_boundary (which is relatively cold) close enough
|
||||
+ to ideal alignment. So only realign L(cross_page_boundary) if
|
||||
+ rawmemchr. */
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
+# endif
|
||||
+L(cross_page_boundary):
|
||||
+ /* Save pointer before aligning as its original value is
|
||||
+ necessary for computer return address if byte is found or
|
||||
+ adjusting length if it is not and this is memchr. */
|
||||
+ movq %rdi, %rcx
|
||||
+ /* Align data to VEC_SIZE. ALGN_PTR_REG is rcx for memchr and rdi
|
||||
+ for rawmemchr. */
|
||||
+ andq $-VEC_SIZE, %ALGN_PTR_REG
|
||||
+ VPCMP $0, (%ALGN_PTR_REG), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %r8d
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
||||
bytes. */
|
||||
- movl %ecx, %SHIFT_REG
|
||||
- sarl $2, %SHIFT_REG
|
||||
+ sarl $2, %eax
|
||||
+# endif
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ movl $(PAGE_SIZE / CHAR_SIZE), %esi
|
||||
+ subl %eax, %esi
|
||||
# endif
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- /* Remove the leading bytes. */
|
||||
- sarxl %SHIFT_REG, %eax, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
- tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+ andl $(CHAR_PER_VEC - 1), %eax
|
||||
# endif
|
||||
+ /* Remove the leading bytes. */
|
||||
+ sarxl %eax, %r8d, %eax
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
+ cmpq %rsi, %rdx
|
||||
+ jbe L(first_vec_x0)
|
||||
+# endif
|
||||
+ testl %eax, %eax
|
||||
+ jz L(cross_page_continue)
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (%RAW_PTR_REG, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ addq %RAW_PTR_REG, %rax
|
||||
# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Calculate "rdx + rcx - VEC_SIZE" with "rdx - (VEC_SIZE - rcx)"
|
||||
- instead of "(rdx + rcx) - VEC_SIZE" to void possible addition
|
||||
- overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rdx
|
||||
- jbe L(zero)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ .p2align 4
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 5
|
||||
+L(aligned_more):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Align data to VEC_SIZE. */
|
||||
+L(cross_page_continue):
|
||||
+ xorl %ecx, %ecx
|
||||
+ subl %edi, %ecx
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ /* esi is for adjusting length to see if near the end. */
|
||||
+ leal (VEC_SIZE * 5)(%rdi, %rcx), %esi
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %esi
|
||||
+# endif
|
||||
+# else
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+L(cross_page_continue):
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VPCMP $0, (VEC_SIZE)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rsi, %rdx
|
||||
+ jbe L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
+
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
+ /* Check if at last CHAR_PER_VEC * 4 length. */
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
+ jbe L(last_4x_vec_or_less_cmpeq)
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
+ /* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
||||
+ */
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ movl %edi, %ecx
|
||||
andq $-(4 * VEC_SIZE), %rdi
|
||||
-
|
||||
-# ifndef USE_AS_RAWMEMCHR
|
||||
- /* Adjust length. */
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
addq %rcx, %rdx
|
||||
+# else
|
||||
+ addq %rdi, %rdx
|
||||
+ andq $-(4 * VEC_SIZE), %rdi
|
||||
+ subq %rdi, %rdx
|
||||
+# endif
|
||||
+# else
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
+ andq $-(4 * VEC_SIZE), %rdi
|
||||
# endif
|
||||
|
||||
+ vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
+
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k2
|
||||
- kord %k1, %k2, %k5
|
||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k3
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k4
|
||||
-
|
||||
- kord %k3, %k4, %k6
|
||||
- kortestd %k5, %k6
|
||||
- jnz L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
+ /* It would be possible to save some instructions using 4x VPCMP
|
||||
+ but bottleneck on port 5 makes it not woth it. */
|
||||
+ VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
||||
+ /* xor will set bytes match esi to zero. */
|
||||
+ vpxorq (VEC_SIZE * 5)(%rdi), %YMMMATCH, %YMM2
|
||||
+ vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
||||
+ VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
||||
+ /* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
||||
+ VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
||||
+ VPCMP $0, %YMM3, %YMMZERO, %k2
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
- jmp L(loop_4x_vec)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ kortestd %k2, %k3
|
||||
+ jz L(loop_4x_vec)
|
||||
# else
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ kortestd %k2, %k3
|
||||
+ jnz L(loop_4x_vec_end)
|
||||
+
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+
|
||||
+ subq $(CHAR_PER_VEC * 4), %rdx
|
||||
ja L(loop_4x_vec)
|
||||
|
||||
+ /* Fall through into less than 4 remaining vectors of length case.
|
||||
+ */
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ addq $(VEC_SIZE * 3), %rdi
|
||||
+ .p2align 4
|
||||
L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- jle L(last_2x_vec)
|
||||
-
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ /* Check if first VEC contained match. */
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ jnz L(first_vec_x1_check)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
+ /* If remaining length > CHAR_PER_VEC * 2. */
|
||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
||||
+ jg L(last_4x_vec)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
+L(last_2x_vec):
|
||||
+ /* If remaining length < CHAR_PER_VEC. */
|
||||
+ addl $CHAR_PER_VEC, %edx
|
||||
+ jle L(zero_end)
|
||||
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
+ /* Check VEC2 and compare any match with remaining length. */
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+L(zero_end):
|
||||
+ ret
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
|
||||
- jnz L(first_vec_x3_check)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x1_check):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Adjust length. */
|
||||
+ subl $-(CHAR_PER_VEC * 4), %edx
|
||||
+ /* Check if match within remaining length. */
|
||||
+ cmpl %eax, %edx
|
||||
+ jbe L(set_zero_end)
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+L(set_zero_end):
|
||||
xorl %eax, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %edx
|
||||
- VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
+L(loop_4x_vec_end):
|
||||
+# endif
|
||||
+ /* rawmemchr will fall through into this if match was found in
|
||||
+ loop. */
|
||||
+
|
||||
+ /* k1 has not of matches with VEC1. */
|
||||
kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ subl $((1 << CHAR_PER_VEC) - 1), %eax
|
||||
+# else
|
||||
+ incl %eax
|
||||
+# endif
|
||||
+ jnz L(last_vec_x1_return)
|
||||
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %edx
|
||||
- jle L(zero)
|
||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2_return)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMMATCH, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ kmovd %k2, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
+ jnz L(last_vec_x3_return)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
+ kmovd %k3, %eax
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1_check):
|
||||
+L(last_vec_x1_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $VEC_SIZE, %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
addq %rdi, %rax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+# endif
|
||||
+# else
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x2_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+# else
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rdx
|
||||
- jbe L(zero)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-# endif
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x0):
|
||||
+L(last_vec_x3_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (%rdi, %rax, 4), %rax
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- addq %rdi, %rax
|
||||
+ /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
+ leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# endif
|
||||
ret
|
||||
|
||||
+
|
||||
+# ifndef USE_AS_RAWMEMCHR
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMP $0, (VEC_SIZE * 5)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ /* Check first VEC regardless. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x1_check)
|
||||
+
|
||||
+ /* If remaining length <= CHAR_PER_VEC * 2. */
|
||||
+ addl $(CHAR_PER_VEC * 2), %edx
|
||||
+ jle L(last_2x_vec)
|
||||
+
|
||||
.p2align 4
|
||||
-L(first_vec_x1):
|
||||
+L(last_4x_vec):
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x2)
|
||||
+
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ /* Create mask for possible matches within remaining length. */
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ movl $((1 << (CHAR_PER_VEC * 2)) - 1), %ecx
|
||||
+ bzhil %edx, %ecx, %ecx
|
||||
+# else
|
||||
+ movq $-1, %rcx
|
||||
+ bzhiq %rdx, %rcx, %rcx
|
||||
+# endif
|
||||
+ /* Test matches in data against length match. */
|
||||
+ andl %ecx, %eax
|
||||
+ jnz L(last_vec_x3)
|
||||
+
|
||||
+ /* if remaining length <= CHAR_PER_VEC * 3 (Note this is after
|
||||
+ remaining length was found to be > CHAR_PER_VEC * 2. */
|
||||
+ subl $CHAR_PER_VEC, %edx
|
||||
+ jbe L(zero_end2)
|
||||
+
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ /* Shift remaining length mask for last VEC. */
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ shrl $CHAR_PER_VEC, %ecx
|
||||
+# else
|
||||
+ shrq $CHAR_PER_VEC, %rcx
|
||||
+# endif
|
||||
+ andl %ecx, %eax
|
||||
+ jz L(zero_end2)
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+L(zero_end2):
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x2):
|
||||
+L(last_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(4x_vec_end):
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- kmovd %k2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- kmovd %k3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- kmovd %k4, %eax
|
||||
- testl %eax, %eax
|
||||
-L(first_vec_x3):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
+# endif
|
||||
|
||||
END (MEMCHR)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,30 +0,0 @@
|
||||
From 6ea916adfa0ab9af6e7dc6adcf6f977dfe017835 Mon Sep 17 00:00:00 2001
|
||||
From: Alice Xu <alice.d.xu@gmail.com>
|
||||
Date: Fri, 7 May 2021 19:03:21 -0700
|
||||
Subject: [PATCH] x86-64: Fix an unknown vector operation in memchr-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
An unknown vector operation occurred in commit 2a76821c308. Fixed it
|
||||
by using "ymm{k1}{z}" but not "ymm {k1} {z}".
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memchr-evex.S | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
index 81d5cd64..f3fdad4f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
@@ -271,7 +271,7 @@ L(loop_4x_vec):
|
||||
vpxorq (VEC_SIZE * 6)(%rdi), %YMMMATCH, %YMM3
|
||||
VPCMP $0, (VEC_SIZE * 7)(%rdi), %YMMMATCH, %k3
|
||||
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
||||
- VPMINU %YMM2, %YMM3, %YMM3 {%k1} {z}
|
||||
+ VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
||||
VPCMP $0, %YMM3, %YMMZERO, %k2
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,566 +0,0 @@
|
||||
From a0db678071c60b6c47c468d231dd0b3694ba7a98 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Tue, 22 Jun 2021 20:42:10 -0700
|
||||
Subject: [PATCH] x86-64: Move strlen.S to multiarch/strlen-vec.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Since strlen.S contains SSE2 version of strlen/strnlen and SSE4.1
|
||||
version of wcslen/wcsnlen, move strlen.S to multiarch/strlen-vec.S
|
||||
and include multiarch/strlen-vec.S from SSE2 and SSE4.1 variants.
|
||||
This also removes the unused symbols, __GI___strlen_sse2 and
|
||||
__GI___wcsnlen_sse4_1.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-sse2.S | 2 +-
|
||||
sysdeps/x86_64/multiarch/strlen-vec.S | 257 ++++++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S | 2 +-
|
||||
sysdeps/x86_64/strlen.S | 243 +-------------------
|
||||
4 files changed, 262 insertions(+), 242 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
||||
(Copyright dates, URL)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-sse2.S b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
||||
index 7bc57b8d..449c8a7f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-sse2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-sse2.S
|
||||
@@ -20,4 +20,4 @@
|
||||
# define strlen __strlen_sse2
|
||||
#endif
|
||||
|
||||
-#include "../strlen.S"
|
||||
+#include "strlen-vec.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
new file mode 100644
|
||||
index 00000000..8f660bb9
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
@@ -0,0 +1,257 @@
|
||||
+/* SSE2 version of strlen and SSE4.1 version of wcslen.
|
||||
+ Copyright (C) 2012-2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+#ifdef AS_WCSLEN
|
||||
+# define PMINU pminud
|
||||
+# define PCMPEQ pcmpeqd
|
||||
+# define SHIFT_RETURN shrq $2, %rax
|
||||
+#else
|
||||
+# define PMINU pminub
|
||||
+# define PCMPEQ pcmpeqb
|
||||
+# define SHIFT_RETURN
|
||||
+#endif
|
||||
+
|
||||
+/* Long lived register in strlen(s), strnlen(s, n) are:
|
||||
+
|
||||
+ %xmm3 - zero
|
||||
+ %rdi - s
|
||||
+ %r10 (s+n) & (~(64-1))
|
||||
+ %r11 s+n
|
||||
+*/
|
||||
+
|
||||
+
|
||||
+.text
|
||||
+ENTRY(strlen)
|
||||
+
|
||||
+/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
||||
+#define FIND_ZERO \
|
||||
+ PCMPEQ (%rax), %xmm0; \
|
||||
+ PCMPEQ 16(%rax), %xmm1; \
|
||||
+ PCMPEQ 32(%rax), %xmm2; \
|
||||
+ PCMPEQ 48(%rax), %xmm3; \
|
||||
+ pmovmskb %xmm0, %esi; \
|
||||
+ pmovmskb %xmm1, %edx; \
|
||||
+ pmovmskb %xmm2, %r8d; \
|
||||
+ pmovmskb %xmm3, %ecx; \
|
||||
+ salq $16, %rdx; \
|
||||
+ salq $16, %rcx; \
|
||||
+ orq %rsi, %rdx; \
|
||||
+ orq %r8, %rcx; \
|
||||
+ salq $32, %rcx; \
|
||||
+ orq %rcx, %rdx;
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+/* Do not read anything when n==0. */
|
||||
+ test %RSI_LP, %RSI_LP
|
||||
+ jne L(n_nonzero)
|
||||
+ xor %rax, %rax
|
||||
+ ret
|
||||
+L(n_nonzero):
|
||||
+# ifdef AS_WCSLEN
|
||||
+ shl $2, %RSI_LP
|
||||
+# endif
|
||||
+
|
||||
+/* Initialize long lived registers. */
|
||||
+
|
||||
+ add %RDI_LP, %RSI_LP
|
||||
+ mov %RSI_LP, %R10_LP
|
||||
+ and $-64, %R10_LP
|
||||
+ mov %RSI_LP, %R11_LP
|
||||
+#endif
|
||||
+
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ pxor %xmm1, %xmm1
|
||||
+ pxor %xmm2, %xmm2
|
||||
+ pxor %xmm3, %xmm3
|
||||
+ movq %rdi, %rax
|
||||
+ movq %rdi, %rcx
|
||||
+ andq $4095, %rcx
|
||||
+/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
||||
+ cmpq $4047, %rcx
|
||||
+/* We cannot unify this branching as it would be ~6 cycles slower. */
|
||||
+ ja L(cross_page)
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+/* Test if end is among first 64 bytes. */
|
||||
+# define STRNLEN_PROLOG \
|
||||
+ mov %r11, %rsi; \
|
||||
+ subq %rax, %rsi; \
|
||||
+ andq $-64, %rax; \
|
||||
+ testq $-64, %rsi; \
|
||||
+ je L(strnlen_ret)
|
||||
+#else
|
||||
+# define STRNLEN_PROLOG andq $-64, %rax;
|
||||
+#endif
|
||||
+
|
||||
+/* Ignore bits in mask that come before start of string. */
|
||||
+#define PROLOG(lab) \
|
||||
+ movq %rdi, %rcx; \
|
||||
+ xorq %rax, %rcx; \
|
||||
+ STRNLEN_PROLOG; \
|
||||
+ sarq %cl, %rdx; \
|
||||
+ test %rdx, %rdx; \
|
||||
+ je L(lab); \
|
||||
+ bsfq %rdx, %rax; \
|
||||
+ SHIFT_RETURN; \
|
||||
+ ret
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+ andq $-16, %rax
|
||||
+ FIND_ZERO
|
||||
+#else
|
||||
+ /* Test first 16 bytes unaligned. */
|
||||
+ movdqu (%rax), %xmm4
|
||||
+ PCMPEQ %xmm0, %xmm4
|
||||
+ pmovmskb %xmm4, %edx
|
||||
+ test %edx, %edx
|
||||
+ je L(next48_bytes)
|
||||
+ bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+L(next48_bytes):
|
||||
+/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
||||
+ andq $-16, %rax
|
||||
+ PCMPEQ 16(%rax), %xmm1
|
||||
+ PCMPEQ 32(%rax), %xmm2
|
||||
+ PCMPEQ 48(%rax), %xmm3
|
||||
+ pmovmskb %xmm1, %edx
|
||||
+ pmovmskb %xmm2, %r8d
|
||||
+ pmovmskb %xmm3, %ecx
|
||||
+ salq $16, %rdx
|
||||
+ salq $16, %rcx
|
||||
+ orq %r8, %rcx
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rdx
|
||||
+#endif
|
||||
+
|
||||
+ /* When no zero byte is found xmm1-3 are zero so we do not have to
|
||||
+ zero them. */
|
||||
+ PROLOG(loop)
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(cross_page):
|
||||
+ andq $-64, %rax
|
||||
+ FIND_ZERO
|
||||
+ PROLOG(loop_init)
|
||||
+
|
||||
+#ifdef AS_STRNLEN
|
||||
+/* We must do this check to correctly handle strnlen (s, -1). */
|
||||
+L(strnlen_ret):
|
||||
+ bts %rsi, %rdx
|
||||
+ sarq %cl, %rdx
|
||||
+ test %rdx, %rdx
|
||||
+ je L(loop_init)
|
||||
+ bsfq %rdx, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+#endif
|
||||
+ .p2align 4
|
||||
+L(loop_init):
|
||||
+ pxor %xmm1, %xmm1
|
||||
+ pxor %xmm2, %xmm2
|
||||
+ pxor %xmm3, %xmm3
|
||||
+#ifdef AS_STRNLEN
|
||||
+ .p2align 4
|
||||
+L(loop):
|
||||
+
|
||||
+ addq $64, %rax
|
||||
+ cmpq %rax, %r10
|
||||
+ je L(exit_end)
|
||||
+
|
||||
+ movdqa (%rax), %xmm0
|
||||
+ PMINU 16(%rax), %xmm0
|
||||
+ PMINU 32(%rax), %xmm0
|
||||
+ PMINU 48(%rax), %xmm0
|
||||
+ PCMPEQ %xmm3, %xmm0
|
||||
+ pmovmskb %xmm0, %edx
|
||||
+ testl %edx, %edx
|
||||
+ jne L(exit)
|
||||
+ jmp L(loop)
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit_end):
|
||||
+ cmp %rax, %r11
|
||||
+ je L(first) /* Do not read when end is at page boundary. */
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ FIND_ZERO
|
||||
+
|
||||
+L(first):
|
||||
+ bts %r11, %rdx
|
||||
+ bsfq %rdx, %rdx
|
||||
+ addq %rdx, %rax
|
||||
+ subq %rdi, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit):
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ FIND_ZERO
|
||||
+
|
||||
+ bsfq %rdx, %rdx
|
||||
+ addq %rdx, %rax
|
||||
+ subq %rdi, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+#else
|
||||
+
|
||||
+ /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
||||
+ .p2align 4
|
||||
+L(loop):
|
||||
+
|
||||
+ movdqa 64(%rax), %xmm0
|
||||
+ PMINU 80(%rax), %xmm0
|
||||
+ PMINU 96(%rax), %xmm0
|
||||
+ PMINU 112(%rax), %xmm0
|
||||
+ PCMPEQ %xmm3, %xmm0
|
||||
+ pmovmskb %xmm0, %edx
|
||||
+ testl %edx, %edx
|
||||
+ jne L(exit64)
|
||||
+
|
||||
+ subq $-128, %rax
|
||||
+
|
||||
+ movdqa (%rax), %xmm0
|
||||
+ PMINU 16(%rax), %xmm0
|
||||
+ PMINU 32(%rax), %xmm0
|
||||
+ PMINU 48(%rax), %xmm0
|
||||
+ PCMPEQ %xmm3, %xmm0
|
||||
+ pmovmskb %xmm0, %edx
|
||||
+ testl %edx, %edx
|
||||
+ jne L(exit0)
|
||||
+ jmp L(loop)
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(exit64):
|
||||
+ addq $64, %rax
|
||||
+L(exit0):
|
||||
+ pxor %xmm0, %xmm0
|
||||
+ FIND_ZERO
|
||||
+
|
||||
+ bsfq %rdx, %rdx
|
||||
+ addq %rdx, %rax
|
||||
+ subq %rdi, %rax
|
||||
+ SHIFT_RETURN
|
||||
+ ret
|
||||
+
|
||||
+#endif
|
||||
+
|
||||
+END(strlen)
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
index a8cab0cb..5fa51fe0 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen-sse4_1.S
|
||||
@@ -2,4 +2,4 @@
|
||||
#define AS_STRNLEN
|
||||
#define strlen __wcsnlen_sse4_1
|
||||
|
||||
-#include "../strlen.S"
|
||||
+#include "strlen-vec.S"
|
||||
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
||||
index f845f3d4..ad047d84 100644
|
||||
--- a/sysdeps/x86_64/strlen.S
|
||||
+++ b/sysdeps/x86_64/strlen.S
|
||||
@@ -1,5 +1,5 @@
|
||||
-/* SSE2 version of strlen/wcslen.
|
||||
- Copyright (C) 2012-2018 Free Software Foundation, Inc.
|
||||
+/* SSE2 version of strlen.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
This file is part of the GNU C Library.
|
||||
|
||||
The GNU C Library is free software; you can redistribute it and/or
|
||||
@@ -16,243 +16,6 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
-#include <sysdep.h>
|
||||
+#include "multiarch/strlen-vec.S"
|
||||
|
||||
-#ifdef AS_WCSLEN
|
||||
-# define PMINU pminud
|
||||
-# define PCMPEQ pcmpeqd
|
||||
-# define SHIFT_RETURN shrq $2, %rax
|
||||
-#else
|
||||
-# define PMINU pminub
|
||||
-# define PCMPEQ pcmpeqb
|
||||
-# define SHIFT_RETURN
|
||||
-#endif
|
||||
-
|
||||
-/* Long lived register in strlen(s), strnlen(s, n) are:
|
||||
-
|
||||
- %xmm3 - zero
|
||||
- %rdi - s
|
||||
- %r10 (s+n) & (~(64-1))
|
||||
- %r11 s+n
|
||||
-*/
|
||||
-
|
||||
-
|
||||
-.text
|
||||
-ENTRY(strlen)
|
||||
-
|
||||
-/* Test 64 bytes from %rax for zero. Save result as bitmask in %rdx. */
|
||||
-#define FIND_ZERO \
|
||||
- PCMPEQ (%rax), %xmm0; \
|
||||
- PCMPEQ 16(%rax), %xmm1; \
|
||||
- PCMPEQ 32(%rax), %xmm2; \
|
||||
- PCMPEQ 48(%rax), %xmm3; \
|
||||
- pmovmskb %xmm0, %esi; \
|
||||
- pmovmskb %xmm1, %edx; \
|
||||
- pmovmskb %xmm2, %r8d; \
|
||||
- pmovmskb %xmm3, %ecx; \
|
||||
- salq $16, %rdx; \
|
||||
- salq $16, %rcx; \
|
||||
- orq %rsi, %rdx; \
|
||||
- orq %r8, %rcx; \
|
||||
- salq $32, %rcx; \
|
||||
- orq %rcx, %rdx;
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
-/* Do not read anything when n==0. */
|
||||
- test %RSI_LP, %RSI_LP
|
||||
- jne L(n_nonzero)
|
||||
- xor %rax, %rax
|
||||
- ret
|
||||
-L(n_nonzero):
|
||||
-# ifdef AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
-# endif
|
||||
-
|
||||
-/* Initialize long lived registers. */
|
||||
-
|
||||
- add %RDI_LP, %RSI_LP
|
||||
- mov %RSI_LP, %R10_LP
|
||||
- and $-64, %R10_LP
|
||||
- mov %RSI_LP, %R11_LP
|
||||
-#endif
|
||||
-
|
||||
- pxor %xmm0, %xmm0
|
||||
- pxor %xmm1, %xmm1
|
||||
- pxor %xmm2, %xmm2
|
||||
- pxor %xmm3, %xmm3
|
||||
- movq %rdi, %rax
|
||||
- movq %rdi, %rcx
|
||||
- andq $4095, %rcx
|
||||
-/* Offsets 4032-4047 will be aligned into 4032 thus fit into page. */
|
||||
- cmpq $4047, %rcx
|
||||
-/* We cannot unify this branching as it would be ~6 cycles slower. */
|
||||
- ja L(cross_page)
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
-/* Test if end is among first 64 bytes. */
|
||||
-# define STRNLEN_PROLOG \
|
||||
- mov %r11, %rsi; \
|
||||
- subq %rax, %rsi; \
|
||||
- andq $-64, %rax; \
|
||||
- testq $-64, %rsi; \
|
||||
- je L(strnlen_ret)
|
||||
-#else
|
||||
-# define STRNLEN_PROLOG andq $-64, %rax;
|
||||
-#endif
|
||||
-
|
||||
-/* Ignore bits in mask that come before start of string. */
|
||||
-#define PROLOG(lab) \
|
||||
- movq %rdi, %rcx; \
|
||||
- xorq %rax, %rcx; \
|
||||
- STRNLEN_PROLOG; \
|
||||
- sarq %cl, %rdx; \
|
||||
- test %rdx, %rdx; \
|
||||
- je L(lab); \
|
||||
- bsfq %rdx, %rax; \
|
||||
- SHIFT_RETURN; \
|
||||
- ret
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
- andq $-16, %rax
|
||||
- FIND_ZERO
|
||||
-#else
|
||||
- /* Test first 16 bytes unaligned. */
|
||||
- movdqu (%rax), %xmm4
|
||||
- PCMPEQ %xmm0, %xmm4
|
||||
- pmovmskb %xmm4, %edx
|
||||
- test %edx, %edx
|
||||
- je L(next48_bytes)
|
||||
- bsf %edx, %eax /* If eax is zeroed 16bit bsf can be used. */
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
-L(next48_bytes):
|
||||
-/* Same as FIND_ZERO except we do not check first 16 bytes. */
|
||||
- andq $-16, %rax
|
||||
- PCMPEQ 16(%rax), %xmm1
|
||||
- PCMPEQ 32(%rax), %xmm2
|
||||
- PCMPEQ 48(%rax), %xmm3
|
||||
- pmovmskb %xmm1, %edx
|
||||
- pmovmskb %xmm2, %r8d
|
||||
- pmovmskb %xmm3, %ecx
|
||||
- salq $16, %rdx
|
||||
- salq $16, %rcx
|
||||
- orq %r8, %rcx
|
||||
- salq $32, %rcx
|
||||
- orq %rcx, %rdx
|
||||
-#endif
|
||||
-
|
||||
- /* When no zero byte is found xmm1-3 are zero so we do not have to
|
||||
- zero them. */
|
||||
- PROLOG(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(cross_page):
|
||||
- andq $-64, %rax
|
||||
- FIND_ZERO
|
||||
- PROLOG(loop_init)
|
||||
-
|
||||
-#ifdef AS_STRNLEN
|
||||
-/* We must do this check to correctly handle strnlen (s, -1). */
|
||||
-L(strnlen_ret):
|
||||
- bts %rsi, %rdx
|
||||
- sarq %cl, %rdx
|
||||
- test %rdx, %rdx
|
||||
- je L(loop_init)
|
||||
- bsfq %rdx, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-#endif
|
||||
- .p2align 4
|
||||
-L(loop_init):
|
||||
- pxor %xmm1, %xmm1
|
||||
- pxor %xmm2, %xmm2
|
||||
- pxor %xmm3, %xmm3
|
||||
-#ifdef AS_STRNLEN
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
-
|
||||
- addq $64, %rax
|
||||
- cmpq %rax, %r10
|
||||
- je L(exit_end)
|
||||
-
|
||||
- movdqa (%rax), %xmm0
|
||||
- PMINU 16(%rax), %xmm0
|
||||
- PMINU 32(%rax), %xmm0
|
||||
- PMINU 48(%rax), %xmm0
|
||||
- PCMPEQ %xmm3, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- testl %edx, %edx
|
||||
- jne L(exit)
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(exit_end):
|
||||
- cmp %rax, %r11
|
||||
- je L(first) /* Do not read when end is at page boundary. */
|
||||
- pxor %xmm0, %xmm0
|
||||
- FIND_ZERO
|
||||
-
|
||||
-L(first):
|
||||
- bts %r11, %rdx
|
||||
- bsfq %rdx, %rdx
|
||||
- addq %rdx, %rax
|
||||
- subq %rdi, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(exit):
|
||||
- pxor %xmm0, %xmm0
|
||||
- FIND_ZERO
|
||||
-
|
||||
- bsfq %rdx, %rdx
|
||||
- addq %rdx, %rax
|
||||
- subq %rdi, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
-#else
|
||||
-
|
||||
- /* Main loop. Unrolled twice to improve L2 cache performance on core2. */
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
-
|
||||
- movdqa 64(%rax), %xmm0
|
||||
- PMINU 80(%rax), %xmm0
|
||||
- PMINU 96(%rax), %xmm0
|
||||
- PMINU 112(%rax), %xmm0
|
||||
- PCMPEQ %xmm3, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- testl %edx, %edx
|
||||
- jne L(exit64)
|
||||
-
|
||||
- subq $-128, %rax
|
||||
-
|
||||
- movdqa (%rax), %xmm0
|
||||
- PMINU 16(%rax), %xmm0
|
||||
- PMINU 32(%rax), %xmm0
|
||||
- PMINU 48(%rax), %xmm0
|
||||
- PCMPEQ %xmm3, %xmm0
|
||||
- pmovmskb %xmm0, %edx
|
||||
- testl %edx, %edx
|
||||
- jne L(exit0)
|
||||
- jmp L(loop)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(exit64):
|
||||
- addq $64, %rax
|
||||
-L(exit0):
|
||||
- pxor %xmm0, %xmm0
|
||||
- FIND_ZERO
|
||||
-
|
||||
- bsfq %rdx, %rdx
|
||||
- addq %rdx, %rax
|
||||
- subq %rdi, %rax
|
||||
- SHIFT_RETURN
|
||||
- ret
|
||||
-
|
||||
-#endif
|
||||
-
|
||||
-END(strlen)
|
||||
libc_hidden_builtin_def (strlen)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,181 +0,0 @@
|
||||
From 6f573a27b6c8b4236445810a44660612323f5a73 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 23 Jun 2021 01:19:34 -0400
|
||||
Subject: [PATCH] x86-64: Add wcslen optimize for sse4.1
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This comment adds the ifunc / build infrastructure
|
||||
necessary for wcslen to prefer the sse4.1 implementation
|
||||
in strlen-vec.S. test-wcslen.c is passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 4 +-
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 3 ++
|
||||
sysdeps/x86_64/multiarch/ifunc-wcslen.h | 52 ++++++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/wcslen-sse4_1.S | 4 ++
|
||||
sysdeps/x86_64/multiarch/wcslen.c | 2 +-
|
||||
sysdeps/x86_64/multiarch/wcsnlen.c | 34 +-------------
|
||||
6 files changed, 63 insertions(+), 36 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
||||
create mode 100644 sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 491c7698..65fde4eb 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -93,8 +93,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
||||
wcscpy-ssse3 wcscpy-c \
|
||||
wcschr-sse2 wcschr-avx2 \
|
||||
wcsrchr-sse2 wcsrchr-avx2 \
|
||||
- wcsnlen-sse4_1 wcsnlen-c \
|
||||
- wcslen-sse2 wcslen-avx2 wcsnlen-avx2 \
|
||||
+ wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
|
||||
+ wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
|
||||
wcschr-avx2-rtm \
|
||||
wcscmp-avx2-rtm \
|
||||
wcslen-avx2-rtm \
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index f1a6460a..580913ca 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -657,6 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcslen_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
+ CPU_FEATURE_USABLE (SSE4_1),
|
||||
+ __wcsnlen_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-wcslen.h b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
||||
new file mode 100644
|
||||
index 00000000..39e33473
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-wcslen.h
|
||||
@@ -0,0 +1,52 @@
|
||||
+/* Common definition for ifunc selections for wcslen and wcsnlen
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <init-arch.h>
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ return OPTIMIZE (evex);
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
+ return OPTIMIZE (avx2_rtm);
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ return OPTIMIZE (avx2);
|
||||
+ }
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
+ return OPTIMIZE (sse4_1);
|
||||
+
|
||||
+ return OPTIMIZE (sse2);
|
||||
+}
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcslen-sse4_1.S b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
new file mode 100644
|
||||
index 00000000..7e62621a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/wcslen-sse4_1.S
|
||||
@@ -0,0 +1,4 @@
|
||||
+#define AS_WCSLEN
|
||||
+#define strlen __wcslen_sse4_1
|
||||
+
|
||||
+#include "strlen-vec.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcslen.c b/sysdeps/x86_64/multiarch/wcslen.c
|
||||
index 6d06e47c..3b04b75b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcslen.c
|
||||
+++ b/sysdeps/x86_64/multiarch/wcslen.c
|
||||
@@ -24,7 +24,7 @@
|
||||
# undef __wcslen
|
||||
|
||||
# define SYMBOL_NAME wcslen
|
||||
-# include "ifunc-avx2.h"
|
||||
+# include "ifunc-wcslen.h"
|
||||
|
||||
libc_ifunc_redirected (__redirect_wcslen, __wcslen, IFUNC_SELECTOR ());
|
||||
weak_alias (__wcslen, wcslen);
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsnlen.c b/sysdeps/x86_64/multiarch/wcsnlen.c
|
||||
index 20b731ae..06736410 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsnlen.c
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsnlen.c
|
||||
@@ -24,39 +24,7 @@
|
||||
# undef __wcsnlen
|
||||
|
||||
# define SYMBOL_NAME wcsnlen
|
||||
-# include <init-arch.h>
|
||||
-
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (sse4_1) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
-extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
-
|
||||
-static inline void *
|
||||
-IFUNC_SELECTOR (void)
|
||||
-{
|
||||
- const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
- && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
- {
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
- return OPTIMIZE (evex);
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
- return OPTIMIZE (avx2_rtm);
|
||||
-
|
||||
- if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
- return OPTIMIZE (avx2);
|
||||
- }
|
||||
-
|
||||
- if (CPU_FEATURE_USABLE_P (cpu_features, SSE4_1))
|
||||
- return OPTIMIZE (sse4_1);
|
||||
-
|
||||
- return OPTIMIZE (sse2);
|
||||
-}
|
||||
+# include "ifunc-wcslen.h"
|
||||
|
||||
libc_ifunc_redirected (__redirect_wcsnlen, __wcsnlen, IFUNC_SELECTOR ());
|
||||
weak_alias (__wcsnlen, wcsnlen);
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,396 +0,0 @@
|
||||
From 231c56760c1e2ded21ad96bbb860b1f08c556c7a Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:27:25 -0800
|
||||
Subject: [PATCH] x86-64 memcpy: Properly handle the length parameter [BZ#
|
||||
24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memcpy for x32. Tested on x86-64 and x32. On x86-64,
|
||||
libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/memcpy-ssse3-back.S: Use RDX_LP for
|
||||
length. Clear the upper 32 bits of RDX register.
|
||||
* sysdeps/x86_64/multiarch/memcpy-ssse3.S: Likewise.
|
||||
* sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S:
|
||||
Likewise.
|
||||
* sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S:
|
||||
Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memcpy.
|
||||
tst-size_t-wmemchr.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memcpy.c: New file.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcpy-ssse3-back.S | 17 ++++--
|
||||
sysdeps/x86_64/multiarch/memcpy-ssse3.S | 17 ++++--
|
||||
.../multiarch/memmove-avx512-no-vzeroupper.S | 16 +++--
|
||||
.../multiarch/memmove-vec-unaligned-erms.S | 54 +++++++++--------
|
||||
sysdeps/x86_64/x32/Makefile | 2 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-memcpy.c | 58 +++++++++++++++++++
|
||||
6 files changed, 122 insertions(+), 42 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
||||
index 3cd11233..568eebd3 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3-back.S
|
||||
@@ -45,28 +45,33 @@
|
||||
.section .text.ssse3,"ax",@progbits
|
||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
||||
ENTRY (MEMPCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMPCPY_CHK)
|
||||
|
||||
ENTRY (MEMPCPY)
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (MEMPCPY)
|
||||
#endif
|
||||
|
||||
#if !defined USE_AS_BCOPY
|
||||
ENTRY (MEMCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMCPY_CHK)
|
||||
#endif
|
||||
|
||||
ENTRY (MEMCPY)
|
||||
- mov %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
#ifdef USE_AS_MEMPCPY
|
||||
- add %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
+#endif
|
||||
+
|
||||
+#ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
#endif
|
||||
|
||||
#ifdef USE_AS_MEMMOVE
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcpy-ssse3.S b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
||||
index 0240bfa3..0bd5ee99 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcpy-ssse3.S
|
||||
@@ -45,28 +45,33 @@
|
||||
.section .text.ssse3,"ax",@progbits
|
||||
#if !defined USE_AS_MEMPCPY && !defined USE_AS_MEMMOVE
|
||||
ENTRY (MEMPCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMPCPY_CHK)
|
||||
|
||||
ENTRY (MEMPCPY)
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (MEMPCPY)
|
||||
#endif
|
||||
|
||||
#if !defined USE_AS_BCOPY
|
||||
ENTRY (MEMCPY_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMCPY_CHK)
|
||||
#endif
|
||||
|
||||
ENTRY (MEMCPY)
|
||||
- mov %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
#ifdef USE_AS_MEMPCPY
|
||||
- add %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
+#endif
|
||||
+
|
||||
+#ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
#endif
|
||||
|
||||
#ifdef USE_AS_MEMMOVE
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
||||
index effc3ac2..6ca2bbc9 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-avx512-no-vzeroupper.S
|
||||
@@ -24,27 +24,31 @@
|
||||
|
||||
.section .text.avx512,"ax",@progbits
|
||||
ENTRY (__mempcpy_chk_avx512_no_vzeroupper)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__mempcpy_chk_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__mempcpy_avx512_no_vzeroupper)
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (__mempcpy_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__memmove_chk_avx512_no_vzeroupper)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__memmove_chk_avx512_no_vzeroupper)
|
||||
|
||||
ENTRY (__memmove_avx512_no_vzeroupper)
|
||||
- mov %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
# ifdef USE_AS_MEMPCPY
|
||||
- add %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
# endif
|
||||
L(start):
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
+# endif
|
||||
lea (%rsi, %rdx), %rcx
|
||||
lea (%rdi, %rdx), %r9
|
||||
cmp $512, %rdx
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index c952576c..274aa1c7 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -95,20 +95,20 @@
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start)
|
||||
END (MEMPCPY_SYMBOL (__mempcpy, unaligned))
|
||||
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
#endif
|
||||
@@ -116,9 +116,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned))
|
||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
movq %rdi, %rax
|
||||
L(start):
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(more_2x_vec)
|
||||
#if !defined USE_MULTIARCH || !IS_IN (libc)
|
||||
L(last_2x_vec):
|
||||
@@ -138,38 +142,38 @@ END (MEMMOVE_SYMBOL (__memmove, unaligned))
|
||||
|
||||
# if VEC_SIZE == 16
|
||||
ENTRY (__mempcpy_chk_erms)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__mempcpy_chk_erms)
|
||||
|
||||
/* Only used to measure performance of REP MOVSB. */
|
||||
ENTRY (__mempcpy_erms)
|
||||
- movq %rdi, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
/* Skip zero length. */
|
||||
- testq %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz 2f
|
||||
- addq %rdx, %rax
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start_movsb)
|
||||
END (__mempcpy_erms)
|
||||
|
||||
ENTRY (__memmove_chk_erms)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__memmove_chk_erms)
|
||||
|
||||
ENTRY (__memmove_erms)
|
||||
movq %rdi, %rax
|
||||
/* Skip zero length. */
|
||||
- testq %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jz 2f
|
||||
L(start_movsb):
|
||||
- movq %rdx, %rcx
|
||||
- cmpq %rsi, %rdi
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ cmp %RSI_LP, %RDI_LP
|
||||
jb 1f
|
||||
/* Source == destination is less common. */
|
||||
je 2f
|
||||
- leaq (%rsi,%rcx), %rdx
|
||||
- cmpq %rdx, %rdi
|
||||
+ lea (%rsi,%rcx), %RDX_LP
|
||||
+ cmp %RDX_LP, %RDI_LP
|
||||
jb L(movsb_backward)
|
||||
1:
|
||||
rep movsb
|
||||
@@ -189,20 +193,20 @@ strong_alias (__memmove_chk_erms, __memcpy_chk_erms)
|
||||
|
||||
# ifdef SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__mempcpy_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||
- movq %rdi, %rax
|
||||
- addq %rdx, %rax
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ add %RDX_LP, %RAX_LP
|
||||
jmp L(start_erms)
|
||||
END (MEMMOVE_SYMBOL (__mempcpy, unaligned_erms))
|
||||
|
||||
# ifdef SHARED
|
||||
ENTRY (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
# endif
|
||||
@@ -210,9 +214,13 @@ END (MEMMOVE_CHK_SYMBOL (__memmove_chk, unaligned_erms))
|
||||
ENTRY (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
movq %rdi, %rax
|
||||
L(start_erms):
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %edx, %edx
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(movsb_more_2x_vec)
|
||||
L(last_2x_vec):
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
@@ -236,7 +244,7 @@ L(movsb):
|
||||
/* Avoid slow backward REP MOVSB. */
|
||||
jb L(more_8x_vec_backward)
|
||||
1:
|
||||
- movq %rdx, %rcx
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
rep movsb
|
||||
L(nop):
|
||||
ret
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index ddec7f04..2fe1e5ac 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -6,7 +6,7 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
-tests += tst-size_t-memchr tst-size_t-memcmp
|
||||
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memcpy.c b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
||||
new file mode 100644
|
||||
index 00000000..66b71e17
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memcpy.c
|
||||
@@ -0,0 +1,58 @@
|
||||
+/* Test memcpy with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_NAME "memcpy"
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+IMPL (memcpy, 1)
|
||||
+
|
||||
+typedef void *(*proto_t) (void *, const void *, size_t);
|
||||
+
|
||||
+static void *
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memcpy (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t dest = { { page_size }, buf1 };
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ do_memcpy (dest, src);
|
||||
+ int res = memcmp (dest.p, src.p, dest.len);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,497 +0,0 @@
|
||||
From a775a7a3eb1e85b54af0b4ee5ff4dcf66772a1fb Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 23 Jun 2021 01:56:29 -0400
|
||||
Subject: [PATCH] x86: Fix overflow bug in wcsnlen-sse4_1 and wcsnlen-avx2 [BZ
|
||||
#27974]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This commit fixes the bug mentioned in the previous commit.
|
||||
|
||||
The previous implementations of wmemchr in these files relied
|
||||
on maxlen * sizeof(wchar_t) which was not guranteed by the standard.
|
||||
|
||||
The new overflow tests added in the previous commit now
|
||||
pass (As well as all the other tests).
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 130 ++++++++++++++++++-------
|
||||
sysdeps/x86_64/multiarch/strlen-vec.S | 15 ++-
|
||||
2 files changed, 107 insertions(+), 38 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
index be8a5db5..37688966 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
@@ -44,21 +44,21 @@
|
||||
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Check zero length. */
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear upper bits. */
|
||||
+ and %RSI_LP, %RSI_LP
|
||||
+# else
|
||||
test %RSI_LP, %RSI_LP
|
||||
+# endif
|
||||
jz L(zero)
|
||||
/* Store max len in R8_LP before adjusting if using WCSLEN. */
|
||||
mov %RSI_LP, %R8_LP
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
-# elif defined __ILP32__
|
||||
- /* Clear the upper 32 bits. */
|
||||
- movl %esi, %esi
|
||||
-# endif
|
||||
# endif
|
||||
movl %edi, %eax
|
||||
movq %rdi, %rdx
|
||||
@@ -72,10 +72,10 @@ ENTRY (STRLEN)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. */
|
||||
VPCMPEQ (%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* If length < VEC_SIZE handle special. */
|
||||
- cmpq $VEC_SIZE, %rsi
|
||||
+ cmpq $CHAR_PER_VEC, %rsi
|
||||
jbe L(first_vec_x0)
|
||||
# endif
|
||||
/* If empty continue to aligned_more. Otherwise return bit
|
||||
@@ -84,6 +84,7 @@ ENTRY (STRLEN)
|
||||
jz L(aligned_more)
|
||||
tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -97,9 +98,14 @@ L(zero):
|
||||
L(first_vec_x0):
|
||||
/* Set bit for max len so that tzcnt will return min of max len
|
||||
and position of first match. */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %esi
|
||||
+# endif
|
||||
btsq %rsi, %rax
|
||||
tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -113,14 +119,19 @@ L(first_vec_x1):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 4 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE * 4 + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
incl %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -133,14 +144,19 @@ L(first_vec_x2):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 3 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE * 3 + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
addl $(VEC_SIZE + 1), %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -153,14 +169,19 @@ L(first_vec_x3):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 2 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE * 2 + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
addl $(VEC_SIZE * 2 + 1), %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -173,14 +194,19 @@ L(first_vec_x4):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Use ecx which was computed earlier to compute correct value.
|
||||
*/
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ leal -(VEC_SIZE * 1 + 1)(%rax, %rcx, 4), %eax
|
||||
+# else
|
||||
subl $(VEC_SIZE + 1), %ecx
|
||||
addl %ecx, %eax
|
||||
+# endif
|
||||
# else
|
||||
subl %edx, %edi
|
||||
addl $(VEC_SIZE * 3 + 1), %edi
|
||||
addl %edi, %eax
|
||||
# endif
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -195,10 +221,14 @@ L(cross_page_continue):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE because
|
||||
- it simplies the logic in last_4x_vec_or_less. */
|
||||
+ /* + 1 because rdi is aligned to VEC_SIZE - 1. + CHAR_SIZE
|
||||
+ because it simplies the logic in last_4x_vec_or_less. */
|
||||
leaq (VEC_SIZE * 4 + CHAR_SIZE + 1)(%rdi), %rcx
|
||||
subq %rdx, %rcx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
# endif
|
||||
/* Load first VEC regardless. */
|
||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
@@ -207,34 +237,38 @@ L(cross_page_continue):
|
||||
subq %rcx, %rsi
|
||||
jb L(last_4x_vec_or_less)
|
||||
# endif
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x4)
|
||||
|
||||
/* Align data to VEC_SIZE * 4 - 1. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Before adjusting length check if at last VEC_SIZE * 4. */
|
||||
- cmpq $(VEC_SIZE * 4 - 1), %rsi
|
||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
||||
jbe L(last_4x_vec_or_less_load)
|
||||
incq %rdi
|
||||
movl %edi, %ecx
|
||||
orq $(VEC_SIZE * 4 - 1), %rdi
|
||||
andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
/* Readjust length. */
|
||||
addq %rcx, %rsi
|
||||
# else
|
||||
@@ -246,13 +280,13 @@ L(cross_page_continue):
|
||||
L(loop_4x_vec):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Break if at end of length. */
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
||||
jb L(last_4x_vec_or_less_cmpeq)
|
||||
# endif
|
||||
- /* Save some code size by microfusing VPMINU with the load. Since
|
||||
- the matches in ymm2/ymm4 can only be returned if there where no
|
||||
- matches in ymm1/ymm3 respectively there is no issue with overlap.
|
||||
- */
|
||||
+ /* Save some code size by microfusing VPMINU with the load.
|
||||
+ Since the matches in ymm2/ymm4 can only be returned if there
|
||||
+ where no matches in ymm1/ymm3 respectively there is no issue
|
||||
+ with overlap. */
|
||||
vmovdqa 1(%rdi), %ymm1
|
||||
VPMINU (VEC_SIZE + 1)(%rdi), %ymm1, %ymm2
|
||||
vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm3
|
||||
@@ -260,7 +294,7 @@ L(loop_4x_vec):
|
||||
|
||||
VPMINU %ymm2, %ymm4, %ymm5
|
||||
VPCMPEQ %ymm5, %ymm0, %ymm5
|
||||
- vpmovmskb %ymm5, %ecx
|
||||
+ vpmovmskb %ymm5, %ecx
|
||||
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
testl %ecx, %ecx
|
||||
@@ -268,27 +302,28 @@ L(loop_4x_vec):
|
||||
|
||||
|
||||
VPCMPEQ %ymm1, %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
subq %rdx, %rdi
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_return_x0)
|
||||
|
||||
VPCMPEQ %ymm2, %ymm0, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_return_x1)
|
||||
|
||||
/* Combine last 2 VEC. */
|
||||
VPCMPEQ %ymm3, %ymm0, %ymm3
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- /* rcx has combined result from all 4 VEC. It will only be used if
|
||||
- the first 3 other VEC all did not contain a match. */
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* rcx has combined result from all 4 VEC. It will only be used
|
||||
+ if the first 3 other VEC all did not contain a match. */
|
||||
salq $32, %rcx
|
||||
orq %rcx, %rax
|
||||
tzcntq %rax, %rax
|
||||
subq $(VEC_SIZE * 2 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -297,15 +332,19 @@ L(loop_4x_vec):
|
||||
# ifdef USE_AS_STRNLEN
|
||||
.p2align 4
|
||||
L(last_4x_vec_or_less_load):
|
||||
- /* Depending on entry adjust rdi / prepare first VEC in ymm1. */
|
||||
+ /* Depending on entry adjust rdi / prepare first VEC in ymm1.
|
||||
+ */
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
L(last_4x_vec_or_less_cmpeq):
|
||||
VPCMPEQ 1(%rdi), %ymm0, %ymm1
|
||||
L(last_4x_vec_or_less):
|
||||
-
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
||||
- VEC_SIZE * 4. */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %esi
|
||||
+# endif
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off
|
||||
+ by VEC_SIZE * 4. */
|
||||
testl $(VEC_SIZE * 2), %esi
|
||||
jnz L(last_4x_vec)
|
||||
|
||||
@@ -320,7 +359,7 @@ L(last_4x_vec_or_less):
|
||||
jb L(max)
|
||||
|
||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
cmpl %eax, %esi
|
||||
@@ -329,6 +368,7 @@ L(last_4x_vec_or_less):
|
||||
addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -340,6 +380,7 @@ L(last_vec_return_x0):
|
||||
subq $(VEC_SIZE * 4 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -350,6 +391,7 @@ L(last_vec_return_x1):
|
||||
subq $(VEC_SIZE * 3 - 1), %rdi
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -366,6 +408,7 @@ L(last_vec_x1_check):
|
||||
incl %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -381,14 +424,14 @@ L(last_4x_vec):
|
||||
jnz L(last_vec_x1)
|
||||
|
||||
VPCMPEQ (VEC_SIZE + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_x2)
|
||||
|
||||
/* Normalize length. */
|
||||
andl $(VEC_SIZE * 4 - 1), %esi
|
||||
VPCMPEQ (VEC_SIZE * 2 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_x3)
|
||||
|
||||
@@ -396,7 +439,7 @@ L(last_4x_vec):
|
||||
jb L(max)
|
||||
|
||||
VPCMPEQ (VEC_SIZE * 3 + 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
tzcntl %eax, %eax
|
||||
/* Check the end of data. */
|
||||
cmpl %eax, %esi
|
||||
@@ -405,6 +448,7 @@ L(last_4x_vec):
|
||||
addl $(VEC_SIZE * 3 + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -419,6 +463,7 @@ L(last_vec_x1):
|
||||
incl %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -432,6 +477,7 @@ L(last_vec_x2):
|
||||
addl $(VEC_SIZE + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -447,6 +493,7 @@ L(last_vec_x3):
|
||||
addl $(VEC_SIZE * 2 + 1), %eax
|
||||
addq %rdi, %rax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
shrq $2, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -455,13 +502,13 @@ L(max_end):
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
|
||||
- /* Cold case for crossing page with first load. */
|
||||
+ /* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
L(cross_page_boundary):
|
||||
/* Align data to VEC_SIZE - 1. */
|
||||
orq $(VEC_SIZE - 1), %rdi
|
||||
VPCMPEQ -(VEC_SIZE - 1)(%rdi), %ymm0, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
/* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
||||
so no need to manually mod rdx. */
|
||||
sarxl %edx, %eax, %eax
|
||||
@@ -470,6 +517,10 @@ L(cross_page_boundary):
|
||||
jnz L(cross_page_less_vec)
|
||||
leaq 1(%rdi), %rcx
|
||||
subq %rdx, %rcx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get wchar_t count. */
|
||||
+ shrl $2, %ecx
|
||||
+# endif
|
||||
/* Check length. */
|
||||
cmpq %rsi, %rcx
|
||||
jb L(cross_page_continue)
|
||||
@@ -479,6 +530,7 @@ L(cross_page_boundary):
|
||||
jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide length by 4 to get wchar_t count. */
|
||||
shrl $2, %eax
|
||||
# endif
|
||||
# endif
|
||||
@@ -489,6 +541,10 @@ L(return_vzeroupper):
|
||||
.p2align 4
|
||||
L(cross_page_less_vec):
|
||||
tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Multiply length by 4 to get byte count. */
|
||||
+ sall $2, %esi
|
||||
+# endif
|
||||
cmpq %rax, %rsi
|
||||
cmovb %esi, %eax
|
||||
# ifdef USE_AS_WCSLEN
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
index 8f660bb9..439e486a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
@@ -65,12 +65,25 @@ ENTRY(strlen)
|
||||
ret
|
||||
L(n_nonzero):
|
||||
# ifdef AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
+/* Check for overflow from maxlen * sizeof(wchar_t). If it would
|
||||
+ overflow the only way this program doesn't have undefined behavior
|
||||
+ is if there is a null terminator in valid memory so wcslen will
|
||||
+ suffice. */
|
||||
+ mov %RSI_LP, %R10_LP
|
||||
+ sar $62, %R10_LP
|
||||
+ test %R10_LP, %R10_LP
|
||||
+ jnz __wcslen_sse4_1
|
||||
+ sal $2, %RSI_LP
|
||||
# endif
|
||||
|
||||
+
|
||||
/* Initialize long lived registers. */
|
||||
|
||||
add %RDI_LP, %RSI_LP
|
||||
+# ifdef AS_WCSLEN
|
||||
+/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
|
||||
+ jbe __wcslen_sse4_1
|
||||
+# endif
|
||||
mov %RSI_LP, %R10_LP
|
||||
and $-64, %R10_LP
|
||||
mov %RSI_LP, %R11_LP
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,745 +0,0 @@
|
||||
From 4ba65586847751372520a36757c17f114588794e Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 19:36:06 -0400
|
||||
Subject: [PATCH] x86: Optimize strlen-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes strlen-evex.S. The
|
||||
optimizations are mostly small things but they add up to roughly
|
||||
10-30% performance improvement for strlen. The results for strnlen are
|
||||
bit more ambiguous. test-strlen, test-strnlen, test-wcslen, and
|
||||
test-wcsnlen are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-evex.S | 581 ++++++++++++++-----------
|
||||
1 file changed, 317 insertions(+), 264 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-evex.S b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
index 05838190..4bf6874b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-evex.S
|
||||
@@ -29,11 +29,13 @@
|
||||
# ifdef USE_AS_WCSLEN
|
||||
# define VPCMP vpcmpd
|
||||
# define VPMINU vpminud
|
||||
-# define SHIFT_REG r9d
|
||||
+# define SHIFT_REG ecx
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPCMP vpcmpb
|
||||
# define VPMINU vpminub
|
||||
-# define SHIFT_REG ecx
|
||||
+# define SHIFT_REG edx
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# define XMMZERO xmm16
|
||||
@@ -46,132 +48,165 @@
|
||||
# define YMM6 ymm22
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check for zero length. */
|
||||
+ /* Check zero length. */
|
||||
test %RSI_LP, %RSI_LP
|
||||
jz L(zero)
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shl $2, %RSI_LP
|
||||
-# elif defined __ILP32__
|
||||
+# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %esi, %esi
|
||||
# endif
|
||||
mov %RSI_LP, %R8_LP
|
||||
# endif
|
||||
- movl %edi, %ecx
|
||||
- movq %rdi, %rdx
|
||||
+ movl %edi, %eax
|
||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
-
|
||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
||||
+ cross check. */
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. Each bit in K0 represents a
|
||||
null byte. */
|
||||
VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- jnz L(first_vec_x0_check)
|
||||
- /* Adjust length and check the end of data. */
|
||||
- subq $VEC_SIZE, %rsi
|
||||
- jbe L(max)
|
||||
-# else
|
||||
- jnz L(first_vec_x0)
|
||||
+ /* If length < CHAR_PER_VEC handle special. */
|
||||
+ cmpq $CHAR_PER_VEC, %rsi
|
||||
+ jbe L(first_vec_x0)
|
||||
# endif
|
||||
-
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
-
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
+ tzcntl %eax, %eax
|
||||
+ ret
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rsi
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x0):
|
||||
+ /* Set bit for max len so that tzcnt will return min of max len
|
||||
+ and position of first match. */
|
||||
+ btsq %rsi, %rax
|
||||
+ tzcntl %eax, %eax
|
||||
+ ret
|
||||
# endif
|
||||
- jmp L(more_4x_vec)
|
||||
|
||||
.p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
-
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
||||
- bytes. */
|
||||
- movl %ecx, %SHIFT_REG
|
||||
- sarl $2, %SHIFT_REG
|
||||
+L(first_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC * 4 + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal CHAR_PER_VEC(%rdi, %rax), %eax
|
||||
# endif
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
+ ret
|
||||
|
||||
- /* Remove the leading bytes. */
|
||||
- sarxl %SHIFT_REG, %eax, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
+ .p2align 4
|
||||
+L(first_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
-# endif
|
||||
- addq %rdi, %rax
|
||||
- addq %rcx, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC * 3 + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal (CHAR_PER_VEC * 2)(%rdi, %rax), %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(aligned_more):
|
||||
+L(first_vec_x3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* "rcx" is less than VEC_SIZE. Calculate "rdx + rcx - VEC_SIZE"
|
||||
- with "rdx - (VEC_SIZE - rcx)" instead of "(rdx + rcx) - VEC_SIZE"
|
||||
- to void possible addition overflow. */
|
||||
- negq %rcx
|
||||
- addq $VEC_SIZE, %rcx
|
||||
-
|
||||
- /* Check the end of data. */
|
||||
- subq %rcx, %rsi
|
||||
- jbe L(max)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC * 2 + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal (CHAR_PER_VEC * 3)(%rdi, %rax), %eax
|
||||
# endif
|
||||
+ ret
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
+ .p2align 4
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Safe to use 32 bit instructions as these are only called for
|
||||
+ size = [1, 159]. */
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
+ /* Use ecx which was computed earlier to compute correct value.
|
||||
+ */
|
||||
+ leal -(CHAR_PER_VEC + 1)(%rcx, %rax), %eax
|
||||
+# else
|
||||
+ subl %edx, %edi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %edi
|
||||
+# endif
|
||||
+ leal (CHAR_PER_VEC * 4)(%rdi, %rax), %eax
|
||||
# endif
|
||||
+ ret
|
||||
|
||||
-L(more_4x_vec):
|
||||
+ .p2align 5
|
||||
+L(aligned_more):
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Align data to VEC_SIZE. */
|
||||
+ andq $-(VEC_SIZE), %rdi
|
||||
+L(cross_page_continue):
|
||||
/* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
since data is only aligned to VEC_SIZE. */
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
-
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* + CHAR_SIZE because it simplies the logic in
|
||||
+ last_4x_vec_or_less. */
|
||||
+ leaq (VEC_SIZE * 5 + CHAR_SIZE)(%rdi), %rcx
|
||||
+ subq %rdx, %rcx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
+# endif
|
||||
+ /* Load first VEC regardless. */
|
||||
VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Adjust length. If near end handle specially. */
|
||||
+ subq %rcx, %rsi
|
||||
+ jb L(last_4x_vec_or_less)
|
||||
+# endif
|
||||
kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
|
||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
+ test %eax, %eax
|
||||
jnz L(first_vec_x2)
|
||||
|
||||
VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
||||
@@ -179,258 +214,276 @@ L(more_4x_vec):
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x3)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
-# ifdef USE_AS_STRNLEN
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- jbe L(last_4x_vec_or_less)
|
||||
-# endif
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
- andq $-(4 * VEC_SIZE), %rdi
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
# ifdef USE_AS_STRNLEN
|
||||
- /* Adjust length. */
|
||||
+ /* Check if at last VEC_SIZE * 4 length. */
|
||||
+ cmpq $(CHAR_PER_VEC * 4 - 1), %rsi
|
||||
+ jbe L(last_4x_vec_or_less_load)
|
||||
+ movl %edi, %ecx
|
||||
+ andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarl $2, %ecx
|
||||
+# endif
|
||||
+ /* Readjust length. */
|
||||
addq %rcx, %rsi
|
||||
# endif
|
||||
+ /* Align data to VEC_SIZE * 4. */
|
||||
+ andq $-(VEC_SIZE * 4), %rdi
|
||||
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- VMOVA (%rdi), %YMM1
|
||||
- VMOVA VEC_SIZE(%rdi), %YMM2
|
||||
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM3
|
||||
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM4
|
||||
-
|
||||
- VPMINU %YMM1, %YMM2, %YMM5
|
||||
- VPMINU %YMM3, %YMM4, %YMM6
|
||||
+ /* Load first VEC regardless. */
|
||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+ /* Break if at end of length. */
|
||||
+ subq $(CHAR_PER_VEC * 4), %rsi
|
||||
+ jb L(last_4x_vec_or_less_cmpeq)
|
||||
+# endif
|
||||
+ /* Save some code size by microfusing VPMINU with the load. Since
|
||||
+ the matches in ymm2/ymm4 can only be returned if there where no
|
||||
+ matches in ymm1/ymm3 respectively there is no issue with overlap.
|
||||
+ */
|
||||
+ VPMINU (VEC_SIZE * 5)(%rdi), %YMM1, %YMM2
|
||||
+ VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
||||
+ VPMINU (VEC_SIZE * 7)(%rdi), %YMM3, %YMM4
|
||||
+
|
||||
+ VPCMP $0, %YMM2, %YMMZERO, %k0
|
||||
+ VPCMP $0, %YMM4, %YMMZERO, %k1
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ kortestd %k0, %k1
|
||||
+ jz L(loop_4x_vec)
|
||||
+
|
||||
+ /* Check if end was in first half. */
|
||||
+ kmovd %k0, %eax
|
||||
+ subq %rdx, %rdi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ shrq $2, %rdi
|
||||
+# endif
|
||||
+ testl %eax, %eax
|
||||
+ jz L(second_vec_return)
|
||||
|
||||
- VPMINU %YMM5, %YMM6, %YMM5
|
||||
- VPCMP $0, %YMM5, %YMMZERO, %k0
|
||||
- ktestd %k0, %k0
|
||||
- jnz L(4x_vec_end)
|
||||
+ VPCMP $0, %YMM1, %YMMZERO, %k2
|
||||
+ kmovd %k2, %edx
|
||||
+ /* Combine VEC1 matches (edx) with VEC2 matches (eax). */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ sall $CHAR_PER_VEC, %eax
|
||||
+ orl %edx, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+# else
|
||||
+ salq $CHAR_PER_VEC, %rax
|
||||
+ orq %rdx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+# endif
|
||||
+ addq %rdi, %rax
|
||||
+ ret
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
|
||||
-# ifndef USE_AS_STRNLEN
|
||||
- jmp L(loop_4x_vec)
|
||||
-# else
|
||||
- subq $(VEC_SIZE * 4), %rsi
|
||||
- ja L(loop_4x_vec)
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
|
||||
+L(last_4x_vec_or_less_load):
|
||||
+ /* Depending on entry adjust rdi / prepare first VEC in YMM1. */
|
||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
+L(last_4x_vec_or_less_cmpeq):
|
||||
+ VPCMP $0, %YMM1, %YMMZERO, %k0
|
||||
+ addq $(VEC_SIZE * 3), %rdi
|
||||
L(last_4x_vec_or_less):
|
||||
- /* Less than 4 * VEC and aligned to VEC_SIZE. */
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
- jle L(last_2x_vec)
|
||||
-
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
+ /* If remaining length > VEC_SIZE * 2. This works if esi is off by
|
||||
+ VEC_SIZE * 4. */
|
||||
+ testl $(CHAR_PER_VEC * 2), %esi
|
||||
+ jnz L(last_4x_vec)
|
||||
+
|
||||
+ /* length may have been negative or positive by an offset of
|
||||
+ CHAR_PER_VEC * 4 depending on where this was called from. This
|
||||
+ fixes that. */
|
||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ jnz L(last_vec_x1_check)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
+ /* Check the end of data. */
|
||||
+ subl $CHAR_PER_VEC, %esi
|
||||
+ jb L(max)
|
||||
|
||||
VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
|
||||
- VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x3_check)
|
||||
+ subq %rdx, %rdi
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
+# endif
|
||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
||||
+ ret
|
||||
+L(max):
|
||||
movq %r8, %rax
|
||||
+ ret
|
||||
+# endif
|
||||
+
|
||||
+ /* Placed here in strnlen so that the jcc L(last_4x_vec_or_less)
|
||||
+ in the 4x VEC loop can use 2 byte encoding. */
|
||||
+ .p2align 4
|
||||
+L(second_vec_return):
|
||||
+ VPCMP $0, %YMM3, %YMMZERO, %k0
|
||||
+ /* Combine YMM3 matches (k0) with YMM4 matches (k1). */
|
||||
+# ifdef USE_AS_WCSLEN
|
||||
+ kunpckbw %k0, %k1, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
+# else
|
||||
+ kunpckdq %k0, %k1, %k0
|
||||
+ kmovq %k0, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+# endif
|
||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
||||
+ ret
|
||||
+
|
||||
+
|
||||
+# ifdef USE_AS_STRNLEN
|
||||
+L(last_vec_x1_check):
|
||||
+ tzcntl %eax, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max)
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- addl $(VEC_SIZE * 2), %esi
|
||||
+L(last_4x_vec):
|
||||
+ /* Test first 2x VEC normally. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(last_vec_x1)
|
||||
|
||||
- VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0_check)
|
||||
- subl $VEC_SIZE, %esi
|
||||
- jle L(max)
|
||||
+ jnz L(last_vec_x2)
|
||||
|
||||
- VPCMP $0, VEC_SIZE(%rdi), %YMMZERO, %k0
|
||||
+ /* Normalize length. */
|
||||
+ andl $(CHAR_PER_VEC * 4 - 1), %esi
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdi), %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1_check)
|
||||
- movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
- ret
|
||||
+ jnz L(last_vec_x3)
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x0_check):
|
||||
+ /* Check the end of data. */
|
||||
+ subl $(CHAR_PER_VEC * 3), %esi
|
||||
+ jb L(max)
|
||||
+
|
||||
+ VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max_end)
|
||||
+
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC * 4)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1_check):
|
||||
+L(last_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2_check):
|
||||
+L(last_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- /* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC * 2)(%rdi, %rax), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x3_check):
|
||||
+L(last_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
+ subl $(CHAR_PER_VEC * 2), %esi
|
||||
/* Check the end of data. */
|
||||
- cmpq %rax, %rsi
|
||||
- jbe L(max)
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
+ cmpl %eax, %esi
|
||||
+ jb L(max_end)
|
||||
+ subq %rdx, %rdi
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
+ sarq $2, %rdi
|
||||
# endif
|
||||
+ leaq (CHAR_PER_VEC * 3)(%rdi, %rax), %rax
|
||||
ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(max):
|
||||
+L(max_end):
|
||||
movq %r8, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
ret
|
||||
# endif
|
||||
|
||||
+ /* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
-L(first_vec_x0):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
+L(cross_page_boundary):
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Align data to VEC_SIZE. */
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ VPCMP $0, (%rdi), %YMMZERO, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ /* Remove the leading bytes. */
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
+ /* NB: Divide shift count by 4 since each bit in K0 represent 4
|
||||
+ bytes. */
|
||||
+ movl %edx, %ecx
|
||||
+ shrl $2, %ecx
|
||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
||||
# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x1):
|
||||
+ /* SHIFT_REG is ecx for USE_AS_WCSLEN and edx otherwise. */
|
||||
+ sarxl %SHIFT_REG, %eax, %eax
|
||||
+ testl %eax, %eax
|
||||
+# ifndef USE_AS_STRNLEN
|
||||
+ jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(first_vec_x2):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
+# else
|
||||
+ jnz L(cross_page_less_vec)
|
||||
+# ifndef USE_AS_WCSLEN
|
||||
+ movl %edx, %ecx
|
||||
+ andl $(CHAR_PER_VEC - 1), %ecx
|
||||
+# endif
|
||||
+ movl $CHAR_PER_VEC, %eax
|
||||
+ subl %ecx, %eax
|
||||
+ /* Check the end of data. */
|
||||
+ cmpq %rax, %rsi
|
||||
+ ja L(cross_page_continue)
|
||||
+ movl %esi, %eax
|
||||
ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(4x_vec_end):
|
||||
- VPCMP $0, %YMM1, %YMMZERO, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
- VPCMP $0, %YMM2, %YMMZERO, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- VPCMP $0, %YMM3, %YMMZERO, %k2
|
||||
- kmovd %k2, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- VPCMP $0, %YMM4, %YMMZERO, %k3
|
||||
- kmovd %k3, %eax
|
||||
-L(first_vec_x3):
|
||||
+L(cross_page_less_vec):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- sall $2, %eax
|
||||
-# endif
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
- addq %rdi, %rax
|
||||
- subq %rdx, %rax
|
||||
-# ifdef USE_AS_WCSLEN
|
||||
- shrq $2, %rax
|
||||
-# endif
|
||||
+ /* Select min of length and position of first null. */
|
||||
+ cmpq %rax, %rsi
|
||||
+ cmovb %esi, %eax
|
||||
ret
|
||||
+# endif
|
||||
|
||||
END (STRLEN)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,158 +0,0 @@
|
||||
From ea8e465a6b8d0f26c72bcbe453a854de3abf68ec Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Wed, 30 Jun 2021 10:47:06 -0700
|
||||
Subject: [PATCH] x86: Check RTM_ALWAYS_ABORT for RTM [BZ #28033]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
From
|
||||
|
||||
https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
|
||||
|
||||
* Intel TSX will be disabled by default.
|
||||
* The processor will force abort all Restricted Transactional Memory (RTM)
|
||||
transactions by default.
|
||||
* A new CPUID bit CPUID.07H.0H.EDX[11](RTM_ALWAYS_ABORT) will be enumerated,
|
||||
which is set to indicate to updated software that the loaded microcode is
|
||||
forcing RTM abort.
|
||||
* On processors that enumerate support for RTM, the CPUID enumeration bits
|
||||
for Intel TSX (CPUID.07H.0H.EBX[11] and CPUID.07H.0H.EBX[4]) continue to
|
||||
be set by default after microcode update.
|
||||
* Workloads that were benefited from Intel TSX might experience a change
|
||||
in performance.
|
||||
* System software may use a new bit in Model-Specific Register (MSR) 0x10F
|
||||
TSX_FORCE_ABORT[TSX_CPUID_CLEAR] functionality to clear the Hardware Lock
|
||||
Elision (HLE) and RTM bits to indicate to software that Intel TSX is
|
||||
disabled.
|
||||
|
||||
1. Add RTM_ALWAYS_ABORT to CPUID features.
|
||||
2. Set RTM usable only if RTM_ALWAYS_ABORT isn't set. This skips the
|
||||
string/tst-memchr-rtm etc. testcases on the affected processors, which
|
||||
always fail after a microcde update.
|
||||
3. Check RTM feature, instead of usability, against /proc/cpuinfo.
|
||||
|
||||
This fixes BZ #28033.
|
||||
---
|
||||
manual/platform.texi | 3 +++
|
||||
sysdeps/x86/cpu-features.c | 5 ++++-
|
||||
sysdeps/x86/sys/platform/x86.h | 6 +++---
|
||||
sysdeps/x86/tst-cpu-features-supports.c | 2 +-
|
||||
sysdeps/x86/tst-get-cpu-features.c | 2 ++
|
||||
5 files changed, 13 insertions(+), 5 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86/bits/platform/x86.h
|
||||
(doesn't exist)
|
||||
sysdeps/x86/bits/platform/x86.h
|
||||
(account for lack of upstream renames)
|
||||
|
||||
diff --git a/manual/platform.texi b/manual/platform.texi
|
||||
index 8fec2933..b7e8aef7 100644
|
||||
--- a/manual/platform.texi
|
||||
+++ b/manual/platform.texi
|
||||
@@ -510,6 +510,9 @@ capability.
|
||||
@item
|
||||
@code{RTM} -- RTM instruction extensions.
|
||||
|
||||
+@item
|
||||
+@code{RTM_ALWAYS_ABORT} -- Transactions always abort, making RTM unusable.
|
||||
+
|
||||
@item
|
||||
@code{SDBG} -- IA32_DEBUG_INTERFACE MSR for silicon debug.
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 3610ee5c..4889f062 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -74,7 +74,6 @@ update_usable (struct cpu_features *cpu_features)
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, HLE);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, BMI2);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, ERMS);
|
||||
- CPU_FEATURE_SET_USABLE (cpu_features, RTM);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, RDSEED);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, ADX);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, CLFLUSHOPT);
|
||||
@@ -90,6 +89,7 @@ update_usable (struct cpu_features *cpu_features)
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIRI);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, MOVDIR64B);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, FSRM);
|
||||
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM_ALWAYS_ABORT);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, SERIALIZE);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, TSXLDTRK);
|
||||
CPU_FEATURE_SET_USABLE (cpu_features, LAHF64_SAHF64);
|
||||
@@ -779,6 +779,9 @@ no_cpuid:
|
||||
GLRO(dl_platform) = "i586";
|
||||
#endif
|
||||
|
||||
+ if (!CPU_FEATURES_CPU_P (cpu_features, RTM_ALWAYS_ABORT))
|
||||
+ CPU_FEATURE_SET_USABLE (cpu_features, RTM);
|
||||
+
|
||||
#if CET_ENABLED
|
||||
# if HAVE_TUNABLES
|
||||
TUNABLE_GET (x86_ibt, tunable_val_t *,
|
||||
diff --git a/sysdeps/x86/sys/platform/x86.h b/sysdeps/x86/sys/platform/x86.h
|
||||
index e5cc7c68..7a434926 100644
|
||||
--- a/sysdeps/x86/sys/platform/x86.h
|
||||
+++ b/sysdeps/x86/sys/platform/x86.h
|
||||
@@ -247,7 +247,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
||||
#define bit_cpu_AVX512_VP2INTERSECT (1u << 8)
|
||||
#define bit_cpu_INDEX_7_EDX_9 (1u << 9)
|
||||
#define bit_cpu_MD_CLEAR (1u << 10)
|
||||
-#define bit_cpu_INDEX_7_EDX_11 (1u << 11)
|
||||
+#define bit_cpu_RTM_ALWAYS_ABORT (1u << 11)
|
||||
#define bit_cpu_INDEX_7_EDX_12 (1u << 12)
|
||||
#define bit_cpu_INDEX_7_EDX_13 (1u << 13)
|
||||
#define bit_cpu_SERIALIZE (1u << 14)
|
||||
@@ -471,7 +471,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
||||
#define index_cpu_AVX512_VP2INTERSECT COMMON_CPUID_INDEX_7
|
||||
#define index_cpu_INDEX_7_EDX_9 COMMON_CPUID_INDEX_7
|
||||
#define index_cpu_MD_CLEAR COMMON_CPUID_INDEX_7
|
||||
-#define index_cpu_INDEX_7_EDX_11 COMMON_CPUID_INDEX_7
|
||||
+#define index_cpu_RTM_ALWAYS_ABORT COMMON_CPUID_INDEX_7
|
||||
#define index_cpu_INDEX_7_EDX_12 COMMON_CPUID_INDEX_7
|
||||
#define index_cpu_INDEX_7_EDX_13 COMMON_CPUID_INDEX_7
|
||||
#define index_cpu_SERIALIZE COMMON_CPUID_INDEX_7
|
||||
@@ -695,7 +695,7 @@ extern const struct cpu_features *__x86_get_cpu_features (unsigned int)
|
||||
#define reg_AVX512_VP2INTERSECT edx
|
||||
#define reg_INDEX_7_EDX_9 edx
|
||||
#define reg_MD_CLEAR edx
|
||||
-#define reg_INDEX_7_EDX_11 edx
|
||||
+#define reg_RTM_ALWAYS_ABORT edx
|
||||
#define reg_INDEX_7_EDX_12 edx
|
||||
#define reg_INDEX_7_EDX_13 edx
|
||||
#define reg_SERIALIZE edx
|
||||
diff --git a/sysdeps/x86/tst-cpu-features-supports.c b/sysdeps/x86/tst-cpu-features-supports.c
|
||||
index 287cf01f..8100a319 100644
|
||||
--- a/sysdeps/x86/tst-cpu-features-supports.c
|
||||
+++ b/sysdeps/x86/tst-cpu-features-supports.c
|
||||
@@ -152,7 +152,7 @@ do_test (int argc, char **argv)
|
||||
fails += CHECK_SUPPORTS (rdpid, RDPID);
|
||||
fails += CHECK_SUPPORTS (rdrnd, RDRAND);
|
||||
fails += CHECK_SUPPORTS (rdseed, RDSEED);
|
||||
- fails += CHECK_SUPPORTS (rtm, RTM);
|
||||
+ fails += CHECK_CPU_SUPPORTS (rtm, RTM);
|
||||
fails += CHECK_SUPPORTS (serialize, SERIALIZE);
|
||||
fails += CHECK_SUPPORTS (sha, SHA);
|
||||
fails += CHECK_CPU_SUPPORTS (shstk, SHSTK);
|
||||
diff --git a/sysdeps/x86/tst-get-cpu-features.c b/sysdeps/x86/tst-get-cpu-features.c
|
||||
index 2763deb6..0717e5d8 100644
|
||||
--- a/sysdeps/x86/tst-get-cpu-features.c
|
||||
+++ b/sysdeps/x86/tst-get-cpu-features.c
|
||||
@@ -183,6 +183,7 @@ do_test (void)
|
||||
CHECK_CPU_FEATURE (UINTR);
|
||||
CHECK_CPU_FEATURE (AVX512_VP2INTERSECT);
|
||||
CHECK_CPU_FEATURE (MD_CLEAR);
|
||||
+ CHECK_CPU_FEATURE (RTM_ALWAYS_ABORT);
|
||||
CHECK_CPU_FEATURE (SERIALIZE);
|
||||
CHECK_CPU_FEATURE (HYBRID);
|
||||
CHECK_CPU_FEATURE (TSXLDTRK);
|
||||
@@ -344,6 +345,7 @@ do_test (void)
|
||||
CHECK_CPU_FEATURE_USABLE (FSRM);
|
||||
CHECK_CPU_FEATURE_USABLE (AVX512_VP2INTERSECT);
|
||||
CHECK_CPU_FEATURE_USABLE (MD_CLEAR);
|
||||
+ CHECK_CPU_FEATURE_USABLE (RTM_ALWAYS_ABORT);
|
||||
CHECK_CPU_FEATURE_USABLE (SERIALIZE);
|
||||
CHECK_CPU_FEATURE_USABLE (HYBRID);
|
||||
CHECK_CPU_FEATURE_USABLE (TSXLDTRK);
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,51 +0,0 @@
|
||||
From 0679442defedf7e52a94264975880ab8674736b2 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 8 Jul 2021 16:13:19 -0400
|
||||
Subject: [PATCH] x86: Remove wcsnlen-sse4_1 from wcslen ifunc-impl-list [BZ
|
||||
#28064]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The following commit
|
||||
|
||||
commit 6f573a27b6c8b4236445810a44660612323f5a73
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed Jun 23 01:19:34 2021 -0400
|
||||
|
||||
x86-64: Add wcslen optimize for sse4.1
|
||||
|
||||
Added wcsnlen-sse4.1 to the wcslen ifunc implementation list and did
|
||||
not add wcslen-sse4.1 to wcslen ifunc implementation list. This commit
|
||||
fixes that by removing wcsnlen-sse4.1 from the wcslen ifunc
|
||||
implementation list and adding wcslen-sse4.1 to the ifunc
|
||||
implementation list.
|
||||
|
||||
Testing:
|
||||
test-wcslen.c, test-rsi-wcslen.c, and test-rsi-strlen.c are passing as
|
||||
well as all other tests in wcsmbs and string.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 580913ca..695cdba6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -657,9 +657,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcslen_evex)
|
||||
- IFUNC_IMPL_ADD (array, i, wcsnlen,
|
||||
+ IFUNC_IMPL_ADD (array, i, wcslen,
|
||||
CPU_FEATURE_USABLE (SSE4_1),
|
||||
- __wcsnlen_sse4_1)
|
||||
+ __wcslen_sse4_1)
|
||||
IFUNC_IMPL_ADD (array, i, wcslen, 1, __wcslen_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/wcsnlen.c. */
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,135 +0,0 @@
|
||||
From c6272098323153db373f2986c67786ea8c85f1cf Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue, 15 Feb 2022 08:18:15 -0600
|
||||
Subject: [PATCH] x86: Fallback {str|wcs}cmp RTM in the ncmp overflow case [BZ
|
||||
#28896]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
|
||||
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
|
||||
not checks around vzeroupper and would trigger spurious
|
||||
aborts. This commit fixes that.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
|
||||
AVX2 machines with and without RTM.
|
||||
|
||||
Co-authored-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86/Makefile | 2 +-
|
||||
sysdeps/x86/tst-strncmp-rtm.c | 17 ++++++++++++++++-
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
|
||||
sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S | 1 +
|
||||
sysdeps/x86_64/multiarch/strncmp-avx2.S | 1 +
|
||||
sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S | 2 +-
|
||||
sysdeps/x86_64/multiarch/wcsncmp-avx2.S | 2 +-
|
||||
7 files changed, 22 insertions(+), 5 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
(split into two patches due to upstream bug differences)
|
||||
|
||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
||||
index 5be71ada..2d814915 100644
|
||||
--- a/sysdeps/x86/Makefile
|
||||
+++ b/sysdeps/x86/Makefile
|
||||
@@ -38,7 +38,7 @@ CFLAGS-tst-memset-rtm.c += -mrtm
|
||||
CFLAGS-tst-strchr-rtm.c += -mrtm
|
||||
CFLAGS-tst-strcpy-rtm.c += -mrtm
|
||||
CFLAGS-tst-strlen-rtm.c += -mrtm
|
||||
-CFLAGS-tst-strncmp-rtm.c += -mrtm
|
||||
+CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
|
||||
CFLAGS-tst-strrchr-rtm.c += -mrtm
|
||||
endif
|
||||
|
||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
index 236ad951..4d0004b5 100644
|
||||
--- a/sysdeps/x86/tst-strncmp-rtm.c
|
||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
@@ -16,6 +16,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<https://www.gnu.org/licenses/>. */
|
||||
|
||||
+#include <stdint.h>
|
||||
#include <tst-string-rtm.h>
|
||||
|
||||
#define LOOP 3000
|
||||
@@ -45,8 +46,22 @@ function (void)
|
||||
return 1;
|
||||
}
|
||||
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function_overflow (void)
|
||||
+{
|
||||
+ if (strncmp (string1, string2, SIZE_MAX) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
do_test (void)
|
||||
{
|
||||
- return do_test_1 ("strncmp", LOOP, prepare, function);
|
||||
+ int status = do_test_1 ("strncmp", LOOP, prepare, function);
|
||||
+ if (status != EXIT_SUCCESS)
|
||||
+ return status;
|
||||
+ status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
|
||||
+ return status;
|
||||
}
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 5d1c9d90..433ae047 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -95,7 +95,7 @@ ENTRY (STRCMP)
|
||||
length to bound a valid memory region. In these cases just use
|
||||
'wcscmp'. */
|
||||
shrq $56, %rcx
|
||||
- jnz __wcscmp_avx2
|
||||
+ jnz OVERFLOW_STRCMP
|
||||
# endif
|
||||
/* Convert units: from wide to byte char. */
|
||||
shl $2, %RDX_LP
|
||||
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
|
||||
index 37d1224b..68bad365 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2-rtm.S
|
||||
@@ -1,3 +1,4 @@
|
||||
#define STRCMP __strncmp_avx2_rtm
|
||||
#define USE_AS_STRNCMP 1
|
||||
+#define OVERFLOW_STRCMP __strcmp_avx2_rtm
|
||||
#include "strcmp-avx2-rtm.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/strncmp-avx2.S b/sysdeps/x86_64/multiarch/strncmp-avx2.S
|
||||
index 1678bcc2..f138e9f1 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strncmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strncmp-avx2.S
|
||||
@@ -1,3 +1,4 @@
|
||||
#define STRCMP __strncmp_avx2
|
||||
#define USE_AS_STRNCMP 1
|
||||
+#define OVERFLOW_STRCMP __strcmp_avx2
|
||||
#include "strcmp-avx2.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
|
||||
index 4e88c70c..f467582c 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2-rtm.S
|
||||
@@ -1,5 +1,5 @@
|
||||
#define STRCMP __wcsncmp_avx2_rtm
|
||||
#define USE_AS_STRNCMP 1
|
||||
#define USE_AS_WCSCMP 1
|
||||
-
|
||||
+#define OVERFLOW_STRCMP __wcscmp_avx2_rtm
|
||||
#include "strcmp-avx2-rtm.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
|
||||
index 4fa1de4d..e9ede522 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/wcsncmp-avx2.S
|
||||
@@ -1,5 +1,5 @@
|
||||
#define STRCMP __wcsncmp_avx2
|
||||
#define USE_AS_STRNCMP 1
|
||||
#define USE_AS_WCSCMP 1
|
||||
-
|
||||
+#define OVERFLOW_STRCMP __wcscmp_avx2
|
||||
#include "strcmp-avx2.S"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,51 +0,0 @@
|
||||
From 55c7bcc71b84123d5d4bd2814366a6b05fcf8ebd Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sat, 9 May 2020 12:04:23 -0700
|
||||
Subject: [PATCH] x86-64: Use RDX_LP on __x86_shared_non_temporal_threshold [BZ
|
||||
#25966]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Since __x86_shared_non_temporal_threshold is defined as
|
||||
|
||||
long int __x86_shared_non_temporal_threshold;
|
||||
|
||||
and long int is 4 bytes for x32, use RDX_LP to compare against
|
||||
__x86_shared_non_temporal_threshold in assembly code.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index 71f5954d..673b73aa 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -245,7 +245,7 @@ L(return):
|
||||
#endif
|
||||
|
||||
L(movsb):
|
||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
jae L(more_8x_vec)
|
||||
cmpq %rsi, %rdi
|
||||
jb 1f
|
||||
@@ -397,7 +397,7 @@ L(more_8x_vec):
|
||||
addq %r8, %rdx
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
/* Check non-temporal store threshold. */
|
||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
ja L(large_forward)
|
||||
#endif
|
||||
L(loop_4x_vec_forward):
|
||||
@@ -448,7 +448,7 @@ L(more_8x_vec_backward):
|
||||
subq %r8, %rdx
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
/* Check non-temporal store threshold. */
|
||||
- cmpq __x86_shared_non_temporal_threshold(%rip), %rdx
|
||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
ja L(large_backward)
|
||||
#endif
|
||||
L(loop_4x_vec_backward):
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,44 +0,0 @@
|
||||
From a35a59036ebae3efcdf5e8167610e0656fca9770 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Thu, 11 Jun 2020 12:41:18 -0700
|
||||
Subject: [PATCH] x86_64: Use %xmmN with vpxor to clear a vector register
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Since "vpxor %xmmN, %xmmN, %xmmN" clears the whole vector register, use
|
||||
%xmmN, instead of %ymmN, with vpxor to clear a vector register.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 4 ++--
|
||||
sysdeps/x86_64/multiarch/strrchr-avx2.S | 2 +-
|
||||
2 files changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 433ae047..70d8499b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -105,8 +105,8 @@ ENTRY (STRCMP)
|
||||
# endif
|
||||
movl %edi, %eax
|
||||
xorl %edx, %edx
|
||||
- /* Make %ymm7 all zeros in this function. */
|
||||
- vpxor %ymm7, %ymm7, %ymm7
|
||||
+ /* Make %xmm7 (%ymm7) all zeros in this function. */
|
||||
+ vpxor %xmm7, %xmm7, %xmm7
|
||||
orl %esi, %eax
|
||||
andl $(PAGE_SIZE - 1), %eax
|
||||
cmpl $(PAGE_SIZE - (VEC_SIZE * 4)), %eax
|
||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
index 9f22a15e..c949410b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
|
||||
@@ -48,7 +48,7 @@ ENTRY (STRRCHR)
|
||||
movl %edi, %ecx
|
||||
/* Broadcast CHAR to YMM4. */
|
||||
VPBROADCAST %xmm4, %ymm4
|
||||
- vpxor %ymm0, %ymm0, %ymm0
|
||||
+ vpxor %xmm0, %xmm0, %xmm0
|
||||
|
||||
/* Check if we may cross page boundary with one vector load. */
|
||||
andl $(2 * VEC_SIZE - 1), %ecx
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,359 +0,0 @@
|
||||
From 1f745ecc2109890886b161d4791e1406fdfc29b8 Mon Sep 17 00:00:00 2001
|
||||
From: noah <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 3 Feb 2021 00:38:59 -0500
|
||||
Subject: [PATCH] x86-64: Refactor and improve performance of strchr-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. Just seemed the performance could be improved a bit. Observed
|
||||
and expected behavior are unchanged. Optimized body of main
|
||||
loop. Updated page cross logic and optimized accordingly. Made a few
|
||||
minor instruction selection modifications. No regressions in test
|
||||
suite. Both test-strchrnul and test-strchr passed.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strchr-avx2.S | 225 ++++++++++++-------------
|
||||
sysdeps/x86_64/multiarch/strchr.c | 4 +-
|
||||
2 files changed, 114 insertions(+), 115 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/multiarch/strchr.c
|
||||
(account for missing upstream macros)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
||||
index da7d2620..919d256c 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
||||
@@ -27,10 +27,12 @@
|
||||
# ifdef USE_AS_WCSCHR
|
||||
# define VPBROADCAST vpbroadcastd
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
+# define VPMINU vpminud
|
||||
# define CHAR_REG esi
|
||||
# else
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
+# define VPMINU vpminub
|
||||
# define CHAR_REG sil
|
||||
# endif
|
||||
|
||||
@@ -43,71 +45,54 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRCHR)
|
||||
movl %edi, %ecx
|
||||
- /* Broadcast CHAR to YMM0. */
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ xorl %edx, %edx
|
||||
+# endif
|
||||
+
|
||||
+ /* Broadcast CHAR to YMM0. */
|
||||
vmovd %esi, %xmm0
|
||||
vpxor %xmm9, %xmm9, %xmm9
|
||||
VPBROADCAST %xmm0, %ymm0
|
||||
- /* Check if we may cross page boundary with one vector load. */
|
||||
- andl $(2 * VEC_SIZE - 1), %ecx
|
||||
- cmpl $VEC_SIZE, %ecx
|
||||
- ja L(cros_page_boundary)
|
||||
|
||||
- /* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
||||
- null byte. */
|
||||
- vmovdqu (%rdi), %ymm8
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
- vpor %ymm1, %ymm2, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ /* Check if we cross page boundary with one vector load. */
|
||||
+ andl $(PAGE_SIZE - 1), %ecx
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
-
|
||||
- jmp L(more_4x_vec)
|
||||
-
|
||||
- .p2align 4
|
||||
-L(cros_page_boundary):
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
+ /* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
||||
+ null byte. */
|
||||
vmovdqu (%rdi), %ymm8
|
||||
VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
vpor %ymm1, %ymm2, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
- /* Remove the leading bytes. */
|
||||
- sarl %cl, %eax
|
||||
testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
- /* Found CHAR or the null byte. */
|
||||
+ jz L(more_vecs)
|
||||
tzcntl %eax, %eax
|
||||
- addq %rcx, %rax
|
||||
-# ifdef USE_AS_STRCHRNUL
|
||||
+ /* Found CHAR or the null byte. */
|
||||
addq %rdi, %rax
|
||||
-# else
|
||||
- xorl %edx, %edx
|
||||
- leaq (%rdi, %rax), %rax
|
||||
- cmp (%rax), %CHAR_REG
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
+L(more_vecs):
|
||||
+ /* Align data for aligned loads in the loop. */
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
L(aligned_more):
|
||||
- addq $VEC_SIZE, %rdi
|
||||
|
||||
-L(more_4x_vec):
|
||||
- /* Check the first 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
- since data is only aligned to VEC_SIZE. */
|
||||
- vmovdqa (%rdi), %ymm8
|
||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
+ since data is only aligned to VEC_SIZE. */
|
||||
+ vmovdqa VEC_SIZE(%rdi), %ymm8
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
vpor %ymm1, %ymm2, %ymm1
|
||||
@@ -137,61 +122,24 @@ L(more_4x_vec):
|
||||
vpor %ymm1, %ymm2, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x3)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(4 * VEC_SIZE - 1), %ecx
|
||||
- andq $-(4 * VEC_SIZE), %rdi
|
||||
-
|
||||
- .p2align 4
|
||||
-L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- vmovdqa (%rdi), %ymm5
|
||||
- vmovdqa VEC_SIZE(%rdi), %ymm6
|
||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
|
||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
||||
-
|
||||
- VPCMPEQ %ymm5, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm6, %ymm0, %ymm2
|
||||
- VPCMPEQ %ymm7, %ymm0, %ymm3
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm4
|
||||
-
|
||||
- VPCMPEQ %ymm5, %ymm9, %ymm5
|
||||
- VPCMPEQ %ymm6, %ymm9, %ymm6
|
||||
- VPCMPEQ %ymm7, %ymm9, %ymm7
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm8
|
||||
-
|
||||
- vpor %ymm1, %ymm5, %ymm1
|
||||
- vpor %ymm2, %ymm6, %ymm2
|
||||
- vpor %ymm3, %ymm7, %ymm3
|
||||
- vpor %ymm4, %ymm8, %ymm4
|
||||
-
|
||||
- vpor %ymm1, %ymm2, %ymm5
|
||||
- vpor %ymm3, %ymm4, %ymm6
|
||||
-
|
||||
- vpor %ymm5, %ymm6, %ymm5
|
||||
-
|
||||
- vpmovmskb %ymm5, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
+ jz L(prep_loop_4x)
|
||||
|
||||
- jmp L(loop_4x_vec)
|
||||
+ tzcntl %eax, %eax
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
+ cmovne %rdx, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER
|
||||
+ ret
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x0):
|
||||
- /* Found CHAR or the null byte. */
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_STRCHRNUL
|
||||
+ /* Found CHAR or the null byte. */
|
||||
addq %rdi, %rax
|
||||
-# else
|
||||
- xorl %edx, %edx
|
||||
- leaq (%rdi, %rax), %rax
|
||||
- cmp (%rax), %CHAR_REG
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -199,13 +147,9 @@ L(first_vec_x0):
|
||||
.p2align 4
|
||||
L(first_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_STRCHRNUL
|
||||
- addq $VEC_SIZE, %rax
|
||||
- addq %rdi, %rax
|
||||
-# else
|
||||
- xorl %edx, %edx
|
||||
leaq VEC_SIZE(%rdi, %rax), %rax
|
||||
- cmp (%rax), %CHAR_REG
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
@@ -213,42 +157,97 @@ L(first_vec_x1):
|
||||
.p2align 4
|
||||
L(first_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_STRCHRNUL
|
||||
- addq $(VEC_SIZE * 2), %rax
|
||||
- addq %rdi, %rax
|
||||
-# else
|
||||
- xorl %edx, %edx
|
||||
+ /* Found CHAR or the null byte. */
|
||||
leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
- cmp (%rax), %CHAR_REG
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+L(prep_loop_4x):
|
||||
+ /* Align data to 4 * VEC_SIZE. */
|
||||
+ andq $-(VEC_SIZE * 4), %rdi
|
||||
+
|
||||
.p2align 4
|
||||
-L(4x_vec_end):
|
||||
+L(loop_4x_vec):
|
||||
+ /* Compare 4 * VEC at a time forward. */
|
||||
+ vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
|
||||
+ vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
|
||||
+ vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
|
||||
+ vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
|
||||
+
|
||||
+ /* Leaves only CHARS matching esi as 0. */
|
||||
+ vpxor %ymm5, %ymm0, %ymm1
|
||||
+ vpxor %ymm6, %ymm0, %ymm2
|
||||
+ vpxor %ymm7, %ymm0, %ymm3
|
||||
+ vpxor %ymm8, %ymm0, %ymm4
|
||||
+
|
||||
+ VPMINU %ymm1, %ymm5, %ymm1
|
||||
+ VPMINU %ymm2, %ymm6, %ymm2
|
||||
+ VPMINU %ymm3, %ymm7, %ymm3
|
||||
+ VPMINU %ymm4, %ymm8, %ymm4
|
||||
+
|
||||
+ VPMINU %ymm1, %ymm2, %ymm5
|
||||
+ VPMINU %ymm3, %ymm4, %ymm6
|
||||
+
|
||||
+ VPMINU %ymm5, %ymm6, %ymm5
|
||||
+
|
||||
+ VPCMPEQ %ymm5, %ymm9, %ymm5
|
||||
+ vpmovmskb %ymm5, %eax
|
||||
+
|
||||
+ addq $(VEC_SIZE * 4), %rdi
|
||||
+ testl %eax, %eax
|
||||
+ jz L(loop_4x_vec)
|
||||
+
|
||||
+ VPCMPEQ %ymm1, %ymm9, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x0)
|
||||
+
|
||||
+ VPCMPEQ %ymm2, %ymm9, %ymm2
|
||||
vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(first_vec_x1)
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
+
|
||||
+ VPCMPEQ %ymm3, %ymm9, %ymm3
|
||||
+ VPCMPEQ %ymm4, %ymm9, %ymm4
|
||||
+ vpmovmskb %ymm3, %ecx
|
||||
vpmovmskb %ymm4, %eax
|
||||
+ salq $32, %rax
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
+ cmovne %rdx, %rax
|
||||
+# endif
|
||||
+ VZEROUPPER
|
||||
+ ret
|
||||
+
|
||||
+ /* Cold case for crossing page with first load. */
|
||||
+ .p2align 4
|
||||
+L(cross_page_boundary):
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ andl $(VEC_SIZE - 1), %ecx
|
||||
+
|
||||
+ vmovdqa (%rdi), %ymm8
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
+ vpor %ymm1, %ymm2, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* Remove the leading bits. */
|
||||
+ sarxl %ecx, %eax, %eax
|
||||
testl %eax, %eax
|
||||
-L(first_vec_x3):
|
||||
+ jz L(aligned_more)
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_STRCHRNUL
|
||||
- addq $(VEC_SIZE * 3), %rax
|
||||
+ addq %rcx, %rdi
|
||||
addq %rdi, %rax
|
||||
-# else
|
||||
- xorl %edx, %edx
|
||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
||||
- cmp (%rax), %CHAR_REG
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
cmovne %rdx, %rax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
END (STRCHR)
|
||||
-#endif
|
||||
+# endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/strchr.c b/sysdeps/x86_64/multiarch/strchr.c
|
||||
index 7e582f02..5225bd4f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strchr.c
|
||||
+++ b/sysdeps/x86_64/multiarch/strchr.c
|
||||
@@ -38,11 +38,11 @@ IFUNC_SELECTOR (void)
|
||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,67 +0,0 @@
|
||||
From 3ec5d83d2a237d39e7fd6ef7a0bc8ac4c171a4a5 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sat, 25 Jan 2020 14:19:40 -0800
|
||||
Subject: [PATCH] x86-64: Avoid rep movsb with short distance [BZ #27130]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
When copying with "rep movsb", if the distance between source and
|
||||
destination is N*4GB + [1..63] with N >= 0, performance may be very
|
||||
slow. This patch updates memmove-vec-unaligned-erms.S for AVX and
|
||||
AVX512 versions with the distance in RCX:
|
||||
|
||||
cmpl $63, %ecx
|
||||
// Don't use "rep movsb" if ECX <= 63
|
||||
jbe L(Don't use rep movsb")
|
||||
Use "rep movsb"
|
||||
|
||||
Benchtests data with bench-memcpy, bench-memcpy-large, bench-memcpy-random
|
||||
and bench-memcpy-walk on Skylake, Ice Lake and Tiger Lake show that its
|
||||
performance impact is within noise range as "rep movsb" is only used for
|
||||
data size >= 4KB.
|
||||
---
|
||||
.../multiarch/memmove-vec-unaligned-erms.S | 21 +++++++++++++++++++
|
||||
1 file changed, 21 insertions(+)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index 673b73aa..c475fed4 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -64,6 +64,13 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
+/* Avoid short distance rep movsb only with non-SSE vector. */
|
||||
+#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
+# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
|
||||
+#else
|
||||
+# define AVOID_SHORT_DISTANCE_REP_MOVSB 0
|
||||
+#endif
|
||||
+
|
||||
#ifndef PREFETCH
|
||||
# define PREFETCH(addr) prefetcht0 addr
|
||||
#endif
|
||||
@@ -255,7 +262,21 @@ L(movsb):
|
||||
cmpq %r9, %rdi
|
||||
/* Avoid slow backward REP MOVSB. */
|
||||
jb L(more_8x_vec_backward)
|
||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
+ movq %rdi, %rcx
|
||||
+ subq %rsi, %rcx
|
||||
+ jmp 2f
|
||||
+# endif
|
||||
1:
|
||||
+# if AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
+ movq %rsi, %rcx
|
||||
+ subq %rdi, %rcx
|
||||
+2:
|
||||
+/* Avoid "rep movsb" if RCX, the distance between source and destination,
|
||||
+ is N*4GB + [1..63] with N >= 0. */
|
||||
+ cmpl $63, %ecx
|
||||
+ jbe L(more_2x_vec) /* Avoid "rep movsb" if ECX <= 63. */
|
||||
+# endif
|
||||
mov %RDX_LP, %RCX_LP
|
||||
rep movsb
|
||||
L(nop):
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,449 +0,0 @@
|
||||
From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
|
||||
From: noah <goldstein.w.n@gmail.com>
|
||||
Date: Sat, 3 Apr 2021 04:12:15 -0400
|
||||
Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No Bug. This commit updates the large memcpy case (no overlap). The
|
||||
update is to perform memcpy on either 2 or 4 contiguous pages at
|
||||
once. This 1) helps to alleviate the affects of false memory aliasing
|
||||
when destination and source have a close 4k alignment and 2) In most
|
||||
cases and for most DRAM units is a modestly more efficient access
|
||||
pattern. These changes are a clear performance improvement for
|
||||
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
|
||||
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
|
||||
pass.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
.../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++----
|
||||
1 file changed, 265 insertions(+), 73 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
(different number of sections)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index c475fed4..3e2dd6bc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -32,7 +32,16 @@
|
||||
overlapping addresses.
|
||||
6. If size >= __x86_shared_non_temporal_threshold and there is no
|
||||
overlap between destination and source, use non-temporal store
|
||||
- instead of aligned store. */
|
||||
+ instead of aligned store copying from either 2 or 4 pages at
|
||||
+ once.
|
||||
+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
|
||||
+ and source and destination do not page alias, copy from 2 pages
|
||||
+ at once using non-temporal stores. Page aliasing in this case is
|
||||
+ considered true if destination's page alignment - sources' page
|
||||
+ alignment is less than 8 * VEC_SIZE.
|
||||
+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source
|
||||
+ and destination do page alias copy from 4 pages at once using
|
||||
+ non-temporal stores. */
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
@@ -64,6 +73,34 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
+#ifndef PAGE_SIZE
|
||||
+# define PAGE_SIZE 4096
|
||||
+#endif
|
||||
+
|
||||
+#if PAGE_SIZE != 4096
|
||||
+# error Unsupported PAGE_SIZE
|
||||
+#endif
|
||||
+
|
||||
+#ifndef LOG_PAGE_SIZE
|
||||
+# define LOG_PAGE_SIZE 12
|
||||
+#endif
|
||||
+
|
||||
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
|
||||
+# error Invalid LOG_PAGE_SIZE
|
||||
+#endif
|
||||
+
|
||||
+/* Byte per page for large_memcpy inner loop. */
|
||||
+#if VEC_SIZE == 64
|
||||
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
|
||||
+#else
|
||||
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
|
||||
+#endif
|
||||
+
|
||||
+/* Amount to shift rdx by to compare for memcpy_large_4x. */
|
||||
+#ifndef LOG_4X_MEMCPY_THRESH
|
||||
+# define LOG_4X_MEMCPY_THRESH 4
|
||||
+#endif
|
||||
+
|
||||
/* Avoid short distance rep movsb only with non-SSE vector. */
|
||||
#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
|
||||
# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
|
||||
@@ -103,6 +140,28 @@
|
||||
# error Unsupported PREFETCH_SIZE!
|
||||
#endif
|
||||
|
||||
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
|
||||
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
|
||||
+ VMOVU (offset)base, vec0; \
|
||||
+ VMOVU ((offset) + VEC_SIZE)base, vec1;
|
||||
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
|
||||
+ VMOVNT vec0, (offset)base; \
|
||||
+ VMOVNT vec1, ((offset) + VEC_SIZE)base;
|
||||
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
|
||||
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
||||
+ VMOVU (offset)base, vec0; \
|
||||
+ VMOVU ((offset) + VEC_SIZE)base, vec1; \
|
||||
+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \
|
||||
+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;
|
||||
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
|
||||
+ VMOVNT vec0, (offset)base; \
|
||||
+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \
|
||||
+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \
|
||||
+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;
|
||||
+#else
|
||||
+# error Invalid LARGE_LOAD_SIZE
|
||||
+#endif
|
||||
+
|
||||
#ifndef SECTION
|
||||
# error SECTION is not defined!
|
||||
#endif
|
||||
@@ -390,6 +449,15 @@ L(last_4x_vec):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
L(more_8x_vec):
|
||||
+ /* Check if non-temporal move candidate. */
|
||||
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
+ /* Check non-temporal store threshold. */
|
||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
+ ja L(large_memcpy_2x)
|
||||
+#endif
|
||||
+ /* Entry if rdx is greater than non-temporal threshold but there
|
||||
+ is overlap. */
|
||||
+L(more_8x_vec_check):
|
||||
cmpq %rsi, %rdi
|
||||
ja L(more_8x_vec_backward)
|
||||
/* Source == destination is less common. */
|
||||
@@ -416,24 +484,21 @@ L(more_8x_vec):
|
||||
subq %r8, %rdi
|
||||
/* Adjust length. */
|
||||
addq %r8, %rdx
|
||||
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
- /* Check non-temporal store threshold. */
|
||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
- ja L(large_forward)
|
||||
-#endif
|
||||
+
|
||||
+ .p2align 4
|
||||
L(loop_4x_vec_forward):
|
||||
/* Copy 4 * VEC a time forward. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
- addq $(VEC_SIZE * 4), %rsi
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ subq $-(VEC_SIZE * 4), %rsi
|
||||
+ addq $-(VEC_SIZE * 4), %rdx
|
||||
VMOVA %VEC(0), (%rdi)
|
||||
VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||||
VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
ja L(loop_4x_vec_forward)
|
||||
/* Store the last 4 * VEC. */
|
||||
@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
|
||||
subq %r8, %r9
|
||||
/* Adjust length. */
|
||||
subq %r8, %rdx
|
||||
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
- /* Check non-temporal store threshold. */
|
||||
- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
|
||||
- ja L(large_backward)
|
||||
-#endif
|
||||
+
|
||||
+ .p2align 4
|
||||
L(loop_4x_vec_backward):
|
||||
/* Copy 4 * VEC a time backward. */
|
||||
VMOVU (%rcx), %VEC(0)
|
||||
VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||||
VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||||
VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||||
- subq $(VEC_SIZE * 4), %rcx
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
+ addq $-(VEC_SIZE * 4), %rcx
|
||||
+ addq $-(VEC_SIZE * 4), %rdx
|
||||
VMOVA %VEC(0), (%r9)
|
||||
VMOVA %VEC(1), -VEC_SIZE(%r9)
|
||||
VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||||
VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||||
- subq $(VEC_SIZE * 4), %r9
|
||||
+ addq $-(VEC_SIZE * 4), %r9
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
ja L(loop_4x_vec_backward)
|
||||
/* Store the first 4 * VEC. */
|
||||
@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
|
||||
-L(large_forward):
|
||||
+ .p2align 4
|
||||
+L(large_memcpy_2x):
|
||||
+ /* Compute absolute value of difference between source and
|
||||
+ destination. */
|
||||
+ movq %rdi, %r9
|
||||
+ subq %rsi, %r9
|
||||
+ movq %r9, %r8
|
||||
+ leaq -1(%r9), %rcx
|
||||
+ sarq $63, %r8
|
||||
+ xorq %r8, %r9
|
||||
+ subq %r8, %r9
|
||||
/* Don't use non-temporal store if there is overlap between
|
||||
- destination and source since destination may be in cache
|
||||
- when source is loaded. */
|
||||
- leaq (%rdi, %rdx), %r10
|
||||
- cmpq %r10, %rsi
|
||||
- jb L(loop_4x_vec_forward)
|
||||
-L(loop_large_forward):
|
||||
+ destination and source since destination may be in cache when
|
||||
+ source is loaded. */
|
||||
+ cmpq %r9, %rdx
|
||||
+ ja L(more_8x_vec_check)
|
||||
+
|
||||
+ /* Cache align destination. First store the first 64 bytes then
|
||||
+ adjust alignments. */
|
||||
+ VMOVU (%rsi), %VEC(8)
|
||||
+#if VEC_SIZE < 64
|
||||
+ VMOVU VEC_SIZE(%rsi), %VEC(9)
|
||||
+#if VEC_SIZE < 32
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)
|
||||
+#endif
|
||||
+#endif
|
||||
+ VMOVU %VEC(8), (%rdi)
|
||||
+#if VEC_SIZE < 64
|
||||
+ VMOVU %VEC(9), VEC_SIZE(%rdi)
|
||||
+#if VEC_SIZE < 32
|
||||
+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)
|
||||
+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)
|
||||
+#endif
|
||||
+#endif
|
||||
+ /* Adjust source, destination, and size. */
|
||||
+ movq %rdi, %r8
|
||||
+ andq $63, %r8
|
||||
+ /* Get the negative of offset for alignment. */
|
||||
+ subq $64, %r8
|
||||
+ /* Adjust source. */
|
||||
+ subq %r8, %rsi
|
||||
+ /* Adjust destination which should be aligned now. */
|
||||
+ subq %r8, %rdi
|
||||
+ /* Adjust length. */
|
||||
+ addq %r8, %rdx
|
||||
+
|
||||
+ /* Test if source and destination addresses will alias. If they do
|
||||
+ the larger pipeline in large_memcpy_4x alleviated the
|
||||
+ performance drop. */
|
||||
+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx
|
||||
+ jz L(large_memcpy_4x)
|
||||
+
|
||||
+ movq %rdx, %r10
|
||||
+ shrq $LOG_4X_MEMCPY_THRESH, %r10
|
||||
+ cmp __x86_shared_non_temporal_threshold(%rip), %r10
|
||||
+ jae L(large_memcpy_4x)
|
||||
+
|
||||
+ /* edx will store remainder size for copying tail. */
|
||||
+ andl $(PAGE_SIZE * 2 - 1), %edx
|
||||
+ /* r10 stores outer loop counter. */
|
||||
+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
|
||||
+ /* Copy 4x VEC at a time from 2 pages. */
|
||||
+ .p2align 4
|
||||
+L(loop_large_memcpy_2x_outer):
|
||||
+ /* ecx stores inner loop counter. */
|
||||
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
||||
+L(loop_large_memcpy_2x_inner):
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
|
||||
+ /* Load vectors from rsi. */
|
||||
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||||
+ subq $-LARGE_LOAD_SIZE, %rsi
|
||||
+ /* Non-temporal store vectors to rdi. */
|
||||
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||||
+ subq $-LARGE_LOAD_SIZE, %rdi
|
||||
+ decl %ecx
|
||||
+ jnz L(loop_large_memcpy_2x_inner)
|
||||
+ addq $PAGE_SIZE, %rdi
|
||||
+ addq $PAGE_SIZE, %rsi
|
||||
+ decq %r10
|
||||
+ jne L(loop_large_memcpy_2x_outer)
|
||||
+ sfence
|
||||
+
|
||||
+ /* Check if only last 4 loads are needed. */
|
||||
+ cmpl $(VEC_SIZE * 4), %edx
|
||||
+ jbe L(large_memcpy_2x_end)
|
||||
+
|
||||
+ /* Handle the last 2 * PAGE_SIZE bytes. */
|
||||
+L(loop_large_memcpy_2x_tail):
|
||||
/* Copy 4 * VEC a time forward with non-temporal stores. */
|
||||
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
|
||||
- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
|
||||
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||||
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
- addq $PREFETCHED_LOAD_SIZE, %rsi
|
||||
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
- VMOVNT %VEC(0), (%rdi)
|
||||
- VMOVNT %VEC(1), VEC_SIZE(%rdi)
|
||||
- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
- addq $PREFETCHED_LOAD_SIZE, %rdi
|
||||
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
- ja L(loop_large_forward)
|
||||
- sfence
|
||||
+ subq $-(VEC_SIZE * 4), %rsi
|
||||
+ addl $-(VEC_SIZE * 4), %edx
|
||||
+ VMOVA %VEC(0), (%rdi)
|
||||
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||||
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ cmpl $(VEC_SIZE * 4), %edx
|
||||
+ ja L(loop_large_memcpy_2x_tail)
|
||||
+
|
||||
+L(large_memcpy_2x_end):
|
||||
/* Store the last 4 * VEC. */
|
||||
- VMOVU %VEC(5), (%rcx)
|
||||
- VMOVU %VEC(6), -VEC_SIZE(%rcx)
|
||||
- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)
|
||||
- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)
|
||||
- /* Store the first VEC. */
|
||||
- VMOVU %VEC(4), (%r11)
|
||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
||||
+
|
||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
-L(large_backward):
|
||||
- /* Don't use non-temporal store if there is overlap between
|
||||
- destination and source since destination may be in cache
|
||||
- when source is loaded. */
|
||||
- leaq (%rcx, %rdx), %r10
|
||||
- cmpq %r10, %r9
|
||||
- jb L(loop_4x_vec_backward)
|
||||
-L(loop_large_backward):
|
||||
- /* Copy 4 * VEC a time backward with non-temporal stores. */
|
||||
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
|
||||
- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
|
||||
- VMOVU (%rcx), %VEC(0)
|
||||
- VMOVU -VEC_SIZE(%rcx), %VEC(1)
|
||||
- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)
|
||||
- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)
|
||||
- subq $PREFETCHED_LOAD_SIZE, %rcx
|
||||
- subq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
- VMOVNT %VEC(0), (%r9)
|
||||
- VMOVNT %VEC(1), -VEC_SIZE(%r9)
|
||||
- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)
|
||||
- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)
|
||||
- subq $PREFETCHED_LOAD_SIZE, %r9
|
||||
- cmpq $PREFETCHED_LOAD_SIZE, %rdx
|
||||
- ja L(loop_large_backward)
|
||||
+ .p2align 4
|
||||
+L(large_memcpy_4x):
|
||||
+ movq %rdx, %r10
|
||||
+ /* edx will store remainder size for copying tail. */
|
||||
+ andl $(PAGE_SIZE * 4 - 1), %edx
|
||||
+ /* r10 stores outer loop counter. */
|
||||
+ shrq $(LOG_PAGE_SIZE + 2), %r10
|
||||
+ /* Copy 4x VEC at a time from 4 pages. */
|
||||
+ .p2align 4
|
||||
+L(loop_large_memcpy_4x_outer):
|
||||
+ /* ecx stores inner loop counter. */
|
||||
+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
|
||||
+L(loop_large_memcpy_4x_inner):
|
||||
+ /* Only one prefetch set per page as doing 4 pages give more time
|
||||
+ for prefetcher to keep up. */
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
|
||||
+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
|
||||
+ /* Load vectors from rsi. */
|
||||
+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
||||
+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
||||
+ subq $-LARGE_LOAD_SIZE, %rsi
|
||||
+ /* Non-temporal store vectors to rdi. */
|
||||
+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
|
||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
|
||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
|
||||
+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
|
||||
+ subq $-LARGE_LOAD_SIZE, %rdi
|
||||
+ decl %ecx
|
||||
+ jnz L(loop_large_memcpy_4x_inner)
|
||||
+ addq $(PAGE_SIZE * 3), %rdi
|
||||
+ addq $(PAGE_SIZE * 3), %rsi
|
||||
+ decq %r10
|
||||
+ jne L(loop_large_memcpy_4x_outer)
|
||||
sfence
|
||||
- /* Store the first 4 * VEC. */
|
||||
- VMOVU %VEC(4), (%rdi)
|
||||
- VMOVU %VEC(5), VEC_SIZE(%rdi)
|
||||
- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)
|
||||
- /* Store the last VEC. */
|
||||
- VMOVU %VEC(8), (%r11)
|
||||
+ /* Check if only last 4 loads are needed. */
|
||||
+ cmpl $(VEC_SIZE * 4), %edx
|
||||
+ jbe L(large_memcpy_4x_end)
|
||||
+
|
||||
+ /* Handle the last 4 * PAGE_SIZE bytes. */
|
||||
+L(loop_large_memcpy_4x_tail):
|
||||
+ /* Copy 4 * VEC a time forward with non-temporal stores. */
|
||||
+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
|
||||
+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
|
||||
+ VMOVU (%rsi), %VEC(0)
|
||||
+ VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)
|
||||
+ subq $-(VEC_SIZE * 4), %rsi
|
||||
+ addl $-(VEC_SIZE * 4), %edx
|
||||
+ VMOVA %VEC(0), (%rdi)
|
||||
+ VMOVA %VEC(1), VEC_SIZE(%rdi)
|
||||
+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)
|
||||
+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ cmpl $(VEC_SIZE * 4), %edx
|
||||
+ ja L(loop_large_memcpy_4x_tail)
|
||||
+
|
||||
+L(large_memcpy_4x_end):
|
||||
+ /* Store the last 4 * VEC. */
|
||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
|
||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
|
||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
|
||||
+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)
|
||||
+
|
||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,151 +0,0 @@
|
||||
From ecd8b842cf37ea112e59cd9085ff1f1b6e208ae0 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:29:58 -0800
|
||||
Subject: [PATCH] x86-64 memrchr: Properly handle the length parameter [BZ#
|
||||
24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memrchr for x32. Tested on x86-64 and x32. On x86-64,
|
||||
libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/memrchr.S: Use RDX_LP for length.
|
||||
* sysdeps/x86_64/multiarch/memrchr-avx2.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-memrchr.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memrchr.c: New file.
|
||||
---
|
||||
sysdeps/x86_64/memrchr.S | 4 +-
|
||||
sysdeps/x86_64/multiarch/memrchr-avx2.S | 4 +-
|
||||
sysdeps/x86_64/x32/Makefile | 3 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-memrchr.c | 57 +++++++++++++++++++++++++
|
||||
4 files changed, 63 insertions(+), 5 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
|
||||
index b8e3fa1d..dc82f8f7 100644
|
||||
--- a/sysdeps/x86_64/memrchr.S
|
||||
+++ b/sysdeps/x86_64/memrchr.S
|
||||
@@ -24,13 +24,13 @@
|
||||
ENTRY (__memrchr)
|
||||
movd %esi, %xmm1
|
||||
|
||||
- sub $16, %rdx
|
||||
+ sub $16, %RDX_LP
|
||||
jbe L(length_less16)
|
||||
|
||||
punpcklbw %xmm1, %xmm1
|
||||
punpcklbw %xmm1, %xmm1
|
||||
|
||||
- add %rdx, %rdi
|
||||
+ add %RDX_LP, %RDI_LP
|
||||
pshufd $0, %xmm1, %xmm1
|
||||
|
||||
movdqu (%rdi), %xmm0
|
||||
diff --git a/sysdeps/x86_64/multiarch/memrchr-avx2.S b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
index b41a58bc..ce488dd9 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memrchr-avx2.S
|
||||
@@ -32,10 +32,10 @@ ENTRY (__memrchr_avx2)
|
||||
vmovd %esi, %xmm0
|
||||
vpbroadcastb %xmm0, %ymm0
|
||||
|
||||
- subq $VEC_SIZE, %rdx
|
||||
+ sub $VEC_SIZE, %RDX_LP
|
||||
jbe L(last_vec_or_less)
|
||||
|
||||
- addq %rdx, %rdi
|
||||
+ add %RDX_LP, %RDI_LP
|
||||
|
||||
/* Check the last VEC_SIZE bytes. */
|
||||
vpcmpeqb (%rdi), %ymm0, %ymm1
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index 2fe1e5ac..e99dbd7c 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -6,7 +6,8 @@ CFLAGS-s_llround.c += -fno-builtin-lround
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
-tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy
|
||||
+tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
||||
+ tst-size_t-memrchr
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memrchr.c b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
||||
new file mode 100644
|
||||
index 00000000..c83699c0
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memrchr.c
|
||||
@@ -0,0 +1,57 @@
|
||||
+/* Test memrchr with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_NAME "memrchr"
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+IMPL (memchr, 1)
|
||||
+
|
||||
+typedef void * (*proto_t) (const void *, int, size_t);
|
||||
+
|
||||
+static void *
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memrchr (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t src = { { page_size }, buf2 };
|
||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) 0x12 };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ c.fn = impl->fn;
|
||||
+ void * res = do_memrchr (src, c);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %p != NULL",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,92 +0,0 @@
|
||||
From 83c5b368226c34a2f0a5287df40fc290b2b34359 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 10:45:07 -0700
|
||||
Subject: [PATCH] x86-64: Require BMI2 for strchr-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Since strchr-avx2.S updated by
|
||||
|
||||
commit 1f745ecc2109890886b161d4791e1406fdfc29b8
|
||||
Author: noah <goldstein.w.n@gmail.com>
|
||||
Date: Wed Feb 3 00:38:59 2021 -0500
|
||||
|
||||
x86-64: Refactor and improve performance of strchr-avx2.S
|
||||
|
||||
uses sarx:
|
||||
|
||||
c4 e2 72 f7 c0 sarx %ecx,%eax,%eax
|
||||
|
||||
for strchr-avx2 family functions, require BMI2 in ifunc-impl-list.c and
|
||||
ifunc-avx2.h.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-avx2.h | 4 ++--
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 12 +++++++++---
|
||||
2 files changed, 11 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-avx2.h b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
||||
index e0f30e61..ef72b73f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-avx2.h
|
||||
@@ -30,11 +30,11 @@ IFUNC_SELECTOR (void)
|
||||
const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
return OPTIMIZE (evex);
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 695cdba6..85b8863a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -400,10 +400,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/strchr.c. */
|
||||
IFUNC_IMPL (i, name, strchr,
|
||||
IFUNC_IMPL_ADD (array, i, strchr,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strchr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strchr,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__strchr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strchr,
|
||||
@@ -417,10 +419,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/strchrnul.c. */
|
||||
IFUNC_IMPL (i, name, strchrnul,
|
||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__strchrnul_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__strchrnul_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, strchrnul,
|
||||
@@ -574,10 +578,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
/* Support sysdeps/x86_64/multiarch/wcschr.c. */
|
||||
IFUNC_IMPL (i, name, wcschr,
|
||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
||||
- CPU_FEATURE_USABLE (AVX2),
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wcschr_avx2)
|
||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__wcschr_avx2_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wcschr,
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,265 +0,0 @@
|
||||
From f53790272ce7bdc5ecd14b45f65d0464d2a61a3a Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 19 Apr 2021 17:48:10 -0400
|
||||
Subject: [PATCH] x86: Optimize less_vec evex and avx512
|
||||
memset-vec-unaligned-erms.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit adds optimized cased for less_vec memset case that
|
||||
uses the avx512vl/avx512bw mask store avoiding the excessive
|
||||
branches. test-memset and test-wmemset are passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 40 ++++++++++-----
|
||||
sysdeps/x86_64/multiarch/ifunc-memset.h | 6 ++-
|
||||
.../multiarch/memset-avx512-unaligned-erms.S | 2 +-
|
||||
.../multiarch/memset-evex-unaligned-erms.S | 2 +-
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 51 +++++++++++++++----
|
||||
5 files changed, 74 insertions(+), 27 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 85b8863a..d59d65f8 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -204,19 +204,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__memset_chk_avx2_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_chk_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_chk_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_chk_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_chk_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __memset_chk,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
@@ -247,19 +251,23 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__memset_avx2_unaligned_erms_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_evex_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_avx512_unaligned_erms)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
- && CPU_FEATURE_USABLE (AVX512BW)),
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__memset_avx512_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, memset,
|
||||
CPU_FEATURE_USABLE (AVX512F),
|
||||
@@ -739,10 +747,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__wmemset_avx2_unaligned_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
- CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wmemset_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, wmemset,
|
||||
- CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wmemset_avx512_unaligned))
|
||||
|
||||
#ifdef SHARED
|
||||
@@ -946,10 +958,14 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
CPU_FEATURE_USABLE (AVX2),
|
||||
__wmemset_chk_avx2_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512VL),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wmemset_chk_evex_unaligned)
|
||||
IFUNC_IMPL_ADD (array, i, __wmemset_chk,
|
||||
- CPU_FEATURE_USABLE (AVX512F),
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
__wmemset_chk_avx512_unaligned))
|
||||
#endif
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memset.h b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
index 19795938..100e3707 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memset.h
|
||||
@@ -54,7 +54,8 @@ IFUNC_SELECTOR (void)
|
||||
&& !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
return OPTIMIZE (avx512_unaligned_erms);
|
||||
@@ -68,7 +69,8 @@ IFUNC_SELECTOR (void)
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
- && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
return OPTIMIZE (evex_unaligned_erms);
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index 22e7b187..8ad842fc 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -19,6 +19,6 @@
|
||||
# define SECTION(p) p##.evex512
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
-
|
||||
+# define USE_LESS_VEC_MASK_STORE 1
|
||||
# include "memset-vec-unaligned-erms.S"
|
||||
#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
index ae0a4d6e..640f0929 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -19,6 +19,6 @@
|
||||
# define SECTION(p) p##.evex
|
||||
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
# define WMEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
-
|
||||
+# define USE_LESS_VEC_MASK_STORE 1
|
||||
# include "memset-vec-unaligned-erms.S"
|
||||
#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index bae5cba4..f877ac9d 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -63,6 +63,8 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
+#define PAGE_SIZE 4096
|
||||
+
|
||||
#ifndef SECTION
|
||||
# error SECTION is not defined!
|
||||
#endif
|
||||
@@ -213,11 +215,38 @@ L(loop):
|
||||
cmpq %rcx, %rdx
|
||||
jne L(loop)
|
||||
VZEROUPPER_SHORT_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
# endif
|
||||
+# ifdef USE_LESS_VEC_MASK_STORE
|
||||
+ /* Clear high bits from edi. Only keeping bits relevant to page
|
||||
+ cross check. Note that we are using rax which is set in
|
||||
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
|
||||
+ */
|
||||
+ andl $(PAGE_SIZE - 1), %edi
|
||||
+ /* Check if VEC_SIZE store cross page. Mask stores suffer serious
|
||||
+ performance degradation when it has to fault supress. */
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
||||
+ ja L(cross_page)
|
||||
+# if VEC_SIZE > 32
|
||||
+ movq $-1, %rcx
|
||||
+ bzhiq %rdx, %rcx, %rcx
|
||||
+ kmovq %rcx, %k1
|
||||
+# else
|
||||
+ movl $-1, %ecx
|
||||
+ bzhil %edx, %ecx, %ecx
|
||||
+ kmovd %ecx, %k1
|
||||
+# endif
|
||||
+ vmovdqu8 %VEC(0), (%rax) {%k1}
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(cross_page):
|
||||
+# endif
|
||||
# if VEC_SIZE > 32
|
||||
cmpb $32, %dl
|
||||
jae L(between_32_63)
|
||||
@@ -234,36 +263,36 @@ L(less_vec):
|
||||
cmpb $1, %dl
|
||||
ja L(between_2_3)
|
||||
jb 1f
|
||||
- movb %cl, (%rdi)
|
||||
+ movb %cl, (%rax)
|
||||
1:
|
||||
VZEROUPPER_RETURN
|
||||
# if VEC_SIZE > 32
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
- VMOVU %YMM0, -32(%rdi,%rdx)
|
||||
- VMOVU %YMM0, (%rdi)
|
||||
+ VMOVU %YMM0, -32(%rax,%rdx)
|
||||
+ VMOVU %YMM0, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
# if VEC_SIZE > 16
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
L(between_16_31):
|
||||
- VMOVU %XMM0, -16(%rdi,%rdx)
|
||||
- VMOVU %XMM0, (%rdi)
|
||||
+ VMOVU %XMM0, -16(%rax,%rdx)
|
||||
+ VMOVU %XMM0, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
# endif
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
L(between_8_15):
|
||||
- movq %rcx, -8(%rdi,%rdx)
|
||||
- movq %rcx, (%rdi)
|
||||
+ movq %rcx, -8(%rax,%rdx)
|
||||
+ movq %rcx, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
- movl %ecx, -4(%rdi,%rdx)
|
||||
- movl %ecx, (%rdi)
|
||||
+ movl %ecx, -4(%rax,%rdx)
|
||||
+ movl %ecx, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
- movw %cx, -2(%rdi,%rdx)
|
||||
- movw %cx, (%rdi)
|
||||
+ movw %cx, -2(%rax,%rdx)
|
||||
+ movw %cx, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,396 +0,0 @@
|
||||
From ccabe7971f508709d034b63b8672f6f751a3d356 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 23 Apr 2021 15:56:24 -0400
|
||||
Subject: [PATCH] x86: Optimize strchr-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes strchr-avx2.S. The optimizations are all
|
||||
small things such as save an ALU in the alignment process, saving a
|
||||
few instructions in the loop return, saving some bytes in the main
|
||||
loop, and increasing the ILP in the return cases. test-strchr,
|
||||
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strchr-avx2.S | 290 +++++++++++++++----------
|
||||
1 file changed, 170 insertions(+), 120 deletions(-)
|
||||
|
||||
Conflics:
|
||||
sysdeps/x86_64/multiarch/strchr-avx2.S
|
||||
(rearranged to account for branch changes)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strchr-avx2.S b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
||||
index 919d256c..5884726b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strchr-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strchr-avx2.S
|
||||
@@ -49,133 +49,144 @@
|
||||
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
ENTRY (STRCHR)
|
||||
- movl %edi, %ecx
|
||||
-# ifndef USE_AS_STRCHRNUL
|
||||
- xorl %edx, %edx
|
||||
-# endif
|
||||
-
|
||||
/* Broadcast CHAR to YMM0. */
|
||||
vmovd %esi, %xmm0
|
||||
+ movl %edi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ VPBROADCAST %xmm0, %ymm0
|
||||
vpxor %xmm9, %xmm9, %xmm9
|
||||
- VPBROADCAST %xmm0, %ymm0
|
||||
|
||||
/* Check if we cross page boundary with one vector load. */
|
||||
- andl $(PAGE_SIZE - 1), %ecx
|
||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
||||
- ja L(cross_page_boundary)
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
||||
null byte. */
|
||||
vmovdqu (%rdi), %ymm8
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
vpor %ymm1, %ymm2, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
- jz L(more_vecs)
|
||||
+ jz L(aligned_more)
|
||||
tzcntl %eax, %eax
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
/* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
+# endif
|
||||
addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
|
||||
+ alignment % 32 was either 16 or 0. As well this makes the
|
||||
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
|
||||
+ easier. */
|
||||
+ .p2align 5
|
||||
+L(first_vec_x4):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $(VEC_SIZE * 3 + 1), %rdi
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
# endif
|
||||
-L(return_vzeroupper):
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
-L(more_vecs):
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
-L(aligned_more):
|
||||
-
|
||||
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
- since data is only aligned to VEC_SIZE. */
|
||||
- vmovdqa VEC_SIZE(%rdi), %ymm8
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
- vpor %ymm1, %ymm2, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
-
|
||||
- vmovdqa VEC_SIZE(%rdi), %ymm8
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
- vpor %ymm1, %ymm2, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
-
|
||||
- vmovdqa (VEC_SIZE * 2)(%rdi), %ymm8
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
- vpor %ymm1, %ymm2, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
-
|
||||
- vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
- vpor %ymm1, %ymm2, %ymm1
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- testl %eax, %eax
|
||||
- jz L(prep_loop_4x)
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- tzcntl %eax, %eax
|
||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ VZEROUPPER_RETURN
|
||||
# endif
|
||||
- VZEROUPPER
|
||||
- ret
|
||||
+
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x0):
|
||||
+L(first_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
- /* Found CHAR or the null byte. */
|
||||
- addq %rdi, %rax
|
||||
+ incq %rdi
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
# endif
|
||||
+ addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x1):
|
||||
+L(first_vec_x2):
|
||||
tzcntl %eax, %eax
|
||||
- leaq VEC_SIZE(%rdi, %rax), %rax
|
||||
+ addq $(VEC_SIZE + 1), %rdi
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
# endif
|
||||
+ addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2):
|
||||
+L(first_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
- /* Found CHAR or the null byte. */
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
+ addq $(VEC_SIZE * 2 + 1), %rdi
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
# endif
|
||||
+ addq %rdi, %rax
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
-L(prep_loop_4x):
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
- andq $-(VEC_SIZE * 4), %rdi
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
+ /* Align data to VEC_SIZE - 1. This is the same number of
|
||||
+ instructions as using andq -VEC_SIZE but saves 4 bytes of code
|
||||
+ on x4 check. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+L(cross_page_continue):
|
||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
+ since data is only aligned to VEC_SIZE. */
|
||||
+ vmovdqa 1(%rdi), %ymm8
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
+ vpor %ymm1, %ymm2, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x1)
|
||||
+
|
||||
+ vmovdqa (VEC_SIZE + 1)(%rdi), %ymm8
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
+ vpor %ymm1, %ymm2, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x2)
|
||||
+
|
||||
+ vmovdqa (VEC_SIZE * 2 + 1)(%rdi), %ymm8
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
+ vpor %ymm1, %ymm2, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x3)
|
||||
|
||||
+ vmovdqa (VEC_SIZE * 3 + 1)(%rdi), %ymm8
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
+ vpor %ymm1, %ymm2, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x4)
|
||||
+ /* Align data to VEC_SIZE * 4 - 1. */
|
||||
+ addq $(VEC_SIZE * 4 + 1), %rdi
|
||||
+ andq $-(VEC_SIZE * 4), %rdi
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
/* Compare 4 * VEC at a time forward. */
|
||||
- vmovdqa (VEC_SIZE * 4)(%rdi), %ymm5
|
||||
- vmovdqa (VEC_SIZE * 5)(%rdi), %ymm6
|
||||
- vmovdqa (VEC_SIZE * 6)(%rdi), %ymm7
|
||||
- vmovdqa (VEC_SIZE * 7)(%rdi), %ymm8
|
||||
+ vmovdqa (%rdi), %ymm5
|
||||
+ vmovdqa (VEC_SIZE)(%rdi), %ymm6
|
||||
+ vmovdqa (VEC_SIZE * 2)(%rdi), %ymm7
|
||||
+ vmovdqa (VEC_SIZE * 3)(%rdi), %ymm8
|
||||
|
||||
/* Leaves only CHARS matching esi as 0. */
|
||||
vpxor %ymm5, %ymm0, %ymm1
|
||||
@@ -191,63 +202,102 @@ L(loop_4x_vec):
|
||||
VPMINU %ymm1, %ymm2, %ymm5
|
||||
VPMINU %ymm3, %ymm4, %ymm6
|
||||
|
||||
- VPMINU %ymm5, %ymm6, %ymm5
|
||||
+ VPMINU %ymm5, %ymm6, %ymm6
|
||||
|
||||
- VPCMPEQ %ymm5, %ymm9, %ymm5
|
||||
- vpmovmskb %ymm5, %eax
|
||||
+ VPCMPEQ %ymm6, %ymm9, %ymm6
|
||||
+ vpmovmskb %ymm6, %ecx
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ testl %ecx, %ecx
|
||||
+ jz L(loop_4x_vec)
|
||||
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
- testl %eax, %eax
|
||||
- jz L(loop_4x_vec)
|
||||
|
||||
- VPCMPEQ %ymm1, %ymm9, %ymm1
|
||||
+ VPCMPEQ %ymm1, %ymm9, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ jnz L(last_vec_x0)
|
||||
+
|
||||
|
||||
- VPCMPEQ %ymm2, %ymm9, %ymm2
|
||||
+ VPCMPEQ %ymm5, %ymm9, %ymm2
|
||||
vpmovmskb %ymm2, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
+ jnz L(last_vec_x1)
|
||||
+
|
||||
+ VPCMPEQ %ymm3, %ymm9, %ymm3
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* rcx has combined result from all 4 VEC. It will only be used
|
||||
+ if the first 3 other VEC all did not contain a match. */
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+ subq $(VEC_SIZE * 2), %rdi
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero_end)
|
||||
+# endif
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_vec_x0):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq $-(VEC_SIZE * 4), %rdi
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero_end)
|
||||
+# endif
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
- VPCMPEQ %ymm3, %ymm9, %ymm3
|
||||
- VPCMPEQ %ymm4, %ymm9, %ymm4
|
||||
- vpmovmskb %ymm3, %ecx
|
||||
- vpmovmskb %ymm4, %eax
|
||||
- salq $32, %rax
|
||||
- orq %rcx, %rax
|
||||
- tzcntq %rax, %rax
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+L(zero_end):
|
||||
+ xorl %eax, %eax
|
||||
+ VZEROUPPER_RETURN
|
||||
# endif
|
||||
- VZEROUPPER
|
||||
- ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+ subq $(VEC_SIZE * 3), %rdi
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdi, %rax), %CHAR_REG
|
||||
+ jne L(zero_end)
|
||||
+# endif
|
||||
+ addq %rdi, %rax
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
|
||||
/* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
L(cross_page_boundary):
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
-
|
||||
- vmovdqa (%rdi), %ymm8
|
||||
- VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
- VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Align rdi to VEC_SIZE - 1. */
|
||||
+ orq $(VEC_SIZE - 1), %rdi
|
||||
+ vmovdqa -(VEC_SIZE - 1)(%rdi), %ymm8
|
||||
+ VPCMPEQ %ymm8, %ymm0, %ymm1
|
||||
+ VPCMPEQ %ymm8, %ymm9, %ymm2
|
||||
vpor %ymm1, %ymm2, %ymm1
|
||||
vpmovmskb %ymm1, %eax
|
||||
- /* Remove the leading bits. */
|
||||
- sarxl %ecx, %eax, %eax
|
||||
+ /* Remove the leading bytes. sarxl only uses bits [5:0] of COUNT
|
||||
+ so no need to manually mod edx. */
|
||||
+ sarxl %edx, %eax, %eax
|
||||
testl %eax, %eax
|
||||
- jz L(aligned_more)
|
||||
+ jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
- addq %rcx, %rdi
|
||||
- addq %rdi, %rax
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ xorl %ecx, %ecx
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rdx, %rax), %CHAR_REG
|
||||
+ leaq (%rdx, %rax), %rax
|
||||
+ cmovne %rcx, %rax
|
||||
+# else
|
||||
+ addq %rdx, %rax
|
||||
# endif
|
||||
- VZEROUPPER_RETURN
|
||||
+L(return_vzeroupper):
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
END (STRCHR)
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,532 +0,0 @@
|
||||
From 7f3e7c262cab4e2401e4331a6ef29c428de02044 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 23 Apr 2021 15:56:25 -0400
|
||||
Subject: [PATCH] x86: Optimize strchr-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes strchr-evex.S. The optimizations are
|
||||
mostly small things such as save an ALU in the alignment process,
|
||||
saving a few instructions in the loop return. The one significant
|
||||
change is saving 2 instructions in the 4x loop. test-strchr,
|
||||
test-strchrnul, test-wcschr, and test-wcschrnul are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strchr-evex.S | 392 ++++++++++++++-----------
|
||||
1 file changed, 218 insertions(+), 174 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strchr-evex.S b/sysdeps/x86_64/multiarch/strchr-evex.S
|
||||
index ddc86a70..7f9d4ee4 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strchr-evex.S
|
||||
@@ -32,13 +32,15 @@
|
||||
# define VPCMP vpcmpd
|
||||
# define VPMINU vpminud
|
||||
# define CHAR_REG esi
|
||||
-# define SHIFT_REG r8d
|
||||
+# define SHIFT_REG ecx
|
||||
+# define CHAR_SIZE 4
|
||||
# else
|
||||
# define VPBROADCAST vpbroadcastb
|
||||
# define VPCMP vpcmpb
|
||||
# define VPMINU vpminub
|
||||
# define CHAR_REG sil
|
||||
-# define SHIFT_REG ecx
|
||||
+# define SHIFT_REG edx
|
||||
+# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
# define XMMZERO xmm16
|
||||
@@ -56,23 +58,20 @@
|
||||
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
ENTRY (STRCHR)
|
||||
- movl %edi, %ecx
|
||||
-# ifndef USE_AS_STRCHRNUL
|
||||
- xorl %edx, %edx
|
||||
-# endif
|
||||
-
|
||||
/* Broadcast CHAR to YMM0. */
|
||||
- VPBROADCAST %esi, %YMM0
|
||||
-
|
||||
+ VPBROADCAST %esi, %YMM0
|
||||
+ movl %edi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
|
||||
- /* Check if we cross page boundary with one vector load. */
|
||||
- andl $(PAGE_SIZE - 1), %ecx
|
||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %ecx
|
||||
- ja L(cross_page_boundary)
|
||||
+ /* Check if we cross page boundary with one vector load.
|
||||
+ Otherwise it is safe to use an unaligned load. */
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ ja L(cross_page_boundary)
|
||||
|
||||
/* Check the first VEC_SIZE bytes. Search for both CHAR and the
|
||||
null bytes. */
|
||||
@@ -83,251 +82,296 @@ ENTRY (STRCHR)
|
||||
VPMINU %YMM2, %YMM1, %YMM2
|
||||
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
- ktestd %k0, %k0
|
||||
- jz L(more_vecs)
|
||||
kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jz L(aligned_more)
|
||||
tzcntl %eax, %eax
|
||||
- /* Found CHAR or the null byte. */
|
||||
# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (%rdi, %rax, 4), %rax
|
||||
+ /* NB: Multiply wchar_t count by 4 to get the number of bytes.
|
||||
+ */
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
addq %rdi, %rax
|
||||
# endif
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (%rax), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(more_vecs):
|
||||
- /* Align data for aligned loads in the loop. */
|
||||
- andq $-VEC_SIZE, %rdi
|
||||
-L(aligned_more):
|
||||
-
|
||||
- /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time
|
||||
- since data is only aligned to VEC_SIZE. */
|
||||
- VMOVA VEC_SIZE(%rdi), %YMM1
|
||||
- addq $VEC_SIZE, %rdi
|
||||
-
|
||||
- /* Leaves only CHARS matching esi as 0. */
|
||||
- vpxorq %YMM1, %YMM0, %YMM2
|
||||
- VPMINU %YMM2, %YMM1, %YMM2
|
||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
-
|
||||
- VMOVA VEC_SIZE(%rdi), %YMM1
|
||||
- /* Leaves only CHARS matching esi as 0. */
|
||||
- vpxorq %YMM1, %YMM0, %YMM2
|
||||
- VPMINU %YMM2, %YMM1, %YMM2
|
||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
-
|
||||
- VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
|
||||
- /* Leaves only CHARS matching esi as 0. */
|
||||
- vpxorq %YMM1, %YMM0, %YMM2
|
||||
- VPMINU %YMM2, %YMM1, %YMM2
|
||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
-
|
||||
- VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
|
||||
- /* Leaves only CHARS matching esi as 0. */
|
||||
- vpxorq %YMM1, %YMM0, %YMM2
|
||||
- VPMINU %YMM2, %YMM1, %YMM2
|
||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
- ktestd %k0, %k0
|
||||
- jz L(prep_loop_4x)
|
||||
-
|
||||
- kmovd %k0, %eax
|
||||
+ /* .p2align 5 helps keep performance more consistent if ENTRY()
|
||||
+ alignment % 32 was either 16 or 0. As well this makes the
|
||||
+ alignment % 32 of the loop_4x_vec fixed which makes tuning it
|
||||
+ easier. */
|
||||
+ .p2align 5
|
||||
+L(first_vec_x3):
|
||||
tzcntl %eax, %eax
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
/* Found CHAR or the null byte. */
|
||||
-# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- leaq (VEC_SIZE * 3)(%rdi, %rax), %rax
|
||||
+ cmp (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
# endif
|
||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
-# endif
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
+# endif
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x0):
|
||||
+L(first_vec_x4):
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Check to see if first match was CHAR (k0) or null (k1). */
|
||||
+ kmovd %k0, %eax
|
||||
tzcntl %eax, %eax
|
||||
- /* Found CHAR or the null byte. */
|
||||
-# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (%rdi, %rax, 4), %rax
|
||||
+ kmovd %k1, %ecx
|
||||
+ /* bzhil will not be 0 if first match was null. */
|
||||
+ bzhil %eax, %ecx, %ecx
|
||||
+ jne L(zero)
|
||||
# else
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
-# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Combine CHAR and null matches. */
|
||||
+ kord %k0, %k1, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
# endif
|
||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x1):
|
||||
tzcntl %eax, %eax
|
||||
- /* Found CHAR or the null byte. */
|
||||
-# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq VEC_SIZE(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- leaq VEC_SIZE(%rdi, %rax), %rax
|
||||
-# endif
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Found CHAR or the null byte. */
|
||||
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
||||
+ jne L(zero)
|
||||
+
|
||||
# endif
|
||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(first_vec_x2):
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Check to see if first match was CHAR (k0) or null (k1). */
|
||||
+ kmovd %k0, %eax
|
||||
tzcntl %eax, %eax
|
||||
- /* Found CHAR or the null byte. */
|
||||
-# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
||||
+ kmovd %k1, %ecx
|
||||
+ /* bzhil will not be 0 if first match was null. */
|
||||
+ bzhil %eax, %ecx, %ecx
|
||||
+ jne L(zero)
|
||||
# else
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
-# endif
|
||||
-# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Combine CHAR and null matches. */
|
||||
+ kord %k0, %k1, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
# endif
|
||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
-L(prep_loop_4x):
|
||||
- /* Align data to 4 * VEC_SIZE. */
|
||||
+ .p2align 4
|
||||
+L(aligned_more):
|
||||
+ /* Align data to VEC_SIZE. */
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+L(cross_page_continue):
|
||||
+ /* Check the next 4 * VEC_SIZE. Only one VEC_SIZE at a time since
|
||||
+ data is only aligned to VEC_SIZE. Use two alternating methods
|
||||
+ for checking VEC to balance latency and port contention. */
|
||||
+
|
||||
+ /* This method has higher latency but has better port
|
||||
+ distribution. */
|
||||
+ VMOVA (VEC_SIZE)(%rdi), %YMM1
|
||||
+ /* Leaves only CHARS matching esi as 0. */
|
||||
+ vpxorq %YMM1, %YMM0, %YMM2
|
||||
+ VPMINU %YMM2, %YMM1, %YMM2
|
||||
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x1)
|
||||
+
|
||||
+ /* This method has higher latency but has better port
|
||||
+ distribution. */
|
||||
+ VMOVA (VEC_SIZE * 2)(%rdi), %YMM1
|
||||
+ /* Each bit in K0 represents a CHAR in YMM1. */
|
||||
+ VPCMP $0, %YMM1, %YMM0, %k0
|
||||
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
+ VPCMP $0, %YMM1, %YMMZERO, %k1
|
||||
+ kortestd %k0, %k1
|
||||
+ jnz L(first_vec_x2)
|
||||
+
|
||||
+ VMOVA (VEC_SIZE * 3)(%rdi), %YMM1
|
||||
+ /* Leaves only CHARS matching esi as 0. */
|
||||
+ vpxorq %YMM1, %YMM0, %YMM2
|
||||
+ VPMINU %YMM2, %YMM1, %YMM2
|
||||
+ /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(first_vec_x3)
|
||||
+
|
||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
+ /* Each bit in K0 represents a CHAR in YMM1. */
|
||||
+ VPCMP $0, %YMM1, %YMM0, %k0
|
||||
+ /* Each bit in K1 represents a CHAR in YMM1. */
|
||||
+ VPCMP $0, %YMM1, %YMMZERO, %k1
|
||||
+ kortestd %k0, %k1
|
||||
+ jnz L(first_vec_x4)
|
||||
+
|
||||
+ /* Align data to VEC_SIZE * 4 for the loop. */
|
||||
+ addq $VEC_SIZE, %rdi
|
||||
andq $-(VEC_SIZE * 4), %rdi
|
||||
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
+ /* Check 4x VEC at a time. No penalty to imm32 offset with evex
|
||||
+ encoding. */
|
||||
VMOVA (VEC_SIZE * 4)(%rdi), %YMM1
|
||||
VMOVA (VEC_SIZE * 5)(%rdi), %YMM2
|
||||
VMOVA (VEC_SIZE * 6)(%rdi), %YMM3
|
||||
VMOVA (VEC_SIZE * 7)(%rdi), %YMM4
|
||||
|
||||
- /* Leaves only CHARS matching esi as 0. */
|
||||
+ /* For YMM1 and YMM3 use xor to set the CHARs matching esi to
|
||||
+ zero. */
|
||||
vpxorq %YMM1, %YMM0, %YMM5
|
||||
- vpxorq %YMM2, %YMM0, %YMM6
|
||||
+ /* For YMM2 and YMM4 cmp not equals to CHAR and store result in
|
||||
+ k register. Its possible to save either 1 or 2 instructions
|
||||
+ using cmp no equals method for either YMM1 or YMM1 and YMM3
|
||||
+ respectively but bottleneck on p5 makes it not worth it. */
|
||||
+ VPCMP $4, %YMM0, %YMM2, %k2
|
||||
vpxorq %YMM3, %YMM0, %YMM7
|
||||
- vpxorq %YMM4, %YMM0, %YMM8
|
||||
-
|
||||
- VPMINU %YMM5, %YMM1, %YMM5
|
||||
- VPMINU %YMM6, %YMM2, %YMM6
|
||||
- VPMINU %YMM7, %YMM3, %YMM7
|
||||
- VPMINU %YMM8, %YMM4, %YMM8
|
||||
-
|
||||
- VPMINU %YMM5, %YMM6, %YMM1
|
||||
- VPMINU %YMM7, %YMM8, %YMM2
|
||||
-
|
||||
- VPMINU %YMM1, %YMM2, %YMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a CHAR or a null byte. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
-
|
||||
- ktestd %k0, %k0
|
||||
+ VPCMP $4, %YMM0, %YMM4, %k4
|
||||
+
|
||||
+ /* Use min to select all zeros from either xor or end of string).
|
||||
+ */
|
||||
+ VPMINU %YMM1, %YMM5, %YMM1
|
||||
+ VPMINU %YMM3, %YMM7, %YMM3
|
||||
+
|
||||
+ /* Use min + zeromask to select for zeros. Since k2 and k4 will
|
||||
+ have 0 as positions that matched with CHAR which will set
|
||||
+ zero in the corresponding destination bytes in YMM2 / YMM4.
|
||||
+ */
|
||||
+ VPMINU %YMM1, %YMM2, %YMM2{%k2}{z}
|
||||
+ VPMINU %YMM3, %YMM4, %YMM4
|
||||
+ VPMINU %YMM2, %YMM4, %YMM4{%k4}{z}
|
||||
+
|
||||
+ VPCMP $0, %YMMZERO, %YMM4, %k1
|
||||
+ kmovd %k1, %ecx
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ testl %ecx, %ecx
|
||||
jz L(loop_4x_vec)
|
||||
|
||||
- /* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM5, %k0
|
||||
+ VPCMP $0, %YMMZERO, %YMM1, %k0
|
||||
kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x0)
|
||||
+ jnz L(last_vec_x1)
|
||||
|
||||
- /* Each bit in K1 represents a CHAR or a null byte in YMM2. */
|
||||
- VPCMP $0, %YMMZERO, %YMM6, %k1
|
||||
- kmovd %k1, %eax
|
||||
+ VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
+ kmovd %k0, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
-
|
||||
- /* Each bit in K2 represents a CHAR or a null byte in YMM3. */
|
||||
- VPCMP $0, %YMMZERO, %YMM7, %k2
|
||||
- /* Each bit in K3 represents a CHAR or a null byte in YMM4. */
|
||||
- VPCMP $0, %YMMZERO, %YMM8, %k3
|
||||
+ jnz L(last_vec_x2)
|
||||
|
||||
+ VPCMP $0, %YMMZERO, %YMM3, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ /* Combine YMM3 matches (eax) with YMM4 matches (ecx). */
|
||||
# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Each bit in K2/K3 represents 4-byte element. */
|
||||
- kshiftlw $8, %k3, %k1
|
||||
+ sall $8, %ecx
|
||||
+ orl %ecx, %eax
|
||||
+ tzcntl %eax, %eax
|
||||
# else
|
||||
- kshiftlq $32, %k3, %k1
|
||||
+ salq $32, %rcx
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
# endif
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Check if match was CHAR or null. */
|
||||
+ cmp (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
||||
+ jne L(zero_end)
|
||||
+# endif
|
||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- korq %k1, %k2, %k1
|
||||
- kmovq %k1, %rax
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+L(zero_end):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+# endif
|
||||
|
||||
- tzcntq %rax, %rax
|
||||
-# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, 4), %rax
|
||||
-# else
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax), %rax
|
||||
+ .p2align 4
|
||||
+L(last_vec_x1):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Check if match was null. */
|
||||
+ cmp (%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
||||
+ jne L(zero_end)
|
||||
# endif
|
||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_vec_x2):
|
||||
+ tzcntl %eax, %eax
|
||||
# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ /* Check if match was null. */
|
||||
+ cmp (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %CHAR_REG
|
||||
+ jne L(zero_end)
|
||||
# endif
|
||||
+ /* NB: Multiply sizeof char type (1 or 4) to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
|
||||
/* Cold case for crossing page with first load. */
|
||||
.p2align 4
|
||||
L(cross_page_boundary):
|
||||
+ movq %rdi, %rdx
|
||||
+ /* Align rdi. */
|
||||
andq $-VEC_SIZE, %rdi
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
-
|
||||
VMOVA (%rdi), %YMM1
|
||||
-
|
||||
/* Leaves only CHARS matching esi as 0. */
|
||||
vpxorq %YMM1, %YMM0, %YMM2
|
||||
VPMINU %YMM2, %YMM1, %YMM2
|
||||
/* Each bit in K0 represents a CHAR or a null byte in YMM1. */
|
||||
VPCMP $0, %YMMZERO, %YMM2, %k0
|
||||
kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
-
|
||||
+ /* Remove the leading bits. */
|
||||
# ifdef USE_AS_WCSCHR
|
||||
+ movl %edx, %SHIFT_REG
|
||||
/* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
bytes. */
|
||||
- movl %ecx, %SHIFT_REG
|
||||
- sarl $2, %SHIFT_REG
|
||||
+ sarl $2, %SHIFT_REG
|
||||
+ andl $(CHAR_PER_VEC - 1), %SHIFT_REG
|
||||
# endif
|
||||
-
|
||||
- /* Remove the leading bits. */
|
||||
sarxl %SHIFT_REG, %eax, %eax
|
||||
+ /* If eax is zero continue. */
|
||||
testl %eax, %eax
|
||||
-
|
||||
- jz L(aligned_more)
|
||||
+ jz L(cross_page_continue)
|
||||
tzcntl %eax, %eax
|
||||
- addq %rcx, %rdi
|
||||
+# ifndef USE_AS_STRCHRNUL
|
||||
+ /* Check to see if match was CHAR or null. */
|
||||
+ cmp (%rdx, %rax, CHAR_SIZE), %CHAR_REG
|
||||
+ jne L(zero_end)
|
||||
+# endif
|
||||
# ifdef USE_AS_WCSCHR
|
||||
- /* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
- leaq (%rdi, %rax, 4), %rax
|
||||
+ /* NB: Multiply wchar_t count by 4 to get the number of
|
||||
+ bytes. */
|
||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
-# ifndef USE_AS_STRCHRNUL
|
||||
- cmp (%rax), %CHAR_REG
|
||||
- cmovne %rdx, %rax
|
||||
+ addq %rdx, %rax
|
||||
# endif
|
||||
ret
|
||||
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,536 +0,0 @@
|
||||
From 104c7b1967c3e78435c6f7eab5e225a7eddf9c6e Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue, 4 May 2021 19:02:40 -0400
|
||||
Subject: [PATCH] x86: Add EVEX optimized memchr family not safe for RTM
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
|
||||
This commit adds a new implementation for EVEX memchr that is not safe
|
||||
for RTM because it uses vzeroupper. The benefit is that by using
|
||||
ymm0-ymm15 it can use vpcmpeq and vpternlogd in the 4x loop which is
|
||||
faster than the RTM safe version which cannot use vpcmpeq because
|
||||
there is no EVEX encoding for the instruction. All parts of the
|
||||
implementation aside from the 4x loop are the same for the two
|
||||
versions and the optimization is only relevant for large sizes.
|
||||
|
||||
Tigerlake:
|
||||
size , algn , Pos , Cur T , New T , Win , Dif
|
||||
512 , 6 , 192 , 9.2 , 9.04 , no-RTM , 0.16
|
||||
512 , 7 , 224 , 9.19 , 8.98 , no-RTM , 0.21
|
||||
2048 , 0 , 256 , 10.74 , 10.54 , no-RTM , 0.2
|
||||
2048 , 0 , 512 , 14.81 , 14.87 , RTM , 0.06
|
||||
2048 , 0 , 1024 , 22.97 , 22.57 , no-RTM , 0.4
|
||||
2048 , 0 , 2048 , 37.49 , 34.51 , no-RTM , 2.98 <--
|
||||
|
||||
Icelake:
|
||||
size , algn , Pos , Cur T , New T , Win , Dif
|
||||
512 , 6 , 192 , 7.6 , 7.3 , no-RTM , 0.3
|
||||
512 , 7 , 224 , 7.63 , 7.27 , no-RTM , 0.36
|
||||
2048 , 0 , 256 , 8.48 , 8.38 , no-RTM , 0.1
|
||||
2048 , 0 , 512 , 11.57 , 11.42 , no-RTM , 0.15
|
||||
2048 , 0 , 1024 , 17.92 , 17.38 , no-RTM , 0.54
|
||||
2048 , 0 , 2048 , 30.37 , 27.34 , no-RTM , 3.03 <--
|
||||
|
||||
test-memchr, test-wmemchr, and test-rawmemchr are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 7 +-
|
||||
sysdeps/x86_64/multiarch/ifunc-evex.h | 55 ++++++
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 15 ++
|
||||
sysdeps/x86_64/multiarch/memchr-evex-rtm.S | 8 +
|
||||
sysdeps/x86_64/multiarch/memchr-evex.S | 161 ++++++++++++++----
|
||||
sysdeps/x86_64/multiarch/memchr.c | 2 +-
|
||||
sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S | 3 +
|
||||
sysdeps/x86_64/multiarch/rawmemchr.c | 2 +-
|
||||
sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S | 3 +
|
||||
sysdeps/x86_64/multiarch/wmemchr.c | 2 +-
|
||||
10 files changed, 217 insertions(+), 41 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/ifunc-evex.h
|
||||
create mode 100644 sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
||||
create mode 100644 sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
||||
create mode 100644 sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 65fde4eb..26be4095 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -77,7 +77,9 @@ sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
strncmp-evex \
|
||||
strncpy-evex \
|
||||
strnlen-evex \
|
||||
- strrchr-evex
|
||||
+ strrchr-evex \
|
||||
+ memchr-evex-rtm \
|
||||
+ rawmemchr-evex-rtm
|
||||
CFLAGS-varshift.c += -msse4
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
@@ -110,7 +112,8 @@ sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
||||
wcsnlen-evex \
|
||||
wcsrchr-evex \
|
||||
wmemchr-evex \
|
||||
- wmemcmp-evex-movbe
|
||||
+ wmemcmp-evex-movbe \
|
||||
+ wmemchr-evex-rtm
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),debug)
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-evex.h b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
||||
new file mode 100644
|
||||
index 00000000..fc391edb
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-evex.h
|
||||
@@ -0,0 +1,55 @@
|
||||
+/* Common definition for ifunc selection optimized with EVEX.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <init-arch.h>
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (sse2) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (avx2_rtm) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE (evex_rtm) attribute_hidden;
|
||||
+
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
+ && CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
+ return OPTIMIZE (evex_rtm);
|
||||
+
|
||||
+ return OPTIMIZE (evex);
|
||||
+ }
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
+ return OPTIMIZE (avx2_rtm);
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ return OPTIMIZE (avx2);
|
||||
+ }
|
||||
+
|
||||
+ return OPTIMIZE (sse2);
|
||||
+}
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index d59d65f8..ac097e8d 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -52,6 +52,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__memchr_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, memchr,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __memchr_evex_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memchr, 1, __memchr_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/memcmp.c. */
|
||||
@@ -288,6 +293,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__rawmemchr_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __rawmemchr_evex_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, rawmemchr, 1, __rawmemchr_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/strlen.c. */
|
||||
@@ -711,6 +721,11 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
&& CPU_FEATURE_USABLE (BMI2)),
|
||||
__wmemchr_evex)
|
||||
+ IFUNC_IMPL_ADD (array, i, wmemchr,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __wmemchr_evex_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wmemchr, 1, __wmemchr_sse2))
|
||||
|
||||
/* Support sysdeps/x86_64/multiarch/wmemcmp.c. */
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex-rtm.S b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
||||
new file mode 100644
|
||||
index 00000000..19871882
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex-rtm.S
|
||||
@@ -0,0 +1,8 @@
|
||||
+#ifndef MEMCHR
|
||||
+# define MEMCHR __memchr_evex_rtm
|
||||
+#endif
|
||||
+
|
||||
+#define USE_IN_RTM 1
|
||||
+#define SECTION(p) p##.evex.rtm
|
||||
+
|
||||
+#include "memchr-evex.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr-evex.S b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
index f3fdad4f..4d0ed6d1 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr-evex.S
|
||||
@@ -38,10 +38,32 @@
|
||||
# define CHAR_SIZE 1
|
||||
# endif
|
||||
|
||||
+ /* In the 4x loop the RTM and non-RTM versions have data pointer
|
||||
+ off by VEC_SIZE * 4 with RTM version being VEC_SIZE * 4 greater.
|
||||
+ This is represented by BASE_OFFSET. As well because the RTM
|
||||
+ version uses vpcmp which stores a bit per element compared where
|
||||
+ the non-RTM version uses vpcmpeq which stores a bit per byte
|
||||
+ compared RET_SCALE of CHAR_SIZE is only relevant for the RTM
|
||||
+ version. */
|
||||
+# ifdef USE_IN_RTM
|
||||
+# define VZEROUPPER
|
||||
+# define BASE_OFFSET (VEC_SIZE * 4)
|
||||
+# define RET_SCALE CHAR_SIZE
|
||||
+# else
|
||||
+# define VZEROUPPER vzeroupper
|
||||
+# define BASE_OFFSET 0
|
||||
+# define RET_SCALE 1
|
||||
+# endif
|
||||
+
|
||||
+ /* In the return from 4x loop memchr and rawmemchr versions have
|
||||
+ data pointers off by VEC_SIZE * 4 with memchr version being
|
||||
+ VEC_SIZE * 4 greater. */
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
+# define RET_OFFSET (BASE_OFFSET - (VEC_SIZE * 4))
|
||||
# define RAW_PTR_REG rcx
|
||||
# define ALGN_PTR_REG rdi
|
||||
# else
|
||||
+# define RET_OFFSET BASE_OFFSET
|
||||
# define RAW_PTR_REG rdi
|
||||
# define ALGN_PTR_REG rcx
|
||||
# endif
|
||||
@@ -57,11 +79,15 @@
|
||||
# define YMM5 ymm21
|
||||
# define YMM6 ymm22
|
||||
|
||||
+# ifndef SECTION
|
||||
+# define SECTION(p) p##.evex
|
||||
+# endif
|
||||
+
|
||||
# define VEC_SIZE 32
|
||||
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
# define PAGE_SIZE 4096
|
||||
|
||||
- .section .text.evex,"ax",@progbits
|
||||
+ .section SECTION(.text),"ax",@progbits
|
||||
ENTRY (MEMCHR)
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
/* Check for zero length. */
|
||||
@@ -237,14 +263,15 @@ L(cross_page_continue):
|
||||
/* Check if at last CHAR_PER_VEC * 4 length. */
|
||||
subq $(CHAR_PER_VEC * 4), %rdx
|
||||
jbe L(last_4x_vec_or_less_cmpeq)
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ /* +VEC_SIZE if USE_IN_RTM otherwise +VEC_SIZE * 5. */
|
||||
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
||||
|
||||
/* Align data to VEC_SIZE * 4 for the loop and readjust length.
|
||||
*/
|
||||
# ifdef USE_AS_WMEMCHR
|
||||
movl %edi, %ecx
|
||||
andq $-(4 * VEC_SIZE), %rdi
|
||||
- andl $(VEC_SIZE * 4 - 1), %ecx
|
||||
+ subl %edi, %ecx
|
||||
/* NB: Divide bytes by 4 to get the wchar_t count. */
|
||||
sarl $2, %ecx
|
||||
addq %rcx, %rdx
|
||||
@@ -254,15 +281,28 @@ L(cross_page_continue):
|
||||
subq %rdi, %rdx
|
||||
# endif
|
||||
# else
|
||||
- addq $VEC_SIZE, %rdi
|
||||
+ addq $(VEC_SIZE + (VEC_SIZE * 4 - BASE_OFFSET)), %rdi
|
||||
andq $-(4 * VEC_SIZE), %rdi
|
||||
# endif
|
||||
-
|
||||
+# ifdef USE_IN_RTM
|
||||
vpxorq %XMMZERO, %XMMZERO, %XMMZERO
|
||||
+# else
|
||||
+ /* copy ymmmatch to ymm0 so we can use vpcmpeq which is not
|
||||
+ encodable with EVEX registers (ymm16-ymm31). */
|
||||
+ vmovdqa64 %YMMMATCH, %ymm0
|
||||
+# endif
|
||||
|
||||
/* Compare 4 * VEC at a time forward. */
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
+ /* Two versions of the loop. One that does not require
|
||||
+ vzeroupper by not using ymm0-ymm15 and another does that require
|
||||
+ vzeroupper because it uses ymm0-ymm15. The reason why ymm0-ymm15
|
||||
+ is used at all is because there is no EVEX encoding vpcmpeq and
|
||||
+ with vpcmpeq this loop can be performed more efficiently. The
|
||||
+ non-vzeroupper version is safe for RTM while the vzeroupper
|
||||
+ version should be prefered if RTM are not supported. */
|
||||
+# ifdef USE_IN_RTM
|
||||
/* It would be possible to save some instructions using 4x VPCMP
|
||||
but bottleneck on port 5 makes it not woth it. */
|
||||
VPCMP $4, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k1
|
||||
@@ -273,12 +313,55 @@ L(loop_4x_vec):
|
||||
/* Reduce VEC2 / VEC3 with min and VEC1 with zero mask. */
|
||||
VPMINU %YMM2, %YMM3, %YMM3{%k1}{z}
|
||||
VPCMP $0, %YMM3, %YMMZERO, %k2
|
||||
+# else
|
||||
+ /* Since vptern can only take 3x vectors fastest to do 1 vec
|
||||
+ seperately with EVEX vpcmp. */
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* vptern can only accept masks for epi32/epi64 so can only save
|
||||
+ instruction using not equals mask on vptern with wmemchr. */
|
||||
+ VPCMP $4, (%rdi), %YMMMATCH, %k1
|
||||
+# else
|
||||
+ VPCMP $0, (%rdi), %YMMMATCH, %k1
|
||||
+# endif
|
||||
+ /* Compare 3x with vpcmpeq and or them all together with vptern.
|
||||
+ */
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm0, %ymm2
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm0, %ymm3
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm0, %ymm4
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* This takes the not of or between ymm2, ymm3, ymm4 as well as
|
||||
+ combines result from VEC0 with zero mask. */
|
||||
+ vpternlogd $1, %ymm2, %ymm3, %ymm4{%k1}{z}
|
||||
+ vpmovmskb %ymm4, %ecx
|
||||
+# else
|
||||
+ /* 254 is mask for oring ymm2, ymm3, ymm4 into ymm4. */
|
||||
+ vpternlogd $254, %ymm2, %ymm3, %ymm4
|
||||
+ vpmovmskb %ymm4, %ecx
|
||||
+ kmovd %k1, %eax
|
||||
+# endif
|
||||
+# endif
|
||||
+
|
||||
# ifdef USE_AS_RAWMEMCHR
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
+# endif
|
||||
+# ifdef USE_IN_RTM
|
||||
kortestd %k2, %k3
|
||||
+# else
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
+ /* ecx contains not of matches. All 1s means no matches. incl will
|
||||
+ overflow and set zeroflag if that is the case. */
|
||||
+ incl %ecx
|
||||
+# else
|
||||
+ /* If either VEC1 (eax) or VEC2-VEC4 (ecx) are not zero. Adding
|
||||
+ to ecx is not an issue because if eax is non-zero it will be
|
||||
+ used for returning the match. If it is zero the add does
|
||||
+ nothing. */
|
||||
+ addq %rax, %rcx
|
||||
+# endif
|
||||
+# endif
|
||||
+# ifdef USE_AS_RAWMEMCHR
|
||||
jz L(loop_4x_vec)
|
||||
# else
|
||||
- kortestd %k2, %k3
|
||||
jnz L(loop_4x_vec_end)
|
||||
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
@@ -288,10 +371,11 @@ L(loop_4x_vec):
|
||||
|
||||
/* Fall through into less than 4 remaining vectors of length case.
|
||||
*/
|
||||
- VPCMP $0, (VEC_SIZE * 4)(%rdi), %YMMMATCH, %k0
|
||||
+ VPCMP $0, BASE_OFFSET(%rdi), %YMMMATCH, %k0
|
||||
+ addq $(BASE_OFFSET - VEC_SIZE), %rdi
|
||||
kmovd %k0, %eax
|
||||
- addq $(VEC_SIZE * 3), %rdi
|
||||
- .p2align 4
|
||||
+ VZEROUPPER
|
||||
+
|
||||
L(last_4x_vec_or_less):
|
||||
/* Check if first VEC contained match. */
|
||||
testl %eax, %eax
|
||||
@@ -338,73 +422,78 @@ L(loop_4x_vec_end):
|
||||
/* rawmemchr will fall through into this if match was found in
|
||||
loop. */
|
||||
|
||||
+# if defined USE_IN_RTM || defined USE_AS_WMEMCHR
|
||||
/* k1 has not of matches with VEC1. */
|
||||
kmovd %k1, %eax
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
+# ifdef USE_AS_WMEMCHR
|
||||
subl $((1 << CHAR_PER_VEC) - 1), %eax
|
||||
-# else
|
||||
+# else
|
||||
incl %eax
|
||||
+# endif
|
||||
+# else
|
||||
+ /* eax already has matches for VEC1. */
|
||||
+ testl %eax, %eax
|
||||
# endif
|
||||
jnz L(last_vec_x1_return)
|
||||
|
||||
+# ifdef USE_IN_RTM
|
||||
VPCMP $0, %YMM2, %YMMZERO, %k0
|
||||
kmovd %k0, %eax
|
||||
+# else
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
+# endif
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_x2_return)
|
||||
|
||||
+# ifdef USE_IN_RTM
|
||||
kmovd %k2, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(last_vec_x3_return)
|
||||
|
||||
kmovd %k3, %eax
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_RAWMEMCHR
|
||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ leaq (VEC_SIZE * 3 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- leaq (VEC_SIZE * 7)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ /* Combine matches in VEC3 (eax) with matches in VEC4 (ecx). */
|
||||
+ salq $VEC_SIZE, %rcx
|
||||
+ orq %rcx, %rax
|
||||
+ tzcntq %rax, %rax
|
||||
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax), %rax
|
||||
+ VZEROUPPER
|
||||
# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_x1_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_RAWMEMCHR
|
||||
-# ifdef USE_AS_WMEMCHR
|
||||
+# if defined USE_AS_WMEMCHR || RET_OFFSET != 0
|
||||
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
- leaq (%rdi, %rax, CHAR_SIZE), %rax
|
||||
-# else
|
||||
- addq %rdi, %rax
|
||||
-# endif
|
||||
+ leaq RET_OFFSET(%rdi, %rax, CHAR_SIZE), %rax
|
||||
# else
|
||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
- leaq (VEC_SIZE * 4)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
+ addq %rdi, %rax
|
||||
# endif
|
||||
+ VZEROUPPER
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(last_vec_x2_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_RAWMEMCHR
|
||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
- leaq VEC_SIZE(%rdi, %rax, CHAR_SIZE), %rax
|
||||
-# else
|
||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
- leaq (VEC_SIZE * 5)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
-# endif
|
||||
+ /* NB: Multiply bytes by RET_SCALE to get the wchar_t count
|
||||
+ if relevant (RET_SCALE = CHAR_SIZE if USE_AS_WMEMCHAR and
|
||||
+ USE_IN_RTM are both defined. Otherwise RET_SCALE = 1. */
|
||||
+ leaq (VEC_SIZE + RET_OFFSET)(%rdi, %rax, RET_SCALE), %rax
|
||||
+ VZEROUPPER
|
||||
ret
|
||||
|
||||
+# ifdef USE_IN_RTM
|
||||
.p2align 4
|
||||
L(last_vec_x3_return):
|
||||
tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_RAWMEMCHR
|
||||
- /* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
-# else
|
||||
/* NB: Multiply bytes by CHAR_SIZE to get the wchar_t count. */
|
||||
- leaq (VEC_SIZE * 6)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
-# endif
|
||||
+ leaq (VEC_SIZE * 2 + RET_OFFSET)(%rdi, %rax, CHAR_SIZE), %rax
|
||||
ret
|
||||
-
|
||||
+# endif
|
||||
|
||||
# ifndef USE_AS_RAWMEMCHR
|
||||
L(last_4x_vec_or_less_cmpeq):
|
||||
diff --git a/sysdeps/x86_64/multiarch/memchr.c b/sysdeps/x86_64/multiarch/memchr.c
|
||||
index 016f5784..f28aea77 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memchr.c
|
||||
+++ b/sysdeps/x86_64/multiarch/memchr.c
|
||||
@@ -24,7 +24,7 @@
|
||||
# undef memchr
|
||||
|
||||
# define SYMBOL_NAME memchr
|
||||
-# include "ifunc-avx2.h"
|
||||
+# include "ifunc-evex.h"
|
||||
|
||||
libc_ifunc_redirected (__redirect_memchr, memchr, IFUNC_SELECTOR ());
|
||||
strong_alias (memchr, __memchr)
|
||||
diff --git a/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
||||
new file mode 100644
|
||||
index 00000000..deda1ca3
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/rawmemchr-evex-rtm.S
|
||||
@@ -0,0 +1,3 @@
|
||||
+#define MEMCHR __rawmemchr_evex_rtm
|
||||
+#define USE_AS_RAWMEMCHR 1
|
||||
+#include "memchr-evex-rtm.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/rawmemchr.c b/sysdeps/x86_64/multiarch/rawmemchr.c
|
||||
index 8a0bc313..1f764f35 100644
|
||||
--- a/sysdeps/x86_64/multiarch/rawmemchr.c
|
||||
+++ b/sysdeps/x86_64/multiarch/rawmemchr.c
|
||||
@@ -26,7 +26,7 @@
|
||||
# undef __rawmemchr
|
||||
|
||||
# define SYMBOL_NAME rawmemchr
|
||||
-# include "ifunc-avx2.h"
|
||||
+# include "ifunc-evex.h"
|
||||
|
||||
libc_ifunc_redirected (__redirect_rawmemchr, __rawmemchr,
|
||||
IFUNC_SELECTOR ());
|
||||
diff --git a/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
||||
new file mode 100644
|
||||
index 00000000..a346cd35
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/wmemchr-evex-rtm.S
|
||||
@@ -0,0 +1,3 @@
|
||||
+#define MEMCHR __wmemchr_evex_rtm
|
||||
+#define USE_AS_WMEMCHR 1
|
||||
+#include "memchr-evex-rtm.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/wmemchr.c b/sysdeps/x86_64/multiarch/wmemchr.c
|
||||
index 6d833702..f9c91915 100644
|
||||
--- a/sysdeps/x86_64/multiarch/wmemchr.c
|
||||
+++ b/sysdeps/x86_64/multiarch/wmemchr.c
|
||||
@@ -26,7 +26,7 @@
|
||||
# undef __wmemchr
|
||||
|
||||
# define SYMBOL_NAME wmemchr
|
||||
-# include "ifunc-avx2.h"
|
||||
+# include "ifunc-evex.h"
|
||||
|
||||
libc_ifunc_redirected (__redirect_wmemchr, __wmemchr, IFUNC_SELECTOR ());
|
||||
weak_alias (__wmemchr, wmemchr)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,873 +0,0 @@
|
||||
From 16d12015c57701b08d7bbed6ec536641bcafb428 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 17 May 2021 13:56:52 -0400
|
||||
Subject: [PATCH] x86: Optimize memcmp-avx2-movbe.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes memcmp-avx2.S. The optimizations include
|
||||
adding a new vec compare path for small sizes, reorganizing the entry
|
||||
control flow, and removing some unnecissary ALU instructions from the
|
||||
main loop. test-memcmp and test-wmemcmp are both passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 6 +
|
||||
sysdeps/x86_64/multiarch/ifunc-memcmp.h | 1 +
|
||||
sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S | 676 +++++++++++--------
|
||||
3 files changed, 402 insertions(+), 281 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index ac097e8d..8be0d78a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -63,16 +63,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL (i, name, memcmp,
|
||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__memcmp_avx2_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__memcmp_avx2_movbe_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__memcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, memcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
@@ -732,16 +735,19 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
IFUNC_IMPL (i, name, wmemcmp,
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__wmemcmp_avx2_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
||||
(CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)
|
||||
&& CPU_FEATURE_USABLE (RTM)),
|
||||
__wmemcmp_avx2_movbe_rtm)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp,
|
||||
(CPU_FEATURE_USABLE (AVX512VL)
|
||||
&& CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)
|
||||
&& CPU_FEATURE_USABLE (MOVBE)),
|
||||
__wmemcmp_evex_movbe)
|
||||
IFUNC_IMPL_ADD (array, i, wmemcmp, CPU_FEATURE_USABLE (SSE4_1),
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-memcmp.h b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
index 8043c635..690dffe8 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-memcmp.h
|
||||
@@ -33,6 +33,7 @@ IFUNC_SELECTOR (void)
|
||||
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX2)
|
||||
&& CPU_FEATURE_USABLE_P (cpu_features, MOVBE)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2)
|
||||
&& CPU_FEATURES_ARCH_P (cpu_features, AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
index 9d5c9c72..16fc673e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-avx2-movbe.S
|
||||
@@ -19,17 +19,23 @@
|
||||
#if IS_IN (libc)
|
||||
|
||||
/* memcmp/wmemcmp is implemented as:
|
||||
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
||||
- to avoid branches.
|
||||
- 2. Use overlapping compare to avoid branch.
|
||||
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
||||
- bytes for wmemcmp.
|
||||
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
+ 1. Use ymm vector compares when possible. The only case where
|
||||
+ vector compares is not possible for when size < VEC_SIZE
|
||||
+ and loading from either s1 or s2 would cause a page cross.
|
||||
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
|
||||
+ with movbe and bswap to avoid branches.
|
||||
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
||||
+ size >= 8 bytes for wmemcmp.
|
||||
+ 4. Optimistically compare up to first 4 * VEC_SIZE one at a
|
||||
+ to check for early mismatches. Only do this if its guranteed the
|
||||
+ work is not wasted.
|
||||
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
area.
|
||||
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
||||
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
||||
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
||||
+ 7. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
||||
+ 8. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
||||
+ 9. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
||||
+
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
@@ -38,8 +44,10 @@
|
||||
# endif
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
+# define CHAR_SIZE 4
|
||||
# define VPCMPEQ vpcmpeqd
|
||||
# else
|
||||
+# define CHAR_SIZE 1
|
||||
# define VPCMPEQ vpcmpeqb
|
||||
# endif
|
||||
|
||||
@@ -52,7 +60,7 @@
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
-# define VEC_MASK ((1 << VEC_SIZE) - 1)
|
||||
+# define PAGE_SIZE 4096
|
||||
|
||||
/* Warning!
|
||||
wmemcmp has to use SIGNED comparison for elements.
|
||||
@@ -71,136 +79,359 @@ ENTRY (MEMCMP)
|
||||
jb L(less_vec)
|
||||
|
||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+ vmovdqu (%rsi), %ymm1
|
||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ /* NB: eax must be destination register if going to
|
||||
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
|
||||
+ must be ecx. */
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_0)
|
||||
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
- jbe L(last_vec)
|
||||
-
|
||||
- VPCMPEQ %ymm0, %ymm0, %ymm0
|
||||
- /* More than 2 * VEC. */
|
||||
- cmpq $(VEC_SIZE * 8), %rdx
|
||||
- ja L(more_8x_vec)
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- jb L(last_4x_vec)
|
||||
-
|
||||
- /* From 4 * VEC to 8 * VEC, inclusively. */
|
||||
- vmovdqu (%rsi), %ymm1
|
||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
||||
+ jbe L(last_1x_vec)
|
||||
|
||||
+ /* Check second VEC no matter what. */
|
||||
vmovdqu VEC_SIZE(%rsi), %ymm2
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
+ /* If all 4 VEC where equal eax will be all 1s so incl will
|
||||
+ overflow and set zero flag. */
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_1)
|
||||
|
||||
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
||||
+ /* Less than 4 * VEC. */
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jbe L(last_2x_vec)
|
||||
|
||||
+ /* Check third and fourth VEC no matter what. */
|
||||
+ vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_2)
|
||||
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
||||
+ vpmovmskb %ymm4, %ecx
|
||||
+ incl %ecx
|
||||
+ jnz L(return_vec_3)
|
||||
|
||||
- vpand %ymm1, %ymm2, %ymm5
|
||||
- vpand %ymm3, %ymm4, %ymm6
|
||||
- vpand %ymm5, %ymm6, %ymm5
|
||||
+ /* Go to 4x VEC loop. */
|
||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
||||
+ ja L(more_8x_vec)
|
||||
|
||||
- vptest %ymm0, %ymm5
|
||||
- jnc L(4x_vec_end)
|
||||
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
|
||||
+ branches. */
|
||||
|
||||
+ /* Load first two VEC from s2 before adjusting addresses. */
|
||||
+ vmovdqu -(VEC_SIZE * 4)(%rsi, %rdx), %ymm1
|
||||
+ vmovdqu -(VEC_SIZE * 3)(%rsi, %rdx), %ymm2
|
||||
leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
- vmovdqu (%rsi), %ymm1
|
||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
||||
|
||||
- vmovdqu VEC_SIZE(%rsi), %ymm2
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
||||
- vpand %ymm2, %ymm1, %ymm5
|
||||
+ /* Wait to load from s1 until addressed adjust due to
|
||||
+ unlamination of microfusion with complex address mode. */
|
||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
||||
+ VPCMPEQ (VEC_SIZE)(%rdi), %ymm2, %ymm2
|
||||
|
||||
vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
||||
- vpand %ymm3, %ymm5, %ymm5
|
||||
-
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
||||
vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
||||
- vpand %ymm4, %ymm5, %ymm5
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
||||
|
||||
- vptest %ymm0, %ymm5
|
||||
- jnc L(4x_vec_end)
|
||||
- xorl %eax, %eax
|
||||
+ /* Reduce VEC0 - VEC4. */
|
||||
+ vpand %ymm1, %ymm2, %ymm5
|
||||
+ vpand %ymm3, %ymm4, %ymm6
|
||||
+ vpand %ymm5, %ymm6, %ymm7
|
||||
+ vpmovmskb %ymm7, %ecx
|
||||
+ incl %ecx
|
||||
+ jnz L(return_vec_0_1_2_3)
|
||||
+ /* NB: eax must be zero to reach here. */
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(return_vec_0):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (%rdi, %rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi, %rax), %ecx
|
||||
+ /* NB: no partial register stall here because xorl zero idiom
|
||||
+ above. */
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (%rsi, %rax), %ecx
|
||||
+ movzbl (%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
L(return_vzeroupper):
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+L(return_vec_1):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl VEC_SIZE(%rdi, %rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl VEC_SIZE(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
|
||||
+ movzbl VEC_SIZE(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(return_vec_2):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (VEC_SIZE * 2)(%rdi, %rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ /* NB: p2align 5 here to ensure 4x loop is 32 byte aligned. */
|
||||
+ .p2align 5
|
||||
+L(8x_return_vec_0_1_2_3):
|
||||
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
|
||||
+ addq %rdi, %rsi
|
||||
+L(return_vec_0_1_2_3):
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_0)
|
||||
|
||||
-L(last_vec):
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
||||
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_1)
|
||||
+
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_2)
|
||||
+L(return_vec_3):
|
||||
+ tzcntl %ecx, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx), %eax
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(more_8x_vec):
|
||||
+ /* Set end of s1 in rdx. */
|
||||
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx), %rdx
|
||||
+ /* rsi stores s2 - s1. This allows loop to only update one
|
||||
+ pointer. */
|
||||
+ subq %rdi, %rsi
|
||||
+ /* Align s1 pointer. */
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ /* Adjust because first 4x vec where check already. */
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ .p2align 4
|
||||
+L(loop_4x_vec):
|
||||
+ /* rsi has s2 - s1 so get correct address by adding s1 (in rdi).
|
||||
+ */
|
||||
+ vmovdqu (%rsi, %rdi), %ymm1
|
||||
+ VPCMPEQ (%rdi), %ymm1, %ymm1
|
||||
+
|
||||
+ vmovdqu VEC_SIZE(%rsi, %rdi), %ymm2
|
||||
+ VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
||||
+
|
||||
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdi), %ymm3
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
||||
+
|
||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdi), %ymm4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
||||
+
|
||||
+ vpand %ymm1, %ymm2, %ymm5
|
||||
+ vpand %ymm3, %ymm4, %ymm6
|
||||
+ vpand %ymm5, %ymm6, %ymm7
|
||||
+ vpmovmskb %ymm7, %ecx
|
||||
+ incl %ecx
|
||||
+ jnz L(8x_return_vec_0_1_2_3)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ /* Check if s1 pointer at end. */
|
||||
+ cmpq %rdx, %rdi
|
||||
+ jb L(loop_4x_vec)
|
||||
+
|
||||
+ subq %rdx, %rdi
|
||||
+ /* rdi has 4 * VEC_SIZE - remaining length. */
|
||||
+ cmpl $(VEC_SIZE * 3), %edi
|
||||
+ jae L(8x_last_1x_vec)
|
||||
+ /* Load regardless of branch. */
|
||||
+ vmovdqu (VEC_SIZE * 2)(%rsi, %rdx), %ymm3
|
||||
+ cmpl $(VEC_SIZE * 2), %edi
|
||||
+ jae L(8x_last_2x_vec)
|
||||
+
|
||||
+ /* Check last 4 VEC. */
|
||||
+ vmovdqu (%rsi, %rdx), %ymm1
|
||||
+ VPCMPEQ (%rdx), %ymm1, %ymm1
|
||||
+
|
||||
+ vmovdqu VEC_SIZE(%rsi, %rdx), %ymm2
|
||||
+ VPCMPEQ VEC_SIZE(%rdx), %ymm2, %ymm2
|
||||
+
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
|
||||
+
|
||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
|
||||
+
|
||||
+ vpand %ymm1, %ymm2, %ymm5
|
||||
+ vpand %ymm3, %ymm4, %ymm6
|
||||
+ vpand %ymm5, %ymm6, %ymm7
|
||||
+ vpmovmskb %ymm7, %ecx
|
||||
+ /* Restore s1 pointer to rdi. */
|
||||
+ movq %rdx, %rdi
|
||||
+ incl %ecx
|
||||
+ jnz L(8x_return_vec_0_1_2_3)
|
||||
+ /* NB: eax must be zero to reach here. */
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ /* Only entry is from L(more_8x_vec). */
|
||||
+ .p2align 4
|
||||
+L(8x_last_2x_vec):
|
||||
+ /* Check second to last VEC. rdx store end pointer of s1 and
|
||||
+ ymm3 has already been loaded with second to last VEC from s2.
|
||||
+ */
|
||||
+ VPCMPEQ (VEC_SIZE * 2)(%rdx), %ymm3, %ymm3
|
||||
+ vpmovmskb %ymm3, %eax
|
||||
+ incl %eax
|
||||
+ jnz L(8x_return_vec_2)
|
||||
+ /* Check last VEC. */
|
||||
+ .p2align 4
|
||||
+L(8x_last_1x_vec):
|
||||
+ vmovdqu (VEC_SIZE * 3)(%rsi, %rdx), %ymm4
|
||||
+ VPCMPEQ (VEC_SIZE * 3)(%rdx), %ymm4, %ymm4
|
||||
+ vpmovmskb %ymm4, %eax
|
||||
+ incl %eax
|
||||
+ jnz L(8x_return_vec_3)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec):
|
||||
- /* A byte or int32 is different within 16 or 32 bytes. */
|
||||
- tzcntl %eax, %ecx
|
||||
+L(last_2x_vec):
|
||||
+ /* Check second to last VEC. */
|
||||
+ vmovdqu -(VEC_SIZE * 2)(%rsi, %rdx), %ymm1
|
||||
+ VPCMPEQ -(VEC_SIZE * 2)(%rdi, %rdx), %ymm1, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_1_end)
|
||||
+ /* Check last VEC. */
|
||||
+L(last_1x_vec):
|
||||
+ vmovdqu -(VEC_SIZE * 1)(%rsi, %rdx), %ymm1
|
||||
+ VPCMPEQ -(VEC_SIZE * 1)(%rdi, %rdx), %ymm1, %ymm1
|
||||
+ vpmovmskb %ymm1, %eax
|
||||
+ incl %eax
|
||||
+ jnz L(return_vec_0_end)
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(8x_return_vec_2):
|
||||
+ subq $VEC_SIZE, %rdx
|
||||
+L(8x_return_vec_3):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addq %rdx, %rax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi, %rcx), %edx
|
||||
- cmpl (%rsi, %rcx), %edx
|
||||
-L(wmemcmp_return):
|
||||
- setl %al
|
||||
- negl %eax
|
||||
- orl $1, %eax
|
||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
- movzbl (%rdi, %rcx), %eax
|
||||
- movzbl (%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
# endif
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
.p2align 4
|
||||
-L(4):
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi), %edx
|
||||
- cmpl (%rsi), %edx
|
||||
- jne L(wmemcmp_return)
|
||||
- ret
|
||||
+L(return_vec_1_end):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addl %edx, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
+
|
||||
.p2align 4
|
||||
-L(between_4_7):
|
||||
- /* Load as big endian with overlapping movbe to avoid branches. */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- je L(exit)
|
||||
- sbbl %eax, %eax
|
||||
- orl $1, %eax
|
||||
- ret
|
||||
+L(return_vec_0_end):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addl %edx, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl -VEC_SIZE(%rdi, %rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl -VEC_SIZE(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
|
||||
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(exit):
|
||||
- ret
|
||||
+L(less_vec):
|
||||
+ /* Check if one or less CHAR. This is necessary for size = 0 but
|
||||
+ is also faster for size = CHAR_SIZE. */
|
||||
+ cmpl $CHAR_SIZE, %edx
|
||||
+ jbe L(one_or_less)
|
||||
+
|
||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
+ page cross. This can have false positives but is by far the
|
||||
+ fastest method. */
|
||||
+ movl %edi, %eax
|
||||
+ orl %esi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jg L(page_cross_less_vec)
|
||||
+
|
||||
+ /* No page cross possible. */
|
||||
+ vmovdqu (%rsi), %ymm2
|
||||
+ VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
+ vpmovmskb %ymm2, %eax
|
||||
+ incl %eax
|
||||
+ /* Result will be zero if s1 and s2 match. Otherwise first set
|
||||
+ bit will be first mismatch. */
|
||||
+ bzhil %edx, %eax, %edx
|
||||
+ jnz L(return_vec_0)
|
||||
+ xorl %eax, %eax
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(between_2_3):
|
||||
+L(page_cross_less_vec):
|
||||
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
||||
+ bytes. */
|
||||
+ cmpl $16, %edx
|
||||
+ jae L(between_16_31)
|
||||
+# ifndef USE_AS_WMEMCMP
|
||||
+ cmpl $8, %edx
|
||||
+ jae L(between_8_15)
|
||||
+ cmpl $4, %edx
|
||||
+ jae L(between_4_7)
|
||||
+
|
||||
/* Load as big endian to avoid branches. */
|
||||
movzwl (%rdi), %eax
|
||||
movzwl (%rsi), %ecx
|
||||
@@ -208,223 +439,106 @@ L(between_2_3):
|
||||
shll $8, %ecx
|
||||
bswap %eax
|
||||
bswap %ecx
|
||||
- movb -1(%rdi, %rdx), %al
|
||||
- movb -1(%rsi, %rdx), %cl
|
||||
+ movzbl -1(%rdi, %rdx), %edi
|
||||
+ movzbl -1(%rsi, %rdx), %esi
|
||||
+ orl %edi, %eax
|
||||
+ orl %esi, %ecx
|
||||
/* Subtraction is okay because the upper 8 bits are zero. */
|
||||
subl %ecx, %eax
|
||||
+ /* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(1):
|
||||
- movzbl (%rdi), %eax
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
subl %ecx, %eax
|
||||
- ret
|
||||
-# endif
|
||||
-
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
+ /* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(less_vec):
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
||||
- cmpb $4, %dl
|
||||
- je L(4)
|
||||
- jb L(zero)
|
||||
-# else
|
||||
- cmpb $1, %dl
|
||||
- je L(1)
|
||||
- jb L(zero)
|
||||
- cmpb $4, %dl
|
||||
- jb L(between_2_3)
|
||||
- cmpb $8, %dl
|
||||
- jb L(between_4_7)
|
||||
+L(between_8_15):
|
||||
# endif
|
||||
- cmpb $16, %dl
|
||||
- jae L(between_16_31)
|
||||
- /* It is between 8 and 15 bytes. */
|
||||
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
vmovq (%rdi), %xmm1
|
||||
vmovq (%rsi), %xmm2
|
||||
- VPCMPEQ %xmm1, %xmm2, %xmm2
|
||||
+ VPCMPEQ %xmm1, %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
- subl $0xffff, %eax
|
||||
- jnz L(first_vec)
|
||||
+ subl $0xffff, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
leaq -8(%rdi, %rdx), %rdi
|
||||
leaq -8(%rsi, %rdx), %rsi
|
||||
vmovq (%rdi), %xmm1
|
||||
vmovq (%rsi), %xmm2
|
||||
- VPCMPEQ %xmm1, %xmm2, %xmm2
|
||||
+ VPCMPEQ %xmm1, %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
- subl $0xffff, %eax
|
||||
- jnz L(first_vec)
|
||||
+ subl $0xffff, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
vmovdqu (%rsi), %xmm2
|
||||
- VPCMPEQ (%rdi), %xmm2, %xmm2
|
||||
+ VPCMPEQ (%rdi), %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
- subl $0xffff, %eax
|
||||
- jnz L(first_vec)
|
||||
+ subl $0xffff, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
+
|
||||
+ vmovdqu -16(%rsi, %rdx), %xmm2
|
||||
leaq -16(%rdi, %rdx), %rdi
|
||||
leaq -16(%rsi, %rdx), %rsi
|
||||
- vmovdqu (%rsi), %xmm2
|
||||
- VPCMPEQ (%rdi), %xmm2, %xmm2
|
||||
+ VPCMPEQ (%rdi), %xmm2, %xmm2
|
||||
vpmovmskb %xmm2, %eax
|
||||
- subl $0xffff, %eax
|
||||
- jnz L(first_vec)
|
||||
+ subl $0xffff, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
+ /* No ymm register was touched. */
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(more_8x_vec):
|
||||
- /* More than 8 * VEC. Check the first VEC. */
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- /* Align the first memory area for aligned loads in the loop.
|
||||
- Compute how much the first memory area is misaligned. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- /* Get the negative of offset for alignment. */
|
||||
- subq $VEC_SIZE, %rcx
|
||||
- /* Adjust the second memory area. */
|
||||
- subq %rcx, %rsi
|
||||
- /* Adjust the first memory area which should be aligned now. */
|
||||
- subq %rcx, %rdi
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rdx
|
||||
-
|
||||
-L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- vmovdqu (%rsi), %ymm1
|
||||
- VPCMPEQ (%rdi), %ymm1, %ymm1
|
||||
-
|
||||
- vmovdqu VEC_SIZE(%rsi), %ymm2
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %ymm2, %ymm2
|
||||
- vpand %ymm2, %ymm1, %ymm5
|
||||
-
|
||||
- vmovdqu (VEC_SIZE * 2)(%rsi), %ymm3
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %ymm3, %ymm3
|
||||
- vpand %ymm3, %ymm5, %ymm5
|
||||
-
|
||||
- vmovdqu (VEC_SIZE * 3)(%rsi), %ymm4
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %ymm4, %ymm4
|
||||
- vpand %ymm4, %ymm5, %ymm5
|
||||
-
|
||||
- vptest %ymm0, %ymm5
|
||||
- jnc L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
- addq $(VEC_SIZE * 4), %rsi
|
||||
-
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- jae L(loop_4x_vec)
|
||||
-
|
||||
- /* Less than 4 * VEC. */
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
- jbe L(last_vec)
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
- jbe L(last_2x_vec)
|
||||
-
|
||||
-L(last_4x_vec):
|
||||
- /* From 2 * VEC to 4 * VEC. */
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- addq $VEC_SIZE, %rsi
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- addq $VEC_SIZE, %rsi
|
||||
- vmovdqu (%rsi), %ymm2
|
||||
- VPCMPEQ (%rdi), %ymm2, %ymm2
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
- VZEROUPPER_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
-L(4x_vec_end):
|
||||
- vpmovmskb %ymm1, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
- vpmovmskb %ymm2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- vpmovmskb %ymm3, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- vpmovmskb %ymm4, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- tzcntl %eax, %ecx
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (VEC_SIZE * 3)(%rdi, %rcx), %edx
|
||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
||||
- jmp L(wmemcmp_return)
|
||||
-# else
|
||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
-# endif
|
||||
- VZEROUPPER_RETURN
|
||||
-
|
||||
.p2align 4
|
||||
-L(first_vec_x1):
|
||||
- tzcntl %eax, %ecx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl VEC_SIZE(%rdi, %rcx), %edx
|
||||
- cmpl VEC_SIZE(%rsi, %rcx), %edx
|
||||
- jmp L(wmemcmp_return)
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
+ movl (%rdi), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi), %ecx
|
||||
+ je L(zero)
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
# else
|
||||
- movzbl VEC_SIZE(%rdi, %rcx), %eax
|
||||
- movzbl VEC_SIZE(%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
-# endif
|
||||
- VZEROUPPER_RETURN
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2):
|
||||
- tzcntl %eax, %ecx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (VEC_SIZE * 2)(%rdi, %rcx), %edx
|
||||
- cmpl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
||||
- jmp L(wmemcmp_return)
|
||||
-# else
|
||||
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
||||
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
+L(between_4_7):
|
||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
||||
+ */
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ jz L(zero_4_7)
|
||||
+ sbbl %eax, %eax
|
||||
+ orl $1, %eax
|
||||
+L(zero_4_7):
|
||||
+ /* No ymm register was touched. */
|
||||
+ ret
|
||||
# endif
|
||||
- VZEROUPPER_RETURN
|
||||
+
|
||||
END (MEMCMP)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,851 +0,0 @@
|
||||
From 4ad473e97acdc5f6d811755b67c09f2128a644ce Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 17 May 2021 13:57:24 -0400
|
||||
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit optimizes memcmp-evex.S. The optimizations include
|
||||
adding a new vec compare path for small sizes, reorganizing the entry
|
||||
control flow, removing some unnecissary ALU instructions from the main
|
||||
loop, and most importantly replacing the heavy use of vpcmp + kand
|
||||
logic with vpxor + vptern. test-memcmp and test-wmemcmp are both
|
||||
passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 710 +++++++++++--------
|
||||
1 file changed, 408 insertions(+), 302 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
index 9c093972..654dc7ac 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -19,17 +19,22 @@
|
||||
#if IS_IN (libc)
|
||||
|
||||
/* memcmp/wmemcmp is implemented as:
|
||||
- 1. For size from 2 to 7 bytes, load as big endian with movbe and bswap
|
||||
- to avoid branches.
|
||||
- 2. Use overlapping compare to avoid branch.
|
||||
- 3. Use vector compare when size >= 4 bytes for memcmp or size >= 8
|
||||
- bytes for wmemcmp.
|
||||
- 4. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
- 5. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
+ 1. Use ymm vector compares when possible. The only case where
|
||||
+ vector compares is not possible for when size < CHAR_PER_VEC
|
||||
+ and loading from either s1 or s2 would cause a page cross.
|
||||
+ 2. For size from 2 to 7 bytes on page cross, load as big endian
|
||||
+ with movbe and bswap to avoid branches.
|
||||
+ 3. Use xmm vector compare when size >= 4 bytes for memcmp or
|
||||
+ size >= 8 bytes for wmemcmp.
|
||||
+ 4. Optimistically compare up to first 4 * CHAR_PER_VEC one at a
|
||||
+ to check for early mismatches. Only do this if its guranteed the
|
||||
+ work is not wasted.
|
||||
+ 5. If size is 8 * VEC_SIZE or less, unroll the loop.
|
||||
+ 6. Compare 4 * VEC_SIZE at a time with the aligned first memory
|
||||
area.
|
||||
- 6. Use 2 vector compares when size is 2 * VEC_SIZE or less.
|
||||
- 7. Use 4 vector compares when size is 4 * VEC_SIZE or less.
|
||||
- 8. Use 8 vector compares when size is 8 * VEC_SIZE or less. */
|
||||
+ 7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
|
||||
+ 8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
|
||||
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
@@ -40,11 +45,21 @@
|
||||
# define VMOVU vmovdqu64
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
-# define VPCMPEQ vpcmpeqd
|
||||
+# define CHAR_SIZE 4
|
||||
+# define VPCMP vpcmpd
|
||||
# else
|
||||
-# define VPCMPEQ vpcmpeqb
|
||||
+# define CHAR_SIZE 1
|
||||
+# define VPCMP vpcmpub
|
||||
# endif
|
||||
|
||||
+# define VEC_SIZE 32
|
||||
+# define PAGE_SIZE 4096
|
||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
+
|
||||
+# define XMM0 xmm16
|
||||
+# define XMM1 xmm17
|
||||
+# define XMM2 xmm18
|
||||
+# define YMM0 ymm16
|
||||
# define XMM1 xmm17
|
||||
# define XMM2 xmm18
|
||||
# define YMM1 ymm17
|
||||
@@ -54,15 +69,6 @@
|
||||
# define YMM5 ymm21
|
||||
# define YMM6 ymm22
|
||||
|
||||
-# define VEC_SIZE 32
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
-# define VEC_MASK 0xff
|
||||
-# define XMM_MASK 0xf
|
||||
-# else
|
||||
-# define VEC_MASK 0xffffffff
|
||||
-# define XMM_MASK 0xffff
|
||||
-# endif
|
||||
-
|
||||
/* Warning!
|
||||
wmemcmp has to use SIGNED comparison for elements.
|
||||
memcmp has to use UNSIGNED comparison for elemnts.
|
||||
@@ -70,145 +76,370 @@
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
ENTRY (MEMCMP)
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- shl $2, %RDX_LP
|
||||
-# elif defined __ILP32__
|
||||
+# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
- cmp $VEC_SIZE, %RDX_LP
|
||||
+ cmp $CHAR_PER_VEC, %RDX_LP
|
||||
jb L(less_vec)
|
||||
|
||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k1
|
||||
+ VMOVU (%rsi), %YMM1
|
||||
+ /* Use compare not equals to directly check for mismatch. */
|
||||
+ VPCMP $4, (%rdi), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
- jbe L(last_vec)
|
||||
-
|
||||
- /* More than 2 * VEC. */
|
||||
- cmpq $(VEC_SIZE * 8), %rdx
|
||||
- ja L(more_8x_vec)
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- jb L(last_4x_vec)
|
||||
+ /* NB: eax must be destination register if going to
|
||||
+ L(return_vec_[0,2]). For L(return_vec_3 destination register
|
||||
+ must be ecx. */
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
|
||||
- /* From 4 * VEC to 8 * VEC, inclusively. */
|
||||
- VMOVU (%rsi), %YMM1
|
||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+ cmpq $(CHAR_PER_VEC * 2), %rdx
|
||||
+ jbe L(last_1x_vec)
|
||||
|
||||
+ /* Check second VEC no matter what. */
|
||||
VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
+ VPCMP $4, VEC_SIZE(%rdi), %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_1)
|
||||
+
|
||||
+ /* Less than 4 * VEC. */
|
||||
+ cmpq $(CHAR_PER_VEC * 4), %rdx
|
||||
+ jbe L(last_2x_vec)
|
||||
|
||||
+ /* Check third and fourth VEC no matter what. */
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
+ VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_2)
|
||||
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
+ VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
||||
+ kmovd %k1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(return_vec_3)
|
||||
|
||||
- kandd %k1, %k2, %k5
|
||||
- kandd %k3, %k4, %k6
|
||||
- kandd %k5, %k6, %k6
|
||||
+ /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
|
||||
+ compare with zero to get a mask is needed. */
|
||||
+ vpxorq %XMM0, %XMM0, %XMM0
|
||||
|
||||
- kmovd %k6, %eax
|
||||
- cmpl $VEC_MASK, %eax
|
||||
- jne L(4x_vec_end)
|
||||
+ /* Go to 4x VEC loop. */
|
||||
+ cmpq $(CHAR_PER_VEC * 8), %rdx
|
||||
+ ja L(more_8x_vec)
|
||||
|
||||
- leaq -(4 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
- leaq -(4 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
- VMOVU (%rsi), %YMM1
|
||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
||||
+ /* Handle remainder of size = 4 * VEC + 1 to 8 * VEC without any
|
||||
+ branches. */
|
||||
|
||||
- VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
- kandd %k1, %k2, %k5
|
||||
+ /* Load first two VEC from s2 before adjusting addresses. */
|
||||
+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
||||
+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx, CHAR_SIZE), %YMM2
|
||||
+ leaq -(4 * VEC_SIZE)(%rdi, %rdx, CHAR_SIZE), %rdi
|
||||
+ leaq -(4 * VEC_SIZE)(%rsi, %rdx, CHAR_SIZE), %rsi
|
||||
+
|
||||
+ /* Wait to load from s1 until addressed adjust due to
|
||||
+ unlamination of microfusion with complex address mode. */
|
||||
+
|
||||
+ /* vpxor will be all 0s if s1 and s2 are equal. Otherwise it
|
||||
+ will have some 1s. */
|
||||
+ vpxorq (%rdi), %YMM1, %YMM1
|
||||
+ vpxorq (VEC_SIZE)(%rdi), %YMM2, %YMM2
|
||||
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
- kandd %k3, %k5, %k5
|
||||
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
+ /* Or together YMM1, YMM2, and YMM3 into YMM3. */
|
||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
- kandd %k4, %k5, %k5
|
||||
+ /* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
|
||||
+ oring with YMM3. Result is stored in YMM4. */
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
||||
+ /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
|
||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ kmovd %k1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(return_vec_0_1_2_3)
|
||||
+ /* NB: eax must be zero to reach here. */
|
||||
+ ret
|
||||
|
||||
- kmovd %k5, %eax
|
||||
- cmpl $VEC_MASK, %eax
|
||||
- jne L(4x_vec_end)
|
||||
- xorl %eax, %eax
|
||||
+ /* NB: aligning 32 here allows for the rest of the jump targets
|
||||
+ to be tuned for 32 byte alignment. Most important this ensures
|
||||
+ the L(more_8x_vec) loop is 32 byte aligned. */
|
||||
+ .p2align 5
|
||||
+L(less_vec):
|
||||
+ /* Check if one or less CHAR. This is necessary for size = 0 but
|
||||
+ is also faster for size = CHAR_SIZE. */
|
||||
+ cmpl $1, %edx
|
||||
+ jbe L(one_or_less)
|
||||
+
|
||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
+ page cross. This can have false positives but is by far the
|
||||
+ fastest method. */
|
||||
+ movl %edi, %eax
|
||||
+ orl %esi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jg L(page_cross_less_vec)
|
||||
+
|
||||
+ /* No page cross possible. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMP $4, (%rdi), %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Create mask in ecx for potentially in bound matches. */
|
||||
+ bzhil %edx, %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(last_2x_vec):
|
||||
- /* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+L(return_vec_0):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ /* NB: no partial register stall here because xorl zero idiom
|
||||
+ above. */
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (%rsi, %rax), %ecx
|
||||
+ movzbl (%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
|
||||
-L(last_vec):
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- leaq -VEC_SIZE(%rdi, %rdx), %rdi
|
||||
- leaq -VEC_SIZE(%rsi, %rdx), %rsi
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+ /* NB: No p2align necessary. Alignment % 16 is naturally 1
|
||||
+ which is good enough for a target not in a loop. */
|
||||
+L(return_vec_1):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl VEC_SIZE(%rsi, %rax), %ecx
|
||||
+ movzbl VEC_SIZE(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec):
|
||||
- /* A byte or int32 is different within 16 or 32 bytes. */
|
||||
- tzcntl %eax, %ecx
|
||||
+ /* NB: No p2align necessary. Alignment % 16 is naturally 2
|
||||
+ which is good enough for a target not in a loop. */
|
||||
+L(return_vec_2):
|
||||
+ tzcntl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi, %rcx, 4), %edx
|
||||
- cmpl (%rsi, %rcx, 4), %edx
|
||||
-L(wmemcmp_return):
|
||||
- setl %al
|
||||
- negl %eax
|
||||
- orl $1, %eax
|
||||
+ movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
- movzbl (%rdi, %rcx), %eax
|
||||
- movzbl (%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
+ movzbl (VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ movzbl (VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
+ .p2align 4
|
||||
+L(8x_return_vec_0_1_2_3):
|
||||
+ /* Returning from L(more_8x_vec) requires restoring rsi. */
|
||||
+ addq %rdi, %rsi
|
||||
+L(return_vec_0_1_2_3):
|
||||
+ VPCMP $4, %YMM1, %YMM0, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
+
|
||||
+ VPCMP $4, %YMM2, %YMM0, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_1)
|
||||
+
|
||||
+ VPCMP $4, %YMM3, %YMM0, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_2)
|
||||
+L(return_vec_3):
|
||||
+ tzcntl %ecx, %ecx
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
-L(4):
|
||||
- xorl %eax, %eax
|
||||
- movl (%rdi), %edx
|
||||
- cmpl (%rsi), %edx
|
||||
- jne L(wmemcmp_return)
|
||||
+L(more_8x_vec):
|
||||
+ /* Set end of s1 in rdx. */
|
||||
+ leaq -(VEC_SIZE * 4)(%rdi, %rdx, CHAR_SIZE), %rdx
|
||||
+ /* rsi stores s2 - s1. This allows loop to only update one
|
||||
+ pointer. */
|
||||
+ subq %rdi, %rsi
|
||||
+ /* Align s1 pointer. */
|
||||
+ andq $-VEC_SIZE, %rdi
|
||||
+ /* Adjust because first 4x vec where check already. */
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ .p2align 4
|
||||
+L(loop_4x_vec):
|
||||
+ VMOVU (%rsi, %rdi), %YMM1
|
||||
+ vpxorq (%rdi), %YMM1, %YMM1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
|
||||
+ vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
||||
+ vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ kmovd %k1, %ecx
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(8x_return_vec_0_1_2_3)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ cmpq %rdx, %rdi
|
||||
+ jb L(loop_4x_vec)
|
||||
+
|
||||
+ subq %rdx, %rdi
|
||||
+ /* rdi has 4 * VEC_SIZE - remaining length. */
|
||||
+ cmpl $(VEC_SIZE * 3), %edi
|
||||
+ jae L(8x_last_1x_vec)
|
||||
+ /* Load regardless of branch. */
|
||||
+ VMOVU (VEC_SIZE * 2)(%rsi, %rdx), %YMM3
|
||||
+ cmpl $(VEC_SIZE * 2), %edi
|
||||
+ jae L(8x_last_2x_vec)
|
||||
+
|
||||
+ VMOVU (%rsi, %rdx), %YMM1
|
||||
+ vpxorq (%rdx), %YMM1, %YMM1
|
||||
+
|
||||
+ VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
|
||||
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
|
||||
+
|
||||
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
|
||||
+ vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
+
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
|
||||
+ VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ kmovd %k1, %ecx
|
||||
+ /* Restore s1 pointer to rdi. */
|
||||
+ movq %rdx, %rdi
|
||||
+ testl %ecx, %ecx
|
||||
+ jnz L(8x_return_vec_0_1_2_3)
|
||||
+ /* NB: eax must be zero to reach here. */
|
||||
+ ret
|
||||
+
|
||||
+ /* Only entry is from L(more_8x_vec). */
|
||||
+ .p2align 4
|
||||
+L(8x_last_2x_vec):
|
||||
+ VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(8x_return_vec_2)
|
||||
+ /* Naturally aligned to 16 bytes. */
|
||||
+L(8x_last_1x_vec):
|
||||
+ VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
|
||||
+ VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(8x_return_vec_3)
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(last_2x_vec):
|
||||
+ /* Check second to last VEC. */
|
||||
+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
||||
+ VPCMP $4, -(VEC_SIZE * 2)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_1_end)
|
||||
+
|
||||
+ /* Check last VEC. */
|
||||
+ .p2align 4
|
||||
+L(last_1x_vec):
|
||||
+ VMOVU -(VEC_SIZE * 1)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
||||
+ VPCMP $4, -(VEC_SIZE * 1)(%rdi, %rdx, CHAR_SIZE), %YMM1, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0_end)
|
||||
ret
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(8x_return_vec_2):
|
||||
+ subq $VEC_SIZE, %rdx
|
||||
+L(8x_return_vec_3):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
+ addq %rdx, %rax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
-L(between_4_7):
|
||||
- /* Load as big endian with overlapping movbe to avoid branches. */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- je L(exit)
|
||||
- sbbl %eax, %eax
|
||||
- orl $1, %eax
|
||||
+L(return_vec_0_end):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addl %edx, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl -VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl -VEC_SIZE(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl -VEC_SIZE(%rsi, %rax), %ecx
|
||||
+ movzbl -VEC_SIZE(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(exit):
|
||||
+L(return_vec_1_end):
|
||||
+ tzcntl %eax, %eax
|
||||
+ addl %edx, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
ret
|
||||
|
||||
+
|
||||
.p2align 4
|
||||
+L(page_cross_less_vec):
|
||||
+ /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
||||
+ bytes. */
|
||||
+ cmpl $(16 / CHAR_SIZE), %edx
|
||||
+ jae L(between_16_31)
|
||||
+# ifndef USE_AS_WMEMCMP
|
||||
+ cmpl $8, %edx
|
||||
+ jae L(between_8_15)
|
||||
+ cmpl $4, %edx
|
||||
+ jae L(between_4_7)
|
||||
L(between_2_3):
|
||||
/* Load as big endian to avoid branches. */
|
||||
movzwl (%rdi), %eax
|
||||
@@ -217,224 +448,99 @@ L(between_2_3):
|
||||
shll $8, %ecx
|
||||
bswap %eax
|
||||
bswap %ecx
|
||||
- movb -1(%rdi, %rdx), %al
|
||||
- movb -1(%rsi, %rdx), %cl
|
||||
+ movzbl -1(%rdi, %rdx), %edi
|
||||
+ movzbl -1(%rsi, %rdx), %esi
|
||||
+ orl %edi, %eax
|
||||
+ orl %esi, %ecx
|
||||
/* Subtraction is okay because the upper 8 bits are zero. */
|
||||
subl %ecx, %eax
|
||||
ret
|
||||
-
|
||||
.p2align 4
|
||||
-L(1):
|
||||
- movzbl (%rdi), %eax
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
subl %ecx, %eax
|
||||
ret
|
||||
-# endif
|
||||
-
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
|
||||
.p2align 4
|
||||
-L(less_vec):
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- /* It can only be 0, 4, 8, 12, 16, 20, 24, 28 bytes. */
|
||||
- cmpb $4, %dl
|
||||
- je L(4)
|
||||
- jb L(zero)
|
||||
-# else
|
||||
- cmpb $1, %dl
|
||||
- je L(1)
|
||||
- jb L(zero)
|
||||
- cmpb $4, %dl
|
||||
- jb L(between_2_3)
|
||||
- cmpb $8, %dl
|
||||
- jb L(between_4_7)
|
||||
+L(between_8_15):
|
||||
# endif
|
||||
- cmpb $16, %dl
|
||||
- jae L(between_16_31)
|
||||
- /* It is between 8 and 15 bytes. */
|
||||
+ /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
vmovq (%rdi), %XMM1
|
||||
vmovq (%rsi), %XMM2
|
||||
- VPCMPEQ %XMM1, %XMM2, %k2
|
||||
- kmovw %k2, %eax
|
||||
- subl $XMM_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+ VPCMP $4, %XMM1, %XMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
- leaq -8(%rdi, %rdx), %rdi
|
||||
- leaq -8(%rsi, %rdx), %rsi
|
||||
+ leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
|
||||
+ leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
|
||||
vmovq (%rdi), %XMM1
|
||||
vmovq (%rsi), %XMM2
|
||||
- VPCMPEQ %XMM1, %XMM2, %k2
|
||||
- kmovw %k2, %eax
|
||||
- subl $XMM_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+ VPCMP $4, %XMM1, %XMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(between_16_31):
|
||||
- /* From 16 to 31 bytes. No branch when size == 16. */
|
||||
- VMOVU (%rsi), %XMM2
|
||||
- VPCMPEQ (%rdi), %XMM2, %k2
|
||||
- kmovw %k2, %eax
|
||||
- subl $XMM_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- leaq -16(%rdi, %rdx), %rdi
|
||||
- leaq -16(%rsi, %rdx), %rsi
|
||||
- VMOVU (%rsi), %XMM2
|
||||
- VPCMPEQ (%rdi), %XMM2, %k2
|
||||
- kmovw %k2, %eax
|
||||
- subl $XMM_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(more_8x_vec):
|
||||
- /* More than 8 * VEC. Check the first VEC. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- /* Align the first memory area for aligned loads in the loop.
|
||||
- Compute how much the first memory area is misaligned. */
|
||||
- movq %rdi, %rcx
|
||||
- andl $(VEC_SIZE - 1), %ecx
|
||||
- /* Get the negative of offset for alignment. */
|
||||
- subq $VEC_SIZE, %rcx
|
||||
- /* Adjust the second memory area. */
|
||||
- subq %rcx, %rsi
|
||||
- /* Adjust the first memory area which should be aligned now. */
|
||||
- subq %rcx, %rdi
|
||||
- /* Adjust length. */
|
||||
- addq %rcx, %rdx
|
||||
-
|
||||
-L(loop_4x_vec):
|
||||
- /* Compare 4 * VEC at a time forward. */
|
||||
- VMOVU (%rsi), %YMM1
|
||||
- VPCMPEQ (%rdi), %YMM1, %k1
|
||||
-
|
||||
- VMOVU VEC_SIZE(%rsi), %YMM2
|
||||
- VPCMPEQ VEC_SIZE(%rdi), %YMM2, %k2
|
||||
- kandd %k2, %k1, %k5
|
||||
-
|
||||
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
- VPCMPEQ (VEC_SIZE * 2)(%rdi), %YMM3, %k3
|
||||
- kandd %k3, %k5, %k5
|
||||
-
|
||||
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
- VPCMPEQ (VEC_SIZE * 3)(%rdi), %YMM4, %k4
|
||||
- kandd %k4, %k5, %k5
|
||||
-
|
||||
- kmovd %k5, %eax
|
||||
- cmpl $VEC_MASK, %eax
|
||||
- jne L(4x_vec_end)
|
||||
-
|
||||
- addq $(VEC_SIZE * 4), %rdi
|
||||
- addq $(VEC_SIZE * 4), %rsi
|
||||
-
|
||||
- subq $(VEC_SIZE * 4), %rdx
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- jae L(loop_4x_vec)
|
||||
-
|
||||
- /* Less than 4 * VEC. */
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
- jbe L(last_vec)
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
- jbe L(last_2x_vec)
|
||||
-
|
||||
-L(last_4x_vec):
|
||||
- /* From 2 * VEC to 4 * VEC. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
-
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- addq $VEC_SIZE, %rsi
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
+L(between_16_31):
|
||||
+ /* From 16 to 31 bytes. No branch when size == 16. */
|
||||
+ VMOVU (%rsi), %XMM2
|
||||
+ VPCMP $4, (%rdi), %XMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
- leaq -(3 * VEC_SIZE)(%rdi, %rdx), %rdi
|
||||
- leaq -(3 * VEC_SIZE)(%rsi, %rdx), %rsi
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
|
||||
- addq $VEC_SIZE, %rdi
|
||||
- addq $VEC_SIZE, %rsi
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMPEQ (%rdi), %YMM2, %k2
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(4x_vec_end):
|
||||
+ VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
|
||||
+ leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
|
||||
+ leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
|
||||
+ VPCMP $4, (%rdi), %XMM2, %k1
|
||||
kmovd %k1, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec)
|
||||
- kmovd %k2, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec_x1)
|
||||
- kmovd %k3, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- jnz L(first_vec_x2)
|
||||
- kmovd %k4, %eax
|
||||
- subl $VEC_MASK, %eax
|
||||
- tzcntl %eax, %ecx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (VEC_SIZE * 3)(%rdi, %rcx, 4), %edx
|
||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, 4), %edx
|
||||
- jmp L(wmemcmp_return)
|
||||
-# else
|
||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
-# endif
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(first_vec_x1):
|
||||
- tzcntl %eax, %ecx
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl VEC_SIZE(%rdi, %rcx, 4), %edx
|
||||
- cmpl VEC_SIZE(%rsi, %rcx, 4), %edx
|
||||
- jmp L(wmemcmp_return)
|
||||
-# else
|
||||
- movzbl VEC_SIZE(%rdi, %rcx), %eax
|
||||
- movzbl VEC_SIZE(%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
-# endif
|
||||
+ .p2align 4
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
+ movl (%rdi), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi), %ecx
|
||||
+ je L(zero)
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
ret
|
||||
+# else
|
||||
|
||||
.p2align 4
|
||||
-L(first_vec_x2):
|
||||
- tzcntl %eax, %ecx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- xorl %eax, %eax
|
||||
- movl (VEC_SIZE * 2)(%rdi, %rcx, 4), %edx
|
||||
- cmpl (VEC_SIZE * 2)(%rsi, %rcx, 4), %edx
|
||||
- jmp L(wmemcmp_return)
|
||||
-# else
|
||||
- movzbl (VEC_SIZE * 2)(%rdi, %rcx), %eax
|
||||
- movzbl (VEC_SIZE * 2)(%rsi, %rcx), %edx
|
||||
- sub %edx, %eax
|
||||
-# endif
|
||||
+L(between_4_7):
|
||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
||||
+ */
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ jz L(zero_4_7)
|
||||
+ sbbl %eax, %eax
|
||||
+ orl $1, %eax
|
||||
+L(zero_4_7):
|
||||
ret
|
||||
+# endif
|
||||
+
|
||||
END (MEMCMP)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,104 +0,0 @@
|
||||
From 6abf27980a947f9b6e514d6b33b83059d39566ae Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 20 May 2021 13:13:51 -0400
|
||||
Subject: [PATCH] x86: Improve memset-vec-unaligned-erms.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. This commit makes a few small improvements to
|
||||
memset-vec-unaligned-erms.S. The changes are 1) only aligning to 64
|
||||
instead of 128. Either alignment will perform equally well in a loop
|
||||
and 128 just increases the odds of having to do an extra iteration
|
||||
which can be significant overhead for small values. 2) Align some
|
||||
targets and the loop. 3) Remove an ALU from the alignment process. 4)
|
||||
Reorder the last 4x VEC so that they are stored after the loop. 5)
|
||||
Move the condition for leq 8x VEC to before the alignment
|
||||
process. test-memset and test-wmemset are both passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 50 +++++++++++--------
|
||||
1 file changed, 28 insertions(+), 22 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index f877ac9d..909c33f6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -173,17 +173,22 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+ .p2align 4
|
||||
L(stosb_more_2x_vec):
|
||||
cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
||||
ja L(stosb)
|
||||
+#else
|
||||
+ .p2align 4
|
||||
#endif
|
||||
L(more_2x_vec):
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- ja L(loop_start)
|
||||
+ /* Stores to first 2x VEC before cmp as any path forward will
|
||||
+ require it. */
|
||||
VMOVU %VEC(0), (%rdi)
|
||||
VMOVU %VEC(0), VEC_SIZE(%rdi)
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ ja L(loop_start)
|
||||
VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
+ VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
L(return):
|
||||
#if VEC_SIZE > 16
|
||||
ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
@@ -192,28 +197,29 @@ L(return):
|
||||
#endif
|
||||
|
||||
L(loop_start):
|
||||
- leaq (VEC_SIZE * 4)(%rdi), %rcx
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
- andq $-(VEC_SIZE * 4), %rcx
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
- VMOVU %VEC(0), VEC_SIZE(%rdi)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rdi,%rdx)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||
- addq %rdi, %rdx
|
||||
- andq $-(VEC_SIZE * 4), %rdx
|
||||
- cmpq %rdx, %rcx
|
||||
- je L(return)
|
||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
||||
+ jbe L(loop_end)
|
||||
+ andq $-(VEC_SIZE * 2), %rdi
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
|
||||
+ .p2align 4
|
||||
L(loop):
|
||||
- VMOVA %VEC(0), (%rcx)
|
||||
- VMOVA %VEC(0), VEC_SIZE(%rcx)
|
||||
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rcx)
|
||||
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rcx)
|
||||
- addq $(VEC_SIZE * 4), %rcx
|
||||
- cmpq %rcx, %rdx
|
||||
- jne L(loop)
|
||||
+ VMOVA %VEC(0), (%rdi)
|
||||
+ VMOVA %VEC(0), VEC_SIZE(%rdi)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
|
||||
+ subq $-(VEC_SIZE * 4), %rdi
|
||||
+ cmpq %rcx, %rdi
|
||||
+ jb L(loop)
|
||||
+L(loop_end):
|
||||
+ /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
|
||||
+ rdx as length is also unchanged. */
|
||||
+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
|
||||
+ VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
|
||||
+ VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
|
||||
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
VZEROUPPER_SHORT_RETURN
|
||||
|
||||
.p2align 4
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,84 +0,0 @@
|
||||
From 1b992204f68af851e905c16016756fd4421e1934 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun, 23 May 2021 19:43:24 -0400
|
||||
Subject: [PATCH] x86: Improve memmove-vec-unaligned-erms.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This patch changes the condition for copy 4x VEC so that if length is
|
||||
exactly equal to 4 * VEC_SIZE it will use the 4x VEC case instead of
|
||||
8x VEC case.
|
||||
|
||||
Results For Skylake memcpy-avx2-erms
|
||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
||||
128 , 0 , 0 , 9.137 , 6.873 , New , 75.22
|
||||
128 , 7 , 0 , 12.933 , 7.732 , New , 59.79
|
||||
128 , 0 , 7 , 11.852 , 6.76 , New , 57.04
|
||||
128 , 7 , 7 , 12.587 , 6.808 , New , 54.09
|
||||
|
||||
Results For Icelake memcpy-evex-erms
|
||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
||||
128 , 0 , 0 , 9.963 , 5.416 , New , 54.36
|
||||
128 , 7 , 0 , 16.467 , 8.061 , New , 48.95
|
||||
128 , 0 , 7 , 14.388 , 7.644 , New , 53.13
|
||||
128 , 7 , 7 , 14.546 , 7.642 , New , 52.54
|
||||
|
||||
Results For Tigerlake memcpy-evex-erms
|
||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
||||
128 , 0 , 0 , 8.979 , 4.95 , New , 55.13
|
||||
128 , 7 , 0 , 14.245 , 7.122 , New , 50.0
|
||||
128 , 0 , 7 , 12.668 , 6.675 , New , 52.69
|
||||
128 , 7 , 7 , 13.042 , 6.802 , New , 52.15
|
||||
|
||||
Results For Skylake memmove-avx2-erms
|
||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
||||
128 , 0 , 32 , 6.181 , 5.691 , New , 92.07
|
||||
128 , 32 , 0 , 6.165 , 5.752 , New , 93.3
|
||||
128 , 0 , 7 , 13.923 , 9.37 , New , 67.3
|
||||
128 , 7 , 0 , 12.049 , 10.182 , New , 84.5
|
||||
|
||||
Results For Icelake memmove-evex-erms
|
||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
||||
128 , 0 , 32 , 5.479 , 4.889 , New , 89.23
|
||||
128 , 32 , 0 , 5.127 , 4.911 , New , 95.79
|
||||
128 , 0 , 7 , 18.885 , 13.547 , New , 71.73
|
||||
128 , 7 , 0 , 15.565 , 14.436 , New , 92.75
|
||||
|
||||
Results For Tigerlake memmove-evex-erms
|
||||
size, al1 , al2 , Cur T , New T , Win , New / Cur
|
||||
128 , 0 , 32 , 5.275 , 4.815 , New , 91.28
|
||||
128 , 32 , 0 , 5.376 , 4.565 , New , 84.91
|
||||
128 , 0 , 7 , 19.426 , 14.273 , New , 73.47
|
||||
128 , 7 , 0 , 15.924 , 14.951 , New , 93.89
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S | 6 +++---
|
||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
index 3e2dd6bc..572cef04 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
|
||||
@@ -417,8 +417,8 @@ L(more_2x_vec):
|
||||
cmpq $(VEC_SIZE * 8), %rdx
|
||||
ja L(more_8x_vec)
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
- jb L(last_4x_vec)
|
||||
- /* Copy from 4 * VEC to 8 * VEC, inclusively. */
|
||||
+ jbe L(last_4x_vec)
|
||||
+ /* Copy from 4 * VEC + 1 to 8 * VEC, inclusively. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)
|
||||
@@ -437,7 +437,7 @@ L(more_2x_vec):
|
||||
VMOVU %VEC(7), -(VEC_SIZE * 4)(%rdi,%rdx)
|
||||
VZEROUPPER_RETURN
|
||||
L(last_4x_vec):
|
||||
- /* Copy from 2 * VEC to 4 * VEC. */
|
||||
+ /* Copy from 2 * VEC + 1 to 4 * VEC, inclusively. */
|
||||
VMOVU (%rsi), %VEC(0)
|
||||
VMOVU VEC_SIZE(%rsi), %VEC(1)
|
||||
VMOVU -VEC_SIZE(%rsi,%rdx), %VEC(2)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,55 +0,0 @@
|
||||
From 08cbcd4dbc686bb38ec3093aff2f919fbff5ec17 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Wed, 23 Jun 2021 19:19:34 -0400
|
||||
Subject: [PATCH] x86: Remove unnecessary overflow check from wcsnlen-sse4_1.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug. The way wcsnlen will check if near the end of maxlen
|
||||
is the following macro:
|
||||
|
||||
mov %r11, %rsi; \
|
||||
subq %rax, %rsi; \
|
||||
andq $-64, %rax; \
|
||||
testq $-64, %rsi; \
|
||||
je L(strnlen_ret)
|
||||
|
||||
Which words independently of s + maxlen overflowing. So the
|
||||
second overflow check is unnecissary for correctness and
|
||||
just extra overhead in the common no overflow case.
|
||||
|
||||
test-strlen.c, test-wcslen.c, test-strnlen.c and test-wcsnlen.c are
|
||||
all passing
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-vec.S | 7 -------
|
||||
1 file changed, 7 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-vec.S b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
index 439e486a..b7657282 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-vec.S
|
||||
@@ -71,19 +71,12 @@ L(n_nonzero):
|
||||
suffice. */
|
||||
mov %RSI_LP, %R10_LP
|
||||
sar $62, %R10_LP
|
||||
- test %R10_LP, %R10_LP
|
||||
jnz __wcslen_sse4_1
|
||||
sal $2, %RSI_LP
|
||||
# endif
|
||||
|
||||
-
|
||||
/* Initialize long lived registers. */
|
||||
-
|
||||
add %RDI_LP, %RSI_LP
|
||||
-# ifdef AS_WCSLEN
|
||||
-/* Check for overflow again from s + maxlen * sizeof(wchar_t). */
|
||||
- jbe __wcslen_sse4_1
|
||||
-# endif
|
||||
mov %RSI_LP, %R10_LP
|
||||
and $-64, %R10_LP
|
||||
mov %RSI_LP, %R11_LP
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,290 +0,0 @@
|
||||
From 82d0b4a4d76db554eb6757acb790fcea30b19965 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:32:24 -0800
|
||||
Subject: [PATCH] x86-64 memset/wmemset: Properly handle the length parameter
|
||||
[BZ# 24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes memset/wmemset for x32. Tested on x86-64 and x32. On
|
||||
x86-64, libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S: Use
|
||||
RDX_LP for length. Clear the upper 32 bits of RDX register.
|
||||
* sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-wmemset.
|
||||
* sysdeps/x86_64/x32/tst-size_t-memset.c: New file.
|
||||
* sysdeps/x86_64/x32/tst-size_t-wmemset.c: Likewise.
|
||||
---
|
||||
.../multiarch/memset-avx512-no-vzeroupper.S | 6 +-
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 34 +++++----
|
||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-memset.c | 73 +++++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-wmemset.c | 20 +++++
|
||||
5 files changed, 121 insertions(+), 16 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-memset.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
||||
index 689cc119..99e25519 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-no-vzeroupper.S
|
||||
@@ -29,12 +29,16 @@
|
||||
.section .text.avx512,"ax",@progbits
|
||||
#if defined PIC
|
||||
ENTRY (MEMSET_CHK)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (MEMSET_CHK)
|
||||
#endif
|
||||
|
||||
ENTRY (MEMSET)
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
+# endif
|
||||
vpxor %xmm0, %xmm0, %xmm0
|
||||
vmovd %esi, %xmm1
|
||||
lea (%rdi, %rdx), %rsi
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index 270a1d49..9a0fd818 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -65,8 +65,8 @@
|
||||
.section SECTION(.text),"ax",@progbits
|
||||
#if VEC_SIZE == 16 && IS_IN (libc)
|
||||
ENTRY (__bzero)
|
||||
- movq %rdi, %rax /* Set return value. */
|
||||
- movq %rsi, %rdx /* Set n. */
|
||||
+ mov %RDI_LP, %RAX_LP /* Set return value. */
|
||||
+ mov %RSI_LP, %RDX_LP /* Set n. */
|
||||
pxor %xmm0, %xmm0
|
||||
jmp L(entry_from_bzero)
|
||||
END (__bzero)
|
||||
@@ -76,13 +76,13 @@ weak_alias (__bzero, bzero)
|
||||
#if IS_IN (libc)
|
||||
# if defined SHARED
|
||||
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
||||
# endif
|
||||
|
||||
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
- shlq $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
jmp L(entry_from_bzero)
|
||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
@@ -90,13 +90,17 @@ END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
+# endif
|
||||
L(entry_from_bzero):
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec)
|
||||
@@ -112,14 +116,14 @@ END (MEMSET_SYMBOL (__memset, unaligned))
|
||||
|
||||
# if VEC_SIZE == 16
|
||||
ENTRY (__memset_chk_erms)
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END (__memset_chk_erms)
|
||||
|
||||
/* Only used to measure performance of REP STOSB. */
|
||||
ENTRY (__memset_erms)
|
||||
/* Skip zero length. */
|
||||
- testq %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
jnz L(stosb)
|
||||
movq %rdi, %rax
|
||||
ret
|
||||
@@ -131,11 +135,11 @@ ENTRY (MEMSET_SYMBOL (__memset, erms))
|
||||
L(stosb):
|
||||
/* Issue vzeroupper before rep stosb. */
|
||||
VZEROUPPER
|
||||
- movq %rdx, %rcx
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
movzbl %sil, %eax
|
||||
- movq %rdi, %rdx
|
||||
+ mov %RDI_LP, %RDX_LP
|
||||
rep stosb
|
||||
- movq %rdx, %rax
|
||||
+ mov %RDX_LP, %RAX_LP
|
||||
ret
|
||||
# if VEC_SIZE == 16
|
||||
END (__memset_erms)
|
||||
@@ -145,16 +149,20 @@ END (MEMSET_SYMBOL (__memset, erms))
|
||||
|
||||
# if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
- cmpq %rdx, %rcx
|
||||
+ cmp %RDX_LP, %RCX_LP
|
||||
jb HIDDEN_JUMPTARGET (__chk_fail)
|
||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
- cmpq $VEC_SIZE, %rdx
|
||||
+# ifdef __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ mov %edx, %edx
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
- cmpq $(VEC_SIZE * 2), %rdx
|
||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(stosb_more_2x_vec)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index e99dbd7c..98bd9ae9 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -7,9 +7,9 @@ endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
||||
- tst-size_t-memrchr
|
||||
+ tst-size_t-memrchr tst-size_t-memset
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp
|
||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
|
||||
endif
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-memset.c b/sysdeps/x86_64/x32/tst-size_t-memset.c
|
||||
new file mode 100644
|
||||
index 00000000..2c367af6
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-memset.c
|
||||
@@ -0,0 +1,73 @@
|
||||
+/* Test memset with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# define TEST_NAME "wmemset"
|
||||
+#else
|
||||
+# define TEST_NAME "memset"
|
||||
+#endif /* WIDE */
|
||||
+
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# include <wchar.h>
|
||||
+# define MEMSET wmemset
|
||||
+# define CHAR wchar_t
|
||||
+#else
|
||||
+# define MEMSET memset
|
||||
+# define CHAR char
|
||||
+#endif /* WIDE */
|
||||
+
|
||||
+IMPL (MEMSET, 1)
|
||||
+
|
||||
+typedef CHAR *(*proto_t) (CHAR *, int, size_t);
|
||||
+
|
||||
+static void *
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_memset (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, (uintptr_t) b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ CHAR ch = 0x23;
|
||||
+ parameter_t src = { { page_size / sizeof (CHAR) }, buf2 };
|
||||
+ parameter_t c = { { 0 }, (void *) (uintptr_t) ch };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ c.fn = impl->fn;
|
||||
+ CHAR *p = (CHAR *) do_memset (src, c);
|
||||
+ size_t i;
|
||||
+ for (i = 0; i < src.len; i++)
|
||||
+ if (p[i] != ch)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s", impl->name);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wmemset.c b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
||||
new file mode 100644
|
||||
index 00000000..955eb488
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wmemset.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* Test wmemset with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include "tst-size_t-memset.c"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,43 +0,0 @@
|
||||
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
|
||||
Author: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com> 2021-05-23 21:43:10
|
||||
Committer: H.J. Lu <hjl.tools@gmail.com> 2021-06-27 10:56:57
|
||||
Parent: 2c16cb88a6e5ace0fb7cedca86860ea7bde522a7 (Linux: Move timer helper routines from librt to libc)
|
||||
Child: 1683249d17e14827b6579529742eb895027dfa84 (x86_64: roundeven with sse4.1 support)
|
||||
Branches: master, remotes/origin/master and many more (41)
|
||||
Follows: glibc-2.33.9000
|
||||
Precedes: glibc-2.34
|
||||
|
||||
math: redirect roundeven function
|
||||
|
||||
This patch redirect roundeven function for futhermore changes.
|
||||
|
||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
|
||||
Conflicts:
|
||||
*
|
||||
(rewritten for older branch)
|
||||
|
||||
diff --git a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
||||
index 7bbbb2dc..8728d0f2 100644
|
||||
--- a/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
||||
+++ b/sysdeps/ieee754/dbl-64/wordsize-64/s_roundeven.c
|
||||
@@ -17,6 +17,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <math.h>
|
||||
#include <math_private.h>
|
||||
#include <libm-alias-double.h>
|
||||
@@ -67,5 +68,6 @@ __roundeven (double x)
|
||||
INSERT_WORDS64 (x, ix);
|
||||
return x;
|
||||
}
|
||||
-hidden_def (__roundeven)
|
||||
+#ifndef __roundeven
|
||||
libm_alias_double (__roundeven, roundeven)
|
||||
+#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,118 +0,0 @@
|
||||
From 447954a206837b5f153869cfeeeab44631c3fac9 Mon Sep 17 00:00:00 2001
|
||||
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
||||
Date: Mon, 24 May 2021 09:43:10 +0800
|
||||
Subject: [PATCH] math: redirect roundeven function
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This patch redirect roundeven function for futhermore changes.
|
||||
|
||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
include/math.h | 3 ++-
|
||||
sysdeps/ieee754/dbl-64/s_roundeven.c | 4 +++-
|
||||
sysdeps/ieee754/float128/s_roundevenf128.c | 1 +
|
||||
sysdeps/ieee754/flt-32/s_roundevenf.c | 3 +++
|
||||
sysdeps/ieee754/ldbl-128/s_roundevenl.c | 1 +
|
||||
sysdeps/ieee754/ldbl-96/s_roundevenl.c | 1 +
|
||||
6 files changed, 11 insertions(+), 2 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
include/math.h
|
||||
(missing MATH_REDIRECT macros)
|
||||
|
||||
diff --git a/include/math.h b/include/math.h
|
||||
index e21d34b8..1f9f9a54 100644
|
||||
--- a/include/math.h
|
||||
+++ b/include/math.h
|
||||
@@ -38,7 +38,6 @@ libm_hidden_proto (__issignaling)
|
||||
libm_hidden_proto (__issignalingf)
|
||||
libm_hidden_proto (__exp)
|
||||
libm_hidden_proto (__expf)
|
||||
-libm_hidden_proto (__roundeven)
|
||||
|
||||
# ifndef __NO_LONG_DOUBLE_MATH
|
||||
libm_hidden_proto (__fpclassifyl)
|
||||
@@ -56,6 +55,8 @@ libm_hidden_proto (__expm1f128)
|
||||
|
||||
# if !(defined __FINITE_MATH_ONLY__ && __FINITE_MATH_ONLY__ > 0)
|
||||
# ifndef NO_MATH_REDIRECT
|
||||
+float (roundevenf) (float) asm ("__roundevenf");
|
||||
+double (roundeven) (double) asm ("__roundeven");
|
||||
/* Declare sqrt for use within GLIBC. Compilers typically inline sqrt as a
|
||||
single instruction. Use an asm to avoid use of PLTs if it doesn't. */
|
||||
float (sqrtf) (float) asm ("__ieee754_sqrtf");
|
||||
diff --git a/sysdeps/ieee754/dbl-64/s_roundeven.c b/sysdeps/ieee754/dbl-64/s_roundeven.c
|
||||
index 1438e81d..61962184 100644
|
||||
--- a/sysdeps/ieee754/dbl-64/s_roundeven.c
|
||||
+++ b/sysdeps/ieee754/dbl-64/s_roundeven.c
|
||||
@@ -17,6 +17,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <math.h>
|
||||
#include <math_private.h>
|
||||
#include <libm-alias-double.h>
|
||||
@@ -101,5 +102,6 @@ __roundeven (double x)
|
||||
INSERT_WORDS (x, hx, lx);
|
||||
return x;
|
||||
}
|
||||
-hidden_def (__roundeven)
|
||||
+#ifndef __roundeven
|
||||
libm_alias_double (__roundeven, roundeven)
|
||||
+#endif
|
||||
diff --git a/sysdeps/ieee754/float128/s_roundevenf128.c b/sysdeps/ieee754/float128/s_roundevenf128.c
|
||||
index 5a9b3f39..e0faf727 100644
|
||||
--- a/sysdeps/ieee754/float128/s_roundevenf128.c
|
||||
+++ b/sysdeps/ieee754/float128/s_roundevenf128.c
|
||||
@@ -1,2 +1,3 @@
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <float128_private.h>
|
||||
#include "../ldbl-128/s_roundevenl.c"
|
||||
diff --git a/sysdeps/ieee754/flt-32/s_roundevenf.c b/sysdeps/ieee754/flt-32/s_roundevenf.c
|
||||
index 90f991d5..a661875e 100644
|
||||
--- a/sysdeps/ieee754/flt-32/s_roundevenf.c
|
||||
+++ b/sysdeps/ieee754/flt-32/s_roundevenf.c
|
||||
@@ -17,6 +17,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <math.h>
|
||||
#include <math_private.h>
|
||||
#include <libm-alias-float.h>
|
||||
@@ -67,4 +68,6 @@ __roundevenf (float x)
|
||||
SET_FLOAT_WORD (x, ix);
|
||||
return x;
|
||||
}
|
||||
+#ifndef __roundevenf
|
||||
libm_alias_float (__roundeven, roundeven)
|
||||
+#endif
|
||||
diff --git a/sysdeps/ieee754/ldbl-128/s_roundevenl.c b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
||||
index 5fc59af4..b9375b6c 100644
|
||||
--- a/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
||||
+++ b/sysdeps/ieee754/ldbl-128/s_roundevenl.c
|
||||
@@ -17,6 +17,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <math.h>
|
||||
#include <math_private.h>
|
||||
#include <libm-alias-ldouble.h>
|
||||
diff --git a/sysdeps/ieee754/ldbl-96/s_roundevenl.c b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
||||
index be2e4fa4..65031ab7 100644
|
||||
--- a/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
||||
+++ b/sysdeps/ieee754/ldbl-96/s_roundevenl.c
|
||||
@@ -17,6 +17,7 @@
|
||||
License along with the GNU C Library; if not, see
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
+#define NO_MATH_REDIRECT
|
||||
#include <math.h>
|
||||
#include <math_private.h>
|
||||
#include <libm-alias-ldouble.h>
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,242 +0,0 @@
|
||||
From 1683249d17e14827b6579529742eb895027dfa84 Mon Sep 17 00:00:00 2001
|
||||
From: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
||||
Date: Mon, 24 May 2021 09:43:11 +0800
|
||||
Subject: [PATCH] x86_64: roundeven with sse4.1 support
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This patch adds support for the sse4.1 hardware floating point
|
||||
roundeven.
|
||||
|
||||
Here is some benchmark results on my systems:
|
||||
|
||||
=AMD Ryzen 9 3900X 12-Core Processor=
|
||||
|
||||
* benchmark result before this commit
|
||||
| | roundeven | roundevenf |
|
||||
|------------|--------------|--------------|
|
||||
| duration | 3.75587e+09 | 3.75114e+09 |
|
||||
| iterations | 3.93053e+08 | 4.35402e+08 |
|
||||
| max | 52.592 | 58.71 |
|
||||
| min | 7.98 | 7.22 |
|
||||
| mean | 9.55563 | 8.61535 |
|
||||
|
||||
* benchmark result after this commit
|
||||
| | roundeven | roundevenf |
|
||||
|------------|---------------|--------------|
|
||||
| duration | 3.73815e+09 | 3.73738e+09 |
|
||||
| iterations | 5.82692e+08 | 5.91498e+08 |
|
||||
| max | 56.468 | 51.642 |
|
||||
| min | 6.27 | 6.156 |
|
||||
| mean | 6.41532 | 6.3185 |
|
||||
|
||||
=Intel(R) Pentium(R) CPU D1508 @ 2.20GHz=
|
||||
|
||||
* benchmark result before this commit
|
||||
| | roundeven | roundevenf |
|
||||
|------------|--------------|--------------|
|
||||
| duration | 2.18208e+09 | 2.18258e+09 |
|
||||
| iterations | 2.39932e+08 | 2.46924e+08 |
|
||||
| max | 96.378 | 98.035 |
|
||||
| min | 6.776 | 5.94 |
|
||||
| mean | 9.09456 | 8.83907 |
|
||||
|
||||
* benchmark result after this commit
|
||||
| | roundeven | roundevenf |
|
||||
|------------|--------------|--------------|
|
||||
| duration | 2.17415e+09 | 2.17005e+09 |
|
||||
| iterations | 3.56193e+08 | 4.09824e+08 |
|
||||
| max | 51.693 | 97.192 |
|
||||
| min | 5.926 | 5.093 |
|
||||
| mean | 6.10385 | 5.29507 |
|
||||
|
||||
Signed-off-by: Shen-Ta Hsieh <ibmibmibm.tw@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/fpu/multiarch/Makefile | 5 +--
|
||||
sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c | 2 ++
|
||||
.../x86_64/fpu/multiarch/s_roundeven-sse4_1.S | 24 ++++++++++++++
|
||||
sysdeps/x86_64/fpu/multiarch/s_roundeven.c | 31 +++++++++++++++++++
|
||||
sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c | 3 ++
|
||||
.../fpu/multiarch/s_roundevenf-sse4_1.S | 24 ++++++++++++++
|
||||
sysdeps/x86_64/fpu/multiarch/s_roundevenf.c | 31 +++++++++++++++++++
|
||||
7 files changed, 118 insertions(+), 2 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
||||
create mode 100644 sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
||||
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
index 9f387248..6ddd1c01 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
|
||||
@@ -1,11 +1,12 @@
|
||||
ifeq ($(subdir),math)
|
||||
libm-sysdep_routines += s_floor-c s_ceil-c s_floorf-c s_ceilf-c \
|
||||
s_rint-c s_rintf-c s_nearbyint-c s_nearbyintf-c \
|
||||
- s_trunc-c s_truncf-c
|
||||
+ s_roundeven-c s_roundevenf-c s_trunc-c s_truncf-c
|
||||
|
||||
libm-sysdep_routines += s_ceil-sse4_1 s_ceilf-sse4_1 s_floor-sse4_1 \
|
||||
s_floorf-sse4_1 s_nearbyint-sse4_1 \
|
||||
- s_nearbyintf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
||||
+ s_nearbyintf-sse4_1 s_roundeven-sse4_1 \
|
||||
+ s_roundevenf-sse4_1 s_rint-sse4_1 s_rintf-sse4_1 \
|
||||
s_trunc-sse4_1 s_truncf-sse4_1
|
||||
|
||||
libm-sysdep_routines += e_exp-fma e_log-fma e_pow-fma s_atan-fma \
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
||||
new file mode 100644
|
||||
index 00000000..c7be43cb
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-c.c
|
||||
@@ -0,0 +1,2 @@
|
||||
+#define __roundeven __roundeven_c
|
||||
+#include <sysdeps/ieee754/dbl-64/s_roundeven.c>
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
||||
new file mode 100644
|
||||
index 00000000..6ae8f6b1
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven-sse4_1.S
|
||||
@@ -0,0 +1,24 @@
|
||||
+/* Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+ .section .text.sse4.1,"ax",@progbits
|
||||
+ENTRY(__roundeven_sse41)
|
||||
+ roundsd $8, %xmm0, %xmm0
|
||||
+ ret
|
||||
+END(__roundeven_sse41)
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundeven.c b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
||||
new file mode 100644
|
||||
index 00000000..d92eda65
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundeven.c
|
||||
@@ -0,0 +1,31 @@
|
||||
+/* Multiple versions of __roundeven.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <libm-alias-double.h>
|
||||
+
|
||||
+#define roundeven __redirect_roundeven
|
||||
+#define __roundeven __redirect___roundeven
|
||||
+#include <math.h>
|
||||
+#undef roundeven
|
||||
+#undef __roundeven
|
||||
+
|
||||
+#define SYMBOL_NAME roundeven
|
||||
+#include "ifunc-sse4_1.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_roundeven, __roundeven, IFUNC_SELECTOR ());
|
||||
+libm_alias_double (__roundeven, roundeven)
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
||||
new file mode 100644
|
||||
index 00000000..72a6e7d1
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-c.c
|
||||
@@ -0,0 +1,3 @@
|
||||
+#undef __roundevenf
|
||||
+#define __roundevenf __roundevenf_c
|
||||
+#include <sysdeps/ieee754/flt-32/s_roundevenf.c>
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
||||
new file mode 100644
|
||||
index 00000000..a76e1080
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf-sse4_1.S
|
||||
@@ -0,0 +1,24 @@
|
||||
+/* Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <sysdep.h>
|
||||
+
|
||||
+ .section .text.sse4.1,"ax",@progbits
|
||||
+ENTRY(__roundevenf_sse41)
|
||||
+ roundss $8, %xmm0, %xmm0
|
||||
+ ret
|
||||
+END(__roundevenf_sse41)
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
||||
new file mode 100644
|
||||
index 00000000..2ee196e6
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/s_roundevenf.c
|
||||
@@ -0,0 +1,31 @@
|
||||
+/* Multiple versions of __roundevenf.
|
||||
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#include <libm-alias-float.h>
|
||||
+
|
||||
+#define roundevenf __redirect_roundevenf
|
||||
+#define __roundevenf __redirect___roundevenf
|
||||
+#include <math.h>
|
||||
+#undef roundevenf
|
||||
+#undef __roundevenf
|
||||
+
|
||||
+#define SYMBOL_NAME roundevenf
|
||||
+#include "ifunc-sse4_1.h"
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect_roundevenf, __roundevenf, IFUNC_SELECTOR ());
|
||||
+libm_alias_float (__roundeven, roundeven)
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,41 +0,0 @@
|
||||
From 7e08db3359c86c94918feb33a1182cd0ff3bb10b Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun, 9 Jan 2022 16:02:28 -0600
|
||||
Subject: [PATCH] x86: Fix __wcsncmp_evex in strcmp-evex.S [BZ# 28755]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Fixes [BZ# 28755] for wcsncmp by redirecting length >= 2^56 to
|
||||
__wcscmp_evex. For x86_64 this covers the entire address range so any
|
||||
length larger could not possibly be used to bound `s1` or `s2`.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 10 ++++++++++
|
||||
1 file changed, 10 insertions(+)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
index 459eeed0..d5aa6daa 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
@@ -97,6 +97,16 @@ ENTRY (STRCMP)
|
||||
je L(char0)
|
||||
jb L(zero)
|
||||
# ifdef USE_AS_WCSCMP
|
||||
+# ifndef __ILP32__
|
||||
+ movq %rdx, %rcx
|
||||
+ /* Check if length could overflow when multiplied by
|
||||
+ sizeof(wchar_t). Checking top 8 bits will cover all potential
|
||||
+ overflow cases as well as redirect cases where its impossible to
|
||||
+ length to bound a valid memory region. In these cases just use
|
||||
+ 'wcscmp'. */
|
||||
+ shrq $56, %rcx
|
||||
+ jnz __wcscmp_evex
|
||||
+# endif
|
||||
/* Convert units: from wide to byte char. */
|
||||
shl $2, %RDX_LP
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,268 +0,0 @@
|
||||
From 78c9ec9000f873abe7a15a91b87080a2e4308260 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 20 Aug 2021 06:42:24 -0700
|
||||
Subject: [PATCH] x86-64: Optimize load of all bits set into ZMM register [BZ
|
||||
#28252]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Optimize loads of all bits set into ZMM register in AVX512 SVML codes
|
||||
by replacing
|
||||
|
||||
vpbroadcastq .L_2il0floatpacket.16(%rip), %zmmX
|
||||
|
||||
and
|
||||
|
||||
vmovups .L_2il0floatpacket.13(%rip), %zmmX
|
||||
|
||||
with
|
||||
vpternlogd $0xff, %zmmX, %zmmX, %zmmX
|
||||
|
||||
This fixes BZ #28252.
|
||||
---
|
||||
.../x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S | 7 +------
|
||||
.../x86_64/fpu/multiarch/svml_d_log8_core_avx512.S | 7 +------
|
||||
.../x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S | 7 +------
|
||||
.../fpu/multiarch/svml_d_sincos8_core_avx512.S | 7 +------
|
||||
.../x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S | 7 +------
|
||||
.../x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S | 7 +------
|
||||
.../x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S | 7 +------
|
||||
.../x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S | 12 ++----------
|
||||
.../fpu/multiarch/svml_s_sincosf16_core_avx512.S | 7 +------
|
||||
.../x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S | 7 +------
|
||||
10 files changed, 11 insertions(+), 64 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
index 24e3b363..07dfed85 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_cos8_core_avx512.S
|
||||
@@ -265,7 +265,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
|
||||
vmovaps %zmm0, %zmm8
|
||||
|
||||
/* Check for large arguments path */
|
||||
- vpbroadcastq .L_2il0floatpacket.16(%rip), %zmm2
|
||||
+ vpternlogd $0xff, %zmm2, %zmm2, %zmm2
|
||||
|
||||
/*
|
||||
ARGUMENT RANGE REDUCTION:
|
||||
@@ -456,8 +456,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_cos
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_cos_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.16:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.16,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
index ae8af8d8..ddb60e5b 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_log8_core_avx512.S
|
||||
@@ -274,7 +274,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
|
||||
|
||||
/* preserve mantissa, set input exponent to 2^(-10) */
|
||||
vpternlogq $248, _ExpMask(%rax), %zmm3, %zmm2
|
||||
- vpbroadcastq .L_2il0floatpacket.12(%rip), %zmm1
|
||||
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm1
|
||||
vpsrlq $32, %zmm4, %zmm6
|
||||
|
||||
/* reciprocal approximation good to at least 11 bits */
|
||||
@@ -461,8 +461,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_log
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_log_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.12:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.12,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
index 2d4b14fd..529c454a 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sin8_core_avx512.S
|
||||
@@ -261,7 +261,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
|
||||
andq $-64, %rsp
|
||||
subq $1280, %rsp
|
||||
movq __svml_d_trig_data@GOTPCREL(%rip), %rax
|
||||
- vpbroadcastq .L_2il0floatpacket.14(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm1, %zmm1, %zmm14
|
||||
vmovups __dAbsMask(%rax), %zmm7
|
||||
vmovups __dInvPI(%rax), %zmm2
|
||||
vmovups __dRShifter(%rax), %zmm1
|
||||
@@ -458,8 +458,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN4v_sin
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN8v_sin_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.14:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.14,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
index 2df626c0..e501a53a 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_d_sincos8_core_avx512.S
|
||||
@@ -430,7 +430,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN4vl8l8_sincos
|
||||
|
||||
/* SinPoly = SinR*SinPoly */
|
||||
vfmadd213pd %zmm5, %zmm5, %zmm4
|
||||
- vpbroadcastq .L_2il0floatpacket.15(%rip), %zmm3
|
||||
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
|
||||
|
||||
/* Update Cos result's sign */
|
||||
vxorpd %zmm2, %zmm1, %zmm1
|
||||
@@ -741,8 +741,3 @@ END (_ZGVeN8vvv_sincos_knl)
|
||||
ENTRY (_ZGVeN8vvv_sincos_skx)
|
||||
WRAPPER_AVX512_vvv_vl8l8 _ZGVeN8vl8l8_sincos_skx
|
||||
END (_ZGVeN8vvv_sincos_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.15:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.15,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
index 6ea1137b..377af394 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_cosf16_core_avx512.S
|
||||
@@ -278,7 +278,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
|
||||
X = X - Y*PI1 - Y*PI2 - Y*PI3
|
||||
*/
|
||||
vmovaps %zmm0, %zmm6
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm12
|
||||
+ vpternlogd $0xff, %zmm12, %zmm12, %zmm12
|
||||
vmovups __sRShifter(%rax), %zmm3
|
||||
vmovups __sPI1_FMA(%rax), %zmm5
|
||||
vmovups __sA9_FMA(%rax), %zmm9
|
||||
@@ -453,8 +453,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_cosf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16v_cosf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
index 89ba0df2..46f33d46 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_expf16_core_avx512.S
|
||||
@@ -264,7 +264,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
|
||||
vmovaps %zmm0, %zmm7
|
||||
|
||||
/* compare against threshold */
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm3
|
||||
+ vpternlogd $0xff, %zmm3, %zmm3, %zmm3
|
||||
vmovups __sInvLn2(%rax), %zmm4
|
||||
vmovups __sShifter(%rax), %zmm1
|
||||
vmovups __sLn2hi(%rax), %zmm6
|
||||
@@ -440,8 +440,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_expf
|
||||
|
||||
#endif
|
||||
END (_ZGVeN16v_expf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
index 4cf0a96f..9e254956 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_logf16_core_avx512.S
|
||||
@@ -235,7 +235,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
|
||||
andq $-64, %rsp
|
||||
subq $1280, %rsp
|
||||
movq __svml_slog_data@GOTPCREL(%rip), %rax
|
||||
- vmovups .L_2il0floatpacket.7(%rip), %zmm6
|
||||
+ vpternlogd $0xff, %zmm6, %zmm6, %zmm6
|
||||
vmovups _iBrkValue(%rax), %zmm4
|
||||
vmovups _sPoly_7(%rax), %zmm8
|
||||
|
||||
@@ -409,8 +409,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_logf
|
||||
|
||||
#endif
|
||||
END (_ZGVeN16v_logf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.7:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.7,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
index bdcd50af..e8331ba1 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_powf16_core_avx512.S
|
||||
@@ -385,7 +385,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
vpsrlq $32, %zmm3, %zmm2
|
||||
vpmovqd %zmm2, %ymm11
|
||||
vcvtps2pd %ymm14, %zmm13
|
||||
- vmovups .L_2il0floatpacket.23(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
vmovaps %zmm14, %zmm26
|
||||
vpandd _ABSMASK(%rax), %zmm1, %zmm8
|
||||
vpcmpd $1, _INF(%rax), %zmm8, %k2
|
||||
@@ -427,7 +427,7 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
vpmovqd %zmm11, %ymm5
|
||||
vpxord %zmm10, %zmm10, %zmm10
|
||||
vgatherdpd _Log2Rcp_lookup(%rax,%ymm4), %zmm10{%k3}
|
||||
- vpbroadcastq .L_2il0floatpacket.24(%rip), %zmm4
|
||||
+ vpternlogd $0xff, %zmm4, %zmm4, %zmm4
|
||||
vpxord %zmm11, %zmm11, %zmm11
|
||||
vcvtdq2pd %ymm7, %zmm7
|
||||
vgatherdpd _Log2Rcp_lookup(%rax,%ymm5), %zmm11{%k1}
|
||||
@@ -643,11 +643,3 @@ WRAPPER_IMPL_AVX512_ff _ZGVdN8vv_powf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16vv_powf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.23:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.23,@object
|
||||
-.L_2il0floatpacket.24:
|
||||
- .long 0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.24,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
index 5fa4bc41..1f46f334 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sincosf16_core_avx512.S
|
||||
@@ -317,7 +317,7 @@ WRAPPER_IMPL_AVX512_fFF _ZGVdN8vvv_sincosf
|
||||
|
||||
/* Result sign calculations */
|
||||
vpternlogd $150, %zmm0, %zmm14, %zmm1
|
||||
- vmovups .L_2il0floatpacket.13(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
|
||||
/* Add correction term 0.5 for cos() part */
|
||||
vaddps %zmm8, %zmm5, %zmm15
|
||||
@@ -748,8 +748,3 @@ END (_ZGVeN16vvv_sincosf_knl)
|
||||
ENTRY (_ZGVeN16vvv_sincosf_skx)
|
||||
WRAPPER_AVX512_vvv_vl4l4 _ZGVeN16vl4l4_sincosf_skx
|
||||
END (_ZGVeN16vvv_sincosf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.13:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.13,@object
|
||||
diff --git a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
index 141f747e..1fc9308a 100644
|
||||
--- a/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
+++ b/sysdeps/x86_64/fpu/multiarch/svml_s_sinf16_core_avx512.S
|
||||
@@ -280,7 +280,7 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
|
||||
movq __svml_s_trig_data@GOTPCREL(%rip), %rax
|
||||
|
||||
/* Check for large and special values */
|
||||
- vmovups .L_2il0floatpacket.11(%rip), %zmm14
|
||||
+ vpternlogd $0xff, %zmm14, %zmm14, %zmm14
|
||||
vmovups __sAbsMask(%rax), %zmm5
|
||||
vmovups __sInvPI(%rax), %zmm1
|
||||
vmovups __sRShifter(%rax), %zmm2
|
||||
@@ -472,8 +472,3 @@ WRAPPER_IMPL_AVX512 _ZGVdN8v_sinf
|
||||
jmp .LBL_2_7
|
||||
#endif
|
||||
END (_ZGVeN16v_sinf_skx)
|
||||
-
|
||||
- .section .rodata, "a"
|
||||
-.L_2il0floatpacket.11:
|
||||
- .long 0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff,0xffffffff
|
||||
- .type .L_2il0floatpacket.11,@object
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,48 +0,0 @@
|
||||
From fc5bd179ef3a953dff8d1655bd530d0e230ffe71 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue, 21 Sep 2021 18:31:49 -0500
|
||||
Subject: [PATCH] x86: Modify ENTRY in sysdep.h so that p2align can be
|
||||
specified
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
|
||||
This change adds a new macro ENTRY_P2ALIGN which takes a second
|
||||
argument, log2 of the desired function alignment.
|
||||
|
||||
The old ENTRY(name) macro is just ENTRY_P2ALIGN(name, 4) so this
|
||||
doesn't affect any existing functionality.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86/sysdep.h | 7 +++++--
|
||||
1 file changed, 5 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
|
||||
index 01bac0f6..a70bb3a2 100644
|
||||
--- a/sysdeps/x86/sysdep.h
|
||||
+++ b/sysdeps/x86/sysdep.h
|
||||
@@ -78,15 +78,18 @@ enum cf_protection_level
|
||||
#define ASM_SIZE_DIRECTIVE(name) .size name,.-name;
|
||||
|
||||
/* Define an entry point visible from C. */
|
||||
-#define ENTRY(name) \
|
||||
+#define ENTRY_P2ALIGN(name, alignment) \
|
||||
.globl C_SYMBOL_NAME(name); \
|
||||
.type C_SYMBOL_NAME(name),@function; \
|
||||
- .align ALIGNARG(4); \
|
||||
+ .align ALIGNARG(alignment); \
|
||||
C_LABEL(name) \
|
||||
cfi_startproc; \
|
||||
_CET_ENDBR; \
|
||||
CALL_MCOUNT
|
||||
|
||||
+/* Common entry 16 byte aligns. */
|
||||
+#define ENTRY(name) ENTRY_P2ALIGN (name, 4)
|
||||
+
|
||||
#undef END
|
||||
#define END(name) \
|
||||
cfi_endproc; \
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,658 +0,0 @@
|
||||
From 1bd8b8d58fc9967cc073d2c13bfb6befefca2faa Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue, 21 Sep 2021 18:45:03 -0500
|
||||
Subject: [PATCH] x86: Optimize memcmp-evex-movbe.S for frontend behavior and
|
||||
size
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
|
||||
The frontend optimizations are to:
|
||||
1. Reorganize logically connected basic blocks so they are either in
|
||||
the same cache line or adjacent cache lines.
|
||||
2. Avoid cases when basic blocks unnecissarily cross cache lines.
|
||||
3. Try and 32 byte align any basic blocks possible without sacrificing
|
||||
code size. Smaller / Less hot basic blocks are used for this.
|
||||
|
||||
Overall code size shrunk by 168 bytes. This should make up for any
|
||||
extra costs due to aligning to 64 bytes.
|
||||
|
||||
In general performance before deviated a great deal dependending on
|
||||
whether entry alignment % 64 was 0, 16, 32, or 48. These changes
|
||||
essentially make it so that the current implementation is at least
|
||||
equal to the best alignment of the original for any arguments.
|
||||
|
||||
The only additional optimization is in the page cross case. Branch on
|
||||
equals case was removed from the size == [4, 7] case. As well the [4,
|
||||
7] and [2, 3] case where swapped as [4, 7] is likely a more hot
|
||||
argument size.
|
||||
|
||||
test-memcmp and test-wmemcmp are both passing.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 434 +++++++++++--------
|
||||
1 file changed, 242 insertions(+), 192 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
index 654dc7ac..2761b54f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -34,7 +34,24 @@
|
||||
area.
|
||||
7. Use 2 vector compares when size is 2 * CHAR_PER_VEC or less.
|
||||
8. Use 4 vector compares when size is 4 * CHAR_PER_VEC or less.
|
||||
- 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less. */
|
||||
+ 9. Use 8 vector compares when size is 8 * CHAR_PER_VEC or less.
|
||||
+
|
||||
+When possible the implementation tries to optimize for frontend in the
|
||||
+following ways:
|
||||
+Throughput:
|
||||
+ 1. All code sections that fit are able to run optimally out of the
|
||||
+ LSD.
|
||||
+ 2. All code sections that fit are able to run optimally out of the
|
||||
+ DSB
|
||||
+ 3. Basic blocks are contained in minimum number of fetch blocks
|
||||
+ necessary.
|
||||
+
|
||||
+Latency:
|
||||
+ 1. Logically connected basic blocks are put in the same
|
||||
+ cache-line.
|
||||
+ 2. Logically connected basic blocks that do not fit in the same
|
||||
+ cache-line are put in adjacent lines. This can get beneficial
|
||||
+ L2 spatial prefetching and L1 next-line prefetching. */
|
||||
|
||||
# include <sysdep.h>
|
||||
|
||||
@@ -47,9 +64,11 @@
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
# define CHAR_SIZE 4
|
||||
# define VPCMP vpcmpd
|
||||
+# define VPTEST vptestmd
|
||||
# else
|
||||
# define CHAR_SIZE 1
|
||||
# define VPCMP vpcmpub
|
||||
+# define VPTEST vptestmb
|
||||
# endif
|
||||
|
||||
# define VEC_SIZE 32
|
||||
@@ -75,7 +94,9 @@
|
||||
*/
|
||||
|
||||
.section .text.evex,"ax",@progbits
|
||||
-ENTRY (MEMCMP)
|
||||
+/* Cache align memcmp entry. This allows for much more thorough
|
||||
+ frontend optimization. */
|
||||
+ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
movl %edx, %edx
|
||||
@@ -89,7 +110,7 @@ ENTRY (MEMCMP)
|
||||
VPCMP $4, (%rdi), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
/* NB: eax must be destination register if going to
|
||||
- L(return_vec_[0,2]). For L(return_vec_3 destination register
|
||||
+ L(return_vec_[0,2]). For L(return_vec_3) destination register
|
||||
must be ecx. */
|
||||
testl %eax, %eax
|
||||
jnz L(return_vec_0)
|
||||
@@ -121,10 +142,6 @@ ENTRY (MEMCMP)
|
||||
testl %ecx, %ecx
|
||||
jnz L(return_vec_3)
|
||||
|
||||
- /* Zero YMM0. 4x VEC reduction is done with vpxor + vtern so
|
||||
- compare with zero to get a mask is needed. */
|
||||
- vpxorq %XMM0, %XMM0, %XMM0
|
||||
-
|
||||
/* Go to 4x VEC loop. */
|
||||
cmpq $(CHAR_PER_VEC * 8), %rdx
|
||||
ja L(more_8x_vec)
|
||||
@@ -148,47 +165,61 @@ ENTRY (MEMCMP)
|
||||
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
- /* Or together YMM1, YMM2, and YMM3 into YMM3. */
|
||||
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
|
||||
- oring with YMM3. Result is stored in YMM4. */
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
||||
- /* Compare YMM4 with 0. If any 1s s1 and s2 don't match. */
|
||||
- VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ oring with YMM1. Result is stored in YMM4. */
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+
|
||||
+ /* Or together YMM2, YMM3, and YMM4 into YMM4. */
|
||||
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
+
|
||||
+ /* Test YMM4 against itself. Store any CHAR mismatches in k1.
|
||||
+ */
|
||||
+ VPTEST %YMM4, %YMM4, %k1
|
||||
+ /* k1 must go to ecx for L(return_vec_0_1_2_3). */
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(return_vec_0_1_2_3)
|
||||
/* NB: eax must be zero to reach here. */
|
||||
ret
|
||||
|
||||
- /* NB: aligning 32 here allows for the rest of the jump targets
|
||||
- to be tuned for 32 byte alignment. Most important this ensures
|
||||
- the L(more_8x_vec) loop is 32 byte aligned. */
|
||||
- .p2align 5
|
||||
-L(less_vec):
|
||||
- /* Check if one or less CHAR. This is necessary for size = 0 but
|
||||
- is also faster for size = CHAR_SIZE. */
|
||||
- cmpl $1, %edx
|
||||
- jbe L(one_or_less)
|
||||
+ .p2align 4
|
||||
+L(8x_end_return_vec_0_1_2_3):
|
||||
+ movq %rdx, %rdi
|
||||
+L(8x_return_vec_0_1_2_3):
|
||||
+ addq %rdi, %rsi
|
||||
+L(return_vec_0_1_2_3):
|
||||
+ VPTEST %YMM1, %YMM1, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
|
||||
- /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
- page cross. This can have false positives but is by far the
|
||||
- fastest method. */
|
||||
- movl %edi, %eax
|
||||
- orl %esi, %eax
|
||||
- andl $(PAGE_SIZE - 1), %eax
|
||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
- jg L(page_cross_less_vec)
|
||||
+ VPTEST %YMM2, %YMM2, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_1)
|
||||
|
||||
- /* No page cross possible. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMP $4, (%rdi), %YMM2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- /* Create mask in ecx for potentially in bound matches. */
|
||||
- bzhil %edx, %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
+ VPTEST %YMM3, %YMM3, %k0
|
||||
+ kmovd %k0, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_2)
|
||||
+L(return_vec_3):
|
||||
+ /* bsf saves 1 byte from tzcnt. This keep L(return_vec_3) in one
|
||||
+ fetch block and the entire L(*return_vec_0_1_2_3) in 1 cache
|
||||
+ line. */
|
||||
+ bsfl %ecx, %ecx
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
@@ -209,10 +240,11 @@ L(return_vec_0):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- /* NB: No p2align necessary. Alignment % 16 is naturally 1
|
||||
- which is good enough for a target not in a loop. */
|
||||
+ .p2align 4
|
||||
L(return_vec_1):
|
||||
- tzcntl %eax, %eax
|
||||
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_1) in one
|
||||
+ fetch block. */
|
||||
+ bsfl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl VEC_SIZE(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
@@ -226,10 +258,11 @@ L(return_vec_1):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- /* NB: No p2align necessary. Alignment % 16 is naturally 2
|
||||
- which is good enough for a target not in a loop. */
|
||||
+ .p2align 4,, 10
|
||||
L(return_vec_2):
|
||||
- tzcntl %eax, %eax
|
||||
+ /* bsf saves 1 byte over tzcnt and keeps L(return_vec_2) in one
|
||||
+ fetch block. */
|
||||
+ bsfl %eax, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
movl (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
@@ -243,40 +276,6 @@ L(return_vec_2):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(8x_return_vec_0_1_2_3):
|
||||
- /* Returning from L(more_8x_vec) requires restoring rsi. */
|
||||
- addq %rdi, %rsi
|
||||
-L(return_vec_0_1_2_3):
|
||||
- VPCMP $4, %YMM1, %YMM0, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
-
|
||||
- VPCMP $4, %YMM2, %YMM0, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_1)
|
||||
-
|
||||
- VPCMP $4, %YMM3, %YMM0, %k0
|
||||
- kmovd %k0, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_2)
|
||||
-L(return_vec_3):
|
||||
- tzcntl %ecx, %ecx
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (VEC_SIZE * 3)(%rdi, %rcx, CHAR_SIZE), %eax
|
||||
- xorl %edx, %edx
|
||||
- cmpl (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %eax
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (VEC_SIZE * 3)(%rdi, %rcx), %eax
|
||||
- movzbl (VEC_SIZE * 3)(%rsi, %rcx), %ecx
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
.p2align 4
|
||||
L(more_8x_vec):
|
||||
/* Set end of s1 in rdx. */
|
||||
@@ -288,21 +287,19 @@ L(more_8x_vec):
|
||||
andq $-VEC_SIZE, %rdi
|
||||
/* Adjust because first 4x vec where check already. */
|
||||
subq $-(VEC_SIZE * 4), %rdi
|
||||
+
|
||||
.p2align 4
|
||||
L(loop_4x_vec):
|
||||
VMOVU (%rsi, %rdi), %YMM1
|
||||
vpxorq (%rdi), %YMM1, %YMM1
|
||||
-
|
||||
VMOVU VEC_SIZE(%rsi, %rdi), %YMM2
|
||||
vpxorq VEC_SIZE(%rdi), %YMM2, %YMM2
|
||||
-
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
||||
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
-
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM3, %YMM4
|
||||
- VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
+ VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(8x_return_vec_0_1_2_3)
|
||||
@@ -319,28 +316,25 @@ L(loop_4x_vec):
|
||||
cmpl $(VEC_SIZE * 2), %edi
|
||||
jae L(8x_last_2x_vec)
|
||||
|
||||
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
|
||||
+
|
||||
VMOVU (%rsi, %rdx), %YMM1
|
||||
vpxorq (%rdx), %YMM1, %YMM1
|
||||
|
||||
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
|
||||
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
|
||||
-
|
||||
- vpxorq (VEC_SIZE * 2)(%rdx), %YMM3, %YMM3
|
||||
- vpternlogd $0xfe, %YMM1, %YMM2, %YMM3
|
||||
-
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM3, %YMM4
|
||||
- VPCMP $4, %YMM4, %YMM0, %k1
|
||||
+ vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
|
||||
+ vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
+ VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
- /* Restore s1 pointer to rdi. */
|
||||
- movq %rdx, %rdi
|
||||
testl %ecx, %ecx
|
||||
- jnz L(8x_return_vec_0_1_2_3)
|
||||
+ jnz L(8x_end_return_vec_0_1_2_3)
|
||||
/* NB: eax must be zero to reach here. */
|
||||
ret
|
||||
|
||||
/* Only entry is from L(more_8x_vec). */
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 10
|
||||
L(8x_last_2x_vec):
|
||||
VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
@@ -355,7 +349,31 @@ L(8x_last_1x_vec):
|
||||
jnz L(8x_return_vec_3)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ /* Not ideally aligned (at offset +9 bytes in fetch block) but
|
||||
+ not aligning keeps it in the same cache line as
|
||||
+ L(8x_last_1x/2x_vec) so likely worth it. As well, saves code
|
||||
+ size. */
|
||||
+ .p2align 4,, 4
|
||||
+L(8x_return_vec_2):
|
||||
+ subq $VEC_SIZE, %rdx
|
||||
+L(8x_return_vec_3):
|
||||
+ bsfl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ leaq (%rdx, %rax, CHAR_SIZE), %rax
|
||||
+ movl (VEC_SIZE * 3)(%rax), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ addq %rdx, %rax
|
||||
+ movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ movzbl (VEC_SIZE * 3)(%rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+ .p2align 4,, 10
|
||||
L(last_2x_vec):
|
||||
/* Check second to last VEC. */
|
||||
VMOVU -(VEC_SIZE * 2)(%rsi, %rdx, CHAR_SIZE), %YMM1
|
||||
@@ -374,26 +392,49 @@ L(last_1x_vec):
|
||||
jnz L(return_vec_0_end)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(8x_return_vec_2):
|
||||
- subq $VEC_SIZE, %rdx
|
||||
-L(8x_return_vec_3):
|
||||
- tzcntl %eax, %eax
|
||||
+ .p2align 4,, 10
|
||||
+L(return_vec_1_end):
|
||||
+ /* Use bsf to save code size. This is necessary to have
|
||||
+ L(one_or_less) fit in aligning bytes between. */
|
||||
+ bsfl %eax, %eax
|
||||
+ addl %edx, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- leaq (%rdx, %rax, CHAR_SIZE), %rax
|
||||
- movl (VEC_SIZE * 3)(%rax), %ecx
|
||||
+ movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
- cmpl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
+ cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
- addq %rdx, %rax
|
||||
- movzbl (VEC_SIZE * 3)(%rsi, %rax), %ecx
|
||||
- movzbl (VEC_SIZE * 3)(%rax), %eax
|
||||
+ movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
+ movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
+ /* NB: L(one_or_less) fits in alignment padding between
|
||||
+ L(return_vec_1_end) and L(return_vec_0_end). */
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
+ movl (%rdi), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi), %ecx
|
||||
+ je L(zero)
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+ ret
|
||||
+# else
|
||||
+L(one_or_less):
|
||||
+ jb L(zero)
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
+ subl %ecx, %eax
|
||||
+ ret
|
||||
+# endif
|
||||
+L(zero):
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
.p2align 4
|
||||
L(return_vec_0_end):
|
||||
tzcntl %eax, %eax
|
||||
@@ -412,23 +453,56 @@ L(return_vec_0_end):
|
||||
ret
|
||||
|
||||
.p2align 4
|
||||
-L(return_vec_1_end):
|
||||
+L(less_vec):
|
||||
+ /* Check if one or less CHAR. This is necessary for size == 0
|
||||
+ but is also faster for size == CHAR_SIZE. */
|
||||
+ cmpl $1, %edx
|
||||
+ jbe L(one_or_less)
|
||||
+
|
||||
+ /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
+ page cross. This can have false positives but is by far the
|
||||
+ fastest method. */
|
||||
+ movl %edi, %eax
|
||||
+ orl %esi, %eax
|
||||
+ andl $(PAGE_SIZE - 1), %eax
|
||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
+ jg L(page_cross_less_vec)
|
||||
+
|
||||
+ /* No page cross possible. */
|
||||
+ VMOVU (%rsi), %YMM2
|
||||
+ VPCMP $4, (%rdi), %YMM2, %k1
|
||||
+ kmovd %k1, %eax
|
||||
+ /* Check if any matches where in bounds. Intentionally not
|
||||
+ storing result in eax to limit dependency chain if it goes to
|
||||
+ L(return_vec_0_lv). */
|
||||
+ bzhil %edx, %eax, %edx
|
||||
+ jnz L(return_vec_0_lv)
|
||||
+ xorl %eax, %eax
|
||||
+ ret
|
||||
+
|
||||
+ /* Essentially duplicate of L(return_vec_0). Ends up not costing
|
||||
+ any code as shrinks L(less_vec) by allowing 2-byte encoding of
|
||||
+ the jump and ends up fitting in aligning bytes. As well fits on
|
||||
+ same cache line as L(less_vec) so also saves a line from having
|
||||
+ to be fetched on cold calls to memcmp. */
|
||||
+ .p2align 4,, 4
|
||||
+L(return_vec_0_lv):
|
||||
tzcntl %eax, %eax
|
||||
- addl %edx, %eax
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
- movl -(VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
xorl %edx, %edx
|
||||
- cmpl -(VEC_SIZE * 2)(%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ /* NB: no partial register stall here because xorl zero idiom
|
||||
+ above. */
|
||||
setg %dl
|
||||
leal -1(%rdx, %rdx), %eax
|
||||
# else
|
||||
- movzbl -(VEC_SIZE * 2)(%rsi, %rax), %ecx
|
||||
- movzbl -(VEC_SIZE * 2)(%rdi, %rax), %eax
|
||||
+ movzbl (%rsi, %rax), %ecx
|
||||
+ movzbl (%rdi, %rax), %eax
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
|
||||
-
|
||||
.p2align 4
|
||||
L(page_cross_less_vec):
|
||||
/* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
||||
@@ -439,108 +513,84 @@ L(page_cross_less_vec):
|
||||
cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
cmpl $4, %edx
|
||||
- jae L(between_4_7)
|
||||
-L(between_2_3):
|
||||
- /* Load as big endian to avoid branches. */
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
- .p2align 4
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movzbl (%rsi), %ecx
|
||||
- movzbl (%rdi), %eax
|
||||
- subl %ecx, %eax
|
||||
+ jb L(between_2_3)
|
||||
+
|
||||
+ /* Load as big endian with overlapping movbe to avoid branches.
|
||||
+ */
|
||||
+ movbe (%rdi), %eax
|
||||
+ movbe (%rsi), %ecx
|
||||
+ shlq $32, %rax
|
||||
+ shlq $32, %rcx
|
||||
+ movbe -4(%rdi, %rdx), %edi
|
||||
+ movbe -4(%rsi, %rdx), %esi
|
||||
+ orq %rdi, %rax
|
||||
+ orq %rsi, %rcx
|
||||
+ subq %rcx, %rax
|
||||
+ /* edx is guranteed to be positive int32 in range [4, 7]. */
|
||||
+ cmovne %edx, %eax
|
||||
+ /* ecx is -1 if rcx > rax. Otherwise 0. */
|
||||
+ sbbl %ecx, %ecx
|
||||
+ /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
|
||||
+ rax then eax and ecx are zero. If rax < rax then ecx is -1 so
|
||||
+ eax doesn't matter. */
|
||||
+ orl %ecx, %eax
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 8
|
||||
L(between_8_15):
|
||||
# endif
|
||||
/* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
- vmovq (%rdi), %XMM1
|
||||
- vmovq (%rsi), %XMM2
|
||||
- VPCMP $4, %XMM1, %XMM2, %k1
|
||||
+ vmovq (%rdi), %xmm1
|
||||
+ vmovq (%rsi), %xmm2
|
||||
+ VPCMP $4, %xmm1, %xmm2, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
+ jnz L(return_vec_0_lv)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
- leaq -8(%rdi, %rdx, CHAR_SIZE), %rdi
|
||||
- leaq -8(%rsi, %rdx, CHAR_SIZE), %rsi
|
||||
- vmovq (%rdi), %XMM1
|
||||
- vmovq (%rsi), %XMM2
|
||||
- VPCMP $4, %XMM1, %XMM2, %k1
|
||||
+ vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
|
||||
+ vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
+ VPCMP $4, %xmm1, %xmm2, %k1
|
||||
+ addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
+ jnz L(return_vec_0_end)
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+ .p2align 4,, 8
|
||||
L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
- VMOVU (%rsi), %XMM2
|
||||
- VPCMP $4, (%rdi), %XMM2, %k1
|
||||
+
|
||||
+ /* Use movups to save code size. */
|
||||
+ movups (%rsi), %xmm2
|
||||
+ VPCMP $4, (%rdi), %xmm2, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
-
|
||||
+ jnz L(return_vec_0_lv)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
-
|
||||
- VMOVU -16(%rsi, %rdx, CHAR_SIZE), %XMM2
|
||||
- leaq -16(%rdi, %rdx, CHAR_SIZE), %rdi
|
||||
- leaq -16(%rsi, %rdx, CHAR_SIZE), %rsi
|
||||
- VPCMP $4, (%rdi), %XMM2, %k1
|
||||
+ movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
+ VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
||||
+ addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
- jnz L(return_vec_0)
|
||||
- ret
|
||||
-
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- .p2align 4
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movl (%rdi), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi), %ecx
|
||||
- je L(zero)
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
+ jnz L(return_vec_0_end)
|
||||
ret
|
||||
-# else
|
||||
|
||||
- .p2align 4
|
||||
-L(between_4_7):
|
||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
||||
- */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- jz L(zero_4_7)
|
||||
- sbbl %eax, %eax
|
||||
- orl $1, %eax
|
||||
-L(zero_4_7):
|
||||
+# ifndef USE_AS_WMEMCMP
|
||||
+L(between_2_3):
|
||||
+ /* Load as big endian to avoid branches. */
|
||||
+ movzwl (%rdi), %eax
|
||||
+ movzwl (%rsi), %ecx
|
||||
+ shll $8, %eax
|
||||
+ shll $8, %ecx
|
||||
+ bswap %eax
|
||||
+ bswap %ecx
|
||||
+ movzbl -1(%rdi, %rdx), %edi
|
||||
+ movzbl -1(%rsi, %rdx), %esi
|
||||
+ orl %edi, %eax
|
||||
+ orl %esi, %ecx
|
||||
+ /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
+ subl %ecx, %eax
|
||||
ret
|
||||
# endif
|
||||
-
|
||||
END (MEMCMP)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,510 +0,0 @@
|
||||
From e59ced238482fd71f3e493717f14f6507346741e Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 20 Sep 2021 16:20:15 -0500
|
||||
Subject: [PATCH] x86: Optimize memset-vec-unaligned-erms.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
|
||||
Optimization are
|
||||
|
||||
1. change control flow for L(more_2x_vec) to fall through to loop and
|
||||
jump for L(less_4x_vec) and L(less_8x_vec). This uses less code
|
||||
size and saves jumps for length > 4x VEC_SIZE.
|
||||
|
||||
2. For EVEX/AVX512 move L(less_vec) closer to entry.
|
||||
|
||||
3. Avoid complex address mode for length > 2x VEC_SIZE
|
||||
|
||||
4. Slightly better aligning code for the loop from the perspective of
|
||||
code size and uops.
|
||||
|
||||
5. Align targets so they make full use of their fetch block and if
|
||||
possible cache line.
|
||||
|
||||
6. Try and reduce total number of icache lines that will need to be
|
||||
pulled in for a given length.
|
||||
|
||||
7. Include "local" version of stosb target. For AVX2/EVEX/AVX512
|
||||
jumping to the stosb target in the sse2 code section will almost
|
||||
certainly be to a new page. The new version does increase code size
|
||||
marginally by duplicating the target but should get better iTLB
|
||||
behavior as a result.
|
||||
|
||||
test-memset, test-wmemset, and test-bzero are all passing.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/memset.S | 10 +-
|
||||
.../multiarch/memset-avx2-unaligned-erms.S | 10 +-
|
||||
.../multiarch/memset-avx512-unaligned-erms.S | 11 +-
|
||||
.../multiarch/memset-evex-unaligned-erms.S | 11 +-
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 285 ++++++++++++------
|
||||
5 files changed, 232 insertions(+), 95 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/memset.S
|
||||
(GNU URL)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||||
index b3426795..8672b030 100644
|
||||
--- a/sysdeps/x86_64/memset.S
|
||||
+++ b/sysdeps/x86_64/memset.S
|
||||
@@ -18,13 +18,15 @@
|
||||
<http://www.gnu.org/licenses/>. */
|
||||
|
||||
#include <sysdep.h>
|
||||
+#define USE_WITH_SSE2 1
|
||||
|
||||
#define VEC_SIZE 16
|
||||
+#define MOV_SIZE 3
|
||||
+#define RET_SIZE 1
|
||||
+
|
||||
#define VEC(i) xmm##i
|
||||
-/* Don't use movups and movaps since it will get larger nop paddings for
|
||||
- alignment. */
|
||||
-#define VMOVU movdqu
|
||||
-#define VMOVA movdqa
|
||||
+#define VMOVU movups
|
||||
+#define VMOVA movaps
|
||||
|
||||
#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
index ae0860f3..1af668af 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
@@ -1,8 +1,14 @@
|
||||
#if IS_IN (libc)
|
||||
+# define USE_WITH_AVX2 1
|
||||
+
|
||||
# define VEC_SIZE 32
|
||||
+# define MOV_SIZE 4
|
||||
+# define RET_SIZE 4
|
||||
+
|
||||
# define VEC(i) ymm##i
|
||||
-# define VMOVU vmovdqu
|
||||
-# define VMOVA vmovdqa
|
||||
+
|
||||
+# define VMOVU vmovdqu
|
||||
+# define VMOVA vmovdqa
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index 8ad842fc..f14d6f84 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -1,11 +1,18 @@
|
||||
#if IS_IN (libc)
|
||||
+# define USE_WITH_AVX512 1
|
||||
+
|
||||
# define VEC_SIZE 64
|
||||
+# define MOV_SIZE 6
|
||||
+# define RET_SIZE 1
|
||||
+
|
||||
# define XMM0 xmm16
|
||||
# define YMM0 ymm16
|
||||
# define VEC0 zmm16
|
||||
# define VEC(i) VEC##i
|
||||
-# define VMOVU vmovdqu64
|
||||
-# define VMOVA vmovdqa64
|
||||
+
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
index 640f0929..64b09e77 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -1,11 +1,18 @@
|
||||
#if IS_IN (libc)
|
||||
+# define USE_WITH_EVEX 1
|
||||
+
|
||||
# define VEC_SIZE 32
|
||||
+# define MOV_SIZE 6
|
||||
+# define RET_SIZE 1
|
||||
+
|
||||
# define XMM0 xmm16
|
||||
# define YMM0 ymm16
|
||||
# define VEC0 ymm16
|
||||
# define VEC(i) VEC##i
|
||||
-# define VMOVU vmovdqu64
|
||||
-# define VMOVA vmovdqa64
|
||||
+
|
||||
+# define VMOVU vmovdqu64
|
||||
+# define VMOVA vmovdqa64
|
||||
+
|
||||
# define VZEROUPPER
|
||||
|
||||
# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index 909c33f6..f08b7323 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -63,8 +63,27 @@
|
||||
# endif
|
||||
#endif
|
||||
|
||||
+#if VEC_SIZE == 64
|
||||
+# define LOOP_4X_OFFSET (VEC_SIZE * 4)
|
||||
+#else
|
||||
+# define LOOP_4X_OFFSET (0)
|
||||
+#endif
|
||||
+
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+# define END_REG rcx
|
||||
+# define LOOP_REG rdi
|
||||
+#else
|
||||
+# define END_REG rdi
|
||||
+# define LOOP_REG rdx
|
||||
+#endif
|
||||
+
|
||||
#define PAGE_SIZE 4096
|
||||
|
||||
+/* Macro to calculate size of small memset block for aligning
|
||||
+ purposes. */
|
||||
+#define SMALL_MEMSET_ALIGN(mov_sz, ret_sz) (2 * (mov_sz) + (ret_sz) + 1)
|
||||
+
|
||||
+
|
||||
#ifndef SECTION
|
||||
# error SECTION is not defined!
|
||||
#endif
|
||||
@@ -74,6 +93,7 @@
|
||||
ENTRY (__bzero)
|
||||
mov %RDI_LP, %RAX_LP /* Set return value. */
|
||||
mov %RSI_LP, %RDX_LP /* Set n. */
|
||||
+ xorl %esi, %esi
|
||||
pxor %XMM0, %XMM0
|
||||
jmp L(entry_from_bzero)
|
||||
END (__bzero)
|
||||
@@ -158,7 +178,7 @@ ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
-ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
+ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
||||
MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
@@ -168,75 +188,43 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
jb L(less_vec)
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(stosb_more_2x_vec)
|
||||
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||||
+ */
|
||||
+ VMOVU %VEC(0), (%rax)
|
||||
+ VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
-L(stosb_more_2x_vec):
|
||||
- cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
||||
- ja L(stosb)
|
||||
-#else
|
||||
- .p2align 4
|
||||
#endif
|
||||
-L(more_2x_vec):
|
||||
- /* Stores to first 2x VEC before cmp as any path forward will
|
||||
- require it. */
|
||||
- VMOVU %VEC(0), (%rdi)
|
||||
- VMOVU %VEC(0), VEC_SIZE(%rdi)
|
||||
- cmpq $(VEC_SIZE * 4), %rdx
|
||||
- ja L(loop_start)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rdi,%rdx)
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rdi,%rdx)
|
||||
-L(return):
|
||||
-#if VEC_SIZE > 16
|
||||
- ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+
|
||||
+ .p2align 4,, 10
|
||||
+L(last_2x_vec):
|
||||
+#ifdef USE_LESS_VEC_MASK_STORE
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
||||
#else
|
||||
- ret
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
||||
#endif
|
||||
+ VZEROUPPER_RETURN
|
||||
|
||||
-L(loop_start):
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 3)(%rdi)
|
||||
- cmpq $(VEC_SIZE * 8), %rdx
|
||||
- jbe L(loop_end)
|
||||
- andq $-(VEC_SIZE * 2), %rdi
|
||||
- subq $-(VEC_SIZE * 4), %rdi
|
||||
- leaq -(VEC_SIZE * 4)(%rax, %rdx), %rcx
|
||||
- .p2align 4
|
||||
-L(loop):
|
||||
- VMOVA %VEC(0), (%rdi)
|
||||
- VMOVA %VEC(0), VEC_SIZE(%rdi)
|
||||
- VMOVA %VEC(0), (VEC_SIZE * 2)(%rdi)
|
||||
- VMOVA %VEC(0), (VEC_SIZE * 3)(%rdi)
|
||||
- subq $-(VEC_SIZE * 4), %rdi
|
||||
- cmpq %rcx, %rdi
|
||||
- jb L(loop)
|
||||
-L(loop_end):
|
||||
- /* NB: rax is set as ptr in MEMSET_VDUP_TO_VEC0_AND_SET_RETURN.
|
||||
- rdx as length is also unchanged. */
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 4)(%rax, %rdx)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 3)(%rax, %rdx)
|
||||
- VMOVU %VEC(0), -(VEC_SIZE * 2)(%rax, %rdx)
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
- VZEROUPPER_SHORT_RETURN
|
||||
-
|
||||
- .p2align 4
|
||||
+ /* If have AVX512 mask instructions put L(less_vec) close to
|
||||
+ entry as it doesn't take much space and is likely a hot target.
|
||||
+ */
|
||||
+#ifdef USE_LESS_VEC_MASK_STORE
|
||||
+ .p2align 4,, 10
|
||||
L(less_vec):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
# endif
|
||||
-# ifdef USE_LESS_VEC_MASK_STORE
|
||||
/* Clear high bits from edi. Only keeping bits relevant to page
|
||||
cross check. Note that we are using rax which is set in
|
||||
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out.
|
||||
- */
|
||||
+ MEMSET_VDUP_TO_VEC0_AND_SET_RETURN as ptr from here on out. */
|
||||
andl $(PAGE_SIZE - 1), %edi
|
||||
- /* Check if VEC_SIZE store cross page. Mask stores suffer serious
|
||||
- performance degradation when it has to fault supress. */
|
||||
+ /* Check if VEC_SIZE store cross page. Mask stores suffer
|
||||
+ serious performance degradation when it has to fault supress.
|
||||
+ */
|
||||
cmpl $(PAGE_SIZE - VEC_SIZE), %edi
|
||||
+ /* This is generally considered a cold target. */
|
||||
ja L(cross_page)
|
||||
# if VEC_SIZE > 32
|
||||
movq $-1, %rcx
|
||||
@@ -247,58 +235,185 @@ L(less_vec):
|
||||
bzhil %edx, %ecx, %ecx
|
||||
kmovd %ecx, %k1
|
||||
# endif
|
||||
- vmovdqu8 %VEC(0), (%rax) {%k1}
|
||||
+ vmovdqu8 %VEC(0), (%rax){%k1}
|
||||
VZEROUPPER_RETURN
|
||||
|
||||
+# if defined USE_MULTIARCH && IS_IN (libc)
|
||||
+ /* Include L(stosb_local) here if including L(less_vec) between
|
||||
+ L(stosb_more_2x_vec) and ENTRY. This is to cache align the
|
||||
+ L(stosb_more_2x_vec) target. */
|
||||
+ .p2align 4,, 10
|
||||
+L(stosb_local):
|
||||
+ movzbl %sil, %eax
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ mov %RDI_LP, %RDX_LP
|
||||
+ rep stosb
|
||||
+ mov %RDX_LP, %RAX_LP
|
||||
+ VZEROUPPER_RETURN
|
||||
+# endif
|
||||
+#endif
|
||||
+
|
||||
+#if defined USE_MULTIARCH && IS_IN (libc)
|
||||
.p2align 4
|
||||
-L(cross_page):
|
||||
+L(stosb_more_2x_vec):
|
||||
+ cmp __x86_rep_stosb_threshold(%rip), %RDX_LP
|
||||
+ ja L(stosb_local)
|
||||
+#endif
|
||||
+ /* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
||||
+ and (4x, 8x] jump to target. */
|
||||
+L(more_2x_vec):
|
||||
+
|
||||
+ /* Two different methods of setting up pointers / compare. The
|
||||
+ two methods are based on the fact that EVEX/AVX512 mov
|
||||
+ instructions take more bytes then AVX2/SSE2 mov instructions. As
|
||||
+ well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
||||
+ setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
||||
+ this saves code size and keeps a few targets in one fetch block.
|
||||
+ For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
||||
+ LOOP_4X_OFFSET) with LEA_BID. */
|
||||
+
|
||||
+ /* END_REG is rcx for EVEX/AVX512. */
|
||||
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
+#endif
|
||||
+
|
||||
+ /* Stores to first 2x VEC before cmp as any path forward will
|
||||
+ require it. */
|
||||
+ VMOVU %VEC(0), (%rax)
|
||||
+ VMOVU %VEC(0), VEC_SIZE(%rax)
|
||||
+
|
||||
+
|
||||
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||||
+ /* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
||||
+ addq %rdx, %END_REG
|
||||
+#endif
|
||||
+
|
||||
+ cmpq $(VEC_SIZE * 4), %rdx
|
||||
+ jbe L(last_2x_vec)
|
||||
+
|
||||
+ /* Store next 2x vec regardless. */
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
||||
+
|
||||
+
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+ /* If LOOP_4X_OFFSET don't readjust LOOP_REG (rdi), just add
|
||||
+ extra offset to addresses in loop. Used for AVX512 to save space
|
||||
+ as no way to get (VEC_SIZE * 4) in imm8. */
|
||||
+# if LOOP_4X_OFFSET == 0
|
||||
+ subq $-(VEC_SIZE * 4), %LOOP_REG
|
||||
# endif
|
||||
-# if VEC_SIZE > 32
|
||||
- cmpb $32, %dl
|
||||
- jae L(between_32_63)
|
||||
+ /* Avoid imm32 compare here to save code size. */
|
||||
+ cmpq %rdi, %rcx
|
||||
+#else
|
||||
+ addq $-(VEC_SIZE * 4), %END_REG
|
||||
+ cmpq $(VEC_SIZE * 8), %rdx
|
||||
+#endif
|
||||
+ jbe L(last_4x_vec)
|
||||
+#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||||
+ /* Set LOOP_REG (rdx). */
|
||||
+ leaq (VEC_SIZE * 4)(%rax), %LOOP_REG
|
||||
+#endif
|
||||
+ /* Align dst for loop. */
|
||||
+ andq $(VEC_SIZE * -2), %LOOP_REG
|
||||
+ .p2align 4
|
||||
+L(loop):
|
||||
+ VMOVA %VEC(0), LOOP_4X_OFFSET(%LOOP_REG)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%LOOP_REG)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%LOOP_REG)
|
||||
+ VMOVA %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%LOOP_REG)
|
||||
+ subq $-(VEC_SIZE * 4), %LOOP_REG
|
||||
+ cmpq %END_REG, %LOOP_REG
|
||||
+ jb L(loop)
|
||||
+ .p2align 4,, MOV_SIZE
|
||||
+L(last_4x_vec):
|
||||
+ VMOVU %VEC(0), LOOP_4X_OFFSET(%END_REG)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE + LOOP_4X_OFFSET)(%END_REG)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%END_REG)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%END_REG)
|
||||
+L(return):
|
||||
+#if VEC_SIZE > 16
|
||||
+ ZERO_UPPER_VEC_REGISTERS_RETURN
|
||||
+#else
|
||||
+ ret
|
||||
+#endif
|
||||
+
|
||||
+ .p2align 4,, 10
|
||||
+#ifndef USE_LESS_VEC_MASK_STORE
|
||||
+# if defined USE_MULTIARCH && IS_IN (libc)
|
||||
+ /* If no USE_LESS_VEC_MASK put L(stosb_local) here. Will be in
|
||||
+ range for 2-byte jump encoding. */
|
||||
+L(stosb_local):
|
||||
+ movzbl %sil, %eax
|
||||
+ mov %RDX_LP, %RCX_LP
|
||||
+ mov %RDI_LP, %RDX_LP
|
||||
+ rep stosb
|
||||
+ mov %RDX_LP, %RAX_LP
|
||||
+ VZEROUPPER_RETURN
|
||||
# endif
|
||||
-# if VEC_SIZE > 16
|
||||
- cmpb $16, %dl
|
||||
+ /* Define L(less_vec) only if not otherwise defined. */
|
||||
+ .p2align 4
|
||||
+L(less_vec):
|
||||
+#endif
|
||||
+L(cross_page):
|
||||
+#if VEC_SIZE > 32
|
||||
+ cmpl $32, %edx
|
||||
+ jae L(between_32_63)
|
||||
+#endif
|
||||
+#if VEC_SIZE > 16
|
||||
+ cmpl $16, %edx
|
||||
jae L(between_16_31)
|
||||
-# endif
|
||||
- MOVQ %XMM0, %rcx
|
||||
- cmpb $8, %dl
|
||||
+#endif
|
||||
+ MOVQ %XMM0, %rdi
|
||||
+ cmpl $8, %edx
|
||||
jae L(between_8_15)
|
||||
- cmpb $4, %dl
|
||||
+ cmpl $4, %edx
|
||||
jae L(between_4_7)
|
||||
- cmpb $1, %dl
|
||||
+ cmpl $1, %edx
|
||||
ja L(between_2_3)
|
||||
- jb 1f
|
||||
- movb %cl, (%rax)
|
||||
-1:
|
||||
+ jb L(return)
|
||||
+ movb %sil, (%rax)
|
||||
VZEROUPPER_RETURN
|
||||
-# if VEC_SIZE > 32
|
||||
+
|
||||
+ /* Align small targets only if not doing so would cross a fetch
|
||||
+ line. */
|
||||
+#if VEC_SIZE > 32
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
- VMOVU %YMM0, -32(%rax,%rdx)
|
||||
VMOVU %YMM0, (%rax)
|
||||
+ VMOVU %YMM0, -32(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
-# endif
|
||||
-# if VEC_SIZE > 16
|
||||
- /* From 16 to 31. No branch when size == 16. */
|
||||
+#endif
|
||||
+
|
||||
+#if VEC_SIZE >= 32
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
L(between_16_31):
|
||||
- VMOVU %XMM0, -16(%rax,%rdx)
|
||||
+ /* From 16 to 31. No branch when size == 16. */
|
||||
VMOVU %XMM0, (%rax)
|
||||
+ VMOVU %XMM0, -16(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
-# endif
|
||||
- /* From 8 to 15. No branch when size == 8. */
|
||||
+#endif
|
||||
+
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
L(between_8_15):
|
||||
- movq %rcx, -8(%rax,%rdx)
|
||||
- movq %rcx, (%rax)
|
||||
+ /* From 8 to 15. No branch when size == 8. */
|
||||
+ movq %rdi, (%rax)
|
||||
+ movq %rdi, -8(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
- movl %ecx, -4(%rax,%rdx)
|
||||
- movl %ecx, (%rax)
|
||||
+ movl %edi, (%rax)
|
||||
+ movl %edi, -4(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
+
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
- movw %cx, -2(%rax,%rdx)
|
||||
- movw %cx, (%rax)
|
||||
+ movw %di, (%rax)
|
||||
+ movb %dil, -1(%rax, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,45 +0,0 @@
|
||||
From bad852b61b79503fcb3c5fc379c70f768df3e1fb Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sat, 23 Oct 2021 01:26:47 -0400
|
||||
Subject: [PATCH] x86: Replace sse2 instructions with avx in
|
||||
memcmp-evex-movbe.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
This commit replaces two usages of SSE2 'movups' with AVX 'vmovdqu'.
|
||||
|
||||
it could potentially be dangerous to use SSE2 if this function is ever
|
||||
called without using 'vzeroupper' beforehand. While compilers appear
|
||||
to use 'vzeroupper' before function calls if AVX2 has been used, using
|
||||
SSE2 here is more brittle. Since it is not absolutely necessary it
|
||||
should be avoided.
|
||||
|
||||
It costs 2-extra bytes but the extra bytes should only eat into
|
||||
alignment padding.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
index 2761b54f..640f6757 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -561,13 +561,13 @@ L(between_16_31):
|
||||
/* From 16 to 31 bytes. No branch when size == 16. */
|
||||
|
||||
/* Use movups to save code size. */
|
||||
- movups (%rsi), %xmm2
|
||||
+ vmovdqu (%rsi), %xmm2
|
||||
VPCMP $4, (%rdi), %xmm2, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(return_vec_0_lv)
|
||||
/* Use overlapping loads to avoid branches. */
|
||||
- movups -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
+ vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
||||
addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
||||
kmovd %k1, %eax
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,695 +0,0 @@
|
||||
From c46e9afb2df5fc9e39ff4d13777e4b4c26e04e55 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 29 Oct 2021 12:40:20 -0700
|
||||
Subject: [PATCH] x86-64: Improve EVEX strcmp with masked load
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
In strcmp-evex.S, to compare 2 32-byte strings, replace
|
||||
|
||||
VMOVU (%rdi, %rdx), %YMM0
|
||||
VMOVU (%rsi, %rdx), %YMM1
|
||||
/* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
VPCMP $4, %YMM0, %YMM1, %k0
|
||||
VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
/* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
kord %k1, %k2, %k1
|
||||
/* Each bit in K1 represents a NULL or a mismatch. */
|
||||
kord %k0, %k1, %k1
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
with
|
||||
|
||||
VMOVU (%rdi, %rdx), %YMM0
|
||||
VPTESTM %YMM0, %YMM0, %k2
|
||||
/* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
in YMM0 and 32 bytes at (%rsi, %rdx). */
|
||||
VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
|
||||
kmovd %k1, %ecx
|
||||
incl %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
It makes EVEX strcmp faster than AVX2 strcmp by up to 40% on Tiger Lake
|
||||
and Ice Lake.
|
||||
|
||||
Co-Authored-By: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 461 +++++++++++++------------
|
||||
1 file changed, 243 insertions(+), 218 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
index d5aa6daa..82f12ac8 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
@@ -41,6 +41,8 @@
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* Compare packed dwords. */
|
||||
# define VPCMP vpcmpd
|
||||
+# define VPMINU vpminud
|
||||
+# define VPTESTM vptestmd
|
||||
# define SHIFT_REG32 r8d
|
||||
# define SHIFT_REG64 r8
|
||||
/* 1 dword char == 4 bytes. */
|
||||
@@ -48,6 +50,8 @@
|
||||
# else
|
||||
/* Compare packed bytes. */
|
||||
# define VPCMP vpcmpb
|
||||
+# define VPMINU vpminub
|
||||
+# define VPTESTM vptestmb
|
||||
# define SHIFT_REG32 ecx
|
||||
# define SHIFT_REG64 rcx
|
||||
/* 1 byte char == 1 byte. */
|
||||
@@ -67,6 +71,9 @@
|
||||
# define YMM5 ymm22
|
||||
# define YMM6 ymm23
|
||||
# define YMM7 ymm24
|
||||
+# define YMM8 ymm25
|
||||
+# define YMM9 ymm26
|
||||
+# define YMM10 ymm27
|
||||
|
||||
/* Warning!
|
||||
wcscmp/wcsncmp have to use SIGNED comparison for elements.
|
||||
@@ -76,7 +83,7 @@
|
||||
/* The main idea of the string comparison (byte or dword) using 256-bit
|
||||
EVEX instructions consists of comparing (VPCMP) two ymm vectors. The
|
||||
latter can be on either packed bytes or dwords depending on
|
||||
- USE_AS_WCSCMP. In order to check the null char, algorithm keeps the
|
||||
+ USE_AS_WCSCMP. In order to check the null CHAR, algorithm keeps the
|
||||
matched bytes/dwords, requiring 5 EVEX instructions (3 VPCMP and 2
|
||||
KORD). In general, the costs of comparing VEC_SIZE bytes (32-bytes)
|
||||
are 3 VPCMP and 2 KORD instructions, together with VMOVU and ktestd
|
||||
@@ -123,27 +130,21 @@ ENTRY (STRCMP)
|
||||
jg L(cross_page)
|
||||
/* Start comparing 4 vectors. */
|
||||
VMOVU (%rdi), %YMM0
|
||||
- VMOVU (%rsi), %YMM1
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
|
||||
- /* Check for NULL in YMM0. */
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- /* Check for NULL in YMM1. */
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (%rsi). */
|
||||
+ VPCMP $0, (%rsi), %YMM0, %k1{%k2}
|
||||
|
||||
- /* Each bit in K1 represents:
|
||||
- 1. A mismatch in YMM0 and YMM1. Or
|
||||
- 2. A NULL in YMM0 or YMM1.
|
||||
- */
|
||||
- kord %k0, %k1, %k1
|
||||
-
|
||||
- ktestd %k1, %k1
|
||||
- je L(next_3_vectors)
|
||||
kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
+ je L(next_3_vectors)
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -172,9 +173,7 @@ L(return):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
L(return_vec_size):
|
||||
- kmovd %k1, %ecx
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -210,9 +209,7 @@ L(return_vec_size):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
L(return_2_vec_size):
|
||||
- kmovd %k1, %ecx
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -248,9 +245,7 @@ L(return_2_vec_size):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
L(return_3_vec_size):
|
||||
- kmovd %k1, %ecx
|
||||
tzcntl %ecx, %edx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -289,43 +284,45 @@ L(return_3_vec_size):
|
||||
.p2align 4
|
||||
L(next_3_vectors):
|
||||
VMOVU VEC_SIZE(%rdi), %YMM0
|
||||
- VMOVU VEC_SIZE(%rsi), %YMM1
|
||||
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- ktestd %k1, %k1
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at VEC_SIZE(%rsi). */
|
||||
+ VPCMP $0, VEC_SIZE(%rsi), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(return_vec_size)
|
||||
|
||||
- VMOVU (VEC_SIZE * 2)(%rdi), %YMM2
|
||||
- VMOVU (VEC_SIZE * 3)(%rdi), %YMM3
|
||||
- VMOVU (VEC_SIZE * 2)(%rsi), %YMM4
|
||||
- VMOVU (VEC_SIZE * 3)(%rsi), %YMM5
|
||||
-
|
||||
- /* Each bit in K0 represents a mismatch in YMM2 and YMM4. */
|
||||
- VPCMP $4, %YMM2, %YMM4, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM4, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM2 or YMM4. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- ktestd %k1, %k1
|
||||
+ VMOVU (VEC_SIZE * 2)(%rdi), %YMM0
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rsi), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(return_2_vec_size)
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in YMM3 and YMM5. */
|
||||
- VPCMP $4, %YMM3, %YMM5, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM5, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM3 or YMM5. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- ktestd %k1, %k1
|
||||
+ VMOVU (VEC_SIZE * 3)(%rdi), %YMM0
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rsi). */
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rsi), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(return_3_vec_size)
|
||||
L(main_loop_header):
|
||||
leaq (VEC_SIZE * 4)(%rdi), %rdx
|
||||
@@ -375,56 +372,51 @@ L(back_to_loop):
|
||||
VMOVA VEC_SIZE(%rax), %YMM2
|
||||
VMOVA (VEC_SIZE * 2)(%rax), %YMM4
|
||||
VMOVA (VEC_SIZE * 3)(%rax), %YMM6
|
||||
- VMOVU (%rdx), %YMM1
|
||||
- VMOVU VEC_SIZE(%rdx), %YMM3
|
||||
- VMOVU (VEC_SIZE * 2)(%rdx), %YMM5
|
||||
- VMOVU (VEC_SIZE * 3)(%rdx), %YMM7
|
||||
-
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K4 represents a NULL or a mismatch in YMM0 and
|
||||
- YMM1. */
|
||||
- kord %k0, %k1, %k4
|
||||
-
|
||||
- VPCMP $4, %YMM2, %YMM3, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K5 represents a NULL or a mismatch in YMM2 and
|
||||
- YMM3. */
|
||||
- kord %k0, %k1, %k5
|
||||
-
|
||||
- VPCMP $4, %YMM4, %YMM5, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM4, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM5, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K6 represents a NULL or a mismatch in YMM4 and
|
||||
- YMM5. */
|
||||
- kord %k0, %k1, %k6
|
||||
-
|
||||
- VPCMP $4, %YMM6, %YMM7, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM6, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM7, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K7 represents a NULL or a mismatch in YMM6 and
|
||||
- YMM7. */
|
||||
- kord %k0, %k1, %k7
|
||||
-
|
||||
- kord %k4, %k5, %k0
|
||||
- kord %k6, %k7, %k1
|
||||
-
|
||||
- /* Test each mask (32 bits) individually because for VEC_SIZE
|
||||
- == 32 is not possible to OR the four masks and keep all bits
|
||||
- in a 64-bit integer register, differing from SSE2 strcmp
|
||||
- where ORing is possible. */
|
||||
- kortestd %k0, %k1
|
||||
- je L(loop)
|
||||
- ktestd %k4, %k4
|
||||
+
|
||||
+ VPMINU %YMM0, %YMM2, %YMM8
|
||||
+ VPMINU %YMM4, %YMM6, %YMM9
|
||||
+
|
||||
+ /* A zero CHAR in YMM8 means that there is a null CHAR. */
|
||||
+ VPMINU %YMM8, %YMM9, %YMM8
|
||||
+
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM8. */
|
||||
+ VPTESTM %YMM8, %YMM8, %k1
|
||||
+
|
||||
+ /* (YMM ^ YMM): A non-zero CHAR represents a mismatch. */
|
||||
+ vpxorq (%rdx), %YMM0, %YMM1
|
||||
+ vpxorq VEC_SIZE(%rdx), %YMM2, %YMM3
|
||||
+ vpxorq (VEC_SIZE * 2)(%rdx), %YMM4, %YMM5
|
||||
+ vpxorq (VEC_SIZE * 3)(%rdx), %YMM6, %YMM7
|
||||
+
|
||||
+ vporq %YMM1, %YMM3, %YMM9
|
||||
+ vporq %YMM5, %YMM7, %YMM10
|
||||
+
|
||||
+ /* A non-zero CHAR in YMM9 represents a mismatch. */
|
||||
+ vporq %YMM9, %YMM10, %YMM9
|
||||
+
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR. */
|
||||
+ VPCMP $0, %YMMZERO, %YMM9, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
+ je L(loop)
|
||||
+
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM0. */
|
||||
+ VPTESTM %YMM0, %YMM0, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and (%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM1, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
je L(test_vec)
|
||||
- kmovd %k4, %edi
|
||||
- tzcntl %edi, %ecx
|
||||
+ tzcntl %ecx, %ecx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %ecx
|
||||
@@ -466,9 +458,18 @@ L(test_vec):
|
||||
cmpq $VEC_SIZE, %r11
|
||||
jbe L(zero)
|
||||
# endif
|
||||
- ktestd %k5, %k5
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM2. */
|
||||
+ VPTESTM %YMM2, %YMM2, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM2 and VEC_SIZE(%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM3, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
je L(test_2_vec)
|
||||
- kmovd %k5, %ecx
|
||||
tzcntl %ecx, %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -512,9 +513,18 @@ L(test_2_vec):
|
||||
cmpq $(VEC_SIZE * 2), %r11
|
||||
jbe L(zero)
|
||||
# endif
|
||||
- ktestd %k6, %k6
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM4. */
|
||||
+ VPTESTM %YMM4, %YMM4, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM4 and (VEC_SIZE * 2)(%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM5, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
je L(test_3_vec)
|
||||
- kmovd %k6, %ecx
|
||||
tzcntl %ecx, %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
@@ -558,8 +568,18 @@ L(test_3_vec):
|
||||
cmpq $(VEC_SIZE * 3), %r11
|
||||
jbe L(zero)
|
||||
# endif
|
||||
- kmovd %k7, %esi
|
||||
- tzcntl %esi, %ecx
|
||||
+ /* Each bit set in K1 represents a non-null CHAR in YMM6. */
|
||||
+ VPTESTM %YMM6, %YMM6, %k1
|
||||
+ /* Each bit cleared in K0 represents a mismatch or a null CHAR
|
||||
+ in YMM6 and (VEC_SIZE * 3)(%rdx). */
|
||||
+ VPCMP $0, %YMMZERO, %YMM7, %k0{%k1}
|
||||
+ kmovd %k0, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
+ tzcntl %ecx, %ecx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* NB: Multiply wchar_t count by 4 to get the number of bytes. */
|
||||
sall $2, %ecx
|
||||
@@ -615,39 +635,51 @@ L(loop_cross_page):
|
||||
|
||||
VMOVU (%rax, %r10), %YMM2
|
||||
VMOVU VEC_SIZE(%rax, %r10), %YMM3
|
||||
- VMOVU (%rdx, %r10), %YMM4
|
||||
- VMOVU VEC_SIZE(%rdx, %r10), %YMM5
|
||||
-
|
||||
- VPCMP $4, %YMM4, %YMM2, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM4, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch in YMM2 and
|
||||
- YMM4. */
|
||||
- kord %k0, %k1, %k1
|
||||
-
|
||||
- VPCMP $4, %YMM5, %YMM3, %k3
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k4
|
||||
- VPCMP $0, %YMMZERO, %YMM5, %k5
|
||||
- kord %k4, %k5, %k4
|
||||
- /* Each bit in K3 represents a NULL or a mismatch in YMM3 and
|
||||
- YMM5. */
|
||||
- kord %k3, %k4, %k3
|
||||
+
|
||||
+ /* Each bit set in K2 represents a non-null CHAR in YMM2. */
|
||||
+ VPTESTM %YMM2, %YMM2, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM2 and 32 bytes at (%rdx, %r10). */
|
||||
+ VPCMP $0, (%rdx, %r10), %YMM2, %k1{%k2}
|
||||
+ kmovd %k1, %r9d
|
||||
+ /* Don't use subl since it is the lower 16/32 bits of RDI
|
||||
+ below. */
|
||||
+ notl %r9d
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* Only last 8 bits are valid. */
|
||||
+ andl $0xff, %r9d
|
||||
+# endif
|
||||
+
|
||||
+ /* Each bit set in K4 represents a non-null CHAR in YMM3. */
|
||||
+ VPTESTM %YMM3, %YMM3, %k4
|
||||
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
|
||||
+ in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
|
||||
+ VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
|
||||
+ kmovd %k3, %edi
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
+ notl %edi
|
||||
+ andl $0xff, %edi
|
||||
+# else
|
||||
+ incl %edi
|
||||
+# endif
|
||||
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* NB: Each bit in K1/K3 represents 4-byte element. */
|
||||
- kshiftlw $8, %k3, %k2
|
||||
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
|
||||
+ sall $8, %edi
|
||||
/* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
bytes. */
|
||||
movl %ecx, %SHIFT_REG32
|
||||
sarl $2, %SHIFT_REG32
|
||||
+
|
||||
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
|
||||
+ orl %r9d, %edi
|
||||
# else
|
||||
- kshiftlq $32, %k3, %k2
|
||||
-# endif
|
||||
+ salq $32, %rdi
|
||||
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- korq %k1, %k2, %k1
|
||||
- kmovq %k1, %rdi
|
||||
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
|
||||
+ orq %r9, %rdi
|
||||
+# endif
|
||||
|
||||
/* Since ECX < VEC_SIZE * 2, simply skip the first ECX bytes. */
|
||||
shrxq %SHIFT_REG64, %rdi, %rdi
|
||||
@@ -692,35 +724,45 @@ L(loop_cross_page_2_vec):
|
||||
/* The first VEC_SIZE * 2 bytes match or are ignored. */
|
||||
VMOVU (VEC_SIZE * 2)(%rax, %r10), %YMM0
|
||||
VMOVU (VEC_SIZE * 3)(%rax, %r10), %YMM1
|
||||
- VMOVU (VEC_SIZE * 2)(%rdx, %r10), %YMM2
|
||||
- VMOVU (VEC_SIZE * 3)(%rdx, %r10), %YMM3
|
||||
-
|
||||
- VPCMP $4, %YMM0, %YMM2, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM2, %k2
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch in YMM0 and
|
||||
- YMM2. */
|
||||
- kord %k0, %k1, %k1
|
||||
-
|
||||
- VPCMP $4, %YMM1, %YMM3, %k3
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k4
|
||||
- VPCMP $0, %YMMZERO, %YMM3, %k5
|
||||
- kord %k4, %k5, %k4
|
||||
- /* Each bit in K3 represents a NULL or a mismatch in YMM1 and
|
||||
- YMM3. */
|
||||
- kord %k3, %k4, %k3
|
||||
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (VEC_SIZE * 2)(%rdx, %r10). */
|
||||
+ VPCMP $0, (VEC_SIZE * 2)(%rdx, %r10), %YMM0, %k1{%k2}
|
||||
+ kmovd %k1, %r9d
|
||||
+ /* Don't use subl since it is the lower 16/32 bits of RDI
|
||||
+ below. */
|
||||
+ notl %r9d
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* NB: Each bit in K1/K3 represents 4-byte element. */
|
||||
- kshiftlw $8, %k3, %k2
|
||||
+ /* Only last 8 bits are valid. */
|
||||
+ andl $0xff, %r9d
|
||||
+# endif
|
||||
+
|
||||
+ VPTESTM %YMM1, %YMM1, %k4
|
||||
+ /* Each bit cleared in K3 represents a mismatch or a null CHAR
|
||||
+ in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
|
||||
+ VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
|
||||
+ kmovd %k3, %edi
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
+ notl %edi
|
||||
+ andl $0xff, %edi
|
||||
# else
|
||||
- kshiftlq $32, %k3, %k2
|
||||
+ incl %edi
|
||||
# endif
|
||||
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- korq %k1, %k2, %k1
|
||||
- kmovq %k1, %rdi
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ /* NB: Each bit in EDI/R9D represents 4-byte element. */
|
||||
+ sall $8, %edi
|
||||
+
|
||||
+ /* Each bit in EDI represents a null CHAR or a mismatch. */
|
||||
+ orl %r9d, %edi
|
||||
+# else
|
||||
+ salq $32, %rdi
|
||||
+
|
||||
+ /* Each bit in RDI represents a null CHAR or a mismatch. */
|
||||
+ orq %r9, %rdi
|
||||
+# endif
|
||||
|
||||
xorl %r8d, %r8d
|
||||
/* If ECX > VEC_SIZE * 2, skip ECX - (VEC_SIZE * 2) bytes. */
|
||||
@@ -729,12 +771,15 @@ L(loop_cross_page_2_vec):
|
||||
/* R8 has number of bytes skipped. */
|
||||
movl %ecx, %r8d
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* NB: Divide shift count by 4 since each bit in K1 represent 4
|
||||
+ /* NB: Divide shift count by 4 since each bit in RDI represent 4
|
||||
bytes. */
|
||||
sarl $2, %ecx
|
||||
-# endif
|
||||
+ /* Skip ECX bytes. */
|
||||
+ shrl %cl, %edi
|
||||
+# else
|
||||
/* Skip ECX bytes. */
|
||||
shrq %cl, %rdi
|
||||
+# endif
|
||||
1:
|
||||
/* Before jumping back to the loop, set ESI to the number of
|
||||
VEC_SIZE * 4 blocks before page crossing. */
|
||||
@@ -818,7 +863,7 @@ L(cross_page_loop):
|
||||
movzbl (%rdi, %rdx), %eax
|
||||
movzbl (%rsi, %rdx), %ecx
|
||||
# endif
|
||||
- /* Check null char. */
|
||||
+ /* Check null CHAR. */
|
||||
testl %eax, %eax
|
||||
jne L(cross_page_loop)
|
||||
/* Since %eax == 0, subtract is OK for both SIGNED and UNSIGNED
|
||||
@@ -901,18 +946,17 @@ L(cross_page):
|
||||
jg L(cross_page_1_vector)
|
||||
L(loop_1_vector):
|
||||
VMOVU (%rdi, %rdx), %YMM0
|
||||
- VMOVU (%rsi, %rdx), %YMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a mismatch in YMM0 and YMM1. */
|
||||
- VPCMP $4, %YMM0, %YMM1, %k0
|
||||
- VPCMP $0, %YMMZERO, %YMM0, %k1
|
||||
- VPCMP $0, %YMMZERO, %YMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in YMM0 or YMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
+
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in YMM0 and 32 bytes at (%rsi, %rdx). */
|
||||
+ VPCMP $0, (%rsi, %rdx), %YMM0, %k1{%k2}
|
||||
kmovd %k1, %ecx
|
||||
- testl %ecx, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xff, %ecx
|
||||
+# else
|
||||
+ incl %ecx
|
||||
+# endif
|
||||
jne L(last_vector)
|
||||
|
||||
addl $VEC_SIZE, %edx
|
||||
@@ -931,18 +975,17 @@ L(cross_page_1_vector):
|
||||
cmpl $(PAGE_SIZE - 16), %eax
|
||||
jg L(cross_page_1_xmm)
|
||||
VMOVU (%rdi, %rdx), %XMM0
|
||||
- VMOVU (%rsi, %rdx), %XMM1
|
||||
-
|
||||
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
||||
- VPCMP $4, %XMM0, %XMM1, %k0
|
||||
- VPCMP $0, %XMMZERO, %XMM0, %k1
|
||||
- VPCMP $0, %XMMZERO, %XMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
||||
- korw %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- korw %k0, %k1, %k1
|
||||
- kmovw %k1, %ecx
|
||||
- testl %ecx, %ecx
|
||||
+
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in XMM0 and 16 bytes at (%rsi, %rdx). */
|
||||
+ VPCMP $0, (%rsi, %rdx), %XMM0, %k1{%k2}
|
||||
+ kmovd %k1, %ecx
|
||||
+# ifdef USE_AS_WCSCMP
|
||||
+ subl $0xf, %ecx
|
||||
+# else
|
||||
+ subl $0xffff, %ecx
|
||||
+# endif
|
||||
jne L(last_vector)
|
||||
|
||||
addl $16, %edx
|
||||
@@ -965,25 +1008,16 @@ L(cross_page_1_xmm):
|
||||
vmovq (%rdi, %rdx), %XMM0
|
||||
vmovq (%rsi, %rdx), %XMM1
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
||||
- VPCMP $4, %XMM0, %XMM1, %k0
|
||||
- VPCMP $0, %XMMZERO, %XMM0, %k1
|
||||
- VPCMP $0, %XMMZERO, %XMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
- kmovd %k1, %ecx
|
||||
-
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in XMM0 and XMM1. */
|
||||
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
|
||||
+ kmovb %k1, %ecx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* Only last 2 bits are valid. */
|
||||
- andl $0x3, %ecx
|
||||
+ subl $0x3, %ecx
|
||||
# else
|
||||
- /* Only last 8 bits are valid. */
|
||||
- andl $0xff, %ecx
|
||||
+ subl $0xff, %ecx
|
||||
# endif
|
||||
-
|
||||
- testl %ecx, %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
addl $8, %edx
|
||||
@@ -1002,25 +1036,16 @@ L(cross_page_8bytes):
|
||||
vmovd (%rdi, %rdx), %XMM0
|
||||
vmovd (%rsi, %rdx), %XMM1
|
||||
|
||||
- /* Each bit in K0 represents a mismatch in XMM0 and XMM1. */
|
||||
- VPCMP $4, %XMM0, %XMM1, %k0
|
||||
- VPCMP $0, %XMMZERO, %XMM0, %k1
|
||||
- VPCMP $0, %XMMZERO, %XMM1, %k2
|
||||
- /* Each bit in K1 represents a NULL in XMM0 or XMM1. */
|
||||
- kord %k1, %k2, %k1
|
||||
- /* Each bit in K1 represents a NULL or a mismatch. */
|
||||
- kord %k0, %k1, %k1
|
||||
+ VPTESTM %YMM0, %YMM0, %k2
|
||||
+ /* Each bit cleared in K1 represents a mismatch or a null CHAR
|
||||
+ in XMM0 and XMM1. */
|
||||
+ VPCMP $0, %XMM1, %XMM0, %k1{%k2}
|
||||
kmovd %k1, %ecx
|
||||
-
|
||||
# ifdef USE_AS_WCSCMP
|
||||
- /* Only the last bit is valid. */
|
||||
- andl $0x1, %ecx
|
||||
+ subl $0x1, %ecx
|
||||
# else
|
||||
- /* Only last 4 bits are valid. */
|
||||
- andl $0xf, %ecx
|
||||
+ subl $0xf, %ecx
|
||||
# endif
|
||||
-
|
||||
- testl %ecx, %ecx
|
||||
jne L(last_vector)
|
||||
|
||||
addl $4, %edx
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,300 +0,0 @@
|
||||
From ee915088a0231cd421054dbd8abab7aadf331153 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:33:52 -0800
|
||||
Subject: [PATCH] x86-64 strncmp family: Properly handle the length parameter
|
||||
[BZ# 24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes the strncmp family for x32. Tested on x86-64 and x32.
|
||||
On x86-64, libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/strcmp-avx2.S: Use RDX_LP for length.
|
||||
* sysdeps/x86_64/multiarch/strcmp-sse42.S: Likewise.
|
||||
* sysdeps/x86_64/strcmp.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncasecmp,
|
||||
tst-size_t-strncmp and tst-size_t-wcsncmp.
|
||||
* sysdeps/x86_64/x32/tst-size_t-strncasecmp.c: New file.
|
||||
* sysdeps/x86_64/x32/tst-size_t-strncmp.c: Likewise.
|
||||
* sysdeps/x86_64/x32/tst-size_t-wcsncmp.c: Likewise.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 6 +-
|
||||
sysdeps/x86_64/multiarch/strcmp-sse42.S | 6 +-
|
||||
sysdeps/x86_64/strcmp.S | 6 +-
|
||||
sysdeps/x86_64/x32/Makefile | 6 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-strncasecmp.c | 59 ++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-strncmp.c | 78 +++++++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-wcsncmp.c | 20 ++++++
|
||||
7 files changed, 170 insertions(+), 11 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 327e3d87..156c1949 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -79,15 +79,15 @@
|
||||
ENTRY (STRCMP)
|
||||
# ifdef USE_AS_STRNCMP
|
||||
/* Check for simple cases (0 or 1) in offset. */
|
||||
- cmp $1, %rdx
|
||||
+ cmp $1, %RDX_LP
|
||||
je L(char0)
|
||||
jb L(zero)
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* Convert units: from wide to byte char. */
|
||||
- shl $2, %rdx
|
||||
+ shl $2, %RDX_LP
|
||||
# endif
|
||||
/* Register %r11 tracks the maximum offset. */
|
||||
- movq %rdx, %r11
|
||||
+ mov %RDX_LP, %R11_LP
|
||||
# endif
|
||||
movl %edi, %eax
|
||||
xorl %edx, %edx
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
index d3c07bd2..a1ebea46 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
@@ -156,11 +156,11 @@ STRCMP_SSE42:
|
||||
#endif
|
||||
|
||||
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
|
||||
- test %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
je LABEL(strcmp_exitz)
|
||||
- cmp $1, %rdx
|
||||
+ cmp $1, %RDX_LP
|
||||
je LABEL(Byte0)
|
||||
- mov %rdx, %r11
|
||||
+ mov %RDX_LP, %R11_LP
|
||||
#endif
|
||||
mov %esi, %ecx
|
||||
mov %edi, %eax
|
||||
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
|
||||
index e16945b9..f47c8ad4 100644
|
||||
--- a/sysdeps/x86_64/strcmp.S
|
||||
+++ b/sysdeps/x86_64/strcmp.S
|
||||
@@ -135,11 +135,11 @@ ENTRY (STRCMP)
|
||||
* This implementation uses SSE to compare up to 16 bytes at a time.
|
||||
*/
|
||||
#if defined USE_AS_STRNCMP || defined USE_AS_STRNCASECMP_L
|
||||
- test %rdx, %rdx
|
||||
+ test %RDX_LP, %RDX_LP
|
||||
je LABEL(strcmp_exitz)
|
||||
- cmp $1, %rdx
|
||||
+ cmp $1, %RDX_LP
|
||||
je LABEL(Byte0)
|
||||
- mov %rdx, %r11
|
||||
+ mov %RDX_LP, %R11_LP
|
||||
#endif
|
||||
mov %esi, %ecx
|
||||
mov %edi, %eax
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index 98bd9ae9..db302839 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -7,9 +7,11 @@ endif
|
||||
|
||||
ifeq ($(subdir),string)
|
||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
||||
- tst-size_t-memrchr tst-size_t-memset
|
||||
+ tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
||||
+ tst-size_t-strncmp
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
-tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset
|
||||
+tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
|
||||
+ tst-size_t-wcsncmp
|
||||
endif
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
||||
new file mode 100644
|
||||
index 00000000..86233593
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncasecmp.c
|
||||
@@ -0,0 +1,59 @@
|
||||
+/* Test strncaecmp with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_NAME "strncasecmp"
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+IMPL (strncasecmp, 1)
|
||||
+
|
||||
+typedef int (*proto_t) (const char *, const char *, size_t);
|
||||
+
|
||||
+static int
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_strncasecmp (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t dest = { { page_size }, buf1 };
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+
|
||||
+ strncpy ((char *) buf1, (const char *) buf2, page_size);
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ int res = do_strncasecmp (dest, src);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncmp.c b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
||||
new file mode 100644
|
||||
index 00000000..54e6bd83
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncmp.c
|
||||
@@ -0,0 +1,78 @@
|
||||
+/* Test strncmp with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# define TEST_NAME "wcsncmp"
|
||||
+#else
|
||||
+# define TEST_NAME "strncmp"
|
||||
+#endif
|
||||
+
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# include <wchar.h>
|
||||
+
|
||||
+# define STRNCMP wcsncmp
|
||||
+# define STRNCPY wcsncpy
|
||||
+# define CHAR wchar_t
|
||||
+#else
|
||||
+# define STRNCMP strncmp
|
||||
+# define STRNCPY strncpy
|
||||
+# define CHAR char
|
||||
+#endif
|
||||
+
|
||||
+IMPL (STRNCMP, 1)
|
||||
+
|
||||
+typedef int (*proto_t) (const CHAR *, const CHAR *, size_t);
|
||||
+
|
||||
+
|
||||
+static int
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_strncmp (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ size_t size = page_size / sizeof (CHAR);
|
||||
+ parameter_t dest = { { size }, buf1 };
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+
|
||||
+ STRNCPY ((CHAR *) buf1, (const CHAR *) buf2, size);
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ int res = do_strncmp (dest, src);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
||||
new file mode 100644
|
||||
index 00000000..4829647c
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsncmp.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* Test wcsncmp with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include "tst-size_t-strncmp.c"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,54 +0,0 @@
|
||||
From 6720d36b6623c5e48c070d86acf61198b33e144e Mon Sep 17 00:00:00 2001
|
||||
From: Fangrui Song <maskray@google.com>
|
||||
Date: Tue, 2 Nov 2021 20:59:52 -0700
|
||||
Subject: [PATCH] x86-64: Replace movzx with movzbl
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Clang cannot assemble movzx in the AT&T dialect mode.
|
||||
|
||||
../sysdeps/x86_64/strcmp.S:2232:16: error: invalid operand for instruction
|
||||
movzx (%rsi), %ecx
|
||||
^~~~
|
||||
|
||||
Change movzx to movzbl, which follows the AT&T dialect and is used
|
||||
elsewhere in the file.
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-sse42.S | 4 ++--
|
||||
sysdeps/x86_64/strcmp.S | 4 ++--
|
||||
2 files changed, 4 insertions(+), 4 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-sse42.S b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
index a1ebea46..d8fdeb3a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-sse42.S
|
||||
@@ -1771,8 +1771,8 @@ LABEL(strcmp_exitz):
|
||||
.p2align 4
|
||||
// XXX Same as code above
|
||||
LABEL(Byte0):
|
||||
- movzx (%rsi), %ecx
|
||||
- movzx (%rdi), %eax
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
|
||||
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
||||
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
|
||||
diff --git a/sysdeps/x86_64/strcmp.S b/sysdeps/x86_64/strcmp.S
|
||||
index f47c8ad4..aa6df898 100644
|
||||
--- a/sysdeps/x86_64/strcmp.S
|
||||
+++ b/sysdeps/x86_64/strcmp.S
|
||||
@@ -2232,8 +2232,8 @@ LABEL(strcmp_exitz):
|
||||
|
||||
.p2align 4
|
||||
LABEL(Byte0):
|
||||
- movzx (%rsi), %ecx
|
||||
- movzx (%rdi), %eax
|
||||
+ movzbl (%rsi), %ecx
|
||||
+ movzbl (%rdi), %eax
|
||||
|
||||
#if defined USE_AS_STRCASECMP_L || defined USE_AS_STRNCASECMP_L
|
||||
leaq _nl_C_LC_CTYPE_tolower+128*4(%rip), %rdx
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,56 +0,0 @@
|
||||
From cf2c57526ba4b57e6863ad4db8a868e2678adce8 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 30 Apr 2021 05:58:59 -0700
|
||||
Subject: [PATCH] x86: Set rep_movsb_threshold to 2112 on processors with FSRM
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The glibc memcpy benchmark on Intel Core i7-1065G7 (Ice Lake) showed
|
||||
that REP MOVSB became faster after 2112 bytes:
|
||||
|
||||
Vector Move REP MOVSB
|
||||
length=2112, align1=0, align2=0: 24.20 24.40
|
||||
length=2112, align1=1, align2=0: 26.07 23.13
|
||||
length=2112, align1=0, align2=1: 27.18 28.13
|
||||
length=2112, align1=1, align2=1: 26.23 25.16
|
||||
length=2176, align1=0, align2=0: 23.18 22.52
|
||||
length=2176, align1=2, align2=0: 25.45 22.52
|
||||
length=2176, align1=0, align2=2: 27.14 27.82
|
||||
length=2176, align1=2, align2=2: 22.73 25.56
|
||||
length=2240, align1=0, align2=0: 24.62 24.25
|
||||
length=2240, align1=3, align2=0: 29.77 27.15
|
||||
length=2240, align1=0, align2=3: 35.55 29.93
|
||||
length=2240, align1=3, align2=3: 34.49 25.15
|
||||
length=2304, align1=0, align2=0: 34.75 26.64
|
||||
length=2304, align1=4, align2=0: 32.09 22.63
|
||||
length=2304, align1=0, align2=4: 28.43 31.24
|
||||
|
||||
Use REP MOVSB for data size > 2112 bytes in memcpy on processors with
|
||||
fast short REP MOVSB (FSRM).
|
||||
|
||||
* sysdeps/x86/dl-cacheinfo.h (dl_init_cacheinfo): Set
|
||||
rep_movsb_threshold to 2112 on processors with fast short REP
|
||||
MOVSB (FSRM).
|
||||
---
|
||||
sysdeps/x86/cacheinfo.h | 6 ++++++
|
||||
1 file changed, 6 insertions(+)
|
||||
|
||||
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
||||
index f72f634a..cc3941d3 100644
|
||||
--- a/sysdeps/x86/cacheinfo.h
|
||||
+++ b/sysdeps/x86/cacheinfo.h
|
||||
@@ -430,6 +430,12 @@ init_cacheinfo (void)
|
||||
rep_movsb_threshold = 2048 * (16 / 16);
|
||||
minimum_rep_movsb_threshold = 16 * 8;
|
||||
}
|
||||
+
|
||||
+ /* NB: The default REP MOVSB threshold is 2112 on processors with fast
|
||||
+ short REP MOVSB (FSRM). */
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, FSRM))
|
||||
+ rep_movsb_threshold = 2112;
|
||||
+
|
||||
if (cpu_features->rep_movsb_threshold > minimum_rep_movsb_threshold)
|
||||
__x86_rep_movsb_threshold = cpu_features->rep_movsb_threshold;
|
||||
else
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,136 +0,0 @@
|
||||
From 475b63702ef38b69558fc3d31a0b66776a70f1d3 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 1 Nov 2021 00:49:52 -0500
|
||||
Subject: [PATCH] x86: Double size of ERMS rep_movsb_threshold in
|
||||
dl-cacheinfo.h
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
|
||||
This patch doubles the rep_movsb_threshold when using ERMS. Based on
|
||||
benchmarks the vector copy loop, especially now that it handles 4k
|
||||
aliasing, is better for these medium ranged.
|
||||
|
||||
On Skylake with ERMS:
|
||||
|
||||
Size, Align1, Align2, dst>src,(rep movsb) / (vec copy)
|
||||
4096, 0, 0, 0, 0.975
|
||||
4096, 0, 0, 1, 0.953
|
||||
4096, 12, 0, 0, 0.969
|
||||
4096, 12, 0, 1, 0.872
|
||||
4096, 44, 0, 0, 0.979
|
||||
4096, 44, 0, 1, 0.83
|
||||
4096, 0, 12, 0, 1.006
|
||||
4096, 0, 12, 1, 0.989
|
||||
4096, 0, 44, 0, 0.739
|
||||
4096, 0, 44, 1, 0.942
|
||||
4096, 12, 12, 0, 1.009
|
||||
4096, 12, 12, 1, 0.973
|
||||
4096, 44, 44, 0, 0.791
|
||||
4096, 44, 44, 1, 0.961
|
||||
4096, 2048, 0, 0, 0.978
|
||||
4096, 2048, 0, 1, 0.951
|
||||
4096, 2060, 0, 0, 0.986
|
||||
4096, 2060, 0, 1, 0.963
|
||||
4096, 2048, 12, 0, 0.971
|
||||
4096, 2048, 12, 1, 0.941
|
||||
4096, 2060, 12, 0, 0.977
|
||||
4096, 2060, 12, 1, 0.949
|
||||
8192, 0, 0, 0, 0.85
|
||||
8192, 0, 0, 1, 0.845
|
||||
8192, 13, 0, 0, 0.937
|
||||
8192, 13, 0, 1, 0.939
|
||||
8192, 45, 0, 0, 0.932
|
||||
8192, 45, 0, 1, 0.927
|
||||
8192, 0, 13, 0, 0.621
|
||||
8192, 0, 13, 1, 0.62
|
||||
8192, 0, 45, 0, 0.53
|
||||
8192, 0, 45, 1, 0.516
|
||||
8192, 13, 13, 0, 0.664
|
||||
8192, 13, 13, 1, 0.659
|
||||
8192, 45, 45, 0, 0.593
|
||||
8192, 45, 45, 1, 0.575
|
||||
8192, 2048, 0, 0, 0.854
|
||||
8192, 2048, 0, 1, 0.834
|
||||
8192, 2061, 0, 0, 0.863
|
||||
8192, 2061, 0, 1, 0.857
|
||||
8192, 2048, 13, 0, 0.63
|
||||
8192, 2048, 13, 1, 0.629
|
||||
8192, 2061, 13, 0, 0.627
|
||||
8192, 2061, 13, 1, 0.62
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86/cacheinfo.h | 8 +++++---
|
||||
sysdeps/x86/dl-tunables.list | 26 +++++++++++++++-----------
|
||||
2 files changed, 20 insertions(+), 14 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86/cacheinfo.h b/sysdeps/x86/cacheinfo.h
|
||||
index cc3941d3..ac025e08 100644
|
||||
--- a/sysdeps/x86/cacheinfo.h
|
||||
+++ b/sysdeps/x86/cacheinfo.h
|
||||
@@ -411,18 +411,20 @@ init_cacheinfo (void)
|
||||
|
||||
/* NB: The REP MOVSB threshold must be greater than VEC_SIZE * 8. */
|
||||
unsigned int minimum_rep_movsb_threshold;
|
||||
- /* NB: The default REP MOVSB threshold is 2048 * (VEC_SIZE / 16). */
|
||||
+ /* NB: The default REP MOVSB threshold is 4096 * (VEC_SIZE / 16) for
|
||||
+ VEC_SIZE == 64 or 32. For VEC_SIZE == 16, the default REP MOVSB
|
||||
+ threshold is 2048 * (VEC_SIZE / 16). */
|
||||
unsigned int rep_movsb_threshold;
|
||||
if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
&& !CPU_FEATURE_PREFERRED_P (cpu_features, Prefer_No_AVX512))
|
||||
{
|
||||
- rep_movsb_threshold = 2048 * (64 / 16);
|
||||
+ rep_movsb_threshold = 4096 * (64 / 16);
|
||||
minimum_rep_movsb_threshold = 64 * 8;
|
||||
}
|
||||
else if (CPU_FEATURE_PREFERRED_P (cpu_features,
|
||||
AVX_Fast_Unaligned_Load))
|
||||
{
|
||||
- rep_movsb_threshold = 2048 * (32 / 16);
|
||||
+ rep_movsb_threshold = 4096 * (32 / 16);
|
||||
minimum_rep_movsb_threshold = 32 * 8;
|
||||
}
|
||||
else
|
||||
diff --git a/sysdeps/x86/dl-tunables.list b/sysdeps/x86/dl-tunables.list
|
||||
index 89bf2966..56c6834a 100644
|
||||
--- a/sysdeps/x86/dl-tunables.list
|
||||
+++ b/sysdeps/x86/dl-tunables.list
|
||||
@@ -32,17 +32,21 @@ glibc {
|
||||
}
|
||||
x86_rep_movsb_threshold {
|
||||
type: SIZE_T
|
||||
- # Since there is overhead to set up REP MOVSB operation, REP MOVSB
|
||||
- # isn't faster on short data. The memcpy micro benchmark in glibc
|
||||
- # shows that 2KB is the approximate value above which REP MOVSB
|
||||
- # becomes faster than SSE2 optimization on processors with Enhanced
|
||||
- # REP MOVSB. Since larger register size can move more data with a
|
||||
- # single load and store, the threshold is higher with larger register
|
||||
- # size. Note: Since the REP MOVSB threshold must be greater than 8
|
||||
- # times of vector size and the default value is 2048 * (vector size
|
||||
- # / 16), the default value and the minimum value must be updated at
|
||||
- # run-time. NB: Don't set the default value since we can't tell if
|
||||
- # the tunable value is set by user or not [BZ #27069].
|
||||
+ # Since there is overhead to set up REP MOVSB operation, REP
|
||||
+ # MOVSB isn't faster on short data. The memcpy micro benchmark
|
||||
+ # in glibc shows that 2KB is the approximate value above which
|
||||
+ # REP MOVSB becomes faster than SSE2 optimization on processors
|
||||
+ # with Enhanced REP MOVSB. Since larger register size can move
|
||||
+ # more data with a single load and store, the threshold is
|
||||
+ # higher with larger register size. Micro benchmarks show AVX
|
||||
+ # REP MOVSB becomes faster apprximately at 8KB. The AVX512
|
||||
+ # threshold is extrapolated to 16KB. For machines with FSRM the
|
||||
+ # threshold is universally set at 2112 bytes. Note: Since the
|
||||
+ # REP MOVSB threshold must be greater than 8 times of vector
|
||||
+ # size and the default value is 4096 * (vector size / 16), the
|
||||
+ # default value and the minimum value must be updated at
|
||||
+ # run-time. NB: Don't set the default value since we can't tell
|
||||
+ # if the tunable value is set by user or not [BZ #27069].
|
||||
minval: 1
|
||||
}
|
||||
x86_rep_stosb_threshold {
|
||||
--
|
||||
GitLab
|
||||
|
File diff suppressed because it is too large
Load Diff
@ -1,39 +0,0 @@
|
||||
From 0b82747dc48d5bf0871bdc6da8cb6eec1256355f Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Thu, 11 Nov 2021 06:31:51 -0800
|
||||
Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_lock_full [BZ
|
||||
#28537]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Replace boolean CAS with value CAS to avoid the extra load.
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
---
|
||||
nptl/pthread_mutex_lock.c | 10 +++++-----
|
||||
1 file changed, 5 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
||||
index 29cc143e..60ada70d 100644
|
||||
--- a/nptl/pthread_mutex_lock.c
|
||||
+++ b/nptl/pthread_mutex_lock.c
|
||||
@@ -292,12 +292,12 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
|
||||
meantime. */
|
||||
if ((oldval & FUTEX_WAITERS) == 0)
|
||||
{
|
||||
- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
|
||||
- oldval | FUTEX_WAITERS,
|
||||
- oldval)
|
||||
- != 0)
|
||||
+ int val;
|
||||
+ if ((val = atomic_compare_and_exchange_val_acq
|
||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
||||
+ oldval)) != oldval)
|
||||
{
|
||||
- oldval = mutex->__data.__lock;
|
||||
+ oldval = val;
|
||||
continue;
|
||||
}
|
||||
oldval |= FUTEX_WAITERS;
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,39 +0,0 @@
|
||||
From 49302b8fdf9103b6fc0a398678668a22fa19574c Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Thu, 11 Nov 2021 06:54:01 -0800
|
||||
Subject: [PATCH] Avoid extra load with CAS in __pthread_mutex_clocklock_common
|
||||
[BZ #28537]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Replace boolean CAS with value CAS to avoid the extra load.
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
---
|
||||
nptl/pthread_mutex_timedlock.c | 10 +++++-----
|
||||
1 file changed, 5 insertions(+), 5 deletions(-)
|
||||
|
||||
diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
|
||||
index 888c12fe..c4627ef6 100644
|
||||
--- a/nptl/pthread_mutex_timedlock.c
|
||||
+++ b/nptl/pthread_mutex_timedlock.c
|
||||
@@ -269,12 +269,12 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
|
||||
meantime. */
|
||||
if ((oldval & FUTEX_WAITERS) == 0)
|
||||
{
|
||||
- if (atomic_compare_and_exchange_bool_acq (&mutex->__data.__lock,
|
||||
- oldval | FUTEX_WAITERS,
|
||||
- oldval)
|
||||
- != 0)
|
||||
+ int val;
|
||||
+ if ((val = atomic_compare_and_exchange_val_acq
|
||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
||||
+ oldval)) != oldval)
|
||||
{
|
||||
- oldval = mutex->__data.__lock;
|
||||
+ oldval = val;
|
||||
continue;
|
||||
}
|
||||
oldval |= FUTEX_WAITERS;
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,51 +0,0 @@
|
||||
From d672a98a1af106bd68deb15576710cd61363f7a6 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Tue, 2 Nov 2021 18:33:07 -0700
|
||||
Subject: [PATCH] Add LLL_MUTEX_READ_LOCK [BZ #28537]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
CAS instruction is expensive. From the x86 CPU's point of view, getting
|
||||
a cache line for writing is more expensive than reading. See Appendix
|
||||
A.2 Spinlock in:
|
||||
|
||||
https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/xeon-lock-scaling-analysis-paper.pdf
|
||||
|
||||
The full compare and swap will grab the cache line exclusive and cause
|
||||
excessive cache line bouncing.
|
||||
|
||||
Add LLL_MUTEX_READ_LOCK to do an atomic load and skip CAS in spinlock
|
||||
loop if compare may fail to reduce cache line bouncing on contended locks.
|
||||
|
||||
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
||||
---
|
||||
nptl/pthread_mutex_lock.c | 7 +++++++
|
||||
1 file changed, 7 insertions(+)
|
||||
|
||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
||||
index 60ada70d..eb4d8baa 100644
|
||||
--- a/nptl/pthread_mutex_lock.c
|
||||
+++ b/nptl/pthread_mutex_lock.c
|
||||
@@ -56,6 +56,11 @@
|
||||
#define FORCE_ELISION(m, s)
|
||||
#endif
|
||||
|
||||
+#ifndef LLL_MUTEX_READ_LOCK
|
||||
+# define LLL_MUTEX_READ_LOCK(mutex) \
|
||||
+ atomic_load_relaxed (&(mutex)->__data.__lock)
|
||||
+#endif
|
||||
+
|
||||
static int __pthread_mutex_lock_full (pthread_mutex_t *mutex)
|
||||
__attribute_noinline__;
|
||||
|
||||
@@ -136,6 +141,8 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
||||
break;
|
||||
}
|
||||
atomic_spin_nop ();
|
||||
+ if (LLL_MUTEX_READ_LOCK (mutex) != 0)
|
||||
+ continue;
|
||||
}
|
||||
while (LLL_MUTEX_TRYLOCK (mutex) != 0);
|
||||
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,71 +0,0 @@
|
||||
From 120ac6d238825452e8024e2f627da33b2508dfd3 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 12 Nov 2021 11:47:42 -0800
|
||||
Subject: [PATCH] Move assignment out of the CAS condition
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Update
|
||||
|
||||
commit 49302b8fdf9103b6fc0a398678668a22fa19574c
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Thu Nov 11 06:54:01 2021 -0800
|
||||
|
||||
Avoid extra load with CAS in __pthread_mutex_clocklock_common [BZ #28537]
|
||||
|
||||
Replace boolean CAS with value CAS to avoid the extra load.
|
||||
|
||||
and
|
||||
|
||||
commit 0b82747dc48d5bf0871bdc6da8cb6eec1256355f
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Thu Nov 11 06:31:51 2021 -0800
|
||||
|
||||
Avoid extra load with CAS in __pthread_mutex_lock_full [BZ #28537]
|
||||
|
||||
Replace boolean CAS with value CAS to avoid the extra load.
|
||||
|
||||
by moving assignment out of the CAS condition.
|
||||
---
|
||||
nptl/pthread_mutex_lock.c | 7 +++----
|
||||
nptl/pthread_mutex_timedlock.c | 7 +++----
|
||||
2 files changed, 6 insertions(+), 8 deletions(-)
|
||||
|
||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
||||
index eb4d8baa..a633d95e 100644
|
||||
--- a/nptl/pthread_mutex_lock.c
|
||||
+++ b/nptl/pthread_mutex_lock.c
|
||||
@@ -299,10 +299,9 @@ __pthread_mutex_lock_full (pthread_mutex_t *mutex)
|
||||
meantime. */
|
||||
if ((oldval & FUTEX_WAITERS) == 0)
|
||||
{
|
||||
- int val;
|
||||
- if ((val = atomic_compare_and_exchange_val_acq
|
||||
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
||||
- oldval)) != oldval)
|
||||
+ int val = atomic_compare_and_exchange_val_acq
|
||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
|
||||
+ if (val != oldval)
|
||||
{
|
||||
oldval = val;
|
||||
continue;
|
||||
diff --git a/nptl/pthread_mutex_timedlock.c b/nptl/pthread_mutex_timedlock.c
|
||||
index c4627ef6..a76c30b7 100644
|
||||
--- a/nptl/pthread_mutex_timedlock.c
|
||||
+++ b/nptl/pthread_mutex_timedlock.c
|
||||
@@ -269,10 +269,9 @@ __pthread_mutex_timedlock (pthread_mutex_t *mutex,
|
||||
meantime. */
|
||||
if ((oldval & FUTEX_WAITERS) == 0)
|
||||
{
|
||||
- int val;
|
||||
- if ((val = atomic_compare_and_exchange_val_acq
|
||||
- (&mutex->__data.__lock, oldval | FUTEX_WAITERS,
|
||||
- oldval)) != oldval)
|
||||
+ int val = atomic_compare_and_exchange_val_acq
|
||||
+ (&mutex->__data.__lock, oldval | FUTEX_WAITERS, oldval);
|
||||
+ if (val != oldval)
|
||||
{
|
||||
oldval = val;
|
||||
continue;
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,60 +0,0 @@
|
||||
From 4df1fa6ddc8925a75f3da644d5da3bb16eb33f02 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 3 Dec 2021 15:29:25 -0800
|
||||
Subject: [PATCH] x86-64: Use notl in EVEX strcmp [BZ #28646]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Must use notl %edi here as lower bits are for CHAR comparisons
|
||||
potentially out of range thus can be 0 without indicating mismatch.
|
||||
This fixes BZ #28646.
|
||||
|
||||
Co-Authored-By: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 14 ++++++++------
|
||||
1 file changed, 8 insertions(+), 6 deletions(-)
|
||||
|
||||
Conflicts:
|
||||
string/test-strcmp.c
|
||||
(new check omitted)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
index 82f12ac8..6f5c4bf9 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
@@ -656,12 +656,13 @@ L(loop_cross_page):
|
||||
in YMM3 and 32 bytes at VEC_SIZE(%rdx, %r10). */
|
||||
VPCMP $0, VEC_SIZE(%rdx, %r10), %YMM3, %k3{%k4}
|
||||
kmovd %k3, %edi
|
||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
||||
+ comparisons potentially out of range thus can be 0 without
|
||||
+ indicating mismatch. */
|
||||
+ notl %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
- notl %edi
|
||||
andl $0xff, %edi
|
||||
-# else
|
||||
- incl %edi
|
||||
# endif
|
||||
|
||||
# ifdef USE_AS_WCSCMP
|
||||
@@ -743,12 +744,13 @@ L(loop_cross_page_2_vec):
|
||||
in YMM1 and 32 bytes at (VEC_SIZE * 3)(%rdx, %r10). */
|
||||
VPCMP $0, (VEC_SIZE * 3)(%rdx, %r10), %YMM1, %k3{%k4}
|
||||
kmovd %k3, %edi
|
||||
+ /* Must use notl %edi here as lower bits are for CHAR
|
||||
+ comparisons potentially out of range thus can be 0 without
|
||||
+ indicating mismatch. */
|
||||
+ notl %edi
|
||||
# ifdef USE_AS_WCSCMP
|
||||
/* Don't use subl since it is the upper 8 bits of EDI below. */
|
||||
- notl %edi
|
||||
andl $0xff, %edi
|
||||
-# else
|
||||
- incl %edi
|
||||
# endif
|
||||
|
||||
# ifdef USE_AS_WCSCMP
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,35 +0,0 @@
|
||||
From ceeffe968c01b1202e482f4855cb6baf5c6cb713 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 6 Dec 2021 07:14:12 -0800
|
||||
Subject: [PATCH] x86: Don't set Prefer_No_AVX512 for processors with AVX512
|
||||
and AVX-VNNI
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Don't set Prefer_No_AVX512 on processors with AVX512 and AVX-VNNI since
|
||||
they won't lower CPU frequency when ZMM load and store instructions are
|
||||
used.
|
||||
---
|
||||
sysdeps/x86/cpu-features.c | 7 +++++--
|
||||
1 file changed, 5 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
|
||||
index 956bfb4f..5ff2baa0 100644
|
||||
--- a/sysdeps/x86/cpu-features.c
|
||||
+++ b/sysdeps/x86/cpu-features.c
|
||||
@@ -525,8 +525,11 @@ init_cpu_features (struct cpu_features *cpu_features)
|
||||
|= bit_arch_Prefer_No_VZEROUPPER;
|
||||
else
|
||||
{
|
||||
- cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
- |= bit_arch_Prefer_No_AVX512;
|
||||
+ /* Processors with AVX512 and AVX-VNNI won't lower CPU frequency
|
||||
+ when ZMM load and store instructions are used. */
|
||||
+ if (!CPU_FEATURES_CPU_P (cpu_features, AVX_VNNI))
|
||||
+ cpu_features->preferred[index_arch_Prefer_No_AVX512]
|
||||
+ |= bit_arch_Prefer_No_AVX512;
|
||||
|
||||
/* Avoid RTM abort triggered by VZEROUPPER inside a
|
||||
transactionally executing RTM region. */
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,153 +0,0 @@
|
||||
From c7c54f65b080affb87a1513dee449c8ad6143c8b Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:35:18 -0800
|
||||
Subject: [PATCH] x86-64 strncpy: Properly handle the length parameter [BZ#
|
||||
24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes strncpy for x32. Tested on x86-64 and x32. On x86-64,
|
||||
libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/strcpy-avx2.S: Use RDX_LP for length.
|
||||
* sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S: Likewise.
|
||||
* sysdeps/x86_64/multiarch/strcpy-ssse3.S: Likewise.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strncpy.
|
||||
* sysdeps/x86_64/x32/tst-size_t-strncpy.c: New file.
|
||||
---
|
||||
.../x86_64/multiarch/strcpy-sse2-unaligned.S | 4 +-
|
||||
sysdeps/x86_64/multiarch/strcpy-ssse3.S | 6 +-
|
||||
sysdeps/x86_64/x32/Makefile | 2 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-strncpy.c | 58 +++++++++++++++++++
|
||||
4 files changed, 64 insertions(+), 6 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
sysdeps/x86_64/multiarch/strcpy-avx2.S
|
||||
(skipped, only needed for x32 arch)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
||||
index 72bf7e85..50aca22d 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcpy-sse2-unaligned.S
|
||||
@@ -40,8 +40,8 @@
|
||||
.text
|
||||
ENTRY (STRCPY)
|
||||
# ifdef USE_AS_STRNCPY
|
||||
- mov %rdx, %r8
|
||||
- test %r8, %r8
|
||||
+ mov %RDX_LP, %R8_LP
|
||||
+ test %R8_LP, %R8_LP
|
||||
jz L(ExitZero)
|
||||
# endif
|
||||
mov %rsi, %rcx
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcpy-ssse3.S b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
||||
index 9858d0c4..0a62814a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcpy-ssse3.S
|
||||
@@ -31,13 +31,13 @@ ENTRY (STRCPY)
|
||||
|
||||
mov %rsi, %rcx
|
||||
# ifdef USE_AS_STRNCPY
|
||||
- mov %rdx, %r8
|
||||
+ mov %RDX_LP, %R8_LP
|
||||
# endif
|
||||
mov %rdi, %rdx
|
||||
# ifdef USE_AS_STRNCPY
|
||||
- test %r8, %r8
|
||||
+ test %R8_LP, %R8_LP
|
||||
jz L(Exit0)
|
||||
- cmp $8, %r8
|
||||
+ cmp $8, %R8_LP
|
||||
jbe L(StrncpyExit8Bytes)
|
||||
# endif
|
||||
cmpb $0, (%rcx)
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index db302839..2a9e20a9 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -8,7 +8,7 @@ endif
|
||||
ifeq ($(subdir),string)
|
||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
||||
- tst-size_t-strncmp
|
||||
+ tst-size_t-strncmp tst-size_t-strncpy
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strncpy.c b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
||||
new file mode 100644
|
||||
index 00000000..4dec71e6
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strncpy.c
|
||||
@@ -0,0 +1,58 @@
|
||||
+/* Test strncpy with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define TEST_NAME "strncpy"
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+IMPL (strncpy, 1)
|
||||
+
|
||||
+typedef char *(*proto_t) (char *, const char*, size_t);
|
||||
+
|
||||
+static void *
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_strncpy (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&b, a.p, b.p, a.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ parameter_t dest = { { page_size }, buf1 };
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ do_strncpy (dest, src);
|
||||
+ int res = strncmp (dest.p, src.p, dest.len);
|
||||
+ if (res)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: %i != 0",
|
||||
+ impl->name, res);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,389 +0,0 @@
|
||||
From abddd61de090ae84e380aff68a98bd94ef704667 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 24 Dec 2021 18:54:41 -0600
|
||||
Subject: [PATCH] x86: Optimize L(less_vec) case in memcmp-evex-movbe.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
Optimizations are twofold.
|
||||
|
||||
1) Replace page cross and 0/1 checks with masked load instructions in
|
||||
L(less_vec). In applications this reduces branch-misses in the
|
||||
hot [0, 32] case.
|
||||
2) Change controlflow so that L(less_vec) case gets the fall through.
|
||||
|
||||
Change 2) helps copies in the [0, 32] size range but comes at the cost
|
||||
of copies in the [33, 64] size range. From profiles of GCC and
|
||||
Python3, 94%+ and 99%+ of calls are in the [0, 32] range so this
|
||||
appears to the the right tradeoff.
|
||||
|
||||
Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memcmp-evex-movbe.S | 249 +++++--------------
|
||||
1 file changed, 56 insertions(+), 193 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
index 640f6757..d2899e7c 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memcmp-evex-movbe.S
|
||||
@@ -62,15 +62,18 @@ Latency:
|
||||
# define VMOVU vmovdqu64
|
||||
|
||||
# ifdef USE_AS_WMEMCMP
|
||||
+# define VMOVU_MASK vmovdqu32
|
||||
# define CHAR_SIZE 4
|
||||
# define VPCMP vpcmpd
|
||||
# define VPTEST vptestmd
|
||||
# else
|
||||
+# define VMOVU_MASK vmovdqu8
|
||||
# define CHAR_SIZE 1
|
||||
# define VPCMP vpcmpub
|
||||
# define VPTEST vptestmb
|
||||
# endif
|
||||
|
||||
+
|
||||
# define VEC_SIZE 32
|
||||
# define PAGE_SIZE 4096
|
||||
# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
||||
@@ -102,12 +105,48 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
movl %edx, %edx
|
||||
# endif
|
||||
cmp $CHAR_PER_VEC, %RDX_LP
|
||||
- jb L(less_vec)
|
||||
+ /* Fall through for [0, VEC_SIZE] as its the hottest. */
|
||||
+ ja L(more_1x_vec)
|
||||
+
|
||||
+ /* Create mask for CHAR's we want to compare. This allows us to
|
||||
+ avoid having to include page cross logic. */
|
||||
+ movl $-1, %ecx
|
||||
+ bzhil %edx, %ecx, %ecx
|
||||
+ kmovd %ecx, %k2
|
||||
+
|
||||
+ /* Safe to load full ymm with mask. */
|
||||
+ VMOVU_MASK (%rsi), %YMM2{%k2}
|
||||
+ VPCMP $4,(%rdi), %YMM2, %k1{%k2}
|
||||
+ kmovd %k1, %eax
|
||||
+ testl %eax, %eax
|
||||
+ jnz L(return_vec_0)
|
||||
+ ret
|
||||
|
||||
+ .p2align 4
|
||||
+L(return_vec_0):
|
||||
+ tzcntl %eax, %eax
|
||||
+# ifdef USE_AS_WMEMCMP
|
||||
+ movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
+ xorl %edx, %edx
|
||||
+ cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
+ /* NB: no partial register stall here because xorl zero idiom
|
||||
+ above. */
|
||||
+ setg %dl
|
||||
+ leal -1(%rdx, %rdx), %eax
|
||||
+# else
|
||||
+ movzbl (%rsi, %rax), %ecx
|
||||
+ movzbl (%rdi, %rax), %eax
|
||||
+ subl %ecx, %eax
|
||||
+# endif
|
||||
+ ret
|
||||
+
|
||||
+
|
||||
+ .p2align 4
|
||||
+L(more_1x_vec):
|
||||
/* From VEC to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
VMOVU (%rsi), %YMM1
|
||||
/* Use compare not equals to directly check for mismatch. */
|
||||
- VPCMP $4, (%rdi), %YMM1, %k1
|
||||
+ VPCMP $4,(%rdi), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
/* NB: eax must be destination register if going to
|
||||
L(return_vec_[0,2]). For L(return_vec_3) destination register
|
||||
@@ -131,13 +170,13 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
|
||||
/* Check third and fourth VEC no matter what. */
|
||||
VMOVU (VEC_SIZE * 2)(%rsi), %YMM3
|
||||
- VPCMP $4, (VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 2)(%rdi), %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(return_vec_2)
|
||||
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
- VPCMP $4, (VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 3)(%rdi), %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
testl %ecx, %ecx
|
||||
jnz L(return_vec_3)
|
||||
@@ -169,7 +208,7 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
VMOVU (VEC_SIZE * 3)(%rsi), %YMM4
|
||||
/* Ternary logic to xor (VEC_SIZE * 3)(%rdi) with YMM4 while
|
||||
oring with YMM1. Result is stored in YMM4. */
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
|
||||
/* Or together YMM2, YMM3, and YMM4 into YMM4. */
|
||||
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
@@ -184,7 +223,8 @@ ENTRY_P2ALIGN (MEMCMP, 6)
|
||||
/* NB: eax must be zero to reach here. */
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
+
|
||||
+ .p2align 4,, 8
|
||||
L(8x_end_return_vec_0_1_2_3):
|
||||
movq %rdx, %rdi
|
||||
L(8x_return_vec_0_1_2_3):
|
||||
@@ -222,23 +262,6 @@ L(return_vec_3):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- .p2align 4
|
||||
-L(return_vec_0):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
- /* NB: no partial register stall here because xorl zero idiom
|
||||
- above. */
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rsi, %rax), %ecx
|
||||
- movzbl (%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
|
||||
.p2align 4
|
||||
L(return_vec_1):
|
||||
@@ -297,7 +320,7 @@ L(loop_4x_vec):
|
||||
VMOVU (VEC_SIZE * 2)(%rsi, %rdi), %YMM3
|
||||
vpxorq (VEC_SIZE * 2)(%rdi), %YMM3, %YMM3
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdi), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdi), %YMM1, %YMM4
|
||||
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
@@ -324,7 +347,7 @@ L(loop_4x_vec):
|
||||
VMOVU VEC_SIZE(%rsi, %rdx), %YMM2
|
||||
vpxorq VEC_SIZE(%rdx), %YMM2, %YMM2
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM4
|
||||
- vpternlogd $0xde, (VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
|
||||
+ vpternlogd $0xde,(VEC_SIZE * 3)(%rdx), %YMM1, %YMM4
|
||||
vpternlogd $0xfe, %YMM2, %YMM3, %YMM4
|
||||
VPTEST %YMM4, %YMM4, %k1
|
||||
kmovd %k1, %ecx
|
||||
@@ -336,14 +359,14 @@ L(loop_4x_vec):
|
||||
/* Only entry is from L(more_8x_vec). */
|
||||
.p2align 4,, 10
|
||||
L(8x_last_2x_vec):
|
||||
- VPCMP $4, (VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 2)(%rdx), %YMM3, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(8x_return_vec_2)
|
||||
/* Naturally aligned to 16 bytes. */
|
||||
L(8x_last_1x_vec):
|
||||
VMOVU (VEC_SIZE * 3)(%rsi, %rdx), %YMM1
|
||||
- VPCMP $4, (VEC_SIZE * 3)(%rdx), %YMM1, %k1
|
||||
+ VPCMP $4,(VEC_SIZE * 3)(%rdx), %YMM1, %k1
|
||||
kmovd %k1, %eax
|
||||
testl %eax, %eax
|
||||
jnz L(8x_return_vec_3)
|
||||
@@ -392,7 +415,9 @@ L(last_1x_vec):
|
||||
jnz L(return_vec_0_end)
|
||||
ret
|
||||
|
||||
- .p2align 4,, 10
|
||||
+
|
||||
+ /* Don't align. Takes 2-fetch blocks either way and aligning
|
||||
+ will cause code to spill into another cacheline. */
|
||||
L(return_vec_1_end):
|
||||
/* Use bsf to save code size. This is necessary to have
|
||||
L(one_or_less) fit in aligning bytes between. */
|
||||
@@ -411,31 +436,8 @@ L(return_vec_1_end):
|
||||
# endif
|
||||
ret
|
||||
|
||||
- /* NB: L(one_or_less) fits in alignment padding between
|
||||
- L(return_vec_1_end) and L(return_vec_0_end). */
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movl (%rdi), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi), %ecx
|
||||
- je L(zero)
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
- ret
|
||||
-# else
|
||||
-L(one_or_less):
|
||||
- jb L(zero)
|
||||
- movzbl (%rsi), %ecx
|
||||
- movzbl (%rdi), %eax
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
-# endif
|
||||
-L(zero):
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
+ /* Don't align. Takes 2-fetch blocks either way and aligning
|
||||
+ will cause code to spill into another cacheline. */
|
||||
L(return_vec_0_end):
|
||||
tzcntl %eax, %eax
|
||||
addl %edx, %eax
|
||||
@@ -451,146 +453,7 @@ L(return_vec_0_end):
|
||||
subl %ecx, %eax
|
||||
# endif
|
||||
ret
|
||||
+ /* 1-byte until next cache line. */
|
||||
|
||||
- .p2align 4
|
||||
-L(less_vec):
|
||||
- /* Check if one or less CHAR. This is necessary for size == 0
|
||||
- but is also faster for size == CHAR_SIZE. */
|
||||
- cmpl $1, %edx
|
||||
- jbe L(one_or_less)
|
||||
-
|
||||
- /* Check if loading one VEC from either s1 or s2 could cause a
|
||||
- page cross. This can have false positives but is by far the
|
||||
- fastest method. */
|
||||
- movl %edi, %eax
|
||||
- orl %esi, %eax
|
||||
- andl $(PAGE_SIZE - 1), %eax
|
||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
||||
- jg L(page_cross_less_vec)
|
||||
-
|
||||
- /* No page cross possible. */
|
||||
- VMOVU (%rsi), %YMM2
|
||||
- VPCMP $4, (%rdi), %YMM2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- /* Check if any matches where in bounds. Intentionally not
|
||||
- storing result in eax to limit dependency chain if it goes to
|
||||
- L(return_vec_0_lv). */
|
||||
- bzhil %edx, %eax, %edx
|
||||
- jnz L(return_vec_0_lv)
|
||||
- xorl %eax, %eax
|
||||
- ret
|
||||
-
|
||||
- /* Essentially duplicate of L(return_vec_0). Ends up not costing
|
||||
- any code as shrinks L(less_vec) by allowing 2-byte encoding of
|
||||
- the jump and ends up fitting in aligning bytes. As well fits on
|
||||
- same cache line as L(less_vec) so also saves a line from having
|
||||
- to be fetched on cold calls to memcmp. */
|
||||
- .p2align 4,, 4
|
||||
-L(return_vec_0_lv):
|
||||
- tzcntl %eax, %eax
|
||||
-# ifdef USE_AS_WMEMCMP
|
||||
- movl (%rdi, %rax, CHAR_SIZE), %ecx
|
||||
- xorl %edx, %edx
|
||||
- cmpl (%rsi, %rax, CHAR_SIZE), %ecx
|
||||
- /* NB: no partial register stall here because xorl zero idiom
|
||||
- above. */
|
||||
- setg %dl
|
||||
- leal -1(%rdx, %rdx), %eax
|
||||
-# else
|
||||
- movzbl (%rsi, %rax), %ecx
|
||||
- movzbl (%rdi, %rax), %eax
|
||||
- subl %ecx, %eax
|
||||
-# endif
|
||||
- ret
|
||||
-
|
||||
- .p2align 4
|
||||
-L(page_cross_less_vec):
|
||||
- /* if USE_AS_WMEMCMP it can only be 0, 4, 8, 12, 16, 20, 24, 28
|
||||
- bytes. */
|
||||
- cmpl $(16 / CHAR_SIZE), %edx
|
||||
- jae L(between_16_31)
|
||||
-# ifndef USE_AS_WMEMCMP
|
||||
- cmpl $8, %edx
|
||||
- jae L(between_8_15)
|
||||
- cmpl $4, %edx
|
||||
- jb L(between_2_3)
|
||||
-
|
||||
- /* Load as big endian with overlapping movbe to avoid branches.
|
||||
- */
|
||||
- movbe (%rdi), %eax
|
||||
- movbe (%rsi), %ecx
|
||||
- shlq $32, %rax
|
||||
- shlq $32, %rcx
|
||||
- movbe -4(%rdi, %rdx), %edi
|
||||
- movbe -4(%rsi, %rdx), %esi
|
||||
- orq %rdi, %rax
|
||||
- orq %rsi, %rcx
|
||||
- subq %rcx, %rax
|
||||
- /* edx is guranteed to be positive int32 in range [4, 7]. */
|
||||
- cmovne %edx, %eax
|
||||
- /* ecx is -1 if rcx > rax. Otherwise 0. */
|
||||
- sbbl %ecx, %ecx
|
||||
- /* If rcx > rax, then ecx is 0 and eax is positive. If rcx ==
|
||||
- rax then eax and ecx are zero. If rax < rax then ecx is -1 so
|
||||
- eax doesn't matter. */
|
||||
- orl %ecx, %eax
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(between_8_15):
|
||||
-# endif
|
||||
- /* If USE_AS_WMEMCMP fall through into 8-15 byte case. */
|
||||
- vmovq (%rdi), %xmm1
|
||||
- vmovq (%rsi), %xmm2
|
||||
- VPCMP $4, %xmm1, %xmm2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_lv)
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- vmovq -8(%rdi, %rdx, CHAR_SIZE), %xmm1
|
||||
- vmovq -8(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
- VPCMP $4, %xmm1, %xmm2, %k1
|
||||
- addl $(CHAR_PER_VEC - (8 / CHAR_SIZE)), %edx
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_end)
|
||||
- ret
|
||||
-
|
||||
- .p2align 4,, 8
|
||||
-L(between_16_31):
|
||||
- /* From 16 to 31 bytes. No branch when size == 16. */
|
||||
-
|
||||
- /* Use movups to save code size. */
|
||||
- vmovdqu (%rsi), %xmm2
|
||||
- VPCMP $4, (%rdi), %xmm2, %k1
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_lv)
|
||||
- /* Use overlapping loads to avoid branches. */
|
||||
- vmovdqu -16(%rsi, %rdx, CHAR_SIZE), %xmm2
|
||||
- VPCMP $4, -16(%rdi, %rdx, CHAR_SIZE), %xmm2, %k1
|
||||
- addl $(CHAR_PER_VEC - (16 / CHAR_SIZE)), %edx
|
||||
- kmovd %k1, %eax
|
||||
- testl %eax, %eax
|
||||
- jnz L(return_vec_0_end)
|
||||
- ret
|
||||
-
|
||||
-# ifndef USE_AS_WMEMCMP
|
||||
-L(between_2_3):
|
||||
- /* Load as big endian to avoid branches. */
|
||||
- movzwl (%rdi), %eax
|
||||
- movzwl (%rsi), %ecx
|
||||
- shll $8, %eax
|
||||
- shll $8, %ecx
|
||||
- bswap %eax
|
||||
- bswap %ecx
|
||||
- movzbl -1(%rdi, %rdx), %edi
|
||||
- movzbl -1(%rsi, %rdx), %esi
|
||||
- orl %edi, %eax
|
||||
- orl %esi, %ecx
|
||||
- /* Subtraction is okay because the upper 8 bits are zero. */
|
||||
- subl %ecx, %eax
|
||||
- ret
|
||||
-# endif
|
||||
END (MEMCMP)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,43 +0,0 @@
|
||||
From 6b8dbbd03ac88f169b65b5c7d7278576a11d2e44 Mon Sep 17 00:00:00 2001
|
||||
From: Jangwoong Kim <6812skiii@gmail.com>
|
||||
Date: Tue, 14 Dec 2021 21:30:51 +0900
|
||||
Subject: [PATCH] nptl: Effectively skip CAS in spinlock loop
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
The commit:
|
||||
"Add LLL_MUTEX_READ_LOCK [BZ #28537]"
|
||||
SHA1: d672a98a1af106bd68deb15576710cd61363f7a6
|
||||
|
||||
introduced LLL_MUTEX_READ_LOCK, to skip CAS in spinlock loop
|
||||
if atomic load fails. But, "continue" inside of do-while loop
|
||||
does not skip the evaluation of escape expression, thus CAS
|
||||
is not skipped.
|
||||
|
||||
Replace do-while with while and skip LLL_MUTEX_TRYLOCK if
|
||||
LLL_MUTEX_READ_LOCK fails.
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
nptl/pthread_mutex_lock.c | 5 ++---
|
||||
1 file changed, 2 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/nptl/pthread_mutex_lock.c b/nptl/pthread_mutex_lock.c
|
||||
index a633d95e..d96a9933 100644
|
||||
--- a/nptl/pthread_mutex_lock.c
|
||||
+++ b/nptl/pthread_mutex_lock.c
|
||||
@@ -141,10 +141,9 @@ __pthread_mutex_lock (pthread_mutex_t *mutex)
|
||||
break;
|
||||
}
|
||||
atomic_spin_nop ();
|
||||
- if (LLL_MUTEX_READ_LOCK (mutex) != 0)
|
||||
- continue;
|
||||
}
|
||||
- while (LLL_MUTEX_TRYLOCK (mutex) != 0);
|
||||
+ while (LLL_MUTEX_READ_LOCK (mutex) != 0
|
||||
+ || LLL_MUTEX_TRYLOCK (mutex) != 0);
|
||||
|
||||
mutex->__data.__spins += (cnt - mutex->__data.__spins) / 8;
|
||||
}
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,146 +0,0 @@
|
||||
From 7835d611af0854e69a0c71e3806f8fe379282d6f Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 18 Feb 2022 14:19:15 -0600
|
||||
Subject: [PATCH] x86: Test wcscmp RTM in the wcsncmp overflow case [BZ #28896]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
In the overflow fallback strncmp-avx2-rtm and wcsncmp-avx2-rtm would
|
||||
call strcmp-avx2 and wcscmp-avx2 respectively. This would have
|
||||
not checks around vzeroupper and would trigger spurious
|
||||
aborts. This commit fixes that.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass on
|
||||
AVX2 machines with and without RTM.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86/Makefile | 5 ++++-
|
||||
sysdeps/x86/tst-strncmp-rtm.c | 32 +++++++++++++++++++++++---------
|
||||
sysdeps/x86/tst-wcsncmp-rtm.c | 21 +++++++++++++++++++++
|
||||
3 files changed, 48 insertions(+), 10 deletions(-)
|
||||
create mode 100644 sysdeps/x86/tst-wcsncmp-rtm.c
|
||||
|
||||
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
|
||||
index 2d814915..c2111f49 100644
|
||||
--- a/sysdeps/x86/Makefile
|
||||
+++ b/sysdeps/x86/Makefile
|
||||
@@ -28,7 +28,9 @@ tests += \
|
||||
tst-strcpy-rtm \
|
||||
tst-strlen-rtm \
|
||||
tst-strncmp-rtm \
|
||||
- tst-strrchr-rtm
|
||||
+ tst-strrchr-rtm \
|
||||
+ tst-wcsncmp-rtm \
|
||||
+# tests
|
||||
|
||||
CFLAGS-tst-memchr-rtm.c += -mrtm
|
||||
CFLAGS-tst-memcmp-rtm.c += -mrtm
|
||||
@@ -40,6 +42,7 @@ CFLAGS-tst-strcpy-rtm.c += -mrtm
|
||||
CFLAGS-tst-strlen-rtm.c += -mrtm
|
||||
CFLAGS-tst-strncmp-rtm.c += -mrtm -Wno-error
|
||||
CFLAGS-tst-strrchr-rtm.c += -mrtm
|
||||
+CFLAGS-tst-wcsncmp-rtm.c += -mrtm -Wno-error
|
||||
endif
|
||||
|
||||
ifneq ($(enable-cet),no)
|
||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
index 4d0004b5..4e9f094f 100644
|
||||
--- a/sysdeps/x86/tst-strncmp-rtm.c
|
||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
@@ -19,18 +19,32 @@
|
||||
#include <stdint.h>
|
||||
#include <tst-string-rtm.h>
|
||||
|
||||
+#ifdef WIDE
|
||||
+# define CHAR wchar_t
|
||||
+# define MEMSET wmemset
|
||||
+# define STRNCMP wcsncmp
|
||||
+# define TEST_NAME wcsncmp
|
||||
+#else /* !WIDE */
|
||||
+# define CHAR char
|
||||
+# define MEMSET memset
|
||||
+# define STRNCMP strncmp
|
||||
+# define TEST_NAME strncmp
|
||||
+#endif /* !WIDE */
|
||||
+
|
||||
+
|
||||
+
|
||||
#define LOOP 3000
|
||||
#define STRING_SIZE 1024
|
||||
-char string1[STRING_SIZE];
|
||||
-char string2[STRING_SIZE];
|
||||
+CHAR string1[STRING_SIZE];
|
||||
+CHAR string2[STRING_SIZE];
|
||||
|
||||
__attribute__ ((noinline, noclone))
|
||||
static int
|
||||
prepare (void)
|
||||
{
|
||||
- memset (string1, 'a', STRING_SIZE - 1);
|
||||
- memset (string2, 'a', STRING_SIZE - 1);
|
||||
- if (strncmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ MEMSET (string1, 'a', STRING_SIZE - 1);
|
||||
+ MEMSET (string2, 'a', STRING_SIZE - 1);
|
||||
+ if (STRNCMP (string1, string2, STRING_SIZE) == 0)
|
||||
return EXIT_SUCCESS;
|
||||
else
|
||||
return EXIT_FAILURE;
|
||||
@@ -40,7 +54,7 @@ __attribute__ ((noinline, noclone))
|
||||
static int
|
||||
function (void)
|
||||
{
|
||||
- if (strncmp (string1, string2, STRING_SIZE) == 0)
|
||||
+ if (STRNCMP (string1, string2, STRING_SIZE) == 0)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
@@ -50,7 +64,7 @@ __attribute__ ((noinline, noclone))
|
||||
static int
|
||||
function_overflow (void)
|
||||
{
|
||||
- if (strncmp (string1, string2, SIZE_MAX) == 0)
|
||||
+ if (STRNCMP (string1, string2, SIZE_MAX) == 0)
|
||||
return 0;
|
||||
else
|
||||
return 1;
|
||||
@@ -59,9 +73,9 @@ function_overflow (void)
|
||||
static int
|
||||
do_test (void)
|
||||
{
|
||||
- int status = do_test_1 ("strncmp", LOOP, prepare, function);
|
||||
+ int status = do_test_1 (TEST_NAME, LOOP, prepare, function);
|
||||
if (status != EXIT_SUCCESS)
|
||||
return status;
|
||||
- status = do_test_1 ("strncmp", LOOP, prepare, function_overflow);
|
||||
+ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
|
||||
return status;
|
||||
}
|
||||
diff --git a/sysdeps/x86/tst-wcsncmp-rtm.c b/sysdeps/x86/tst-wcsncmp-rtm.c
|
||||
new file mode 100644
|
||||
index 00000000..bad3b863
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86/tst-wcsncmp-rtm.c
|
||||
@@ -0,0 +1,21 @@
|
||||
+/* Test case for wcsncmp inside a transactionally executing RTM region.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include <wchar.h>
|
||||
+#include "tst-strncmp-rtm.c"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,37 +0,0 @@
|
||||
From b98d0bbf747f39770e0caba7e984ce9f8f900330 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Fri, 18 Feb 2022 17:00:25 -0600
|
||||
Subject: [PATCH] x86: Fix TEST_NAME to make it a string in tst-strncmp-rtm.c
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Previously TEST_NAME was passing a function pointer. This didn't fail
|
||||
because of the -Wno-error flag (to allow for overflow sizes passed
|
||||
to strncmp/wcsncmp)
|
||||
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86/tst-strncmp-rtm.c | 4 ++--
|
||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
index 4e9f094f..aef9866c 100644
|
||||
--- a/sysdeps/x86/tst-strncmp-rtm.c
|
||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
@@ -23,12 +23,12 @@
|
||||
# define CHAR wchar_t
|
||||
# define MEMSET wmemset
|
||||
# define STRNCMP wcsncmp
|
||||
-# define TEST_NAME wcsncmp
|
||||
+# define TEST_NAME "wcsncmp"
|
||||
#else /* !WIDE */
|
||||
# define CHAR char
|
||||
# define MEMSET memset
|
||||
# define STRNCMP strncmp
|
||||
-# define TEST_NAME strncmp
|
||||
+# define TEST_NAME "strncmp"
|
||||
#endif /* !WIDE */
|
||||
|
||||
|
||||
--
|
||||
GitLab
|
||||
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@ -1,33 +0,0 @@
|
||||
From c15efd011cea3d8f0494269eb539583215a1feed Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 4 Feb 2022 11:09:10 -0800
|
||||
Subject: [PATCH] x86-64: Fix strcmp-avx2.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
|
||||
|
||||
commit b77b06e0e296f1a2276c27a67e1d44f2cfa38d45
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jan 10 15:35:38 2022 -0600
|
||||
|
||||
x86: Optimize strcmp-avx2.S
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 554ffe4c..04675aa4 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -106,7 +106,7 @@ ENTRY(STRCMP)
|
||||
# ifdef USE_AS_STRNCMP
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
- movl %edx, %rdx
|
||||
+ movl %edx, %edx
|
||||
# endif
|
||||
cmp $1, %RDX_LP
|
||||
/* Signed comparison intentional. We use this branch to also
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,33 +0,0 @@
|
||||
From 0e0199a9e02ebe42e2b36958964d63f03573c382 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Fri, 4 Feb 2022 11:11:08 -0800
|
||||
Subject: [PATCH] x86-64: Fix strcmp-evex.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Change "movl %edx, %rdx" to "movl %edx, %edx" in:
|
||||
|
||||
commit 8418eb3ff4b781d31c4ed5dc6c0bd7356bc45db9
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon Jan 10 15:35:39 2022 -0600
|
||||
|
||||
x86: Optimize strcmp-evex.S
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
index 99d8409a..ed56af8e 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
@@ -116,7 +116,7 @@ ENTRY(STRCMP)
|
||||
# ifdef USE_AS_STRNCMP
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
- movl %edx, %rdx
|
||||
+ movl %edx, %edx
|
||||
# endif
|
||||
cmp $1, %RDX_LP
|
||||
/* Signed comparison intentional. We use this branch to also
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,459 +0,0 @@
|
||||
From b62ace2740a106222e124cc86956448fa07abf4d Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun, 6 Feb 2022 00:54:18 -0600
|
||||
Subject: [PATCH] x86: Improve vec generation in memset-vec-unaligned-erms.S
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
No bug.
|
||||
|
||||
Split vec generation into multiple steps. This allows the
|
||||
broadcast in AVX2 to use 'xmm' registers for the L(less_vec)
|
||||
case. This saves an expensive lane-cross instruction and removes
|
||||
the need for 'vzeroupper'.
|
||||
|
||||
For SSE2 replace 2x 'punpck' instructions with zero-idiom 'pxor' for
|
||||
byte broadcast.
|
||||
|
||||
Results for memset-avx2 small (geomean of N = 20 benchset runs).
|
||||
|
||||
size, New Time, Old Time, New / Old
|
||||
0, 4.100, 3.831, 0.934
|
||||
1, 5.074, 4.399, 0.867
|
||||
2, 4.433, 4.411, 0.995
|
||||
4, 4.487, 4.415, 0.984
|
||||
8, 4.454, 4.396, 0.987
|
||||
16, 4.502, 4.443, 0.987
|
||||
|
||||
All relevant string/wcsmbs tests are passing.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86_64/memset.S | 21 ++-
|
||||
.../multiarch/memset-avx2-unaligned-erms.S | 18 +-
|
||||
.../multiarch/memset-avx512-unaligned-erms.S | 18 +-
|
||||
.../multiarch/memset-evex-unaligned-erms.S | 18 +-
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 164 +++++++++++-------
|
||||
5 files changed, 152 insertions(+), 87 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||||
index 8672b030..27debd2b 100644
|
||||
--- a/sysdeps/x86_64/memset.S
|
||||
+++ b/sysdeps/x86_64/memset.S
|
||||
@@ -28,17 +28,22 @@
|
||||
#define VMOVU movups
|
||||
#define VMOVA movaps
|
||||
|
||||
-#define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- punpcklbw %xmm0, %xmm0; \
|
||||
- punpcklwd %xmm0, %xmm0; \
|
||||
- pshufd $0, %xmm0, %xmm0
|
||||
+ pxor %xmm1, %xmm1; \
|
||||
+ pshufb %xmm1, %xmm0; \
|
||||
+ movq r, %rax
|
||||
|
||||
-#define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- pshufd $0, %xmm0, %xmm0
|
||||
+ pshufd $0, %xmm0, %xmm0; \
|
||||
+ movq r, %rax
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
#define SECTION(p) p
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
index 1af668af..c0bf2875 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
@@ -10,15 +10,18 @@
|
||||
# define VMOVU vmovdqu
|
||||
# define VMOVA vmovdqa
|
||||
|
||||
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vmovd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastb %xmm0, %ymm0
|
||||
+ movq r, %rax;
|
||||
|
||||
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- vmovd d, %xmm0; \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastd %xmm0, %ymm0
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastb %xmm0, %ymm0
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW() vpbroadcastb %xmm0, %xmm0
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH() vpbroadcastd %xmm0, %ymm0
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW() vpbroadcastd %xmm0, %xmm0
|
||||
|
||||
# ifndef SECTION
|
||||
# define SECTION(p) p##.avx
|
||||
@@ -30,5 +33,6 @@
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
|
||||
+# define USE_XMM_LESS_VEC
|
||||
# include "memset-vec-unaligned-erms.S"
|
||||
#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index f14d6f84..5241216a 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -15,13 +15,19 @@
|
||||
|
||||
# define VZEROUPPER
|
||||
|
||||
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastb d, %VEC0
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastb d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
|
||||
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastd d, %VEC0
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastd d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define SECTION(p) p##.evex512
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx512_##s
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
index 64b09e77..63700215 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -15,13 +15,19 @@
|
||||
|
||||
# define VZEROUPPER
|
||||
|
||||
-# define MEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastb d, %VEC0
|
||||
+# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastb d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
|
||||
-# define WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN(d, r) \
|
||||
- movq r, %rax; \
|
||||
- vpbroadcastd d, %VEC0
|
||||
+# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
+ vpbroadcastd d, %VEC0; \
|
||||
+ movq r, %rax
|
||||
+
|
||||
+# define MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define MEMSET_VDUP_TO_VEC0_LOW()
|
||||
+
|
||||
+# define WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+# define WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
|
||||
# define SECTION(p) p##.evex
|
||||
# define MEMSET_SYMBOL(p,s) p##_evex_##s
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index f08b7323..a67f9833 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -58,8 +58,10 @@
|
||||
#ifndef MOVQ
|
||||
# if VEC_SIZE > 16
|
||||
# define MOVQ vmovq
|
||||
+# define MOVD vmovd
|
||||
# else
|
||||
# define MOVQ movq
|
||||
+# define MOVD movd
|
||||
# endif
|
||||
#endif
|
||||
|
||||
@@ -72,9 +74,17 @@
|
||||
#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
# define END_REG rcx
|
||||
# define LOOP_REG rdi
|
||||
+# define LESS_VEC_REG rax
|
||||
#else
|
||||
# define END_REG rdi
|
||||
# define LOOP_REG rdx
|
||||
+# define LESS_VEC_REG rdi
|
||||
+#endif
|
||||
+
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+# define XMM_SMALL 1
|
||||
+#else
|
||||
+# define XMM_SMALL 0
|
||||
#endif
|
||||
|
||||
#define PAGE_SIZE 4096
|
||||
@@ -110,8 +120,12 @@ END_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
||||
|
||||
ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
shl $2, %RDX_LP
|
||||
- WMEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
- jmp L(entry_from_bzero)
|
||||
+ WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
+ WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
+ cmpq $VEC_SIZE, %rdx
|
||||
+ jb L(less_vec_no_vdup)
|
||||
+ WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+ jmp L(entry_from_wmemset)
|
||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
#endif
|
||||
|
||||
@@ -123,7 +137,7 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||||
#endif
|
||||
|
||||
ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
@@ -131,6 +145,8 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
L(entry_from_bzero):
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec)
|
||||
+ MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
+L(entry_from_wmemset):
|
||||
cmpq $(VEC_SIZE * 2), %rdx
|
||||
ja L(more_2x_vec)
|
||||
/* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
@@ -179,27 +195,27 @@ END_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
# endif
|
||||
|
||||
ENTRY_P2ALIGN (MEMSET_SYMBOL (__memset, unaligned_erms), 6)
|
||||
- MEMSET_VDUP_TO_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
+ MEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
# ifdef __ILP32__
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
# endif
|
||||
cmp $VEC_SIZE, %RDX_LP
|
||||
jb L(less_vec)
|
||||
+ MEMSET_VDUP_TO_VEC0_HIGH ()
|
||||
cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
ja L(stosb_more_2x_vec)
|
||||
- /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE.
|
||||
- */
|
||||
- VMOVU %VEC(0), (%rax)
|
||||
- VMOVU %VEC(0), -VEC_SIZE(%rax, %rdx)
|
||||
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
|
||||
- .p2align 4,, 10
|
||||
+ .p2align 4,, 4
|
||||
L(last_2x_vec):
|
||||
#ifdef USE_LESS_VEC_MASK_STORE
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 2 + LOOP_4X_OFFSET)(%rcx)
|
||||
- VMOVU %VEC(0), (VEC_SIZE * 3 + LOOP_4X_OFFSET)(%rcx)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi, %rdx)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
#else
|
||||
VMOVU %VEC(0), (VEC_SIZE * -2)(%rdi)
|
||||
VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi)
|
||||
@@ -212,6 +228,7 @@ L(last_2x_vec):
|
||||
#ifdef USE_LESS_VEC_MASK_STORE
|
||||
.p2align 4,, 10
|
||||
L(less_vec):
|
||||
+L(less_vec_no_vdup):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
@@ -262,28 +279,18 @@ L(stosb_more_2x_vec):
|
||||
/* Fallthrough goes to L(loop_4x_vec). Tests for memset (2x, 4x]
|
||||
and (4x, 8x] jump to target. */
|
||||
L(more_2x_vec):
|
||||
-
|
||||
- /* Two different methods of setting up pointers / compare. The
|
||||
- two methods are based on the fact that EVEX/AVX512 mov
|
||||
- instructions take more bytes then AVX2/SSE2 mov instructions. As
|
||||
- well that EVEX/AVX512 machines also have fast LEA_BID. Both
|
||||
- setup and END_REG to avoid complex address mode. For EVEX/AVX512
|
||||
- this saves code size and keeps a few targets in one fetch block.
|
||||
- For AVX2/SSE2 this helps prevent AGU bottlenecks. */
|
||||
-#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
- /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 +
|
||||
- LOOP_4X_OFFSET) with LEA_BID. */
|
||||
-
|
||||
- /* END_REG is rcx for EVEX/AVX512. */
|
||||
- leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
-#endif
|
||||
-
|
||||
- /* Stores to first 2x VEC before cmp as any path forward will
|
||||
- require it. */
|
||||
- VMOVU %VEC(0), (%rax)
|
||||
- VMOVU %VEC(0), VEC_SIZE(%rax)
|
||||
+ /* Store next 2x vec regardless. */
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * 1)(%rdi)
|
||||
|
||||
|
||||
+ /* Two different methods of setting up pointers / compare. The two
|
||||
+ methods are based on the fact that EVEX/AVX512 mov instructions take
|
||||
+ more bytes then AVX2/SSE2 mov instructions. As well that EVEX/AVX512
|
||||
+ machines also have fast LEA_BID. Both setup and END_REG to avoid complex
|
||||
+ address mode. For EVEX/AVX512 this saves code size and keeps a few
|
||||
+ targets in one fetch block. For AVX2/SSE2 this helps prevent AGU
|
||||
+ bottlenecks. */
|
||||
#if !(defined USE_WITH_EVEX || defined USE_WITH_AVX512)
|
||||
/* If AVX2/SSE2 compute END_REG (rdi) with ALU. */
|
||||
addq %rdx, %END_REG
|
||||
@@ -292,6 +299,15 @@ L(more_2x_vec):
|
||||
cmpq $(VEC_SIZE * 4), %rdx
|
||||
jbe L(last_2x_vec)
|
||||
|
||||
+
|
||||
+#if defined USE_WITH_EVEX || defined USE_WITH_AVX512
|
||||
+ /* If EVEX/AVX512 compute END_REG - (VEC_SIZE * 4 + LOOP_4X_OFFSET) with
|
||||
+ LEA_BID. */
|
||||
+
|
||||
+ /* END_REG is rcx for EVEX/AVX512. */
|
||||
+ leaq -(VEC_SIZE * 4 + LOOP_4X_OFFSET)(%rdi, %rdx), %END_REG
|
||||
+#endif
|
||||
+
|
||||
/* Store next 2x vec regardless. */
|
||||
VMOVU %VEC(0), (VEC_SIZE * 2)(%rax)
|
||||
VMOVU %VEC(0), (VEC_SIZE * 3)(%rax)
|
||||
@@ -355,65 +371,93 @@ L(stosb_local):
|
||||
/* Define L(less_vec) only if not otherwise defined. */
|
||||
.p2align 4
|
||||
L(less_vec):
|
||||
+ /* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
||||
+ xmm). This is only does anything for AVX2. */
|
||||
+ MEMSET_VDUP_TO_VEC0_LOW ()
|
||||
+L(less_vec_no_vdup):
|
||||
#endif
|
||||
L(cross_page):
|
||||
#if VEC_SIZE > 32
|
||||
cmpl $32, %edx
|
||||
- jae L(between_32_63)
|
||||
+ jge L(between_32_63)
|
||||
#endif
|
||||
#if VEC_SIZE > 16
|
||||
cmpl $16, %edx
|
||||
- jae L(between_16_31)
|
||||
+ jge L(between_16_31)
|
||||
+#endif
|
||||
+#ifndef USE_XMM_LESS_VEC
|
||||
+ MOVQ %XMM0, %rcx
|
||||
#endif
|
||||
- MOVQ %XMM0, %rdi
|
||||
cmpl $8, %edx
|
||||
- jae L(between_8_15)
|
||||
+ jge L(between_8_15)
|
||||
cmpl $4, %edx
|
||||
- jae L(between_4_7)
|
||||
+ jge L(between_4_7)
|
||||
cmpl $1, %edx
|
||||
- ja L(between_2_3)
|
||||
- jb L(return)
|
||||
- movb %sil, (%rax)
|
||||
- VZEROUPPER_RETURN
|
||||
+ jg L(between_2_3)
|
||||
+ jl L(between_0_0)
|
||||
+ movb %sil, (%LESS_VEC_REG)
|
||||
+L(between_0_0):
|
||||
+ ret
|
||||
|
||||
- /* Align small targets only if not doing so would cross a fetch
|
||||
- line. */
|
||||
+ /* Align small targets only if not doing so would cross a fetch line.
|
||||
+ */
|
||||
#if VEC_SIZE > 32
|
||||
.p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
/* From 32 to 63. No branch when size == 32. */
|
||||
L(between_32_63):
|
||||
- VMOVU %YMM0, (%rax)
|
||||
- VMOVU %YMM0, -32(%rax, %rdx)
|
||||
+ VMOVU %YMM0, (%LESS_VEC_REG)
|
||||
+ VMOVU %YMM0, -32(%LESS_VEC_REG, %rdx)
|
||||
VZEROUPPER_RETURN
|
||||
#endif
|
||||
|
||||
#if VEC_SIZE >= 32
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, RET_SIZE)
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(MOV_SIZE, 1)
|
||||
L(between_16_31):
|
||||
/* From 16 to 31. No branch when size == 16. */
|
||||
- VMOVU %XMM0, (%rax)
|
||||
- VMOVU %XMM0, -16(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+ VMOVU %XMM0, (%LESS_VEC_REG)
|
||||
+ VMOVU %XMM0, -16(%LESS_VEC_REG, %rdx)
|
||||
+ ret
|
||||
#endif
|
||||
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
+ /* Move size is 3 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||||
+ */
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(3 + XMM_SMALL, 1)
|
||||
L(between_8_15):
|
||||
/* From 8 to 15. No branch when size == 8. */
|
||||
- movq %rdi, (%rax)
|
||||
- movq %rdi, -8(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+ MOVQ %XMM0, (%rdi)
|
||||
+ MOVQ %XMM0, -8(%rdi, %rdx)
|
||||
+#else
|
||||
+ movq %rcx, (%LESS_VEC_REG)
|
||||
+ movq %rcx, -8(%LESS_VEC_REG, %rdx)
|
||||
+#endif
|
||||
+ ret
|
||||
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(2, RET_SIZE)
|
||||
+ /* Move size is 2 for SSE2, EVEX, and AVX512. Move size is 4 for AVX2.
|
||||
+ */
|
||||
+ .p2align 4,, SMALL_MEMSET_ALIGN(2 << XMM_SMALL, 1)
|
||||
L(between_4_7):
|
||||
/* From 4 to 7. No branch when size == 4. */
|
||||
- movl %edi, (%rax)
|
||||
- movl %edi, -4(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+ MOVD %XMM0, (%rdi)
|
||||
+ MOVD %XMM0, -4(%rdi, %rdx)
|
||||
+#else
|
||||
+ movl %ecx, (%LESS_VEC_REG)
|
||||
+ movl %ecx, -4(%LESS_VEC_REG, %rdx)
|
||||
+#endif
|
||||
+ ret
|
||||
|
||||
- .p2align 4,, SMALL_MEMSET_ALIGN(3, RET_SIZE)
|
||||
+ /* 4 * XMM_SMALL for the third mov for AVX2. */
|
||||
+ .p2align 4,, 4 * XMM_SMALL + SMALL_MEMSET_ALIGN(3, 1)
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
- movw %di, (%rax)
|
||||
- movb %dil, -1(%rax, %rdx)
|
||||
- VZEROUPPER_RETURN
|
||||
+#ifdef USE_XMM_LESS_VEC
|
||||
+ movb %sil, (%rdi)
|
||||
+ movb %sil, 1(%rdi)
|
||||
+ movb %sil, -1(%rdi, %rdx)
|
||||
+#else
|
||||
+ movw %cx, (%LESS_VEC_REG)
|
||||
+ movb %sil, -1(%LESS_VEC_REG, %rdx)
|
||||
+#endif
|
||||
+ ret
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,40 +0,0 @@
|
||||
From 1b0c60f95bbe2eded80b2bb5be75c0e45b11cde1 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Mon, 7 Feb 2022 00:32:23 -0600
|
||||
Subject: [PATCH] x86: Remove SSSE3 instruction for broadcast in memset.S (SSE2
|
||||
Only)
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
commit b62ace2740a106222e124cc86956448fa07abf4d
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun Feb 6 00:54:18 2022 -0600
|
||||
|
||||
x86: Improve vec generation in memset-vec-unaligned-erms.S
|
||||
|
||||
Revert usage of 'pshufb' in broadcast logic as it is an SSSE3
|
||||
instruction and memset.S is restricted to only SSE2 instructions.
|
||||
---
|
||||
sysdeps/x86_64/memset.S | 7 ++++---
|
||||
1 file changed, 4 insertions(+), 3 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||||
index 27debd2b..4cb4aa71 100644
|
||||
--- a/sysdeps/x86_64/memset.S
|
||||
+++ b/sysdeps/x86_64/memset.S
|
||||
@@ -30,9 +30,10 @@
|
||||
|
||||
# define MEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
- pxor %xmm1, %xmm1; \
|
||||
- pshufb %xmm1, %xmm0; \
|
||||
- movq r, %rax
|
||||
+ movq r, %rax; \
|
||||
+ punpcklbw %xmm0, %xmm0; \
|
||||
+ punpcklwd %xmm0, %xmm0; \
|
||||
+ pshufd $0, %xmm0, %xmm0
|
||||
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,218 +0,0 @@
|
||||
From 5165de69c0908e28a380cbd4bb054e55ea4abc95 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 21 Jan 2019 11:36:36 -0800
|
||||
Subject: [PATCH] x86-64 strnlen/wcsnlen: Properly handle the length parameter
|
||||
[BZ# 24097]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
On x32, the size_t parameter may be passed in the lower 32 bits of a
|
||||
64-bit register with the non-zero upper 32 bits. The string/memory
|
||||
functions written in assembly can only use the lower 32 bits of a
|
||||
64-bit register as length or must clear the upper 32 bits before using
|
||||
the full 64-bit register for length.
|
||||
|
||||
This pach fixes strnlen/wcsnlen for x32. Tested on x86-64 and x32. On
|
||||
x86-64, libc.so is the same with and withou the fix.
|
||||
|
||||
[BZ# 24097]
|
||||
CVE-2019-6488
|
||||
* sysdeps/x86_64/multiarch/strlen-avx2.S: Use RSI_LP for length.
|
||||
Clear the upper 32 bits of RSI register.
|
||||
* sysdeps/x86_64/strlen.S: Use RSI_LP for length.
|
||||
* sysdeps/x86_64/x32/Makefile (tests): Add tst-size_t-strnlen
|
||||
and tst-size_t-wcsnlen.
|
||||
* sysdeps/x86_64/x32/tst-size_t-strnlen.c: New file.
|
||||
* sysdeps/x86_64/x32/tst-size_t-wcsnlen.c: Likewise.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/strlen-avx2.S | 9 ++--
|
||||
sysdeps/x86_64/strlen.S | 12 ++---
|
||||
sysdeps/x86_64/x32/Makefile | 4 +-
|
||||
sysdeps/x86_64/x32/tst-size_t-strnlen.c | 72 +++++++++++++++++++++++++
|
||||
sysdeps/x86_64/x32/tst-size_t-wcsnlen.c | 20 +++++++
|
||||
5 files changed, 106 insertions(+), 11 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
||||
create mode 100644 sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
||||
|
||||
Conflicts:
|
||||
ChangeLog
|
||||
(removed)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/strlen-avx2.S b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
index fb2418cd..645e0446 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strlen-avx2.S
|
||||
@@ -42,12 +42,15 @@
|
||||
ENTRY (STRLEN)
|
||||
# ifdef USE_AS_STRNLEN
|
||||
/* Check for zero length. */
|
||||
- testq %rsi, %rsi
|
||||
+ test %RSI_LP, %RSI_LP
|
||||
jz L(zero)
|
||||
# ifdef USE_AS_WCSLEN
|
||||
- shl $2, %rsi
|
||||
+ shl $2, %RSI_LP
|
||||
+# elif defined __ILP32__
|
||||
+ /* Clear the upper 32 bits. */
|
||||
+ movl %esi, %esi
|
||||
# endif
|
||||
- movq %rsi, %r8
|
||||
+ mov %RSI_LP, %R8_LP
|
||||
# endif
|
||||
movl %edi, %ecx
|
||||
movq %rdi, %rdx
|
||||
diff --git a/sysdeps/x86_64/strlen.S b/sysdeps/x86_64/strlen.S
|
||||
index 01cb5fa8..f845f3d4 100644
|
||||
--- a/sysdeps/x86_64/strlen.S
|
||||
+++ b/sysdeps/x86_64/strlen.S
|
||||
@@ -59,21 +59,21 @@ ENTRY(strlen)
|
||||
|
||||
#ifdef AS_STRNLEN
|
||||
/* Do not read anything when n==0. */
|
||||
- test %rsi, %rsi
|
||||
+ test %RSI_LP, %RSI_LP
|
||||
jne L(n_nonzero)
|
||||
xor %rax, %rax
|
||||
ret
|
||||
L(n_nonzero):
|
||||
# ifdef AS_WCSLEN
|
||||
- shlq $2, %rsi
|
||||
+ shl $2, %RSI_LP
|
||||
# endif
|
||||
|
||||
/* Initialize long lived registers. */
|
||||
|
||||
- add %rdi, %rsi
|
||||
- mov %rsi, %r10
|
||||
- and $-64, %r10
|
||||
- mov %rsi, %r11
|
||||
+ add %RDI_LP, %RSI_LP
|
||||
+ mov %RSI_LP, %R10_LP
|
||||
+ and $-64, %R10_LP
|
||||
+ mov %RSI_LP, %R11_LP
|
||||
#endif
|
||||
|
||||
pxor %xmm0, %xmm0
|
||||
diff --git a/sysdeps/x86_64/x32/Makefile b/sysdeps/x86_64/x32/Makefile
|
||||
index 2a9e20a9..1557724b 100644
|
||||
--- a/sysdeps/x86_64/x32/Makefile
|
||||
+++ b/sysdeps/x86_64/x32/Makefile
|
||||
@@ -8,10 +8,10 @@ endif
|
||||
ifeq ($(subdir),string)
|
||||
tests += tst-size_t-memchr tst-size_t-memcmp tst-size_t-memcpy \
|
||||
tst-size_t-memrchr tst-size_t-memset tst-size_t-strncasecmp \
|
||||
- tst-size_t-strncmp tst-size_t-strncpy
|
||||
+ tst-size_t-strncmp tst-size_t-strncpy tst-size_t-strnlen
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
tests += tst-size_t-wmemchr tst-size_t-wmemcmp tst-size_t-wmemset \
|
||||
- tst-size_t-wcsncmp
|
||||
+ tst-size_t-wcsncmp tst-size_t-wcsnlen
|
||||
endif
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-strnlen.c b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
||||
new file mode 100644
|
||||
index 00000000..690a4a8a
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-strnlen.c
|
||||
@@ -0,0 +1,72 @@
|
||||
+/* Test strnlen with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# define TEST_NAME "wcsnlen"
|
||||
+#else
|
||||
+# define TEST_NAME "strnlen"
|
||||
+#endif /* WIDE */
|
||||
+
|
||||
+#include "test-size_t.h"
|
||||
+
|
||||
+#ifdef WIDE
|
||||
+# include <wchar.h>
|
||||
+# define STRNLEN wcsnlen
|
||||
+# define CHAR wchar_t
|
||||
+#else
|
||||
+# define STRNLEN strnlen
|
||||
+# define CHAR char
|
||||
+#endif /* WIDE */
|
||||
+
|
||||
+IMPL (STRNLEN, 1)
|
||||
+
|
||||
+typedef size_t (*proto_t) (const CHAR *, size_t);
|
||||
+
|
||||
+static size_t
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+do_strnlen (parameter_t a, parameter_t b)
|
||||
+{
|
||||
+ return CALL (&a, a.p, b.len);
|
||||
+}
|
||||
+
|
||||
+static int
|
||||
+test_main (void)
|
||||
+{
|
||||
+ test_init ();
|
||||
+
|
||||
+ size_t size = page_size / sizeof (CHAR);
|
||||
+ parameter_t src = { { 0 }, buf2 };
|
||||
+ parameter_t c = { { size }, (void *) (uintptr_t) 'a' };
|
||||
+
|
||||
+ int ret = 0;
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ {
|
||||
+ src.fn = impl->fn;
|
||||
+ size_t res = do_strnlen (src, c);
|
||||
+ if (res != size)
|
||||
+ {
|
||||
+ error (0, 0, "Wrong result in function %s: 0x%x != 0x%x",
|
||||
+ impl->name, res, size);
|
||||
+ ret = 1;
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ return ret ? EXIT_FAILURE : EXIT_SUCCESS;
|
||||
+}
|
||||
+
|
||||
+#include <support/test-driver.c>
|
||||
diff --git a/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
||||
new file mode 100644
|
||||
index 00000000..093b4bbe
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/x32/tst-size_t-wcsnlen.c
|
||||
@@ -0,0 +1,20 @@
|
||||
+/* Test wcsnlen with size_t in the lower 32 bits of 64-bit register.
|
||||
+ Copyright (C) 2019 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <http://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+#define WIDE 1
|
||||
+#include "tst-size_t-strnlen.c"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,753 +0,0 @@
|
||||
From 3d9f171bfb5325bd5f427e9fc386453358c6e840 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Mon, 7 Feb 2022 05:55:15 -0800
|
||||
Subject: [PATCH] x86-64: Optimize bzero
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
memset with zero as the value to set is by far the majority value (99%+
|
||||
for Python3 and GCC).
|
||||
|
||||
bzero can be slightly more optimized for this case by using a zero-idiom
|
||||
xor for broadcasting the set value to a register (vector or GPR).
|
||||
|
||||
Co-developed-by: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
---
|
||||
sysdeps/generic/ifunc-init.h | 5 +-
|
||||
sysdeps/x86_64/memset.S | 8 +
|
||||
sysdeps/x86_64/multiarch/Makefile | 205 +++++++++++-------
|
||||
sysdeps/x86_64/multiarch/bzero.c | 106 +++++++++
|
||||
sysdeps/x86_64/multiarch/ifunc-impl-list.c | 42 ++++
|
||||
.../memset-avx2-unaligned-erms-rtm.S | 1 +
|
||||
.../multiarch/memset-avx2-unaligned-erms.S | 6 +
|
||||
.../multiarch/memset-avx512-unaligned-erms.S | 3 +
|
||||
.../multiarch/memset-evex-unaligned-erms.S | 3 +
|
||||
.../multiarch/memset-sse2-unaligned-erms.S | 1 +
|
||||
.../multiarch/memset-vec-unaligned-erms.S | 110 +++++++---
|
||||
11 files changed, 384 insertions(+), 106 deletions(-)
|
||||
create mode 100644 sysdeps/x86_64/multiarch/bzero.c
|
||||
|
||||
Conflicts:
|
||||
sysdeps/generic/ifunc-init.h
|
||||
(needs macros from cf4fd28ea453d1a9cec93939bc88b58ccef5437a (memcmpeq))
|
||||
sysdeps/x86_64/multiarch/Makefile
|
||||
(file ordering)
|
||||
|
||||
diff --git a/sysdeps/generic/ifunc-init.h b/sysdeps/generic/ifunc-init.h
|
||||
index 241e4161..f7a72375 100644
|
||||
--- a/sysdeps/generic/ifunc-init.h
|
||||
+++ b/sysdeps/generic/ifunc-init.h
|
||||
@@ -50,5 +50,8 @@
|
||||
'__<symbol>_<variant>' as the optimized implementation and
|
||||
'<symbol>_ifunc_selector' as the IFUNC selector. */
|
||||
#define REDIRECT_NAME EVALUATOR1 (__redirect, SYMBOL_NAME)
|
||||
-#define OPTIMIZE(name) EVALUATOR2 (SYMBOL_NAME, name)
|
||||
+#define OPTIMIZE1(name) EVALUATOR1 (SYMBOL_NAME, name)
|
||||
+#define OPTIMIZE2(name) EVALUATOR2 (SYMBOL_NAME, name)
|
||||
+/* Default is to use OPTIMIZE2. */
|
||||
+#define OPTIMIZE(name) OPTIMIZE2(name)
|
||||
#define IFUNC_SELECTOR EVALUATOR1 (SYMBOL_NAME, ifunc_selector)
|
||||
diff --git a/sysdeps/x86_64/memset.S b/sysdeps/x86_64/memset.S
|
||||
index 4cb4aa71..a1353f89 100644
|
||||
--- a/sysdeps/x86_64/memset.S
|
||||
+++ b/sysdeps/x86_64/memset.S
|
||||
@@ -35,6 +35,9 @@
|
||||
punpcklwd %xmm0, %xmm0; \
|
||||
pshufd $0, %xmm0, %xmm0
|
||||
|
||||
+# define BZERO_ZERO_VEC0() \
|
||||
+ pxor %xmm0, %xmm0
|
||||
+
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
movd d, %xmm0; \
|
||||
pshufd $0, %xmm0, %xmm0; \
|
||||
@@ -53,6 +56,10 @@
|
||||
# define MEMSET_SYMBOL(p,s) memset
|
||||
#endif
|
||||
|
||||
+#ifndef BZERO_SYMBOL
|
||||
+# define BZERO_SYMBOL(p,s) __bzero
|
||||
+#endif
|
||||
+
|
||||
#ifndef WMEMSET_SYMBOL
|
||||
# define WMEMSET_CHK_SYMBOL(p,s) p
|
||||
# define WMEMSET_SYMBOL(p,s) __wmemset
|
||||
@@ -63,6 +70,7 @@
|
||||
libc_hidden_builtin_def (memset)
|
||||
|
||||
#if IS_IN (libc)
|
||||
+weak_alias (__bzero, bzero)
|
||||
libc_hidden_def (__wmemset)
|
||||
weak_alias (__wmemset, wmemset)
|
||||
libc_hidden_weak (wmemset)
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 26be4095..37d8d6f0 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -1,85 +1,130 @@
|
||||
ifeq ($(subdir),string)
|
||||
|
||||
-sysdep_routines += strncat-c stpncpy-c strncpy-c \
|
||||
- strcmp-sse2 strcmp-sse2-unaligned strcmp-ssse3 \
|
||||
- strcmp-sse4_2 strcmp-avx2 \
|
||||
- strncmp-sse2 strncmp-ssse3 strncmp-sse4_2 strncmp-avx2 \
|
||||
- memchr-sse2 rawmemchr-sse2 memchr-avx2 rawmemchr-avx2 \
|
||||
- memrchr-sse2 memrchr-avx2 \
|
||||
- memcmp-sse2 \
|
||||
- memcmp-avx2-movbe \
|
||||
- memcmp-sse4 memcpy-ssse3 \
|
||||
- memmove-ssse3 \
|
||||
- memcpy-ssse3-back \
|
||||
- memmove-ssse3-back \
|
||||
- memmove-avx512-no-vzeroupper \
|
||||
- strcasecmp_l-sse2 strcasecmp_l-ssse3 \
|
||||
- strcasecmp_l-sse4_2 strcasecmp_l-avx \
|
||||
- strncase_l-sse2 strncase_l-ssse3 \
|
||||
- strncase_l-sse4_2 strncase_l-avx \
|
||||
- strchr-sse2 strchrnul-sse2 strchr-avx2 strchrnul-avx2 \
|
||||
- strrchr-sse2 strrchr-avx2 \
|
||||
- strlen-sse2 strnlen-sse2 strlen-avx2 strnlen-avx2 \
|
||||
- strcat-avx2 strncat-avx2 \
|
||||
- strcat-ssse3 strncat-ssse3\
|
||||
- strcpy-avx2 strncpy-avx2 \
|
||||
- strcpy-sse2 stpcpy-sse2 \
|
||||
- strcpy-ssse3 strncpy-ssse3 stpcpy-ssse3 stpncpy-ssse3 \
|
||||
- strcpy-sse2-unaligned strncpy-sse2-unaligned \
|
||||
- stpcpy-sse2-unaligned stpncpy-sse2-unaligned \
|
||||
- stpcpy-avx2 stpncpy-avx2 \
|
||||
- strcat-sse2 \
|
||||
- strcat-sse2-unaligned strncat-sse2-unaligned \
|
||||
- strchr-sse2-no-bsf memcmp-ssse3 strstr-sse2-unaligned \
|
||||
- strcspn-sse2 strpbrk-sse2 strspn-sse2 \
|
||||
- strcspn-c strpbrk-c strspn-c varshift \
|
||||
- memset-avx512-no-vzeroupper \
|
||||
- memmove-sse2-unaligned-erms \
|
||||
- memmove-avx-unaligned-erms \
|
||||
- memmove-avx512-unaligned-erms \
|
||||
- memset-sse2-unaligned-erms \
|
||||
- memset-avx2-unaligned-erms \
|
||||
- memset-avx512-unaligned-erms \
|
||||
- memchr-avx2-rtm \
|
||||
- memcmp-avx2-movbe-rtm \
|
||||
- memmove-avx-unaligned-erms-rtm \
|
||||
- memrchr-avx2-rtm \
|
||||
- memset-avx2-unaligned-erms-rtm \
|
||||
- rawmemchr-avx2-rtm \
|
||||
- strchr-avx2-rtm \
|
||||
- strcmp-avx2-rtm \
|
||||
- strchrnul-avx2-rtm \
|
||||
- stpcpy-avx2-rtm \
|
||||
- stpncpy-avx2-rtm \
|
||||
- strcat-avx2-rtm \
|
||||
- strcpy-avx2-rtm \
|
||||
- strlen-avx2-rtm \
|
||||
- strncat-avx2-rtm \
|
||||
- strncmp-avx2-rtm \
|
||||
- strncpy-avx2-rtm \
|
||||
- strnlen-avx2-rtm \
|
||||
- strrchr-avx2-rtm \
|
||||
- memchr-evex \
|
||||
- memcmp-evex-movbe \
|
||||
- memmove-evex-unaligned-erms \
|
||||
- memrchr-evex \
|
||||
- memset-evex-unaligned-erms \
|
||||
- rawmemchr-evex \
|
||||
- stpcpy-evex \
|
||||
- stpncpy-evex \
|
||||
- strcat-evex \
|
||||
- strchr-evex \
|
||||
- strchrnul-evex \
|
||||
- strcmp-evex \
|
||||
- strcpy-evex \
|
||||
- strlen-evex \
|
||||
- strncat-evex \
|
||||
- strncmp-evex \
|
||||
- strncpy-evex \
|
||||
- strnlen-evex \
|
||||
- strrchr-evex \
|
||||
- memchr-evex-rtm \
|
||||
- rawmemchr-evex-rtm
|
||||
+sysdep_routines += \
|
||||
+ bzero \
|
||||
+ memchr-avx2 \
|
||||
+ memchr-avx2-rtm \
|
||||
+ memchr-evex \
|
||||
+ memchr-evex-rtm \
|
||||
+ memchr-sse2 \
|
||||
+ memcmp-avx2-movbe \
|
||||
+ memcmp-avx2-movbe-rtm \
|
||||
+ memcmp-evex-movbe \
|
||||
+ memcmp-sse2 \
|
||||
+ memcmp-sse4 \
|
||||
+ memcmp-ssse3 \
|
||||
+ memcpy-ssse3 \
|
||||
+ memcpy-ssse3-back \
|
||||
+ memmove-avx-unaligned-erms \
|
||||
+ memmove-avx-unaligned-erms-rtm \
|
||||
+ memmove-avx512-no-vzeroupper \
|
||||
+ memmove-avx512-unaligned-erms \
|
||||
+ memmove-evex-unaligned-erms \
|
||||
+ memmove-sse2-unaligned-erms \
|
||||
+ memmove-ssse3 \
|
||||
+ memmove-ssse3-back \
|
||||
+ memrchr-avx2 \
|
||||
+ memrchr-avx2-rtm \
|
||||
+ memrchr-evex \
|
||||
+ memrchr-sse2 \
|
||||
+ memset-avx2-unaligned-erms \
|
||||
+ memset-avx2-unaligned-erms-rtm \
|
||||
+ memset-avx512-no-vzeroupper \
|
||||
+ memset-avx512-unaligned-erms \
|
||||
+ memset-evex-unaligned-erms \
|
||||
+ memset-sse2-unaligned-erms \
|
||||
+ rawmemchr-avx2 \
|
||||
+ rawmemchr-avx2-rtm \
|
||||
+ rawmemchr-evex \
|
||||
+ rawmemchr-evex-rtm \
|
||||
+ rawmemchr-sse2 \
|
||||
+ stpcpy-avx2 \
|
||||
+ stpcpy-avx2-rtm \
|
||||
+ stpcpy-evex \
|
||||
+ stpcpy-sse2 \
|
||||
+ stpcpy-sse2-unaligned \
|
||||
+ stpcpy-ssse3 \
|
||||
+ stpncpy-avx2 \
|
||||
+ stpncpy-avx2-rtm \
|
||||
+ stpncpy-c \
|
||||
+ stpncpy-evex \
|
||||
+ stpncpy-sse2-unaligned \
|
||||
+ stpncpy-ssse3 \
|
||||
+ strcasecmp_l-avx \
|
||||
+ strcasecmp_l-sse2 \
|
||||
+ strcasecmp_l-sse4_2 \
|
||||
+ strcasecmp_l-ssse3 \
|
||||
+ strcat-avx2 \
|
||||
+ strcat-avx2-rtm \
|
||||
+ strcat-evex \
|
||||
+ strcat-sse2 \
|
||||
+ strcat-sse2-unaligned \
|
||||
+ strcat-ssse3 \
|
||||
+ strchr-avx2 \
|
||||
+ strchr-avx2-rtm \
|
||||
+ strchr-evex \
|
||||
+ strchr-sse2 \
|
||||
+ strchr-sse2-no-bsf \
|
||||
+ strchrnul-avx2 \
|
||||
+ strchrnul-avx2-rtm \
|
||||
+ strchrnul-evex \
|
||||
+ strchrnul-sse2 \
|
||||
+ strcmp-avx2 \
|
||||
+ strcmp-avx2-rtm \
|
||||
+ strcmp-evex \
|
||||
+ strcmp-sse2 \
|
||||
+ strcmp-sse2-unaligned \
|
||||
+ strcmp-sse4_2 \
|
||||
+ strcmp-ssse3 \
|
||||
+ strcpy-avx2 \
|
||||
+ strcpy-avx2-rtm \
|
||||
+ strcpy-evex \
|
||||
+ strcpy-sse2 \
|
||||
+ strcpy-sse2-unaligned \
|
||||
+ strcpy-ssse3 \
|
||||
+ strcspn-c \
|
||||
+ strcspn-sse2 \
|
||||
+ strlen-avx2 \
|
||||
+ strlen-avx2-rtm \
|
||||
+ strlen-evex \
|
||||
+ strlen-sse2 \
|
||||
+ strncase_l-avx \
|
||||
+ strncase_l-sse2 \
|
||||
+ strncase_l-sse4_2 \
|
||||
+ strncase_l-ssse3 \
|
||||
+ strncat-avx2 \
|
||||
+ strncat-avx2-rtm \
|
||||
+ strncat-c \
|
||||
+ strncat-evex \
|
||||
+ strncat-sse2-unaligned \
|
||||
+ strncat-ssse3 \
|
||||
+ strncmp-avx2 \
|
||||
+ strncmp-avx2-rtm \
|
||||
+ strncmp-evex \
|
||||
+ strncmp-sse2 \
|
||||
+ strncmp-sse4_2 \
|
||||
+ strncmp-ssse3 \
|
||||
+ strncpy-avx2 \
|
||||
+ strncpy-avx2-rtm \
|
||||
+ strncpy-c \
|
||||
+ strncpy-evex \
|
||||
+ strncpy-sse2-unaligned \
|
||||
+ strncpy-ssse3 \
|
||||
+ strnlen-avx2 \
|
||||
+ strnlen-avx2-rtm \
|
||||
+ strnlen-evex \
|
||||
+ strnlen-sse2 \
|
||||
+ strpbrk-c \
|
||||
+ strpbrk-sse2 \
|
||||
+ strrchr-avx2 \
|
||||
+ strrchr-avx2-rtm \
|
||||
+ strrchr-evex \
|
||||
+ strrchr-sse2 \
|
||||
+ strspn-c \
|
||||
+ strspn-sse2 \
|
||||
+ strstr-sse2-unaligned \
|
||||
+ varshift \
|
||||
+# sysdep_routines
|
||||
CFLAGS-varshift.c += -msse4
|
||||
CFLAGS-strcspn-c.c += -msse4
|
||||
CFLAGS-strpbrk-c.c += -msse4
|
||||
diff --git a/sysdeps/x86_64/multiarch/bzero.c b/sysdeps/x86_64/multiarch/bzero.c
|
||||
new file mode 100644
|
||||
index 00000000..58a14b2c
|
||||
--- /dev/null
|
||||
+++ b/sysdeps/x86_64/multiarch/bzero.c
|
||||
@@ -0,0 +1,106 @@
|
||||
+/* Multiple versions of bzero.
|
||||
+ All versions must be listed in ifunc-impl-list.c.
|
||||
+ Copyright (C) 2022 Free Software Foundation, Inc.
|
||||
+ This file is part of the GNU C Library.
|
||||
+
|
||||
+ The GNU C Library is free software; you can redistribute it and/or
|
||||
+ modify it under the terms of the GNU Lesser General Public
|
||||
+ License as published by the Free Software Foundation; either
|
||||
+ version 2.1 of the License, or (at your option) any later version.
|
||||
+
|
||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
+ Lesser General Public License for more details.
|
||||
+
|
||||
+ You should have received a copy of the GNU Lesser General Public
|
||||
+ License along with the GNU C Library; if not, see
|
||||
+ <https://www.gnu.org/licenses/>. */
|
||||
+
|
||||
+/* Define multiple versions only for the definition in libc. */
|
||||
+#if IS_IN (libc)
|
||||
+# define __bzero __redirect___bzero
|
||||
+# include <string.h>
|
||||
+# undef __bzero
|
||||
+
|
||||
+# define SYMBOL_NAME __bzero
|
||||
+# include <init-arch.h>
|
||||
+
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (sse2_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned) attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_rtm)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx2_unaligned_erms_rtm)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (evex_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned)
|
||||
+ attribute_hidden;
|
||||
+extern __typeof (REDIRECT_NAME) OPTIMIZE1 (avx512_unaligned_erms)
|
||||
+ attribute_hidden;
|
||||
+
|
||||
+static inline void *
|
||||
+IFUNC_SELECTOR (void)
|
||||
+{
|
||||
+ const struct cpu_features* cpu_features = __get_cpu_features ();
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512F)
|
||||
+ && !CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_AVX512))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE1 (avx512_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE1 (avx512_unaligned);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX2))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE_P (cpu_features, BMI2))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE1 (evex_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE1 (evex_unaligned);
|
||||
+ }
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, RTM))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE1 (avx2_unaligned_erms_rtm);
|
||||
+
|
||||
+ return OPTIMIZE1 (avx2_unaligned_rtm);
|
||||
+ }
|
||||
+
|
||||
+ if (!CPU_FEATURES_ARCH_P (cpu_features, Prefer_No_VZEROUPPER))
|
||||
+ {
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE1 (avx2_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE1 (avx2_unaligned);
|
||||
+ }
|
||||
+ }
|
||||
+
|
||||
+ if (CPU_FEATURE_USABLE_P (cpu_features, ERMS))
|
||||
+ return OPTIMIZE1 (sse2_unaligned_erms);
|
||||
+
|
||||
+ return OPTIMIZE1 (sse2_unaligned);
|
||||
+}
|
||||
+
|
||||
+libc_ifunc_redirected (__redirect___bzero, __bzero, IFUNC_SELECTOR ());
|
||||
+
|
||||
+weak_alias (__bzero, bzero)
|
||||
+#endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
index 8be0d78a..c963d391 100644
|
||||
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
|
||||
@@ -282,6 +282,48 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||
__memset_avx512_no_vzeroupper)
|
||||
)
|
||||
|
||||
+ /* Support sysdeps/x86_64/multiarch/bzero.c. */
|
||||
+ IFUNC_IMPL (i, name, bzero,
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero, 1,
|
||||
+ __bzero_sse2_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero, 1,
|
||||
+ __bzero_sse2_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ CPU_FEATURE_USABLE (AVX2),
|
||||
+ __bzero_avx2_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ CPU_FEATURE_USABLE (AVX2),
|
||||
+ __bzero_avx2_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (RTM)),
|
||||
+ __bzero_avx2_unaligned_rtm)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ (CPU_FEATURE_USABLE (AVX2)
|
||||
+ && CPU_FEATURE_USABLE (RTM)),
|
||||
+ __bzero_avx2_unaligned_erms_rtm)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __bzero_evex_unaligned)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __bzero_evex_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __bzero_avx512_unaligned_erms)
|
||||
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||
+ (CPU_FEATURE_USABLE (AVX512VL)
|
||||
+ && CPU_FEATURE_USABLE (AVX512BW)
|
||||
+ && CPU_FEATURE_USABLE (BMI2)),
|
||||
+ __bzero_avx512_unaligned)
|
||||
+ )
|
||||
+
|
||||
/* Support sysdeps/x86_64/multiarch/rawmemchr.c. */
|
||||
IFUNC_IMPL (i, name, rawmemchr,
|
||||
IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
|
||||
index 8ac3e479..5a5ee6f6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms-rtm.S
|
||||
@@ -5,6 +5,7 @@
|
||||
|
||||
#define SECTION(p) p##.avx.rtm
|
||||
#define MEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
+#define BZERO_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
#define WMEMSET_SYMBOL(p,s) p##_avx2_##s##_rtm
|
||||
|
||||
#include "memset-avx2-unaligned-erms.S"
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
index c0bf2875..a093a283 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx2-unaligned-erms.S
|
||||
@@ -14,6 +14,9 @@
|
||||
vmovd d, %xmm0; \
|
||||
movq r, %rax;
|
||||
|
||||
+# define BZERO_ZERO_VEC0() \
|
||||
+ vpxor %xmm0, %xmm0, %xmm0
|
||||
+
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
MEMSET_SET_VEC0_AND_SET_RETURN(d, r)
|
||||
|
||||
@@ -29,6 +32,9 @@
|
||||
# ifndef MEMSET_SYMBOL
|
||||
# define MEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
+# ifndef BZERO_SYMBOL
|
||||
+# define BZERO_SYMBOL(p,s) p##_avx2_##s
|
||||
+# endif
|
||||
# ifndef WMEMSET_SYMBOL
|
||||
# define WMEMSET_SYMBOL(p,s) p##_avx2_##s
|
||||
# endif
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
index 5241216a..727c9213 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-avx512-unaligned-erms.S
|
||||
@@ -19,6 +19,9 @@
|
||||
vpbroadcastb d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
+# define BZERO_ZERO_VEC0() \
|
||||
+ vpxorq %XMM0, %XMM0, %XMM0
|
||||
+
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastd d, %VEC0; \
|
||||
movq r, %rax
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
index 63700215..5d8fa78f 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-evex-unaligned-erms.S
|
||||
@@ -19,6 +19,9 @@
|
||||
vpbroadcastb d, %VEC0; \
|
||||
movq r, %rax
|
||||
|
||||
+# define BZERO_ZERO_VEC0() \
|
||||
+ vpxorq %XMM0, %XMM0, %XMM0
|
||||
+
|
||||
# define WMEMSET_SET_VEC0_AND_SET_RETURN(d, r) \
|
||||
vpbroadcastd d, %VEC0; \
|
||||
movq r, %rax
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
index 56b81f5c..8f579ad6 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
@@ -22,6 +22,7 @@
|
||||
|
||||
#if IS_IN (libc)
|
||||
# define MEMSET_SYMBOL(p,s) p##_sse2_##s
|
||||
+# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
|
||||
# define WMEMSET_SYMBOL(p,s) p##_sse2_##s
|
||||
|
||||
# ifdef SHARED
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index a67f9833..06f5f5d7 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -26,6 +26,10 @@
|
||||
|
||||
#include <sysdep.h>
|
||||
|
||||
+#ifndef BZERO_SYMBOL
|
||||
+# define BZERO_SYMBOL(p,s) MEMSET_SYMBOL (p, s)
|
||||
+#endif
|
||||
+
|
||||
#ifndef MEMSET_CHK_SYMBOL
|
||||
# define MEMSET_CHK_SYMBOL(p,s) MEMSET_SYMBOL(p, s)
|
||||
#endif
|
||||
@@ -87,6 +91,18 @@
|
||||
# define XMM_SMALL 0
|
||||
#endif
|
||||
|
||||
+#ifdef USE_LESS_VEC_MASK_STORE
|
||||
+# define SET_REG64 rcx
|
||||
+# define SET_REG32 ecx
|
||||
+# define SET_REG16 cx
|
||||
+# define SET_REG8 cl
|
||||
+#else
|
||||
+# define SET_REG64 rsi
|
||||
+# define SET_REG32 esi
|
||||
+# define SET_REG16 si
|
||||
+# define SET_REG8 sil
|
||||
+#endif
|
||||
+
|
||||
#define PAGE_SIZE 4096
|
||||
|
||||
/* Macro to calculate size of small memset block for aligning
|
||||
@@ -96,18 +112,6 @@
|
||||
|
||||
#ifndef SECTION
|
||||
# error SECTION is not defined!
|
||||
-#endif
|
||||
-
|
||||
- .section SECTION(.text),"ax",@progbits
|
||||
-#if VEC_SIZE == 16 && IS_IN (libc)
|
||||
-ENTRY (__bzero)
|
||||
- mov %RDI_LP, %RAX_LP /* Set return value. */
|
||||
- mov %RSI_LP, %RDX_LP /* Set n. */
|
||||
- xorl %esi, %esi
|
||||
- pxor %XMM0, %XMM0
|
||||
- jmp L(entry_from_bzero)
|
||||
-END (__bzero)
|
||||
-weak_alias (__bzero, bzero)
|
||||
#endif
|
||||
|
||||
#if IS_IN (libc)
|
||||
@@ -123,12 +127,37 @@ ENTRY (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
WMEMSET_SET_VEC0_AND_SET_RETURN (%esi, %rdi)
|
||||
WMEMSET_VDUP_TO_VEC0_LOW()
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
- jb L(less_vec_no_vdup)
|
||||
+ jb L(less_vec_from_wmemset)
|
||||
WMEMSET_VDUP_TO_VEC0_HIGH()
|
||||
jmp L(entry_from_wmemset)
|
||||
END (WMEMSET_SYMBOL (__wmemset, unaligned))
|
||||
#endif
|
||||
|
||||
+ENTRY (BZERO_SYMBOL(__bzero, unaligned))
|
||||
+#if VEC_SIZE > 16
|
||||
+ BZERO_ZERO_VEC0 ()
|
||||
+#endif
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ mov %RSI_LP, %RDX_LP
|
||||
+#ifndef USE_LESS_VEC_MASK_STORE
|
||||
+ xorl %esi, %esi
|
||||
+#endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
+ jb L(less_vec_no_vdup)
|
||||
+#ifdef USE_LESS_VEC_MASK_STORE
|
||||
+ xorl %esi, %esi
|
||||
+#endif
|
||||
+#if VEC_SIZE <= 16
|
||||
+ BZERO_ZERO_VEC0 ()
|
||||
+#endif
|
||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
+ ja L(more_2x_vec)
|
||||
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
+ VZEROUPPER_RETURN
|
||||
+END (BZERO_SYMBOL(__bzero, unaligned))
|
||||
+
|
||||
#if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
@@ -142,7 +171,6 @@ ENTRY (MEMSET_SYMBOL (__memset, unaligned))
|
||||
/* Clear the upper 32 bits. */
|
||||
mov %edx, %edx
|
||||
# endif
|
||||
-L(entry_from_bzero):
|
||||
cmpq $VEC_SIZE, %rdx
|
||||
jb L(less_vec)
|
||||
MEMSET_VDUP_TO_VEC0_HIGH()
|
||||
@@ -187,6 +215,31 @@ END (__memset_erms)
|
||||
END (MEMSET_SYMBOL (__memset, erms))
|
||||
# endif
|
||||
|
||||
+ENTRY_P2ALIGN (BZERO_SYMBOL(__bzero, unaligned_erms), 6)
|
||||
+# if VEC_SIZE > 16
|
||||
+ BZERO_ZERO_VEC0 ()
|
||||
+# endif
|
||||
+ mov %RDI_LP, %RAX_LP
|
||||
+ mov %RSI_LP, %RDX_LP
|
||||
+# ifndef USE_LESS_VEC_MASK_STORE
|
||||
+ xorl %esi, %esi
|
||||
+# endif
|
||||
+ cmp $VEC_SIZE, %RDX_LP
|
||||
+ jb L(less_vec_no_vdup)
|
||||
+# ifdef USE_LESS_VEC_MASK_STORE
|
||||
+ xorl %esi, %esi
|
||||
+# endif
|
||||
+# if VEC_SIZE <= 16
|
||||
+ BZERO_ZERO_VEC0 ()
|
||||
+# endif
|
||||
+ cmp $(VEC_SIZE * 2), %RDX_LP
|
||||
+ ja L(stosb_more_2x_vec)
|
||||
+ /* From VEC and to 2 * VEC. No branch when size == VEC_SIZE. */
|
||||
+ VMOVU %VEC(0), (%rdi)
|
||||
+ VMOVU %VEC(0), (VEC_SIZE * -1)(%rdi, %rdx)
|
||||
+ VZEROUPPER_RETURN
|
||||
+END (BZERO_SYMBOL(__bzero, unaligned_erms))
|
||||
+
|
||||
# if defined SHARED && IS_IN (libc)
|
||||
ENTRY_CHK (MEMSET_CHK_SYMBOL (__memset_chk, unaligned_erms))
|
||||
cmp %RDX_LP, %RCX_LP
|
||||
@@ -229,6 +282,7 @@ L(last_2x_vec):
|
||||
.p2align 4,, 10
|
||||
L(less_vec):
|
||||
L(less_vec_no_vdup):
|
||||
+L(less_vec_from_wmemset):
|
||||
/* Less than 1 VEC. */
|
||||
# if VEC_SIZE != 16 && VEC_SIZE != 32 && VEC_SIZE != 64
|
||||
# error Unsupported VEC_SIZE!
|
||||
@@ -374,8 +428,11 @@ L(less_vec):
|
||||
/* Broadcast esi to partial register (i.e VEC_SIZE == 32 broadcast to
|
||||
xmm). This is only does anything for AVX2. */
|
||||
MEMSET_VDUP_TO_VEC0_LOW ()
|
||||
+L(less_vec_from_wmemset):
|
||||
+#if VEC_SIZE > 16
|
||||
L(less_vec_no_vdup):
|
||||
#endif
|
||||
+#endif
|
||||
L(cross_page):
|
||||
#if VEC_SIZE > 32
|
||||
cmpl $32, %edx
|
||||
@@ -386,7 +443,10 @@ L(cross_page):
|
||||
jge L(between_16_31)
|
||||
#endif
|
||||
#ifndef USE_XMM_LESS_VEC
|
||||
- MOVQ %XMM0, %rcx
|
||||
+ MOVQ %XMM0, %SET_REG64
|
||||
+#endif
|
||||
+#if VEC_SIZE <= 16
|
||||
+L(less_vec_no_vdup):
|
||||
#endif
|
||||
cmpl $8, %edx
|
||||
jge L(between_8_15)
|
||||
@@ -395,7 +455,7 @@ L(cross_page):
|
||||
cmpl $1, %edx
|
||||
jg L(between_2_3)
|
||||
jl L(between_0_0)
|
||||
- movb %sil, (%LESS_VEC_REG)
|
||||
+ movb %SET_REG8, (%LESS_VEC_REG)
|
||||
L(between_0_0):
|
||||
ret
|
||||
|
||||
@@ -428,8 +488,8 @@ L(between_8_15):
|
||||
MOVQ %XMM0, (%rdi)
|
||||
MOVQ %XMM0, -8(%rdi, %rdx)
|
||||
#else
|
||||
- movq %rcx, (%LESS_VEC_REG)
|
||||
- movq %rcx, -8(%LESS_VEC_REG, %rdx)
|
||||
+ movq %SET_REG64, (%LESS_VEC_REG)
|
||||
+ movq %SET_REG64, -8(%LESS_VEC_REG, %rdx)
|
||||
#endif
|
||||
ret
|
||||
|
||||
@@ -442,8 +502,8 @@ L(between_4_7):
|
||||
MOVD %XMM0, (%rdi)
|
||||
MOVD %XMM0, -4(%rdi, %rdx)
|
||||
#else
|
||||
- movl %ecx, (%LESS_VEC_REG)
|
||||
- movl %ecx, -4(%LESS_VEC_REG, %rdx)
|
||||
+ movl %SET_REG32, (%LESS_VEC_REG)
|
||||
+ movl %SET_REG32, -4(%LESS_VEC_REG, %rdx)
|
||||
#endif
|
||||
ret
|
||||
|
||||
@@ -452,12 +512,12 @@ L(between_4_7):
|
||||
L(between_2_3):
|
||||
/* From 2 to 3. No branch when size == 2. */
|
||||
#ifdef USE_XMM_LESS_VEC
|
||||
- movb %sil, (%rdi)
|
||||
- movb %sil, 1(%rdi)
|
||||
- movb %sil, -1(%rdi, %rdx)
|
||||
+ movb %SET_REG8, (%rdi)
|
||||
+ movb %SET_REG8, 1(%rdi)
|
||||
+ movb %SET_REG8, -1(%rdi, %rdx)
|
||||
#else
|
||||
- movw %cx, (%LESS_VEC_REG)
|
||||
- movb %sil, -1(%LESS_VEC_REG, %rdx)
|
||||
+ movw %SET_REG16, (%LESS_VEC_REG)
|
||||
+ movb %SET_REG8, -1(%LESS_VEC_REG, %rdx)
|
||||
#endif
|
||||
ret
|
||||
END (MEMSET_SYMBOL (__memset, unaligned_erms))
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,33 +0,0 @@
|
||||
From 7912236f4a597deb092650ca79f33504ddb4af28 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sat, 12 Feb 2022 00:45:00 -0600
|
||||
Subject: [PATCH] x86: Set .text section in memset-vec-unaligned-erms
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Mon Feb 7 05:55:15 2022 -0800
|
||||
|
||||
x86-64: Optimize bzero
|
||||
|
||||
Remove setting the .text section for the code. This commit
|
||||
adds that back.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
index 06f5f5d7..4fb475c0 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-vec-unaligned-erms.S
|
||||
@@ -114,6 +114,7 @@
|
||||
# error SECTION is not defined!
|
||||
#endif
|
||||
|
||||
+ .section SECTION(.text), "ax", @progbits
|
||||
#if IS_IN (libc)
|
||||
# if defined SHARED
|
||||
ENTRY_CHK (WMEMSET_CHK_SYMBOL (__wmemset_chk, unaligned))
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,90 +0,0 @@
|
||||
From e108c02a5e23c8c88ce66d8705d4a24bb6b9a8bf Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Tue, 15 Feb 2022 20:27:21 -0600
|
||||
Subject: [PATCH] x86: Fix bug in strncmp-evex and strncmp-avx2 [BZ #28895]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Logic can read before the start of `s1` / `s2` if both `s1` and `s2`
|
||||
are near the start of a page. To avoid having the result contimated by
|
||||
these comparisons the `strcmp` variants would mask off these
|
||||
comparisons. This was missing in the `strncmp` variants causing
|
||||
the bug. This commit adds the masking to `strncmp` so that out of
|
||||
range comparisons don't affect the result.
|
||||
|
||||
test-strcmp, test-strncmp, test-wcscmp, and test-wcsncmp all pass as
|
||||
well a full xcheck on x86_64 linux.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
string/test-strncmp.c | 23 +++++++++++++++++++++++
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 1 +
|
||||
sysdeps/x86_64/multiarch/strcmp-evex.S | 1 +
|
||||
3 files changed, 25 insertions(+)
|
||||
|
||||
diff --git a/string/test-strncmp.c b/string/test-strncmp.c
|
||||
index 927a6daa..e61fffd9 100644
|
||||
--- a/string/test-strncmp.c
|
||||
+++ b/string/test-strncmp.c
|
||||
@@ -403,6 +403,28 @@ check2 (void)
|
||||
free (s2);
|
||||
}
|
||||
|
||||
+static void
|
||||
+check4 (void)
|
||||
+{
|
||||
+ /* To trigger bug 28895; We need 1) both s1 and s2 to be within 32 bytes of
|
||||
+ the end of the page. 2) For there to be no mismatch/null byte before the
|
||||
+ first page cross. 3) For length (`n`) to be large enough for one string to
|
||||
+ cross the page. And 4) for there to be either mismatch/null bytes before
|
||||
+ the start of the strings. */
|
||||
+
|
||||
+ size_t size = 10;
|
||||
+ size_t addr_mask = (getpagesize () - 1) ^ (sizeof (CHAR) - 1);
|
||||
+ CHAR *s1 = (CHAR *)(buf1 + (addr_mask & 0xffa));
|
||||
+ CHAR *s2 = (CHAR *)(buf2 + (addr_mask & 0xfed));
|
||||
+ int exp_result;
|
||||
+
|
||||
+ STRCPY (s1, L ("tst-tlsmod%"));
|
||||
+ STRCPY (s2, L ("tst-tls-manydynamic73mod"));
|
||||
+ exp_result = SIMPLE_STRNCMP (s1, s2, size);
|
||||
+ FOR_EACH_IMPL (impl, 0)
|
||||
+ check_result (impl, s1, s2, size, exp_result);
|
||||
+}
|
||||
+
|
||||
static void
|
||||
check3 (void)
|
||||
{
|
||||
@@ -445,6 +467,7 @@ test_main (void)
|
||||
check1 ();
|
||||
check2 ();
|
||||
check3 ();
|
||||
+ check4 ();
|
||||
|
||||
printf ("%23s", "");
|
||||
FOR_EACH_IMPL (impl, 0)
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 04675aa4..179cc0e3 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -661,6 +661,7 @@ L(ret8):
|
||||
# ifdef USE_AS_STRNCMP
|
||||
.p2align 4,, 10
|
||||
L(return_page_cross_end_check):
|
||||
+ andl %r10d, %ecx
|
||||
tzcntl %ecx, %ecx
|
||||
leal -VEC_SIZE(%rax, %rcx), %ecx
|
||||
cmpl %ecx, %edx
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-evex.S b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
index ed56af8e..0dfa62bd 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-evex.S
|
||||
@@ -689,6 +689,7 @@ L(ret8):
|
||||
# ifdef USE_AS_STRNCMP
|
||||
.p2align 4,, 10
|
||||
L(return_page_cross_end_check):
|
||||
+ andl %r10d, %ecx
|
||||
tzcntl %ecx, %ecx
|
||||
leal -VEC_SIZE(%rax, %rcx, SIZE_OF_CHAR), %ecx
|
||||
# ifdef USE_AS_WCSCMP
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,77 +0,0 @@
|
||||
From 9fef7039a7d04947bc89296ee0d187bc8d89b772 Mon Sep 17 00:00:00 2001
|
||||
From: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Thu, 24 Mar 2022 15:50:33 -0500
|
||||
Subject: [PATCH] x86: Fix fallback for wcsncmp_avx2 in strcmp-avx2.S [BZ
|
||||
#28896]
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Overflow case for __wcsncmp_avx2_rtm should be __wcscmp_avx2_rtm not
|
||||
__wcscmp_avx2.
|
||||
|
||||
commit ddf0992cf57a93200e0c782e2a94d0733a5a0b87
|
||||
Author: Noah Goldstein <goldstein.w.n@gmail.com>
|
||||
Date: Sun Jan 9 16:02:21 2022 -0600
|
||||
|
||||
x86: Fix __wcsncmp_avx2 in strcmp-avx2.S [BZ# 28755]
|
||||
|
||||
Set the wrong fallback function for `__wcsncmp_avx2_rtm`. It was set
|
||||
to fallback on to `__wcscmp_avx2` instead of `__wcscmp_avx2_rtm` which
|
||||
can cause spurious aborts.
|
||||
|
||||
This change will need to be backported.
|
||||
|
||||
All string/memory tests pass.
|
||||
Reviewed-by: H.J. Lu <hjl.tools@gmail.com>
|
||||
---
|
||||
sysdeps/x86/tst-strncmp-rtm.c | 15 +++++++++++++++
|
||||
sysdeps/x86_64/multiarch/strcmp-avx2.S | 2 +-
|
||||
2 files changed, 16 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/sysdeps/x86/tst-strncmp-rtm.c b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
index aef9866c..ba6543be 100644
|
||||
--- a/sysdeps/x86/tst-strncmp-rtm.c
|
||||
+++ b/sysdeps/x86/tst-strncmp-rtm.c
|
||||
@@ -70,6 +70,16 @@ function_overflow (void)
|
||||
return 1;
|
||||
}
|
||||
|
||||
+__attribute__ ((noinline, noclone))
|
||||
+static int
|
||||
+function_overflow2 (void)
|
||||
+{
|
||||
+ if (STRNCMP (string1, string2, SIZE_MAX >> 4) == 0)
|
||||
+ return 0;
|
||||
+ else
|
||||
+ return 1;
|
||||
+}
|
||||
+
|
||||
static int
|
||||
do_test (void)
|
||||
{
|
||||
@@ -77,5 +87,10 @@ do_test (void)
|
||||
if (status != EXIT_SUCCESS)
|
||||
return status;
|
||||
status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow);
|
||||
+ if (status != EXIT_SUCCESS)
|
||||
+ return status;
|
||||
+ status = do_test_1 (TEST_NAME, LOOP, prepare, function_overflow2);
|
||||
+ if (status != EXIT_SUCCESS)
|
||||
+ return status;
|
||||
return status;
|
||||
}
|
||||
diff --git a/sysdeps/x86_64/multiarch/strcmp-avx2.S b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
index 179cc0e3..782f9472 100644
|
||||
--- a/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
+++ b/sysdeps/x86_64/multiarch/strcmp-avx2.S
|
||||
@@ -122,7 +122,7 @@ ENTRY(STRCMP)
|
||||
are cases where length is large enough that it can never be a
|
||||
bound on valid memory so just use wcscmp. */
|
||||
shrq $56, %rcx
|
||||
- jnz __wcscmp_avx2
|
||||
+ jnz OVERFLOW_STRCMP
|
||||
|
||||
leaq (, %rdx, 4), %rdx
|
||||
# endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,27 +0,0 @@
|
||||
From 1283948f236f209b7d3f44b69a42b96806fa6da0 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sat, 5 Feb 2022 11:06:01 -0800
|
||||
Subject: [PATCH] x86: Improve L to support L(XXX_SYMBOL (YYY, ZZZ))
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
---
|
||||
sysdeps/x86/sysdep.h | 3 ++-
|
||||
1 file changed, 2 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
|
||||
index a70bb3a2..49b0efe2 100644
|
||||
--- a/sysdeps/x86/sysdep.h
|
||||
+++ b/sysdeps/x86/sysdep.h
|
||||
@@ -111,7 +111,8 @@ enum cf_protection_level
|
||||
/* Local label name for asm code. */
|
||||
#ifndef L
|
||||
/* ELF-like local names start with `.L'. */
|
||||
-# define L(name) .L##name
|
||||
+# define LOCAL_LABEL(name) .L##name
|
||||
+# define L(name) LOCAL_LABEL(name)
|
||||
#endif
|
||||
|
||||
#define atom_text_section .section ".text.atom", "ax"
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,108 +0,0 @@
|
||||
From c328d0152d4b14cca58407ec68143894c8863004 Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Sat, 5 Feb 2022 11:52:33 -0800
|
||||
Subject: [PATCH] x86_64/multiarch: Sort sysdep_routines and put one entry per
|
||||
line
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
Conflicts:
|
||||
sysdeps/x86_64/multiarch/Makefile
|
||||
(test order changed)
|
||||
|
||||
---
|
||||
sysdeps/x86_64/multiarch/Makefile | 78 +++++++++++++++++++------------
|
||||
1 file changed, 48 insertions(+), 30 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/Makefile b/sysdeps/x86_64/multiarch/Makefile
|
||||
index 37d8d6f0..8c9e7812 100644
|
||||
--- a/sysdeps/x86_64/multiarch/Makefile
|
||||
+++ b/sysdeps/x86_64/multiarch/Makefile
|
||||
@@ -132,37 +132,55 @@ CFLAGS-strspn-c.c += -msse4
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),wcsmbs)
|
||||
-sysdep_routines += wmemcmp-sse4 wmemcmp-ssse3 wmemcmp-c \
|
||||
- wmemcmp-avx2-movbe \
|
||||
- wmemchr-sse2 wmemchr-avx2 \
|
||||
- wcscmp-sse2 wcscmp-avx2 \
|
||||
- wcsncmp-sse2 wcsncmp-avx2 \
|
||||
- wcscpy-ssse3 wcscpy-c \
|
||||
- wcschr-sse2 wcschr-avx2 \
|
||||
- wcsrchr-sse2 wcsrchr-avx2 \
|
||||
- wcslen-sse2 wcslen-sse4_1 wcslen-avx2 \
|
||||
- wcsnlen-c wcsnlen-sse4_1 wcsnlen-avx2 \
|
||||
- wcschr-avx2-rtm \
|
||||
- wcscmp-avx2-rtm \
|
||||
- wcslen-avx2-rtm \
|
||||
- wcsncmp-avx2-rtm \
|
||||
- wcsnlen-avx2-rtm \
|
||||
- wcsrchr-avx2-rtm \
|
||||
- wmemchr-avx2-rtm \
|
||||
- wmemcmp-avx2-movbe-rtm \
|
||||
- wcschr-evex \
|
||||
- wcscmp-evex \
|
||||
- wcslen-evex \
|
||||
- wcsncmp-evex \
|
||||
- wcsnlen-evex \
|
||||
- wcsrchr-evex \
|
||||
- wmemchr-evex \
|
||||
- wmemcmp-evex-movbe \
|
||||
- wmemchr-evex-rtm
|
||||
+sysdep_routines += \
|
||||
+ wcschr-avx2 \
|
||||
+ wcschr-avx2-rtm \
|
||||
+ wcschr-evex \
|
||||
+ wcschr-sse2 \
|
||||
+ wcscmp-avx2 \
|
||||
+ wcscmp-avx2-rtm \
|
||||
+ wcscmp-evex \
|
||||
+ wcscmp-sse2 \
|
||||
+ wcscpy-c \
|
||||
+ wcscpy-ssse3 \
|
||||
+ wcslen-avx2 \
|
||||
+ wcslen-avx2-rtm \
|
||||
+ wcslen-evex \
|
||||
+ wcslen-sse2 \
|
||||
+ wcslen-sse4_1 \
|
||||
+ wcsncmp-avx2 \
|
||||
+ wcsncmp-avx2-rtm \
|
||||
+ wcsncmp-evex \
|
||||
+ wcsncmp-sse2 \
|
||||
+ wcsnlen-avx2 \
|
||||
+ wcsnlen-avx2-rtm \
|
||||
+ wcsnlen-c \
|
||||
+ wcsnlen-evex \
|
||||
+ wcsnlen-sse4_1 \
|
||||
+ wcsrchr-avx2 \
|
||||
+ wcsrchr-avx2-rtm \
|
||||
+ wcsrchr-evex \
|
||||
+ wcsrchr-sse2 \
|
||||
+ wmemchr-avx2 \
|
||||
+ wmemchr-avx2-rtm \
|
||||
+ wmemchr-evex \
|
||||
+ wmemchr-evex-rtm \
|
||||
+ wmemchr-sse2 \
|
||||
+ wmemcmp-avx2-movbe \
|
||||
+ wmemcmp-avx2-movbe-rtm \
|
||||
+ wmemcmp-c \
|
||||
+ wmemcmp-evex-movbe \
|
||||
+ wmemcmp-sse4 \
|
||||
+ wmemcmp-ssse3 \
|
||||
+# sysdep_routines
|
||||
endif
|
||||
|
||||
ifeq ($(subdir),debug)
|
||||
-sysdep_routines += memcpy_chk-nonshared mempcpy_chk-nonshared \
|
||||
- memmove_chk-nonshared memset_chk-nonshared \
|
||||
- wmemset_chk-nonshared
|
||||
+sysdep_routines += \
|
||||
+ memcpy_chk-nonshared \
|
||||
+ memmove_chk-nonshared \
|
||||
+ mempcpy_chk-nonshared \
|
||||
+ memset_chk-nonshared \
|
||||
+ wmemset_chk-nonshared \
|
||||
+# sysdep_routines
|
||||
endif
|
||||
--
|
||||
GitLab
|
||||
|
@ -1,36 +0,0 @@
|
||||
From 0fb8800029d230b3711bf722b2a47db92d0e273f Mon Sep 17 00:00:00 2001
|
||||
From: "H.J. Lu" <hjl.tools@gmail.com>
|
||||
Date: Thu, 10 Feb 2022 11:52:50 -0800
|
||||
Subject: [PATCH] x86-64: Remove bzero weak alias in SS2 memset
|
||||
Content-type: text/plain; charset=UTF-8
|
||||
|
||||
commit 3d9f171bfb5325bd5f427e9fc386453358c6e840
|
||||
Author: H.J. Lu <hjl.tools@gmail.com>
|
||||
Date: Mon Feb 7 05:55:15 2022 -0800
|
||||
|
||||
x86-64: Optimize bzero
|
||||
|
||||
added the optimized bzero. Remove bzero weak alias in SS2 memset to
|
||||
avoid undefined __bzero in memset-sse2-unaligned-erms.
|
||||
---
|
||||
sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S | 4 +---
|
||||
1 file changed, 1 insertion(+), 3 deletions(-)
|
||||
|
||||
diff --git a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
index 8f579ad6..af51362b 100644
|
||||
--- a/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
+++ b/sysdeps/x86_64/multiarch/memset-sse2-unaligned-erms.S
|
||||
@@ -31,9 +31,7 @@
|
||||
# endif
|
||||
|
||||
# undef weak_alias
|
||||
-# define weak_alias(original, alias) \
|
||||
- .weak bzero; bzero = __bzero
|
||||
-
|
||||
+# define weak_alias(original, alias)
|
||||
# undef strong_alias
|
||||
# define strong_alias(ignored1, ignored2)
|
||||
#endif
|
||||
--
|
||||
GitLab
|
||||
|
Some files were not shown because too many files have changed in this diff Show More
Loading…
Reference in New Issue
Block a user