forked from rpms/glibc
Auto-sync with upstream branch master
Upstream commit: 5d7f1bce7d8eea31f4baeb68bcc3124b35acc751 - Apply glibc-benchtests-aarch64.patch to fix an aarch64 build failure. - Drop glibc-rh2244688.patch revert. Fix applied upstream. - Drop glibc-rh2244992.patch, glibc-rh2248915.patch, glibc-rh2248502-3.patch. All applied upstream. - posix: Revert the removal of the crypt prototype from <unistd.h> - elf: Add comments on how LD_AUDIT and LD_PRELOAD handle __libc_enable_secure - elf: Ignore LD_LIBRARY_PATH and debug env var for setuid for static - elf: Remove any_debug from dl_main_state - elf: Remove LD_PROFILE for static binaries - elf: Ignore LD_PROFILE for setuid binaries - s390: Use dl-symbol-redir-ifunc.h on cpu-tunables - x86: Use dl-symbol-redir-ifunc.h on cpu-tunables - elf: Emit warning if tunable is ill-formatted - elf: Fix _dl_debug_vdprintf to work before self-relocation - elf: Do not parse ill-formatted strings - elf: Do not process invalid tunable format - elf: Add all malloc tunable to unsecvars - elf: Ignore GLIBC_TUNABLES for setuid/setgid binaries - elf: Add GLIBC_TUNABLES to unsecvars - elf: Remove /etc/suid-debug support - stdlib: The qsort implementation needs to use heapsort in more cases - stdlib: Handle various corner cases in the fallback heapsort for qsort - stdlib: Avoid another self-comparison in qsort - hurd: fix restarting reauth_dtable on signal - hurd: Prevent the final file_exec_paths call from signals - manual: Fix termios.c example. (Bug 31078) - aarch64: Add vector implementations of expm1 routines - linux: Use fchmodat2 on fchmod for flags different than 0 (BZ 26401) - intl: Add test case for bug 16621 - resolv: free only initialized items from gai pool - ldconfig: Fixes for skipping temporary files. - nptl: Link tst-execstack-threads-mod.so with -z execstack - nptl: Rename tst-execstack to tst-execstack-threads - localedata: Convert oc_FR locale to UTF-8 - localedata: Add information for Occitan - elf: Fix force_first handling in dlclose (bug 30981) - elf: Handle non-directory name in search path (BZ 31035) - New Zealand locales (en_NZ & mi_NZ) first day of week should be Monday - x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S - posix: Check pidfd_spawn with tst-spawn7-pid - y2038: Fix support for 64-bit time on legacy ABIs - AArch64: Remove Falkor memcpy - AArch64: Add memset_zva64 - AArch64: Cleanup emag memset - test: Run the tst-tls-allocation-failure-static-patched with test-wrapper. - aarch64: Add vector implementations of log1p routines - aarch64: Add vector implementations of atan2 routines - aarch64: Add vector implementations of atan routines - aarch64: Add vector implementations of acos routines - aarch64: Add vector implementations of asin routines
This commit is contained in:
parent
9e361d1258
commit
f94f56a23b
250
glibc-benchtests-aarch64.patch
Normal file
250
glibc-benchtests-aarch64.patch
Normal file
@ -0,0 +1,250 @@
|
|||||||
|
Author: Joe Ramsay <Joe.Ramsay@arm.com>
|
||||||
|
Date: Tue Nov 21 14:39:39 2023 +0000
|
||||||
|
|
||||||
|
aarch64: Fix libmvec benchmarks
|
||||||
|
|
||||||
|
These were broken by the new atan2 functions, as they were only
|
||||||
|
set up for univariate functions. Arity is now detected from the
|
||||||
|
input file - this revealed a mistake that the double-precision
|
||||||
|
inputs were being used for both single- and double-precision
|
||||||
|
routines, which is now remedied.
|
||||||
|
|
||||||
|
diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
|
||||||
|
index 3e124c781065fea9..3661a24044cc9770 100644
|
||||||
|
--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
|
||||||
|
+++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py
|
||||||
|
@@ -22,40 +22,49 @@ TEMPLATE = """
|
||||||
|
#include <math.h>
|
||||||
|
#include <arm_neon.h>
|
||||||
|
|
||||||
|
-#define STRIDE {stride}
|
||||||
|
+#define STRIDE {rowlen}
|
||||||
|
|
||||||
|
-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{ \\
|
||||||
|
- {rtype} mx0 = {fname}(vld1q_f{prec_short} (variants[v].in[i].arg0)); \\
|
||||||
|
+#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{ \\
|
||||||
|
+ {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE])); \\
|
||||||
|
mx0; }}))
|
||||||
|
|
||||||
|
-struct args
|
||||||
|
+#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{ \\
|
||||||
|
+ {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE]), \\
|
||||||
|
+ vld1q_f{prec_short} (&variants[v].in->arg1[i * STRIDE])); \\
|
||||||
|
+ mx0; }}))
|
||||||
|
+
|
||||||
|
+struct args_1
|
||||||
|
+{{
|
||||||
|
+ {stype} arg0[{nelems}];
|
||||||
|
+}};
|
||||||
|
+
|
||||||
|
+struct args_2
|
||||||
|
{{
|
||||||
|
- {stype} arg0[STRIDE];
|
||||||
|
- double timing;
|
||||||
|
+ {stype} arg0[{nelems}];
|
||||||
|
+ {stype} arg1[{nelems}];
|
||||||
|
}};
|
||||||
|
|
||||||
|
struct _variants
|
||||||
|
{{
|
||||||
|
const char *name;
|
||||||
|
- int count;
|
||||||
|
- const struct args *in;
|
||||||
|
+ const struct args_{arity} *in;
|
||||||
|
}};
|
||||||
|
|
||||||
|
-static const struct args in0[{rowcount}] = {{
|
||||||
|
+static const struct args_{arity} in0 = {{
|
||||||
|
{in_data}
|
||||||
|
}};
|
||||||
|
|
||||||
|
static const struct _variants variants[1] = {{
|
||||||
|
- {{"", {rowcount}, in0}},
|
||||||
|
+ {{"", &in0}},
|
||||||
|
}};
|
||||||
|
|
||||||
|
#define NUM_VARIANTS 1
|
||||||
|
-#define NUM_SAMPLES(i) (variants[i].count)
|
||||||
|
+#define NUM_SAMPLES(i) ({nelems} / STRIDE)
|
||||||
|
#define VARIANT(i) (variants[i].name)
|
||||||
|
|
||||||
|
static {rtype} volatile ret;
|
||||||
|
|
||||||
|
-#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC(i, j); }})
|
||||||
|
+#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC_{arity}(i, j); }})
|
||||||
|
#define FUNCNAME "{fname}"
|
||||||
|
#include <bench-libmvec-skeleton.c>
|
||||||
|
"""
|
||||||
|
@@ -63,27 +72,34 @@ static {rtype} volatile ret;
|
||||||
|
def main(name):
|
||||||
|
_, prec, _, func = name.split("-")
|
||||||
|
scalar_to_advsimd_type = {"double": "float64x2_t", "float": "float32x4_t"}
|
||||||
|
-
|
||||||
|
- stride = {"double": 2, "float": 4}[prec]
|
||||||
|
+ rowlen = {"double": 2, "float": 4}[prec]
|
||||||
|
rtype = scalar_to_advsimd_type[prec]
|
||||||
|
atype = scalar_to_advsimd_type[prec]
|
||||||
|
- fname = f"_ZGVnN{stride}v_{func}{'f' if prec == 'float' else ''}"
|
||||||
|
prec_short = {"double": 64, "float": 32}[prec]
|
||||||
|
-
|
||||||
|
- with open(f"../benchtests/libmvec/{func}-inputs") as f:
|
||||||
|
- in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
|
||||||
|
- in_vals = [in_vals[i:i+stride] for i in range(0, len(in_vals), stride)]
|
||||||
|
- rowcount= len(in_vals)
|
||||||
|
- in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
|
||||||
|
-
|
||||||
|
- print(TEMPLATE.format(stride=stride,
|
||||||
|
+ input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
|
||||||
|
+
|
||||||
|
+ with open(f"../benchtests/libmvec/{input_filename}") as f:
|
||||||
|
+ input_file = f.readlines()
|
||||||
|
+ in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
|
||||||
|
+ # Split in case of multivariate signature
|
||||||
|
+ in_vals = (l.split(", ") for l in in_vals)
|
||||||
|
+ # Transpose
|
||||||
|
+ in_vals = list(zip(*in_vals))
|
||||||
|
+ in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
|
||||||
|
+ for col in in_vals)
|
||||||
|
+
|
||||||
|
+ arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
|
||||||
|
+ fname = f"_ZGVnN{rowlen}{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
|
||||||
|
+
|
||||||
|
+ print(TEMPLATE.format(rowlen=rowlen,
|
||||||
|
rtype=rtype,
|
||||||
|
atype=atype,
|
||||||
|
fname=fname,
|
||||||
|
prec_short=prec_short,
|
||||||
|
in_data=in_data,
|
||||||
|
- rowcount=rowcount,
|
||||||
|
- stype=prec))
|
||||||
|
+ stype=prec,
|
||||||
|
+ arity=arity,
|
||||||
|
+ nelems=len(in_vals[0])))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
|
||||||
|
index 66f2c8e0f465f9ce..5d9332be9c5a536a 100755
|
||||||
|
--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
|
||||||
|
+++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py
|
||||||
|
@@ -22,46 +22,55 @@ TEMPLATE = """
|
||||||
|
#include <math.h>
|
||||||
|
#include <arm_sve.h>
|
||||||
|
|
||||||
|
-#define MAX_STRIDE {max_stride}
|
||||||
|
#define STRIDE {stride}
|
||||||
|
#define PTRUE svptrue_b{prec_short}
|
||||||
|
#define SV_LOAD svld1_f{prec_short}
|
||||||
|
#define SV_STORE svst1_f{prec_short}
|
||||||
|
#define REQUIRE_SVE
|
||||||
|
|
||||||
|
-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{ \\
|
||||||
|
- {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), variants[v].in[i].arg0), PTRUE()); \\
|
||||||
|
+#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{ \\
|
||||||
|
+ {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]), PTRUE()); \\
|
||||||
|
mx0; }}))
|
||||||
|
|
||||||
|
-struct args
|
||||||
|
+#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{ \\
|
||||||
|
+ {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]), \\
|
||||||
|
+ SV_LOAD (PTRUE(), &variants[v].in->arg1[i * STRIDE]), \\
|
||||||
|
+ PTRUE()); \\
|
||||||
|
+ mx0; }}))
|
||||||
|
+
|
||||||
|
+struct args_1
|
||||||
|
{{
|
||||||
|
- {stype} arg0[MAX_STRIDE];
|
||||||
|
- double timing;
|
||||||
|
+ {stype} arg0[{nelems}];
|
||||||
|
+}};
|
||||||
|
+
|
||||||
|
+struct args_2
|
||||||
|
+{{
|
||||||
|
+ {stype} arg0[{nelems}];
|
||||||
|
+ {stype} arg1[{nelems}];
|
||||||
|
}};
|
||||||
|
|
||||||
|
struct _variants
|
||||||
|
{{
|
||||||
|
const char *name;
|
||||||
|
- int count;
|
||||||
|
- const struct args *in;
|
||||||
|
+ const struct args_{arity} *in;
|
||||||
|
}};
|
||||||
|
|
||||||
|
-static const struct args in0[{rowcount}] = {{
|
||||||
|
+static const struct args_{arity} in0 = {{
|
||||||
|
{in_data}
|
||||||
|
}};
|
||||||
|
|
||||||
|
static const struct _variants variants[1] = {{
|
||||||
|
- {{"", {rowcount}, in0}},
|
||||||
|
+ {{"", &in0}},
|
||||||
|
}};
|
||||||
|
|
||||||
|
#define NUM_VARIANTS 1
|
||||||
|
-#define NUM_SAMPLES(i) (variants[i].count)
|
||||||
|
+#define NUM_SAMPLES(i) ({nelems} / STRIDE)
|
||||||
|
#define VARIANT(i) (variants[i].name)
|
||||||
|
|
||||||
|
// Cannot pass volatile pointer to svst1. This still does not appear to get optimised out.
|
||||||
|
-static {stype} /*volatile*/ ret[MAX_STRIDE];
|
||||||
|
+static {stype} /*volatile*/ ret[{rowlen}];
|
||||||
|
|
||||||
|
-#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC(i, j)); }})
|
||||||
|
+#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC_{arity}(i, j)); }})
|
||||||
|
#define FUNCNAME "{fname}"
|
||||||
|
#include <bench-libmvec-skeleton.c>
|
||||||
|
"""
|
||||||
|
@@ -69,23 +78,29 @@ static {stype} /*volatile*/ ret[MAX_STRIDE];
|
||||||
|
def main(name):
|
||||||
|
_, prec, _, func = name.split("-")
|
||||||
|
scalar_to_sve_type = {"double": "svfloat64_t", "float": "svfloat32_t"}
|
||||||
|
-
|
||||||
|
stride = {"double": "svcntd()", "float": "svcntw()"}[prec]
|
||||||
|
rtype = scalar_to_sve_type[prec]
|
||||||
|
atype = scalar_to_sve_type[prec]
|
||||||
|
- fname = f"_ZGVsMxv_{func}{'f' if prec == 'float' else ''}"
|
||||||
|
prec_short = {"double": 64, "float": 32}[prec]
|
||||||
|
# Max SVE vector length is 2048 bits. To ensure benchmarks are
|
||||||
|
# vector-length-agnostic, but still use as wide vectors as
|
||||||
|
# possible on any given target, divide input data into 2048-bit
|
||||||
|
# rows, then load/store as many elements as the target will allow.
|
||||||
|
- max_stride = 2048 // prec_short
|
||||||
|
-
|
||||||
|
- with open(f"../benchtests/libmvec/{func}-inputs") as f:
|
||||||
|
- in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")]
|
||||||
|
- in_vals = [in_vals[i:i+max_stride] for i in range(0, len(in_vals), max_stride)]
|
||||||
|
- rowcount= len(in_vals)
|
||||||
|
- in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals)
|
||||||
|
+ rowlen = {"double": 32, "float": 64}[prec]
|
||||||
|
+ input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec]
|
||||||
|
+
|
||||||
|
+ with open(f"../benchtests/libmvec/{input_filename}") as f:
|
||||||
|
+ input_file = f.readlines()
|
||||||
|
+ in_vals = (l.strip() for l in input_file if l and not l.startswith("#"))
|
||||||
|
+ # Split in case of multivariate signature
|
||||||
|
+ in_vals = (l.split(", ") for l in in_vals)
|
||||||
|
+ # Transpose
|
||||||
|
+ in_vals = list(zip(*in_vals))
|
||||||
|
+ in_data = ",\n".join("{" + (", ".join(val for val in col) + "}")
|
||||||
|
+ for col in in_vals)
|
||||||
|
+
|
||||||
|
+ arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec)
|
||||||
|
+ fname = f"_ZGVsMx{'v' * arity}_{func}{'f' if prec == 'float' else ''}"
|
||||||
|
|
||||||
|
print(TEMPLATE.format(stride=stride,
|
||||||
|
rtype=rtype,
|
||||||
|
@@ -93,9 +108,10 @@ def main(name):
|
||||||
|
fname=fname,
|
||||||
|
prec_short=prec_short,
|
||||||
|
in_data=in_data,
|
||||||
|
- rowcount=rowcount,
|
||||||
|
stype=prec,
|
||||||
|
- max_stride=max_stride))
|
||||||
|
+ rowlen=rowlen,
|
||||||
|
+ arity=arity,
|
||||||
|
+ nelems=len(in_vals[0])))
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
@ -1,967 +0,0 @@
|
|||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Wed Oct 18 11:12:29 2023 +0200
|
|
||||||
|
|
||||||
Revert "x86: Prepare `strrchr-evex` and `strrchr-evex512` for AVX10"
|
|
||||||
|
|
||||||
This reverts commit a3c50bf46a1ca6d9d2b7d879176d345abf95a9de.
|
|
||||||
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
|
|
||||||
index cd6a0a870a02b9bd..58b2853ab69265e8 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S
|
|
||||||
@@ -1,4 +1,4 @@
|
|
||||||
-/* Implementation for strrchr using evex256 and evex512.
|
|
||||||
+/* Placeholder function, not used by any processor at the moment.
|
|
||||||
Copyright (C) 2022-2023 Free Software Foundation, Inc.
|
|
||||||
This file is part of the GNU C Library.
|
|
||||||
|
|
||||||
@@ -16,6 +16,8 @@
|
|
||||||
License along with the GNU C Library; if not, see
|
|
||||||
<https://www.gnu.org/licenses/>. */
|
|
||||||
|
|
||||||
+/* UNUSED. Exists purely as reference implementation. */
|
|
||||||
+
|
|
||||||
#include <isa-level.h>
|
|
||||||
|
|
||||||
#if ISA_SHOULD_BUILD (4)
|
|
||||||
@@ -23,351 +25,240 @@
|
|
||||||
# include <sysdep.h>
|
|
||||||
|
|
||||||
# ifdef USE_AS_WCSRCHR
|
|
||||||
-# if VEC_SIZE == 64
|
|
||||||
-# define RCX_M cx
|
|
||||||
-# define KORTEST_M kortestw
|
|
||||||
-# else
|
|
||||||
-# define RCX_M cl
|
|
||||||
-# define KORTEST_M kortestb
|
|
||||||
-# endif
|
|
||||||
-
|
|
||||||
-# define SHIFT_REG VRCX
|
|
||||||
# define CHAR_SIZE 4
|
|
||||||
-# define VPCMP vpcmpd
|
|
||||||
-# define VPMIN vpminud
|
|
||||||
-# define VPCOMPRESS vpcompressd
|
|
||||||
-# define VPTESTN vptestnmd
|
|
||||||
-# define VPTEST vptestmd
|
|
||||||
-# define VPBROADCAST vpbroadcastd
|
|
||||||
+# define VPBROADCAST vpbroadcastd
|
|
||||||
# define VPCMPEQ vpcmpeqd
|
|
||||||
-
|
|
||||||
+# define VPMINU vpminud
|
|
||||||
+# define VPTESTN vptestnmd
|
|
||||||
# else
|
|
||||||
-# define SHIFT_REG VRDI
|
|
||||||
# define CHAR_SIZE 1
|
|
||||||
-# define VPCMP vpcmpb
|
|
||||||
-# define VPMIN vpminub
|
|
||||||
-# define VPCOMPRESS vpcompressb
|
|
||||||
-# define VPTESTN vptestnmb
|
|
||||||
-# define VPTEST vptestmb
|
|
||||||
-# define VPBROADCAST vpbroadcastb
|
|
||||||
+# define VPBROADCAST vpbroadcastb
|
|
||||||
# define VPCMPEQ vpcmpeqb
|
|
||||||
-
|
|
||||||
-# define RCX_M VRCX
|
|
||||||
-# define KORTEST_M KORTEST
|
|
||||||
+# define VPMINU vpminub
|
|
||||||
+# define VPTESTN vptestnmb
|
|
||||||
# endif
|
|
||||||
|
|
||||||
-# define VMATCH VMM(0)
|
|
||||||
-# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
# define PAGE_SIZE 4096
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
|
|
||||||
.section SECTION(.text), "ax", @progbits
|
|
||||||
- /* Aligning entry point to 64 byte, provides better performance for
|
|
||||||
- one vector length string. */
|
|
||||||
-ENTRY_P2ALIGN(STRRCHR, 6)
|
|
||||||
- movl %edi, %eax
|
|
||||||
- /* Broadcast CHAR to VMATCH. */
|
|
||||||
- VPBROADCAST %esi, %VMATCH
|
|
||||||
+/* Aligning entry point to 64 byte, provides better performance for
|
|
||||||
+ one vector length string. */
|
|
||||||
+ENTRY_P2ALIGN (STRRCHR, 6)
|
|
||||||
|
|
||||||
- andl $(PAGE_SIZE - 1), %eax
|
|
||||||
- cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
- jg L(cross_page_boundary)
|
|
||||||
+ /* Broadcast CHAR to VMM(0). */
|
|
||||||
+ VPBROADCAST %esi, %VMM(0)
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ sall $20, %eax
|
|
||||||
+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax
|
|
||||||
+ ja L(page_cross)
|
|
||||||
|
|
||||||
+L(page_cross_continue):
|
|
||||||
+ /* Compare [w]char for null, mask bit will be set for match. */
|
|
||||||
VMOVU (%rdi), %VMM(1)
|
|
||||||
- /* k0 has a 1 for each zero CHAR in YMM1. */
|
|
||||||
- VPTESTN %VMM(1), %VMM(1), %k0
|
|
||||||
- KMOV %k0, %VGPR(rsi)
|
|
||||||
- test %VGPR(rsi), %VGPR(rsi)
|
|
||||||
- jz L(aligned_more)
|
|
||||||
- /* fallthrough: zero CHAR in first VEC. */
|
|
||||||
-L(page_cross_return):
|
|
||||||
- /* K1 has a 1 for each search CHAR match in VEC(1). */
|
|
||||||
- VPCMPEQ %VMATCH, %VMM(1), %k1
|
|
||||||
- KMOV %k1, %VGPR(rax)
|
|
||||||
- /* Build mask up until first zero CHAR (used to mask of
|
|
||||||
- potential search CHAR matches past the end of the string). */
|
|
||||||
- blsmsk %VGPR(rsi), %VGPR(rsi)
|
|
||||||
- /* Use `and` here to remove any out of bounds matches so we can
|
|
||||||
- do a reverse scan on `rax` to find the last match. */
|
|
||||||
- and %VGPR(rsi), %VGPR(rax)
|
|
||||||
- jz L(ret0)
|
|
||||||
- /* Get last match. */
|
|
||||||
- bsr %VGPR(rax), %VGPR(rax)
|
|
||||||
+
|
|
||||||
+ VPTESTN %VMM(1), %VMM(1), %k1
|
|
||||||
+ KMOV %k1, %VRCX
|
|
||||||
+ test %VRCX, %VRCX
|
|
||||||
+ jz L(align_more)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %VMM(1), %VMM(0), %k0
|
|
||||||
+ KMOV %k0, %VRAX
|
|
||||||
+ BLSMSK %VRCX, %VRCX
|
|
||||||
+ and %VRCX, %VRAX
|
|
||||||
+ jz L(ret)
|
|
||||||
+
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
# ifdef USE_AS_WCSRCHR
|
|
||||||
leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- addq %rdi, %rax
|
|
||||||
+ add %rdi, %rax
|
|
||||||
# endif
|
|
||||||
-L(ret0):
|
|
||||||
+L(ret):
|
|
||||||
ret
|
|
||||||
|
|
||||||
- /* Returns for first vec x1/x2/x3 have hard coded backward
|
|
||||||
- search path for earlier matches. */
|
|
||||||
- .p2align 4,, 6
|
|
||||||
-L(first_vec_x1):
|
|
||||||
- VPCMPEQ %VMATCH, %VMM(2), %k1
|
|
||||||
- KMOV %k1, %VGPR(rax)
|
|
||||||
- blsmsk %VGPR(rcx), %VGPR(rcx)
|
|
||||||
- /* eax non-zero if search CHAR in range. */
|
|
||||||
- and %VGPR(rcx), %VGPR(rax)
|
|
||||||
- jnz L(first_vec_x1_return)
|
|
||||||
-
|
|
||||||
- /* fallthrough: no match in YMM2 then need to check for earlier
|
|
||||||
- matches (in YMM1). */
|
|
||||||
- .p2align 4,, 4
|
|
||||||
-L(first_vec_x0_test):
|
|
||||||
- VPCMPEQ %VMATCH, %VMM(1), %k1
|
|
||||||
- KMOV %k1, %VGPR(rax)
|
|
||||||
- test %VGPR(rax), %VGPR(rax)
|
|
||||||
- jz L(ret1)
|
|
||||||
- bsr %VGPR(rax), %VGPR(rax)
|
|
||||||
+L(vector_x2_end):
|
|
||||||
+ VPCMPEQ %VMM(2), %VMM(0), %k2
|
|
||||||
+ KMOV %k2, %VRAX
|
|
||||||
+ BLSMSK %VRCX, %VRCX
|
|
||||||
+ and %VRCX, %VRAX
|
|
||||||
+ jz L(vector_x1_ret)
|
|
||||||
+
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ /* Check the first vector at very last to look for match. */
|
|
||||||
+L(vector_x1_ret):
|
|
||||||
+ VPCMPEQ %VMM(1), %VMM(0), %k2
|
|
||||||
+ KMOV %k2, %VRAX
|
|
||||||
+ test %VRAX, %VRAX
|
|
||||||
+ jz L(ret)
|
|
||||||
+
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
# ifdef USE_AS_WCSRCHR
|
|
||||||
leaq (%rsi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- addq %rsi, %rax
|
|
||||||
+ add %rsi, %rax
|
|
||||||
# endif
|
|
||||||
-L(ret1):
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4,, 10
|
|
||||||
-L(first_vec_x3):
|
|
||||||
- VPCMPEQ %VMATCH, %VMM(4), %k1
|
|
||||||
- KMOV %k1, %VGPR(rax)
|
|
||||||
- blsmsk %VGPR(rcx), %VGPR(rcx)
|
|
||||||
- /* If no search CHAR match in range check YMM1/YMM2/YMM3. */
|
|
||||||
- and %VGPR(rcx), %VGPR(rax)
|
|
||||||
- jz L(first_vec_x1_or_x2)
|
|
||||||
- bsr %VGPR(rax), %VGPR(rax)
|
|
||||||
- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
- ret
|
|
||||||
- .p2align 4,, 4
|
|
||||||
-
|
|
||||||
-L(first_vec_x2):
|
|
||||||
- VPCMPEQ %VMATCH, %VMM(3), %k1
|
|
||||||
- KMOV %k1, %VGPR(rax)
|
|
||||||
- blsmsk %VGPR(rcx), %VGPR(rcx)
|
|
||||||
- /* Check YMM3 for last match first. If no match try YMM2/YMM1. */
|
|
||||||
- and %VGPR(rcx), %VGPR(rax)
|
|
||||||
- jz L(first_vec_x0_x1_test)
|
|
||||||
- bsr %VGPR(rax), %VGPR(rax)
|
|
||||||
- leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4,, 6
|
|
||||||
-L(first_vec_x0_x1_test):
|
|
||||||
- VPCMPEQ %VMATCH, %VMM(2), %k1
|
|
||||||
- KMOV %k1, %VGPR(rax)
|
|
||||||
- /* Check YMM2 for last match first. If no match try YMM1. */
|
|
||||||
- test %VGPR(rax), %VGPR(rax)
|
|
||||||
- jz L(first_vec_x0_test)
|
|
||||||
- .p2align 4,, 4
|
|
||||||
-L(first_vec_x1_return):
|
|
||||||
- bsr %VGPR(rax), %VGPR(rax)
|
|
||||||
- leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax
|
|
||||||
- ret
|
|
||||||
-
|
|
||||||
- .p2align 4,, 12
|
|
||||||
-L(aligned_more):
|
|
||||||
-L(page_cross_continue):
|
|
||||||
- /* Need to keep original pointer incase VEC(1) has last match. */
|
|
||||||
+L(align_more):
|
|
||||||
+ /* Zero r8 to store match result. */
|
|
||||||
+ xorl %r8d, %r8d
|
|
||||||
+ /* Save pointer of first vector, in case if no match found. */
|
|
||||||
movq %rdi, %rsi
|
|
||||||
+ /* Align pointer to vector size. */
|
|
||||||
andq $-VEC_SIZE, %rdi
|
|
||||||
-
|
|
||||||
- VMOVU VEC_SIZE(%rdi), %VMM(2)
|
|
||||||
+ /* Loop unroll for 2 vector loop. */
|
|
||||||
+ VMOVA (VEC_SIZE)(%rdi), %VMM(2)
|
|
||||||
VPTESTN %VMM(2), %VMM(2), %k0
|
|
||||||
KMOV %k0, %VRCX
|
|
||||||
- movq %rdi, %r8
|
|
||||||
test %VRCX, %VRCX
|
|
||||||
- jnz L(first_vec_x1)
|
|
||||||
-
|
|
||||||
- VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3)
|
|
||||||
- VPTESTN %VMM(3), %VMM(3), %k0
|
|
||||||
- KMOV %k0, %VRCX
|
|
||||||
-
|
|
||||||
- test %VRCX, %VRCX
|
|
||||||
- jnz L(first_vec_x2)
|
|
||||||
-
|
|
||||||
- VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4)
|
|
||||||
- VPTESTN %VMM(4), %VMM(4), %k0
|
|
||||||
- KMOV %k0, %VRCX
|
|
||||||
-
|
|
||||||
- /* Intentionally use 64-bit here. EVEX256 version needs 1-byte
|
|
||||||
- padding for efficient nop before loop alignment. */
|
|
||||||
- test %rcx, %rcx
|
|
||||||
- jnz L(first_vec_x3)
|
|
||||||
+ jnz L(vector_x2_end)
|
|
||||||
|
|
||||||
+ /* Save pointer of second vector, in case if no match
|
|
||||||
+ found. */
|
|
||||||
+ movq %rdi, %r9
|
|
||||||
+ /* Align address to VEC_SIZE * 2 for loop. */
|
|
||||||
andq $-(VEC_SIZE * 2), %rdi
|
|
||||||
- .p2align 4
|
|
||||||
-L(first_aligned_loop):
|
|
||||||
- /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
|
|
||||||
- gurantee they don't store a match. */
|
|
||||||
- VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5)
|
|
||||||
- VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6)
|
|
||||||
-
|
|
||||||
- VPCMP $4, %VMM(5), %VMATCH, %k2
|
|
||||||
- VPCMP $4, %VMM(6), %VMATCH, %k3{%k2}
|
|
||||||
-
|
|
||||||
- VPMIN %VMM(5), %VMM(6), %VMM(7)
|
|
||||||
|
|
||||||
- VPTEST %VMM(7), %VMM(7), %k1{%k3}
|
|
||||||
- subq $(VEC_SIZE * -2), %rdi
|
|
||||||
- KORTEST_M %k1, %k1
|
|
||||||
- jc L(first_aligned_loop)
|
|
||||||
+ .p2align 4,,11
|
|
||||||
+L(loop):
|
|
||||||
+ /* 2 vector loop, as it provide better performance as compared
|
|
||||||
+ to 4 vector loop. */
|
|
||||||
+ VMOVA (VEC_SIZE * 2)(%rdi), %VMM(3)
|
|
||||||
+ VMOVA (VEC_SIZE * 3)(%rdi), %VMM(4)
|
|
||||||
+ VPCMPEQ %VMM(3), %VMM(0), %k1
|
|
||||||
+ VPCMPEQ %VMM(4), %VMM(0), %k2
|
|
||||||
+ VPMINU %VMM(3), %VMM(4), %VMM(5)
|
|
||||||
+ VPTESTN %VMM(5), %VMM(5), %k0
|
|
||||||
+ KOR %k1, %k2, %k3
|
|
||||||
+ subq $-(VEC_SIZE * 2), %rdi
|
|
||||||
+ /* If k0 and k3 zero, match and end of string not found. */
|
|
||||||
+ KORTEST %k0, %k3
|
|
||||||
+ jz L(loop)
|
|
||||||
+
|
|
||||||
+ /* If k0 is non zero, end of string found. */
|
|
||||||
+ KORTEST %k0, %k0
|
|
||||||
+ jnz L(endloop)
|
|
||||||
+
|
|
||||||
+ lea VEC_SIZE(%rdi), %r8
|
|
||||||
+ /* A match found, it need to be stored in r8 before loop
|
|
||||||
+ continue. */
|
|
||||||
+ /* Check second vector first. */
|
|
||||||
+ KMOV %k2, %VRDX
|
|
||||||
+ test %VRDX, %VRDX
|
|
||||||
+ jnz L(loop_vec_x2_match)
|
|
||||||
|
|
||||||
- VPTESTN %VMM(7), %VMM(7), %k1
|
|
||||||
KMOV %k1, %VRDX
|
|
||||||
- test %VRDX, %VRDX
|
|
||||||
- jz L(second_aligned_loop_prep)
|
|
||||||
+ /* Match is in first vector, rdi offset need to be subtracted
|
|
||||||
+ by VEC_SIZE. */
|
|
||||||
+ sub $VEC_SIZE, %r8
|
|
||||||
+
|
|
||||||
+ /* If second vector doesn't have match, first vector must
|
|
||||||
+ have match. */
|
|
||||||
+L(loop_vec_x2_match):
|
|
||||||
+ BSR %VRDX, %VRDX
|
|
||||||
+# ifdef USE_AS_WCSRCHR
|
|
||||||
+ sal $2, %rdx
|
|
||||||
+# endif
|
|
||||||
+ add %rdx, %r8
|
|
||||||
+ jmp L(loop)
|
|
||||||
|
|
||||||
- KORTEST_M %k3, %k3
|
|
||||||
- jnc L(return_first_aligned_loop)
|
|
||||||
+L(endloop):
|
|
||||||
+ /* Check if string end in first loop vector. */
|
|
||||||
+ VPTESTN %VMM(3), %VMM(3), %k0
|
|
||||||
+ KMOV %k0, %VRCX
|
|
||||||
+ test %VRCX, %VRCX
|
|
||||||
+ jnz L(loop_vector_x1_end)
|
|
||||||
|
|
||||||
- .p2align 4,, 6
|
|
||||||
-L(first_vec_x1_or_x2_or_x3):
|
|
||||||
- VPCMPEQ %VMM(4), %VMATCH, %k4
|
|
||||||
- KMOV %k4, %VRAX
|
|
||||||
+ /* Check if it has match in first loop vector. */
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
test %VRAX, %VRAX
|
|
||||||
- jz L(first_vec_x1_or_x2)
|
|
||||||
- bsr %VRAX, %VRAX
|
|
||||||
- leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
|
|
||||||
- ret
|
|
||||||
+ jz L(loop_vector_x2_end)
|
|
||||||
|
|
||||||
- .p2align 4,, 8
|
|
||||||
-L(return_first_aligned_loop):
|
|
||||||
- VPTESTN %VMM(5), %VMM(5), %k0
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %r8
|
|
||||||
+
|
|
||||||
+ /* String must end in second loop vector. */
|
|
||||||
+L(loop_vector_x2_end):
|
|
||||||
+ VPTESTN %VMM(4), %VMM(4), %k0
|
|
||||||
KMOV %k0, %VRCX
|
|
||||||
- blsmsk %VRCX, %VRCX
|
|
||||||
- jnc L(return_first_new_match_first)
|
|
||||||
- blsmsk %VRDX, %VRDX
|
|
||||||
- VPCMPEQ %VMM(6), %VMATCH, %k0
|
|
||||||
- KMOV %k0, %VRAX
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- and %VRDX, %VRAX
|
|
||||||
- jnz L(return_first_new_match_ret)
|
|
||||||
- subq $VEC_SIZE, %rdi
|
|
||||||
-L(return_first_new_match_first):
|
|
||||||
KMOV %k2, %VRAX
|
|
||||||
-# ifdef USE_AS_WCSRCHR
|
|
||||||
- xorl $((1 << CHAR_PER_VEC)- 1), %VRAX
|
|
||||||
+ BLSMSK %VRCX, %VRCX
|
|
||||||
+ /* Check if it has match in second loop vector. */
|
|
||||||
and %VRCX, %VRAX
|
|
||||||
-# else
|
|
||||||
- andn %VRCX, %VRAX, %VRAX
|
|
||||||
-# endif
|
|
||||||
- jz L(first_vec_x1_or_x2_or_x3)
|
|
||||||
-L(return_first_new_match_ret):
|
|
||||||
- bsr %VRAX, %VRAX
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
- ret
|
|
||||||
+ jz L(check_last_match)
|
|
||||||
|
|
||||||
- .p2align 4,, 10
|
|
||||||
-L(first_vec_x1_or_x2):
|
|
||||||
- VPCMPEQ %VMM(3), %VMATCH, %k3
|
|
||||||
- KMOV %k3, %VRAX
|
|
||||||
- test %VRAX, %VRAX
|
|
||||||
- jz L(first_vec_x0_x1_test)
|
|
||||||
- bsr %VRAX, %VRAX
|
|
||||||
- leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4
|
|
||||||
- /* We can throw away the work done for the first 4x checks here
|
|
||||||
- as we have a later match. This is the 'fast' path persay. */
|
|
||||||
-L(second_aligned_loop_prep):
|
|
||||||
-L(second_aligned_loop_set_furthest_match):
|
|
||||||
- movq %rdi, %rsi
|
|
||||||
- VMOVA %VMM(5), %VMM(7)
|
|
||||||
- VMOVA %VMM(6), %VMM(8)
|
|
||||||
- .p2align 4
|
|
||||||
-L(second_aligned_loop):
|
|
||||||
- VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5)
|
|
||||||
- VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6)
|
|
||||||
- VPCMP $4, %VMM(5), %VMATCH, %k2
|
|
||||||
- VPCMP $4, %VMM(6), %VMATCH, %k3{%k2}
|
|
||||||
-
|
|
||||||
- VPMIN %VMM(5), %VMM(6), %VMM(4)
|
|
||||||
-
|
|
||||||
- VPTEST %VMM(4), %VMM(4), %k1{%k3}
|
|
||||||
- subq $(VEC_SIZE * -2), %rdi
|
|
||||||
- KMOV %k1, %VRCX
|
|
||||||
- inc %RCX_M
|
|
||||||
- jz L(second_aligned_loop)
|
|
||||||
- VPTESTN %VMM(4), %VMM(4), %k1
|
|
||||||
- KMOV %k1, %VRDX
|
|
||||||
- test %VRDX, %VRDX
|
|
||||||
- jz L(second_aligned_loop_set_furthest_match)
|
|
||||||
-
|
|
||||||
- KORTEST_M %k3, %k3
|
|
||||||
- jnc L(return_new_match)
|
|
||||||
- /* branch here because there is a significant advantage interms
|
|
||||||
- of output dependency chance in using edx. */
|
|
||||||
+ /* String end in first loop vector. */
|
|
||||||
+L(loop_vector_x1_end):
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ BLSMSK %VRCX, %VRCX
|
|
||||||
+ /* Check if it has match in second loop vector. */
|
|
||||||
+ and %VRCX, %VRAX
|
|
||||||
+ jz L(check_last_match)
|
|
||||||
|
|
||||||
-L(return_old_match):
|
|
||||||
- VPCMPEQ %VMM(8), %VMATCH, %k0
|
|
||||||
- KMOV %k0, %VRCX
|
|
||||||
- bsr %VRCX, %VRCX
|
|
||||||
- jnz L(return_old_match_ret)
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
|
|
||||||
- VPCMPEQ %VMM(7), %VMATCH, %k0
|
|
||||||
- KMOV %k0, %VRCX
|
|
||||||
- bsr %VRCX, %VRCX
|
|
||||||
- subq $VEC_SIZE, %rsi
|
|
||||||
-L(return_old_match_ret):
|
|
||||||
- leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
|
|
||||||
+ /* No match in first and second loop vector. */
|
|
||||||
+L(check_last_match):
|
|
||||||
+ /* Check if any match recorded in r8. */
|
|
||||||
+ test %r8, %r8
|
|
||||||
+ jz L(vector_x2_ret)
|
|
||||||
+ movq %r8, %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
-L(return_new_match):
|
|
||||||
- VPTESTN %VMM(5), %VMM(5), %k0
|
|
||||||
- KMOV %k0, %VRCX
|
|
||||||
- blsmsk %VRCX, %VRCX
|
|
||||||
- jnc L(return_new_match_first)
|
|
||||||
- dec %VRDX
|
|
||||||
- VPCMPEQ %VMM(6), %VMATCH, %k0
|
|
||||||
- KMOV %k0, %VRAX
|
|
||||||
- addq $VEC_SIZE, %rdi
|
|
||||||
- and %VRDX, %VRAX
|
|
||||||
- jnz L(return_new_match_ret)
|
|
||||||
- subq $VEC_SIZE, %rdi
|
|
||||||
-L(return_new_match_first):
|
|
||||||
+ /* No match recorded in r8. Check the second saved vector
|
|
||||||
+ in beginning. */
|
|
||||||
+L(vector_x2_ret):
|
|
||||||
+ VPCMPEQ %VMM(2), %VMM(0), %k2
|
|
||||||
KMOV %k2, %VRAX
|
|
||||||
-# ifdef USE_AS_WCSRCHR
|
|
||||||
- xorl $((1 << CHAR_PER_VEC)- 1), %VRAX
|
|
||||||
- and %VRCX, %VRAX
|
|
||||||
-# else
|
|
||||||
- andn %VRCX, %VRAX, %VRAX
|
|
||||||
-# endif
|
|
||||||
- jz L(return_old_match)
|
|
||||||
-L(return_new_match_ret):
|
|
||||||
- bsr %VRAX, %VRAX
|
|
||||||
- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ test %VRAX, %VRAX
|
|
||||||
+ jz L(vector_x1_ret)
|
|
||||||
+
|
|
||||||
+ /* Match found in the second saved vector. */
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
+ leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax
|
|
||||||
ret
|
|
||||||
|
|
||||||
- .p2align 4,, 4
|
|
||||||
-L(cross_page_boundary):
|
|
||||||
- xorq %rdi, %rax
|
|
||||||
- mov $-1, %VRDX
|
|
||||||
- VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6)
|
|
||||||
- VPTESTN %VMM(6), %VMM(6), %k0
|
|
||||||
- KMOV %k0, %VRSI
|
|
||||||
+L(page_cross):
|
|
||||||
+ mov %rdi, %rax
|
|
||||||
+ movl %edi, %ecx
|
|
||||||
|
|
||||||
# ifdef USE_AS_WCSRCHR
|
|
||||||
- movl %edi, %ecx
|
|
||||||
- and $(VEC_SIZE - 1), %ecx
|
|
||||||
- shrl $2, %ecx
|
|
||||||
+ /* Calculate number of compare result bits to be skipped for
|
|
||||||
+ wide string alignment adjustment. */
|
|
||||||
+ andl $(VEC_SIZE - 1), %ecx
|
|
||||||
+ sarl $2, %ecx
|
|
||||||
# endif
|
|
||||||
- shlx %SHIFT_REG, %VRDX, %VRDX
|
|
||||||
-
|
|
||||||
+ /* ecx contains number of w[char] to be skipped as a result
|
|
||||||
+ of address alignment. */
|
|
||||||
+ andq $-VEC_SIZE, %rax
|
|
||||||
+ VMOVA (%rax), %VMM(1)
|
|
||||||
+ VPTESTN %VMM(1), %VMM(1), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ SHR %cl, %VRAX
|
|
||||||
+ jz L(page_cross_continue)
|
|
||||||
+ VPCMPEQ %VMM(1), %VMM(0), %k0
|
|
||||||
+ KMOV %k0, %VRDX
|
|
||||||
+ SHR %cl, %VRDX
|
|
||||||
+ BLSMSK %VRAX, %VRAX
|
|
||||||
+ and %VRDX, %VRAX
|
|
||||||
+ jz L(ret)
|
|
||||||
+ BSR %VRAX, %VRAX
|
|
||||||
# ifdef USE_AS_WCSRCHR
|
|
||||||
- kmovw %edx, %k1
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
# else
|
|
||||||
- KMOV %VRDX, %k1
|
|
||||||
+ add %rdi, %rax
|
|
||||||
# endif
|
|
||||||
|
|
||||||
- VPCOMPRESS %VMM(6), %VMM(1){%k1}{z}
|
|
||||||
- /* We could technically just jmp back after the vpcompress but
|
|
||||||
- it doesn't save any 16-byte blocks. */
|
|
||||||
- shrx %SHIFT_REG, %VRSI, %VRSI
|
|
||||||
- test %VRSI, %VRSI
|
|
||||||
- jnz L(page_cross_return)
|
|
||||||
- jmp L(page_cross_continue)
|
|
||||||
- /* 1-byte from cache line. */
|
|
||||||
-END(STRRCHR)
|
|
||||||
+ ret
|
|
||||||
+END (STRRCHR)
|
|
||||||
#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
|
||||||
index 3bf6a5101422e4d1..85e3b0119f5dc923 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/strrchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S
|
|
||||||
@@ -1,8 +1,394 @@
|
|
||||||
+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions.
|
|
||||||
+ Copyright (C) 2021-2023 Free Software Foundation, Inc.
|
|
||||||
+ This file is part of the GNU C Library.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is free software; you can redistribute it and/or
|
|
||||||
+ modify it under the terms of the GNU Lesser General Public
|
|
||||||
+ License as published by the Free Software Foundation; either
|
|
||||||
+ version 2.1 of the License, or (at your option) any later version.
|
|
||||||
+
|
|
||||||
+ The GNU C Library is distributed in the hope that it will be useful,
|
|
||||||
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
||||||
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
|
||||||
+ Lesser General Public License for more details.
|
|
||||||
+
|
|
||||||
+ You should have received a copy of the GNU Lesser General Public
|
|
||||||
+ License along with the GNU C Library; if not, see
|
|
||||||
+ <https://www.gnu.org/licenses/>. */
|
|
||||||
+
|
|
||||||
+#include <isa-level.h>
|
|
||||||
+
|
|
||||||
+#if ISA_SHOULD_BUILD (4)
|
|
||||||
+
|
|
||||||
+# include <sysdep.h>
|
|
||||||
+
|
|
||||||
# ifndef STRRCHR
|
|
||||||
# define STRRCHR __strrchr_evex
|
|
||||||
# endif
|
|
||||||
|
|
||||||
-#include "x86-evex256-vecs.h"
|
|
||||||
-#include "reg-macros.h"
|
|
||||||
+# include "x86-evex256-vecs.h"
|
|
||||||
+
|
|
||||||
+# ifdef USE_AS_WCSRCHR
|
|
||||||
+# define SHIFT_REG rsi
|
|
||||||
+# define kunpck_2x kunpckbw
|
|
||||||
+# define kmov_2x kmovd
|
|
||||||
+# define maskz_2x ecx
|
|
||||||
+# define maskm_2x eax
|
|
||||||
+# define CHAR_SIZE 4
|
|
||||||
+# define VPMIN vpminud
|
|
||||||
+# define VPTESTN vptestnmd
|
|
||||||
+# define VPTEST vptestmd
|
|
||||||
+# define VPBROADCAST vpbroadcastd
|
|
||||||
+# define VPCMPEQ vpcmpeqd
|
|
||||||
+# define VPCMP vpcmpd
|
|
||||||
+
|
|
||||||
+# define USE_WIDE_CHAR
|
|
||||||
+# else
|
|
||||||
+# define SHIFT_REG rdi
|
|
||||||
+# define kunpck_2x kunpckdq
|
|
||||||
+# define kmov_2x kmovq
|
|
||||||
+# define maskz_2x rcx
|
|
||||||
+# define maskm_2x rax
|
|
||||||
+
|
|
||||||
+# define CHAR_SIZE 1
|
|
||||||
+# define VPMIN vpminub
|
|
||||||
+# define VPTESTN vptestnmb
|
|
||||||
+# define VPTEST vptestmb
|
|
||||||
+# define VPBROADCAST vpbroadcastb
|
|
||||||
+# define VPCMPEQ vpcmpeqb
|
|
||||||
+# define VPCMP vpcmpb
|
|
||||||
+# endif
|
|
||||||
+
|
|
||||||
+# include "reg-macros.h"
|
|
||||||
+
|
|
||||||
+# define VMATCH VMM(0)
|
|
||||||
+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE)
|
|
||||||
+# define PAGE_SIZE 4096
|
|
||||||
+
|
|
||||||
+ .section SECTION(.text), "ax", @progbits
|
|
||||||
+ENTRY_P2ALIGN(STRRCHR, 6)
|
|
||||||
+ movl %edi, %eax
|
|
||||||
+ /* Broadcast CHAR to VMATCH. */
|
|
||||||
+ VPBROADCAST %esi, %VMATCH
|
|
||||||
+
|
|
||||||
+ andl $(PAGE_SIZE - 1), %eax
|
|
||||||
+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax
|
|
||||||
+ jg L(cross_page_boundary)
|
|
||||||
+L(page_cross_continue):
|
|
||||||
+ VMOVU (%rdi), %VMM(1)
|
|
||||||
+ /* k0 has a 1 for each zero CHAR in VEC(1). */
|
|
||||||
+ VPTESTN %VMM(1), %VMM(1), %k0
|
|
||||||
+ KMOV %k0, %VRSI
|
|
||||||
+ test %VRSI, %VRSI
|
|
||||||
+ jz L(aligned_more)
|
|
||||||
+ /* fallthrough: zero CHAR in first VEC. */
|
|
||||||
+ /* K1 has a 1 for each search CHAR match in VEC(1). */
|
|
||||||
+ VPCMPEQ %VMATCH, %VMM(1), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ /* Build mask up until first zero CHAR (used to mask of
|
|
||||||
+ potential search CHAR matches past the end of the string).
|
|
||||||
+ */
|
|
||||||
+ blsmsk %VRSI, %VRSI
|
|
||||||
+ and %VRSI, %VRAX
|
|
||||||
+ jz L(ret0)
|
|
||||||
+ /* Get last match (the `and` removed any out of bounds matches).
|
|
||||||
+ */
|
|
||||||
+ bsr %VRAX, %VRAX
|
|
||||||
+# ifdef USE_AS_WCSRCHR
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+# endif
|
|
||||||
+L(ret0):
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ /* Returns for first vec x1/x2/x3 have hard coded backward
|
|
||||||
+ search path for earlier matches. */
|
|
||||||
+ .p2align 4,, 6
|
|
||||||
+L(first_vec_x1):
|
|
||||||
+ VPCMPEQ %VMATCH, %VMM(2), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ blsmsk %VRCX, %VRCX
|
|
||||||
+ /* eax non-zero if search CHAR in range. */
|
|
||||||
+ and %VRCX, %VRAX
|
|
||||||
+ jnz L(first_vec_x1_return)
|
|
||||||
+
|
|
||||||
+ /* fallthrough: no match in VEC(2) then need to check for
|
|
||||||
+ earlier matches (in VEC(1)). */
|
|
||||||
+ .p2align 4,, 4
|
|
||||||
+L(first_vec_x0_test):
|
|
||||||
+ VPCMPEQ %VMATCH, %VMM(1), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ test %VRAX, %VRAX
|
|
||||||
+ jz L(ret1)
|
|
||||||
+ bsr %VRAX, %VRAX
|
|
||||||
+# ifdef USE_AS_WCSRCHR
|
|
||||||
+ leaq (%rsi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ addq %rsi, %rax
|
|
||||||
+# endif
|
|
||||||
+L(ret1):
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 10
|
|
||||||
+L(first_vec_x1_or_x2):
|
|
||||||
+ VPCMPEQ %VMM(3), %VMATCH, %k3
|
|
||||||
+ VPCMPEQ %VMM(2), %VMATCH, %k2
|
|
||||||
+ /* K2 and K3 have 1 for any search CHAR match. Test if any
|
|
||||||
+ matches between either of them. Otherwise check VEC(1). */
|
|
||||||
+ KORTEST %k2, %k3
|
|
||||||
+ jz L(first_vec_x0_test)
|
|
||||||
+
|
|
||||||
+ /* Guaranteed that VEC(2) and VEC(3) are within range so merge
|
|
||||||
+ the two bitmasks then get last result. */
|
|
||||||
+ kunpck_2x %k2, %k3, %k3
|
|
||||||
+ kmov_2x %k3, %maskm_2x
|
|
||||||
+ bsr %maskm_2x, %maskm_2x
|
|
||||||
+ leaq (VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 7
|
|
||||||
+L(first_vec_x3):
|
|
||||||
+ VPCMPEQ %VMATCH, %VMM(4), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ blsmsk %VRCX, %VRCX
|
|
||||||
+ /* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3).
|
|
||||||
+ */
|
|
||||||
+ and %VRCX, %VRAX
|
|
||||||
+ jz L(first_vec_x1_or_x2)
|
|
||||||
+ bsr %VRAX, %VRAX
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 6
|
|
||||||
+L(first_vec_x0_x1_test):
|
|
||||||
+ VPCMPEQ %VMATCH, %VMM(2), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ /* Check VEC(2) for last match first. If no match try VEC(1).
|
|
||||||
+ */
|
|
||||||
+ test %VRAX, %VRAX
|
|
||||||
+ jz L(first_vec_x0_test)
|
|
||||||
+ .p2align 4,, 4
|
|
||||||
+L(first_vec_x1_return):
|
|
||||||
+ bsr %VRAX, %VRAX
|
|
||||||
+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 10
|
|
||||||
+L(first_vec_x2):
|
|
||||||
+ VPCMPEQ %VMATCH, %VMM(3), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ blsmsk %VRCX, %VRCX
|
|
||||||
+ /* Check VEC(3) for last match first. If no match try
|
|
||||||
+ VEC(2)/VEC(1). */
|
|
||||||
+ and %VRCX, %VRAX
|
|
||||||
+ jz L(first_vec_x0_x1_test)
|
|
||||||
+ bsr %VRAX, %VRAX
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 12
|
|
||||||
+L(aligned_more):
|
|
||||||
+ /* Need to keep original pointer in case VEC(1) has last match.
|
|
||||||
+ */
|
|
||||||
+ movq %rdi, %rsi
|
|
||||||
+ andq $-VEC_SIZE, %rdi
|
|
||||||
+
|
|
||||||
+ VMOVU VEC_SIZE(%rdi), %VMM(2)
|
|
||||||
+ VPTESTN %VMM(2), %VMM(2), %k0
|
|
||||||
+ KMOV %k0, %VRCX
|
|
||||||
+
|
|
||||||
+ test %VRCX, %VRCX
|
|
||||||
+ jnz L(first_vec_x1)
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3)
|
|
||||||
+ VPTESTN %VMM(3), %VMM(3), %k0
|
|
||||||
+ KMOV %k0, %VRCX
|
|
||||||
+
|
|
||||||
+ test %VRCX, %VRCX
|
|
||||||
+ jnz L(first_vec_x2)
|
|
||||||
+
|
|
||||||
+ VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4)
|
|
||||||
+ VPTESTN %VMM(4), %VMM(4), %k0
|
|
||||||
+ KMOV %k0, %VRCX
|
|
||||||
+ movq %rdi, %r8
|
|
||||||
+ test %VRCX, %VRCX
|
|
||||||
+ jnz L(first_vec_x3)
|
|
||||||
+
|
|
||||||
+ andq $-(VEC_SIZE * 2), %rdi
|
|
||||||
+ .p2align 4,, 10
|
|
||||||
+L(first_aligned_loop):
|
|
||||||
+ /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can
|
|
||||||
+ guarantee they don't store a match. */
|
|
||||||
+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5)
|
|
||||||
+ VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %VMM(5), %VMATCH, %k2
|
|
||||||
+ vpxord %VMM(6), %VMATCH, %VMM(7)
|
|
||||||
+
|
|
||||||
+ VPMIN %VMM(5), %VMM(6), %VMM(8)
|
|
||||||
+ VPMIN %VMM(8), %VMM(7), %VMM(7)
|
|
||||||
+
|
|
||||||
+ VPTESTN %VMM(7), %VMM(7), %k1
|
|
||||||
+ subq $(VEC_SIZE * -2), %rdi
|
|
||||||
+ KORTEST %k1, %k2
|
|
||||||
+ jz L(first_aligned_loop)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %VMM(6), %VMATCH, %k3
|
|
||||||
+ VPTESTN %VMM(8), %VMM(8), %k1
|
|
||||||
+
|
|
||||||
+ /* If k1 is zero, then we found a CHAR match but no null-term.
|
|
||||||
+ We can now safely throw out VEC1-4. */
|
|
||||||
+ KTEST %k1, %k1
|
|
||||||
+ jz L(second_aligned_loop_prep)
|
|
||||||
+
|
|
||||||
+ KORTEST %k2, %k3
|
|
||||||
+ jnz L(return_first_aligned_loop)
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 6
|
|
||||||
+L(first_vec_x1_or_x2_or_x3):
|
|
||||||
+ VPCMPEQ %VMM(4), %VMATCH, %k4
|
|
||||||
+ KMOV %k4, %VRAX
|
|
||||||
+ bsr %VRAX, %VRAX
|
|
||||||
+ jz L(first_vec_x1_or_x2)
|
|
||||||
+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 8
|
|
||||||
+L(return_first_aligned_loop):
|
|
||||||
+ VPTESTN %VMM(5), %VMM(5), %k0
|
|
||||||
+
|
|
||||||
+ /* Combined results from VEC5/6. */
|
|
||||||
+ kunpck_2x %k0, %k1, %k0
|
|
||||||
+ kmov_2x %k0, %maskz_2x
|
|
||||||
+
|
|
||||||
+ blsmsk %maskz_2x, %maskz_2x
|
|
||||||
+ kunpck_2x %k2, %k3, %k3
|
|
||||||
+ kmov_2x %k3, %maskm_2x
|
|
||||||
+ and %maskz_2x, %maskm_2x
|
|
||||||
+ jz L(first_vec_x1_or_x2_or_x3)
|
|
||||||
+
|
|
||||||
+ bsr %maskm_2x, %maskm_2x
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4
|
|
||||||
+ /* We can throw away the work done for the first 4x checks here
|
|
||||||
+ as we have a later match. This is the 'fast' path persay.
|
|
||||||
+ */
|
|
||||||
+L(second_aligned_loop_prep):
|
|
||||||
+L(second_aligned_loop_set_furthest_match):
|
|
||||||
+ movq %rdi, %rsi
|
|
||||||
+ /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on
|
|
||||||
+ port0 and have noticeable overhead in the loop. */
|
|
||||||
+ VMOVA %VMM(5), %VMM(7)
|
|
||||||
+ VMOVA %VMM(6), %VMM(8)
|
|
||||||
+ .p2align 4
|
|
||||||
+L(second_aligned_loop):
|
|
||||||
+ VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5)
|
|
||||||
+ VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6)
|
|
||||||
+ VPCMPEQ %VMM(5), %VMATCH, %k2
|
|
||||||
+ vpxord %VMM(6), %VMATCH, %VMM(3)
|
|
||||||
+
|
|
||||||
+ VPMIN %VMM(5), %VMM(6), %VMM(4)
|
|
||||||
+ VPMIN %VMM(3), %VMM(4), %VMM(3)
|
|
||||||
+
|
|
||||||
+ VPTESTN %VMM(3), %VMM(3), %k1
|
|
||||||
+ subq $(VEC_SIZE * -2), %rdi
|
|
||||||
+ KORTEST %k1, %k2
|
|
||||||
+ jz L(second_aligned_loop)
|
|
||||||
+ VPCMPEQ %VMM(6), %VMATCH, %k3
|
|
||||||
+ VPTESTN %VMM(4), %VMM(4), %k1
|
|
||||||
+ KTEST %k1, %k1
|
|
||||||
+ jz L(second_aligned_loop_set_furthest_match)
|
|
||||||
+
|
|
||||||
+ /* branch here because we know we have a match in VEC7/8 but
|
|
||||||
+ might not in VEC5/6 so the latter is expected to be less
|
|
||||||
+ likely. */
|
|
||||||
+ KORTEST %k2, %k3
|
|
||||||
+ jnz L(return_new_match)
|
|
||||||
+
|
|
||||||
+L(return_old_match):
|
|
||||||
+ VPCMPEQ %VMM(8), %VMATCH, %k0
|
|
||||||
+ KMOV %k0, %VRCX
|
|
||||||
+ bsr %VRCX, %VRCX
|
|
||||||
+ jnz L(return_old_match_ret)
|
|
||||||
+
|
|
||||||
+ VPCMPEQ %VMM(7), %VMATCH, %k0
|
|
||||||
+ KMOV %k0, %VRCX
|
|
||||||
+ bsr %VRCX, %VRCX
|
|
||||||
+ subq $VEC_SIZE, %rsi
|
|
||||||
+L(return_old_match_ret):
|
|
||||||
+ leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+ .p2align 4,, 10
|
|
||||||
+L(return_new_match):
|
|
||||||
+ VPTESTN %VMM(5), %VMM(5), %k0
|
|
||||||
+
|
|
||||||
+ /* Combined results from VEC5/6. */
|
|
||||||
+ kunpck_2x %k0, %k1, %k0
|
|
||||||
+ kmov_2x %k0, %maskz_2x
|
|
||||||
+
|
|
||||||
+ blsmsk %maskz_2x, %maskz_2x
|
|
||||||
+ kunpck_2x %k2, %k3, %k3
|
|
||||||
+ kmov_2x %k3, %maskm_2x
|
|
||||||
+
|
|
||||||
+ /* Match at end was out-of-bounds so use last known match. */
|
|
||||||
+ and %maskz_2x, %maskm_2x
|
|
||||||
+ jz L(return_old_match)
|
|
||||||
+
|
|
||||||
+ bsr %maskm_2x, %maskm_2x
|
|
||||||
+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+ ret
|
|
||||||
+
|
|
||||||
+L(cross_page_boundary):
|
|
||||||
+ /* eax contains all the page offset bits of src (rdi). `xor rdi,
|
|
||||||
+ rax` sets pointer will all page offset bits cleared so
|
|
||||||
+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC
|
|
||||||
+ before page cross (guaranteed to be safe to read). Doing this
|
|
||||||
+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves
|
|
||||||
+ a bit of code size. */
|
|
||||||
+ xorq %rdi, %rax
|
|
||||||
+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1)
|
|
||||||
+ VPTESTN %VMM(1), %VMM(1), %k0
|
|
||||||
+ KMOV %k0, %VRCX
|
|
||||||
+
|
|
||||||
+ /* Shift out zero CHAR matches that are before the beginning of
|
|
||||||
+ src (rdi). */
|
|
||||||
+# ifdef USE_AS_WCSRCHR
|
|
||||||
+ movl %edi, %esi
|
|
||||||
+ andl $(VEC_SIZE - 1), %esi
|
|
||||||
+ shrl $2, %esi
|
|
||||||
+# endif
|
|
||||||
+ shrx %VGPR(SHIFT_REG), %VRCX, %VRCX
|
|
||||||
+
|
|
||||||
+ test %VRCX, %VRCX
|
|
||||||
+ jz L(page_cross_continue)
|
|
||||||
|
|
||||||
-#include "strrchr-evex-base.S"
|
|
||||||
+ /* Found zero CHAR so need to test for search CHAR. */
|
|
||||||
+ VPCMP $0, %VMATCH, %VMM(1), %k1
|
|
||||||
+ KMOV %k1, %VRAX
|
|
||||||
+ /* Shift out search CHAR matches that are before the beginning of
|
|
||||||
+ src (rdi). */
|
|
||||||
+ shrx %VGPR(SHIFT_REG), %VRAX, %VRAX
|
|
||||||
+
|
|
||||||
+ /* Check if any search CHAR match in range. */
|
|
||||||
+ blsmsk %VRCX, %VRCX
|
|
||||||
+ and %VRCX, %VRAX
|
|
||||||
+ jz L(ret3)
|
|
||||||
+ bsr %VRAX, %VRAX
|
|
||||||
+# ifdef USE_AS_WCSRCHR
|
|
||||||
+ leaq (%rdi, %rax, CHAR_SIZE), %rax
|
|
||||||
+# else
|
|
||||||
+ addq %rdi, %rax
|
|
||||||
+# endif
|
|
||||||
+L(ret3):
|
|
||||||
+ ret
|
|
||||||
+END(STRRCHR)
|
|
||||||
+#endif
|
|
||||||
diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
|
|
||||||
index a584cd3f430ba9d5..e5c5fe3bf28a5966 100644
|
|
||||||
--- a/sysdeps/x86_64/multiarch/wcsrchr-evex.S
|
|
||||||
+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S
|
|
||||||
@@ -4,5 +4,4 @@
|
|
||||||
|
|
||||||
#define STRRCHR WCSRCHR
|
|
||||||
#define USE_AS_WCSRCHR 1
|
|
||||||
-#define USE_WIDE_CHAR 1
|
|
||||||
#include "strrchr-evex.S"
|
|
@ -1,110 +0,0 @@
|
|||||||
commit ffe333d46c4ef7c5221cccad2743cac3e149e615
|
|
||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Tue Nov 7 11:23:15 2023 +0100
|
|
||||||
|
|
||||||
elf: Fix force_first handling in dlclose (bug 30785)
|
|
||||||
|
|
||||||
The force_first parameter was ineffective because the dlclose'd
|
|
||||||
object was not necessarily the first in the maps array. Also
|
|
||||||
enable force_first handling unconditionally, regardless of namespace.
|
|
||||||
The initial object in a namespace should be destructed first, too.
|
|
||||||
|
|
||||||
The _dl_sort_maps_dfs function had early returns for relocation
|
|
||||||
dependency processing which broke force_first handling, too, and
|
|
||||||
this is fixed in this change as well.
|
|
||||||
|
|
||||||
diff --git a/elf/dl-close.c b/elf/dl-close.c
|
|
||||||
index 1c7a861db140259d..a97a1efa45eb7409 100644
|
|
||||||
--- a/elf/dl-close.c
|
|
||||||
+++ b/elf/dl-close.c
|
|
||||||
@@ -153,6 +153,16 @@ _dl_close_worker (struct link_map *map, bool force)
|
|
||||||
}
|
|
||||||
assert (idx == nloaded);
|
|
||||||
|
|
||||||
+ /* Put the dlclose'd map first, so that its destructor runs first.
|
|
||||||
+ The map variable is NULL after a retry. */
|
|
||||||
+ if (map != NULL)
|
|
||||||
+ {
|
|
||||||
+ maps[map->l_idx] = maps[0];
|
|
||||||
+ maps[map->l_idx]->l_idx = map->l_idx;
|
|
||||||
+ maps[0] = map;
|
|
||||||
+ maps[0]->l_idx = 0;
|
|
||||||
+ }
|
|
||||||
+
|
|
||||||
/* Keep track of the lowest index link map we have covered already. */
|
|
||||||
int done_index = -1;
|
|
||||||
while (++done_index < nloaded)
|
|
||||||
@@ -226,9 +236,10 @@ _dl_close_worker (struct link_map *map, bool force)
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
- /* Sort the entries. We can skip looking for the binary itself which is
|
|
||||||
- at the front of the search list for the main namespace. */
|
|
||||||
- _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true);
|
|
||||||
+ /* Sort the entries. Unless retrying, the maps[0] object (the
|
|
||||||
+ original argument to dlclose) needs to remain first, so that its
|
|
||||||
+ destructor runs first. */
|
|
||||||
+ _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true);
|
|
||||||
|
|
||||||
/* Call all termination functions at once. */
|
|
||||||
bool unload_any = false;
|
|
||||||
@@ -732,7 +743,11 @@ _dl_close_worker (struct link_map *map, bool force)
|
|
||||||
/* Recheck if we need to retry, release the lock. */
|
|
||||||
out:
|
|
||||||
if (dl_close_state == rerun)
|
|
||||||
- goto retry;
|
|
||||||
+ {
|
|
||||||
+ /* The map may have been deallocated. */
|
|
||||||
+ map = NULL;
|
|
||||||
+ goto retry;
|
|
||||||
+ }
|
|
||||||
|
|
||||||
dl_close_state = not_pending;
|
|
||||||
}
|
|
||||||
diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c
|
|
||||||
index 5616c8a6a33ebb1e..5c846c7c6f9d88bc 100644
|
|
||||||
--- a/elf/dl-sort-maps.c
|
|
||||||
+++ b/elf/dl-sort-maps.c
|
|
||||||
@@ -255,13 +255,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps,
|
|
||||||
The below memcpy is not needed in the do_reldeps case here,
|
|
||||||
since we wrote back to maps[] during DFS traversal. */
|
|
||||||
if (maps_head == maps)
|
|
||||||
- return;
|
|
||||||
+ break;
|
|
||||||
}
|
|
||||||
assert (maps_head == maps);
|
|
||||||
- return;
|
|
||||||
}
|
|
||||||
-
|
|
||||||
- memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
|
|
||||||
+ else
|
|
||||||
+ memcpy (maps, rpo, sizeof (struct link_map *) * nmaps);
|
|
||||||
|
|
||||||
/* Skipping the first object at maps[0] is not valid in general,
|
|
||||||
since traversing along object dependency-links may "find" that
|
|
||||||
diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def
|
|
||||||
index 4bf9052db16fb352..cf6453e9eb85ac65 100644
|
|
||||||
--- a/elf/dso-sort-tests-1.def
|
|
||||||
+++ b/elf/dso-sort-tests-1.def
|
|
||||||
@@ -56,14 +56,16 @@ output: b>a>{}<a<b
|
|
||||||
# relocation(dynamic) dependencies. While this is technically unspecified, the
|
|
||||||
# presumed reasonable practical behavior is for the destructor order to respect
|
|
||||||
# the static DT_NEEDED links (here this means the a->b->c->d order).
|
|
||||||
-# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based
|
|
||||||
-# dynamic_sort=2 algorithm does, although it is still arguable whether going
|
|
||||||
-# beyond spec to do this is the right thing to do.
|
|
||||||
+# The older dynamic_sort=1 algorithm originally did not achieve this,
|
|
||||||
+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker,
|
|
||||||
+# effectively disabling proper force_first handling.
|
|
||||||
+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first
|
|
||||||
+# handling: the a object is simply moved to the front.
|
|
||||||
# The below expected outputs are what the two algorithms currently produce
|
|
||||||
# respectively, for regression testing purposes.
|
|
||||||
tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c
|
|
||||||
-output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<c<d<g<f<b<e];}
|
|
||||||
-output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<g<f<a<b<c<d<e];}
|
|
||||||
+output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<b<c<d<g<f<e];}
|
|
||||||
+output(glibc.rtld.dynamic_sort=2): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[<a<g<f<b<c<d<e];}
|
|
||||||
|
|
||||||
# Test that even in the presence of dependency loops involving dlopen'ed
|
|
||||||
# object, that object is initialized last (and not unloaded prematurely).
|
|
@ -1,22 +0,0 @@
|
|||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Wed Nov 15 06:31:40 2023 +0100
|
|
||||||
|
|
||||||
stdlib: Avoid another self-comparison in qsort
|
|
||||||
|
|
||||||
In the insertion phase, we could run off the start of the array if the
|
|
||||||
comparison function never runs zero. In that case, it never finds the
|
|
||||||
initial element that terminates the iteration.
|
|
||||||
|
|
||||||
diff --git a/stdlib/qsort.c b/stdlib/qsort.c
|
|
||||||
index ad110e8a892a66e1..1ea31ff4247da7a4 100644
|
|
||||||
--- a/stdlib/qsort.c
|
|
||||||
+++ b/stdlib/qsort.c
|
|
||||||
@@ -217,7 +217,7 @@ insertion_sort_qsort_partitions (void *const pbase, size_t total_elems,
|
|
||||||
while ((run_ptr += size) <= end_ptr)
|
|
||||||
{
|
|
||||||
tmp_ptr = run_ptr - size;
|
|
||||||
- while (cmp (run_ptr, tmp_ptr, arg) < 0)
|
|
||||||
+ while (tmp_ptr != base_ptr && cmp (run_ptr, tmp_ptr, arg) < 0)
|
|
||||||
tmp_ptr -= size;
|
|
||||||
|
|
||||||
tmp_ptr += size;
|
|
@ -1,60 +0,0 @@
|
|||||||
Author: Florian Weimer <fweimer@redhat.com>
|
|
||||||
Date: Mon Oct 23 12:53:16 2023 +0200
|
|
||||||
|
|
||||||
ldconfig: Fixes for skipping temporary files.
|
|
||||||
|
|
||||||
Arguments to a memchr call were swapped, causing incorrect skipping
|
|
||||||
of files.
|
|
||||||
|
|
||||||
Files related to dpkg have different names: they actually end in
|
|
||||||
.dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed.
|
|
||||||
|
|
||||||
Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip
|
|
||||||
temporary files created by package managers").
|
|
||||||
|
|
||||||
diff --git a/elf/ldconfig.c b/elf/ldconfig.c
|
|
||||||
index 02387a169c902f01..bccd386761d8cbb2 100644
|
|
||||||
--- a/elf/ldconfig.c
|
|
||||||
+++ b/elf/ldconfig.c
|
|
||||||
@@ -661,6 +661,17 @@ struct dlib_entry
|
|
||||||
struct dlib_entry *next;
|
|
||||||
};
|
|
||||||
|
|
||||||
+/* Return true if the N bytes at NAME end with with the characters in
|
|
||||||
+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.)
|
|
||||||
+ Expected to be called with a string literal for SUFFIX. */
|
|
||||||
+static inline bool
|
|
||||||
+endswithn (const char *name, size_t n, const char *suffix)
|
|
||||||
+{
|
|
||||||
+ return (n >= strlen (suffix)
|
|
||||||
+ && memcmp (name + n - strlen (suffix), suffix,
|
|
||||||
+ strlen (suffix)) == 0);
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
/* Skip some temporary DSO files. These files may be partially written
|
|
||||||
and lead to ldconfig crashes when examined. */
|
|
||||||
static bool
|
|
||||||
@@ -670,8 +681,7 @@ skip_dso_based_on_name (const char *name, size_t len)
|
|
||||||
names like these are never really DSOs we want to look at. */
|
|
||||||
if (len >= sizeof (".#prelink#") - 1)
|
|
||||||
{
|
|
||||||
- if (strcmp (name + len - sizeof (".#prelink#") + 1,
|
|
||||||
- ".#prelink#") == 0)
|
|
||||||
+ if (endswithn (name, len, ".#prelink#"))
|
|
||||||
return true;
|
|
||||||
if (len >= sizeof (".#prelink#.XXXXXX") - 1
|
|
||||||
&& memcmp (name + len - sizeof (".#prelink#.XXXXXX")
|
|
||||||
@@ -679,10 +689,11 @@ skip_dso_based_on_name (const char *name, size_t len)
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
/* Skip temporary files created by RPM. */
|
|
||||||
- if (memchr (name, len, ';') != NULL)
|
|
||||||
+ if (memchr (name, ';', len) != NULL)
|
|
||||||
return true;
|
|
||||||
/* Skip temporary files created by dpkg. */
|
|
||||||
- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0)
|
|
||||||
+ if (endswithn (name, len, ".dpkg-new")
|
|
||||||
+ || endswithn (name, len, ".dpkg-tmp"))
|
|
||||||
return true;
|
|
||||||
return false;
|
|
||||||
}
|
|
63
glibc.spec
63
glibc.spec
@ -1,4 +1,4 @@
|
|||||||
%global glibcsrcdir glibc-2.38.9000-244-gd1dcb565a1
|
%global glibcsrcdir glibc-2.38.9000-295-g5d7f1bce7d
|
||||||
%global glibcversion 2.38.9000
|
%global glibcversion 2.38.9000
|
||||||
# Pre-release tarballs are pulled in from git using a command that is
|
# Pre-release tarballs are pulled in from git using a command that is
|
||||||
# effectively:
|
# effectively:
|
||||||
@ -159,7 +159,7 @@ Version: %{glibcversion}
|
|||||||
# - It allows using the Release number without the %%dist tag in the dependency
|
# - It allows using the Release number without the %%dist tag in the dependency
|
||||||
# generator to make the generated requires interchangeable between Rawhide
|
# generator to make the generated requires interchangeable between Rawhide
|
||||||
# and ELN (.elnYY < .fcXX).
|
# and ELN (.elnYY < .fcXX).
|
||||||
%global baserelease 22
|
%global baserelease 23
|
||||||
Release: %{baserelease}%{?dist}
|
Release: %{baserelease}%{?dist}
|
||||||
|
|
||||||
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
|
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
|
||||||
@ -230,10 +230,7 @@ Patch9: glibc-rh827510.patch
|
|||||||
Patch13: glibc-fedora-localedata-rh61908.patch
|
Patch13: glibc-fedora-localedata-rh61908.patch
|
||||||
Patch17: glibc-cs-path.patch
|
Patch17: glibc-cs-path.patch
|
||||||
Patch23: glibc-python3.patch
|
Patch23: glibc-python3.patch
|
||||||
Patch24: glibc-rh2244688.patch
|
Patch24: glibc-benchtests-aarch64.patch
|
||||||
Patch25: glibc-rh2244992.patch
|
|
||||||
Patch26: glibc-rh2248915.patch
|
|
||||||
Patch27: glibc-rh2248502-3.patch
|
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Continued list of core "glibc" package information:
|
# Continued list of core "glibc" package information:
|
||||||
@ -2204,6 +2201,60 @@ update_gconv_modules_cache ()
|
|||||||
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Wed Nov 22 2023 Florian Weimer <fweimer@redhat.com> - 2.38.9000-23
|
||||||
|
- Apply glibc-benchtests-aarch64.patch to fix an aarch64 build failure.
|
||||||
|
- Drop glibc-rh2244688.patch revert. Fix applied upstream.
|
||||||
|
- Drop glibc-rh2244992.patch, glibc-rh2248915.patch, glibc-rh2248502-3.patch.
|
||||||
|
All applied upstream.
|
||||||
|
- Auto-sync with upstream branch master,
|
||||||
|
commit 5d7f1bce7d8eea31f4baeb68bcc3124b35acc751:
|
||||||
|
- posix: Revert the removal of the crypt prototype from <unistd.h>
|
||||||
|
- elf: Add comments on how LD_AUDIT and LD_PRELOAD handle __libc_enable_secure
|
||||||
|
- elf: Ignore LD_LIBRARY_PATH and debug env var for setuid for static
|
||||||
|
- elf: Remove any_debug from dl_main_state
|
||||||
|
- elf: Remove LD_PROFILE for static binaries
|
||||||
|
- elf: Ignore LD_PROFILE for setuid binaries
|
||||||
|
- s390: Use dl-symbol-redir-ifunc.h on cpu-tunables
|
||||||
|
- x86: Use dl-symbol-redir-ifunc.h on cpu-tunables
|
||||||
|
- elf: Emit warning if tunable is ill-formatted
|
||||||
|
- elf: Fix _dl_debug_vdprintf to work before self-relocation
|
||||||
|
- elf: Do not parse ill-formatted strings
|
||||||
|
- elf: Do not process invalid tunable format
|
||||||
|
- elf: Add all malloc tunable to unsecvars
|
||||||
|
- elf: Ignore GLIBC_TUNABLES for setuid/setgid binaries
|
||||||
|
- elf: Add GLIBC_TUNABLES to unsecvars
|
||||||
|
- elf: Remove /etc/suid-debug support
|
||||||
|
- stdlib: The qsort implementation needs to use heapsort in more cases
|
||||||
|
- stdlib: Handle various corner cases in the fallback heapsort for qsort
|
||||||
|
- stdlib: Avoid another self-comparison in qsort
|
||||||
|
- hurd: fix restarting reauth_dtable on signal
|
||||||
|
- hurd: Prevent the final file_exec_paths call from signals
|
||||||
|
- manual: Fix termios.c example. (Bug 31078)
|
||||||
|
- aarch64: Add vector implementations of expm1 routines
|
||||||
|
- linux: Use fchmodat2 on fchmod for flags different than 0 (BZ 26401)
|
||||||
|
- intl: Add test case for bug 16621
|
||||||
|
- resolv: free only initialized items from gai pool
|
||||||
|
- ldconfig: Fixes for skipping temporary files.
|
||||||
|
- nptl: Link tst-execstack-threads-mod.so with -z execstack
|
||||||
|
- nptl: Rename tst-execstack to tst-execstack-threads
|
||||||
|
- localedata: Convert oc_FR locale to UTF-8
|
||||||
|
- localedata: Add information for Occitan
|
||||||
|
- elf: Fix force_first handling in dlclose (bug 30981)
|
||||||
|
- elf: Handle non-directory name in search path (BZ 31035)
|
||||||
|
- New Zealand locales (en_NZ & mi_NZ) first day of week should be Monday
|
||||||
|
- x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S
|
||||||
|
- posix: Check pidfd_spawn with tst-spawn7-pid
|
||||||
|
- y2038: Fix support for 64-bit time on legacy ABIs
|
||||||
|
- AArch64: Remove Falkor memcpy
|
||||||
|
- AArch64: Add memset_zva64
|
||||||
|
- AArch64: Cleanup emag memset
|
||||||
|
- test: Run the tst-tls-allocation-failure-static-patched with test-wrapper.
|
||||||
|
- aarch64: Add vector implementations of log1p routines
|
||||||
|
- aarch64: Add vector implementations of atan2 routines
|
||||||
|
- aarch64: Add vector implementations of atan routines
|
||||||
|
- aarch64: Add vector implementations of acos routines
|
||||||
|
- aarch64: Add vector implementations of asin routines
|
||||||
|
|
||||||
* Wed Nov 15 2023 Florian Weimer <fweimer@redhat.com> - 2.38.9000-22
|
* Wed Nov 15 2023 Florian Weimer <fweimer@redhat.com> - 2.38.9000-22
|
||||||
- Work around another self-comparison application issue in qsort (#2248502)
|
- Work around another self-comparison application issue in qsort (#2248502)
|
||||||
|
|
||||||
|
2
sources
2
sources
@ -1 +1 @@
|
|||||||
SHA512 (glibc-2.38.9000-244-gd1dcb565a1.tar.xz) = 74d23268abb91447d74906ae477f926baa2f54a95d87fc18d8972a976524987e98f6141f3767406aef6293d2ef7f7bc16cfedaacd2d33bca207253ec7073c915
|
SHA512 (glibc-2.38.9000-295-g5d7f1bce7d.tar.xz) = 7e531ed030bef65198d86d9974dc280ea2897d3d89c0ceecdd2667470ce4418c42f5a1633927786c8a9f4d7d967cec2a462e4a6918dfb92fc419e5ec9ecf28e3
|
||||||
|
Loading…
Reference in New Issue
Block a user