diff --git a/glibc-benchtests-aarch64.patch b/glibc-benchtests-aarch64.patch new file mode 100644 index 0000000..4266517 --- /dev/null +++ b/glibc-benchtests-aarch64.patch @@ -0,0 +1,250 @@ +Author: Joe Ramsay +Date: Tue Nov 21 14:39:39 2023 +0000 + + aarch64: Fix libmvec benchmarks + + These were broken by the new atan2 functions, as they were only + set up for univariate functions. Arity is now detected from the + input file - this revealed a mistake that the double-precision + inputs were being used for both single- and double-precision + routines, which is now remedied. + +diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py +index 3e124c781065fea9..3661a24044cc9770 100644 +--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py ++++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_advsimd.py +@@ -22,40 +22,49 @@ TEMPLATE = """ + #include + #include + +-#define STRIDE {stride} ++#define STRIDE {rowlen} + +-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{ \\ +- {rtype} mx0 = {fname}(vld1q_f{prec_short} (variants[v].in[i].arg0)); \\ ++#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{ \\ ++ {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE])); \\ + mx0; }})) + +-struct args ++#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{ \\ ++ {rtype} mx0 = {fname}(vld1q_f{prec_short} (&variants[v].in->arg0[i * STRIDE]), \\ ++ vld1q_f{prec_short} (&variants[v].in->arg1[i * STRIDE])); \\ ++ mx0; }})) ++ ++struct args_1 ++{{ ++ {stype} arg0[{nelems}]; ++}}; ++ ++struct args_2 + {{ +- {stype} arg0[STRIDE]; +- double timing; ++ {stype} arg0[{nelems}]; ++ {stype} arg1[{nelems}]; + }}; + + struct _variants + {{ + const char *name; +- int count; +- const struct args *in; ++ const struct args_{arity} *in; + }}; + +-static const struct args in0[{rowcount}] = {{ ++static const struct args_{arity} in0 = {{ + {in_data} + }}; + + static const struct _variants variants[1] = {{ +- {{"", {rowcount}, in0}}, ++ {{"", &in0}}, + }}; + + #define NUM_VARIANTS 1 +-#define NUM_SAMPLES(i) (variants[i].count) ++#define NUM_SAMPLES(i) ({nelems} / STRIDE) + #define VARIANT(i) (variants[i].name) + + static {rtype} volatile ret; + +-#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC(i, j); }}) ++#define BENCH_FUNC(i, j) ({{ ret = CALL_BENCH_FUNC_{arity}(i, j); }}) + #define FUNCNAME "{fname}" + #include + """ +@@ -63,27 +72,34 @@ static {rtype} volatile ret; + def main(name): + _, prec, _, func = name.split("-") + scalar_to_advsimd_type = {"double": "float64x2_t", "float": "float32x4_t"} +- +- stride = {"double": 2, "float": 4}[prec] ++ rowlen = {"double": 2, "float": 4}[prec] + rtype = scalar_to_advsimd_type[prec] + atype = scalar_to_advsimd_type[prec] +- fname = f"_ZGVnN{stride}v_{func}{'f' if prec == 'float' else ''}" + prec_short = {"double": 64, "float": 32}[prec] +- +- with open(f"../benchtests/libmvec/{func}-inputs") as f: +- in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")] +- in_vals = [in_vals[i:i+stride] for i in range(0, len(in_vals), stride)] +- rowcount= len(in_vals) +- in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals) +- +- print(TEMPLATE.format(stride=stride, ++ input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec] ++ ++ with open(f"../benchtests/libmvec/{input_filename}") as f: ++ input_file = f.readlines() ++ in_vals = (l.strip() for l in input_file if l and not l.startswith("#")) ++ # Split in case of multivariate signature ++ in_vals = (l.split(", ") for l in in_vals) ++ # Transpose ++ in_vals = list(zip(*in_vals)) ++ in_data = ",\n".join("{" + (", ".join(val for val in col) + "}") ++ for col in in_vals) ++ ++ arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec) ++ fname = f"_ZGVnN{rowlen}{'v' * arity}_{func}{'f' if prec == 'float' else ''}" ++ ++ print(TEMPLATE.format(rowlen=rowlen, + rtype=rtype, + atype=atype, + fname=fname, + prec_short=prec_short, + in_data=in_data, +- rowcount=rowcount, +- stype=prec)) ++ stype=prec, ++ arity=arity, ++ nelems=len(in_vals[0]))) + + + if __name__ == "__main__": +diff --git a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py +index 66f2c8e0f465f9ce..5d9332be9c5a536a 100755 +--- a/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py ++++ b/sysdeps/aarch64/fpu/scripts/bench_libmvec_sve.py +@@ -22,46 +22,55 @@ TEMPLATE = """ + #include + #include + +-#define MAX_STRIDE {max_stride} + #define STRIDE {stride} + #define PTRUE svptrue_b{prec_short} + #define SV_LOAD svld1_f{prec_short} + #define SV_STORE svst1_f{prec_short} + #define REQUIRE_SVE + +-#define CALL_BENCH_FUNC(v, i) (__extension__ ({{ \\ +- {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), variants[v].in[i].arg0), PTRUE()); \\ ++#define CALL_BENCH_FUNC_1(v, i) (__extension__ ({{ \\ ++ {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]), PTRUE()); \\ + mx0; }})) + +-struct args ++#define CALL_BENCH_FUNC_2(v, i) (__extension__ ({{ \\ ++ {rtype} mx0 = {fname}(SV_LOAD (PTRUE(), &variants[v].in->arg0[i * STRIDE]), \\ ++ SV_LOAD (PTRUE(), &variants[v].in->arg1[i * STRIDE]), \\ ++ PTRUE()); \\ ++ mx0; }})) ++ ++struct args_1 + {{ +- {stype} arg0[MAX_STRIDE]; +- double timing; ++ {stype} arg0[{nelems}]; ++}}; ++ ++struct args_2 ++{{ ++ {stype} arg0[{nelems}]; ++ {stype} arg1[{nelems}]; + }}; + + struct _variants + {{ + const char *name; +- int count; +- const struct args *in; ++ const struct args_{arity} *in; + }}; + +-static const struct args in0[{rowcount}] = {{ ++static const struct args_{arity} in0 = {{ + {in_data} + }}; + + static const struct _variants variants[1] = {{ +- {{"", {rowcount}, in0}}, ++ {{"", &in0}}, + }}; + + #define NUM_VARIANTS 1 +-#define NUM_SAMPLES(i) (variants[i].count) ++#define NUM_SAMPLES(i) ({nelems} / STRIDE) + #define VARIANT(i) (variants[i].name) + + // Cannot pass volatile pointer to svst1. This still does not appear to get optimised out. +-static {stype} /*volatile*/ ret[MAX_STRIDE]; ++static {stype} /*volatile*/ ret[{rowlen}]; + +-#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC(i, j)); }}) ++#define BENCH_FUNC(i, j) ({{ SV_STORE(PTRUE(), ret, CALL_BENCH_FUNC_{arity}(i, j)); }}) + #define FUNCNAME "{fname}" + #include + """ +@@ -69,23 +78,29 @@ static {stype} /*volatile*/ ret[MAX_STRIDE]; + def main(name): + _, prec, _, func = name.split("-") + scalar_to_sve_type = {"double": "svfloat64_t", "float": "svfloat32_t"} +- + stride = {"double": "svcntd()", "float": "svcntw()"}[prec] + rtype = scalar_to_sve_type[prec] + atype = scalar_to_sve_type[prec] +- fname = f"_ZGVsMxv_{func}{'f' if prec == 'float' else ''}" + prec_short = {"double": 64, "float": 32}[prec] + # Max SVE vector length is 2048 bits. To ensure benchmarks are + # vector-length-agnostic, but still use as wide vectors as + # possible on any given target, divide input data into 2048-bit + # rows, then load/store as many elements as the target will allow. +- max_stride = 2048 // prec_short +- +- with open(f"../benchtests/libmvec/{func}-inputs") as f: +- in_vals = [l.strip() for l in f.readlines() if l and not l.startswith("#")] +- in_vals = [in_vals[i:i+max_stride] for i in range(0, len(in_vals), max_stride)] +- rowcount= len(in_vals) +- in_data = ",\n".join("{{" + ", ".join(row) + "}, 0}" for row in in_vals) ++ rowlen = {"double": 32, "float": 64}[prec] ++ input_filename = {"double": f"{func}-inputs", "float": f"{func}f-inputs"}[prec] ++ ++ with open(f"../benchtests/libmvec/{input_filename}") as f: ++ input_file = f.readlines() ++ in_vals = (l.strip() for l in input_file if l and not l.startswith("#")) ++ # Split in case of multivariate signature ++ in_vals = (l.split(", ") for l in in_vals) ++ # Transpose ++ in_vals = list(zip(*in_vals)) ++ in_data = ",\n".join("{" + (", ".join(val for val in col) + "}") ++ for col in in_vals) ++ ++ arity = [l for l in input_file if l.startswith("## args: ")][0].count(prec) ++ fname = f"_ZGVsMx{'v' * arity}_{func}{'f' if prec == 'float' else ''}" + + print(TEMPLATE.format(stride=stride, + rtype=rtype, +@@ -93,9 +108,10 @@ def main(name): + fname=fname, + prec_short=prec_short, + in_data=in_data, +- rowcount=rowcount, + stype=prec, +- max_stride=max_stride)) ++ rowlen=rowlen, ++ arity=arity, ++ nelems=len(in_vals[0]))) + + + if __name__ == "__main__": diff --git a/glibc-rh2244688.patch b/glibc-rh2244688.patch deleted file mode 100644 index e2c5f37..0000000 --- a/glibc-rh2244688.patch +++ /dev/null @@ -1,967 +0,0 @@ -Author: Florian Weimer -Date: Wed Oct 18 11:12:29 2023 +0200 - - Revert "x86: Prepare `strrchr-evex` and `strrchr-evex512` for AVX10" - - This reverts commit a3c50bf46a1ca6d9d2b7d879176d345abf95a9de. - -diff --git a/sysdeps/x86_64/multiarch/strrchr-evex-base.S b/sysdeps/x86_64/multiarch/strrchr-evex-base.S -index cd6a0a870a02b9bd..58b2853ab69265e8 100644 ---- a/sysdeps/x86_64/multiarch/strrchr-evex-base.S -+++ b/sysdeps/x86_64/multiarch/strrchr-evex-base.S -@@ -1,4 +1,4 @@ --/* Implementation for strrchr using evex256 and evex512. -+/* Placeholder function, not used by any processor at the moment. - Copyright (C) 2022-2023 Free Software Foundation, Inc. - This file is part of the GNU C Library. - -@@ -16,6 +16,8 @@ - License along with the GNU C Library; if not, see - . */ - -+/* UNUSED. Exists purely as reference implementation. */ -+ - #include - - #if ISA_SHOULD_BUILD (4) -@@ -23,351 +25,240 @@ - # include - - # ifdef USE_AS_WCSRCHR --# if VEC_SIZE == 64 --# define RCX_M cx --# define KORTEST_M kortestw --# else --# define RCX_M cl --# define KORTEST_M kortestb --# endif -- --# define SHIFT_REG VRCX - # define CHAR_SIZE 4 --# define VPCMP vpcmpd --# define VPMIN vpminud --# define VPCOMPRESS vpcompressd --# define VPTESTN vptestnmd --# define VPTEST vptestmd --# define VPBROADCAST vpbroadcastd -+# define VPBROADCAST vpbroadcastd - # define VPCMPEQ vpcmpeqd -- -+# define VPMINU vpminud -+# define VPTESTN vptestnmd - # else --# define SHIFT_REG VRDI - # define CHAR_SIZE 1 --# define VPCMP vpcmpb --# define VPMIN vpminub --# define VPCOMPRESS vpcompressb --# define VPTESTN vptestnmb --# define VPTEST vptestmb --# define VPBROADCAST vpbroadcastb -+# define VPBROADCAST vpbroadcastb - # define VPCMPEQ vpcmpeqb -- --# define RCX_M VRCX --# define KORTEST_M KORTEST -+# define VPMINU vpminub -+# define VPTESTN vptestnmb - # endif - --# define VMATCH VMM(0) --# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - # define PAGE_SIZE 4096 -+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) - - .section SECTION(.text), "ax", @progbits -- /* Aligning entry point to 64 byte, provides better performance for -- one vector length string. */ --ENTRY_P2ALIGN(STRRCHR, 6) -- movl %edi, %eax -- /* Broadcast CHAR to VMATCH. */ -- VPBROADCAST %esi, %VMATCH -+/* Aligning entry point to 64 byte, provides better performance for -+ one vector length string. */ -+ENTRY_P2ALIGN (STRRCHR, 6) - -- andl $(PAGE_SIZE - 1), %eax -- cmpl $(PAGE_SIZE - VEC_SIZE), %eax -- jg L(cross_page_boundary) -+ /* Broadcast CHAR to VMM(0). */ -+ VPBROADCAST %esi, %VMM(0) -+ movl %edi, %eax -+ sall $20, %eax -+ cmpl $((PAGE_SIZE - VEC_SIZE) << 20), %eax -+ ja L(page_cross) - -+L(page_cross_continue): -+ /* Compare [w]char for null, mask bit will be set for match. */ - VMOVU (%rdi), %VMM(1) -- /* k0 has a 1 for each zero CHAR in YMM1. */ -- VPTESTN %VMM(1), %VMM(1), %k0 -- KMOV %k0, %VGPR(rsi) -- test %VGPR(rsi), %VGPR(rsi) -- jz L(aligned_more) -- /* fallthrough: zero CHAR in first VEC. */ --L(page_cross_return): -- /* K1 has a 1 for each search CHAR match in VEC(1). */ -- VPCMPEQ %VMATCH, %VMM(1), %k1 -- KMOV %k1, %VGPR(rax) -- /* Build mask up until first zero CHAR (used to mask of -- potential search CHAR matches past the end of the string). */ -- blsmsk %VGPR(rsi), %VGPR(rsi) -- /* Use `and` here to remove any out of bounds matches so we can -- do a reverse scan on `rax` to find the last match. */ -- and %VGPR(rsi), %VGPR(rax) -- jz L(ret0) -- /* Get last match. */ -- bsr %VGPR(rax), %VGPR(rax) -+ -+ VPTESTN %VMM(1), %VMM(1), %k1 -+ KMOV %k1, %VRCX -+ test %VRCX, %VRCX -+ jz L(align_more) -+ -+ VPCMPEQ %VMM(1), %VMM(0), %k0 -+ KMOV %k0, %VRAX -+ BLSMSK %VRCX, %VRCX -+ and %VRCX, %VRAX -+ jz L(ret) -+ -+ BSR %VRAX, %VRAX - # ifdef USE_AS_WCSRCHR - leaq (%rdi, %rax, CHAR_SIZE), %rax - # else -- addq %rdi, %rax -+ add %rdi, %rax - # endif --L(ret0): -+L(ret): - ret - -- /* Returns for first vec x1/x2/x3 have hard coded backward -- search path for earlier matches. */ -- .p2align 4,, 6 --L(first_vec_x1): -- VPCMPEQ %VMATCH, %VMM(2), %k1 -- KMOV %k1, %VGPR(rax) -- blsmsk %VGPR(rcx), %VGPR(rcx) -- /* eax non-zero if search CHAR in range. */ -- and %VGPR(rcx), %VGPR(rax) -- jnz L(first_vec_x1_return) -- -- /* fallthrough: no match in YMM2 then need to check for earlier -- matches (in YMM1). */ -- .p2align 4,, 4 --L(first_vec_x0_test): -- VPCMPEQ %VMATCH, %VMM(1), %k1 -- KMOV %k1, %VGPR(rax) -- test %VGPR(rax), %VGPR(rax) -- jz L(ret1) -- bsr %VGPR(rax), %VGPR(rax) -+L(vector_x2_end): -+ VPCMPEQ %VMM(2), %VMM(0), %k2 -+ KMOV %k2, %VRAX -+ BLSMSK %VRCX, %VRCX -+ and %VRCX, %VRAX -+ jz L(vector_x1_ret) -+ -+ BSR %VRAX, %VRAX -+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax -+ ret -+ -+ /* Check the first vector at very last to look for match. */ -+L(vector_x1_ret): -+ VPCMPEQ %VMM(1), %VMM(0), %k2 -+ KMOV %k2, %VRAX -+ test %VRAX, %VRAX -+ jz L(ret) -+ -+ BSR %VRAX, %VRAX - # ifdef USE_AS_WCSRCHR - leaq (%rsi, %rax, CHAR_SIZE), %rax - # else -- addq %rsi, %rax -+ add %rsi, %rax - # endif --L(ret1): - ret - -- .p2align 4,, 10 --L(first_vec_x3): -- VPCMPEQ %VMATCH, %VMM(4), %k1 -- KMOV %k1, %VGPR(rax) -- blsmsk %VGPR(rcx), %VGPR(rcx) -- /* If no search CHAR match in range check YMM1/YMM2/YMM3. */ -- and %VGPR(rcx), %VGPR(rax) -- jz L(first_vec_x1_or_x2) -- bsr %VGPR(rax), %VGPR(rax) -- leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax -- ret -- .p2align 4,, 4 -- --L(first_vec_x2): -- VPCMPEQ %VMATCH, %VMM(3), %k1 -- KMOV %k1, %VGPR(rax) -- blsmsk %VGPR(rcx), %VGPR(rcx) -- /* Check YMM3 for last match first. If no match try YMM2/YMM1. */ -- and %VGPR(rcx), %VGPR(rax) -- jz L(first_vec_x0_x1_test) -- bsr %VGPR(rax), %VGPR(rax) -- leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax -- ret -- -- .p2align 4,, 6 --L(first_vec_x0_x1_test): -- VPCMPEQ %VMATCH, %VMM(2), %k1 -- KMOV %k1, %VGPR(rax) -- /* Check YMM2 for last match first. If no match try YMM1. */ -- test %VGPR(rax), %VGPR(rax) -- jz L(first_vec_x0_test) -- .p2align 4,, 4 --L(first_vec_x1_return): -- bsr %VGPR(rax), %VGPR(rax) -- leaq (VEC_SIZE)(%r8, %rax, CHAR_SIZE), %rax -- ret -- -- .p2align 4,, 12 --L(aligned_more): --L(page_cross_continue): -- /* Need to keep original pointer incase VEC(1) has last match. */ -+L(align_more): -+ /* Zero r8 to store match result. */ -+ xorl %r8d, %r8d -+ /* Save pointer of first vector, in case if no match found. */ - movq %rdi, %rsi -+ /* Align pointer to vector size. */ - andq $-VEC_SIZE, %rdi -- -- VMOVU VEC_SIZE(%rdi), %VMM(2) -+ /* Loop unroll for 2 vector loop. */ -+ VMOVA (VEC_SIZE)(%rdi), %VMM(2) - VPTESTN %VMM(2), %VMM(2), %k0 - KMOV %k0, %VRCX -- movq %rdi, %r8 - test %VRCX, %VRCX -- jnz L(first_vec_x1) -- -- VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3) -- VPTESTN %VMM(3), %VMM(3), %k0 -- KMOV %k0, %VRCX -- -- test %VRCX, %VRCX -- jnz L(first_vec_x2) -- -- VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4) -- VPTESTN %VMM(4), %VMM(4), %k0 -- KMOV %k0, %VRCX -- -- /* Intentionally use 64-bit here. EVEX256 version needs 1-byte -- padding for efficient nop before loop alignment. */ -- test %rcx, %rcx -- jnz L(first_vec_x3) -+ jnz L(vector_x2_end) - -+ /* Save pointer of second vector, in case if no match -+ found. */ -+ movq %rdi, %r9 -+ /* Align address to VEC_SIZE * 2 for loop. */ - andq $-(VEC_SIZE * 2), %rdi -- .p2align 4 --L(first_aligned_loop): -- /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can -- gurantee they don't store a match. */ -- VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5) -- VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6) -- -- VPCMP $4, %VMM(5), %VMATCH, %k2 -- VPCMP $4, %VMM(6), %VMATCH, %k3{%k2} -- -- VPMIN %VMM(5), %VMM(6), %VMM(7) - -- VPTEST %VMM(7), %VMM(7), %k1{%k3} -- subq $(VEC_SIZE * -2), %rdi -- KORTEST_M %k1, %k1 -- jc L(first_aligned_loop) -+ .p2align 4,,11 -+L(loop): -+ /* 2 vector loop, as it provide better performance as compared -+ to 4 vector loop. */ -+ VMOVA (VEC_SIZE * 2)(%rdi), %VMM(3) -+ VMOVA (VEC_SIZE * 3)(%rdi), %VMM(4) -+ VPCMPEQ %VMM(3), %VMM(0), %k1 -+ VPCMPEQ %VMM(4), %VMM(0), %k2 -+ VPMINU %VMM(3), %VMM(4), %VMM(5) -+ VPTESTN %VMM(5), %VMM(5), %k0 -+ KOR %k1, %k2, %k3 -+ subq $-(VEC_SIZE * 2), %rdi -+ /* If k0 and k3 zero, match and end of string not found. */ -+ KORTEST %k0, %k3 -+ jz L(loop) -+ -+ /* If k0 is non zero, end of string found. */ -+ KORTEST %k0, %k0 -+ jnz L(endloop) -+ -+ lea VEC_SIZE(%rdi), %r8 -+ /* A match found, it need to be stored in r8 before loop -+ continue. */ -+ /* Check second vector first. */ -+ KMOV %k2, %VRDX -+ test %VRDX, %VRDX -+ jnz L(loop_vec_x2_match) - -- VPTESTN %VMM(7), %VMM(7), %k1 - KMOV %k1, %VRDX -- test %VRDX, %VRDX -- jz L(second_aligned_loop_prep) -+ /* Match is in first vector, rdi offset need to be subtracted -+ by VEC_SIZE. */ -+ sub $VEC_SIZE, %r8 -+ -+ /* If second vector doesn't have match, first vector must -+ have match. */ -+L(loop_vec_x2_match): -+ BSR %VRDX, %VRDX -+# ifdef USE_AS_WCSRCHR -+ sal $2, %rdx -+# endif -+ add %rdx, %r8 -+ jmp L(loop) - -- KORTEST_M %k3, %k3 -- jnc L(return_first_aligned_loop) -+L(endloop): -+ /* Check if string end in first loop vector. */ -+ VPTESTN %VMM(3), %VMM(3), %k0 -+ KMOV %k0, %VRCX -+ test %VRCX, %VRCX -+ jnz L(loop_vector_x1_end) - -- .p2align 4,, 6 --L(first_vec_x1_or_x2_or_x3): -- VPCMPEQ %VMM(4), %VMATCH, %k4 -- KMOV %k4, %VRAX -+ /* Check if it has match in first loop vector. */ -+ KMOV %k1, %VRAX - test %VRAX, %VRAX -- jz L(first_vec_x1_or_x2) -- bsr %VRAX, %VRAX -- leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax -- ret -+ jz L(loop_vector_x2_end) - -- .p2align 4,, 8 --L(return_first_aligned_loop): -- VPTESTN %VMM(5), %VMM(5), %k0 -+ BSR %VRAX, %VRAX -+ leaq (%rdi, %rax, CHAR_SIZE), %r8 -+ -+ /* String must end in second loop vector. */ -+L(loop_vector_x2_end): -+ VPTESTN %VMM(4), %VMM(4), %k0 - KMOV %k0, %VRCX -- blsmsk %VRCX, %VRCX -- jnc L(return_first_new_match_first) -- blsmsk %VRDX, %VRDX -- VPCMPEQ %VMM(6), %VMATCH, %k0 -- KMOV %k0, %VRAX -- addq $VEC_SIZE, %rdi -- and %VRDX, %VRAX -- jnz L(return_first_new_match_ret) -- subq $VEC_SIZE, %rdi --L(return_first_new_match_first): - KMOV %k2, %VRAX --# ifdef USE_AS_WCSRCHR -- xorl $((1 << CHAR_PER_VEC)- 1), %VRAX -+ BLSMSK %VRCX, %VRCX -+ /* Check if it has match in second loop vector. */ - and %VRCX, %VRAX --# else -- andn %VRCX, %VRAX, %VRAX --# endif -- jz L(first_vec_x1_or_x2_or_x3) --L(return_first_new_match_ret): -- bsr %VRAX, %VRAX -- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax -- ret -+ jz L(check_last_match) - -- .p2align 4,, 10 --L(first_vec_x1_or_x2): -- VPCMPEQ %VMM(3), %VMATCH, %k3 -- KMOV %k3, %VRAX -- test %VRAX, %VRAX -- jz L(first_vec_x0_x1_test) -- bsr %VRAX, %VRAX -- leaq (VEC_SIZE * 2)(%r8, %rax, CHAR_SIZE), %rax -+ BSR %VRAX, %VRAX -+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax - ret - -- .p2align 4 -- /* We can throw away the work done for the first 4x checks here -- as we have a later match. This is the 'fast' path persay. */ --L(second_aligned_loop_prep): --L(second_aligned_loop_set_furthest_match): -- movq %rdi, %rsi -- VMOVA %VMM(5), %VMM(7) -- VMOVA %VMM(6), %VMM(8) -- .p2align 4 --L(second_aligned_loop): -- VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5) -- VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6) -- VPCMP $4, %VMM(5), %VMATCH, %k2 -- VPCMP $4, %VMM(6), %VMATCH, %k3{%k2} -- -- VPMIN %VMM(5), %VMM(6), %VMM(4) -- -- VPTEST %VMM(4), %VMM(4), %k1{%k3} -- subq $(VEC_SIZE * -2), %rdi -- KMOV %k1, %VRCX -- inc %RCX_M -- jz L(second_aligned_loop) -- VPTESTN %VMM(4), %VMM(4), %k1 -- KMOV %k1, %VRDX -- test %VRDX, %VRDX -- jz L(second_aligned_loop_set_furthest_match) -- -- KORTEST_M %k3, %k3 -- jnc L(return_new_match) -- /* branch here because there is a significant advantage interms -- of output dependency chance in using edx. */ -+ /* String end in first loop vector. */ -+L(loop_vector_x1_end): -+ KMOV %k1, %VRAX -+ BLSMSK %VRCX, %VRCX -+ /* Check if it has match in second loop vector. */ -+ and %VRCX, %VRAX -+ jz L(check_last_match) - --L(return_old_match): -- VPCMPEQ %VMM(8), %VMATCH, %k0 -- KMOV %k0, %VRCX -- bsr %VRCX, %VRCX -- jnz L(return_old_match_ret) -+ BSR %VRAX, %VRAX -+ leaq (%rdi, %rax, CHAR_SIZE), %rax -+ ret - -- VPCMPEQ %VMM(7), %VMATCH, %k0 -- KMOV %k0, %VRCX -- bsr %VRCX, %VRCX -- subq $VEC_SIZE, %rsi --L(return_old_match_ret): -- leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax -+ /* No match in first and second loop vector. */ -+L(check_last_match): -+ /* Check if any match recorded in r8. */ -+ test %r8, %r8 -+ jz L(vector_x2_ret) -+ movq %r8, %rax - ret - --L(return_new_match): -- VPTESTN %VMM(5), %VMM(5), %k0 -- KMOV %k0, %VRCX -- blsmsk %VRCX, %VRCX -- jnc L(return_new_match_first) -- dec %VRDX -- VPCMPEQ %VMM(6), %VMATCH, %k0 -- KMOV %k0, %VRAX -- addq $VEC_SIZE, %rdi -- and %VRDX, %VRAX -- jnz L(return_new_match_ret) -- subq $VEC_SIZE, %rdi --L(return_new_match_first): -+ /* No match recorded in r8. Check the second saved vector -+ in beginning. */ -+L(vector_x2_ret): -+ VPCMPEQ %VMM(2), %VMM(0), %k2 - KMOV %k2, %VRAX --# ifdef USE_AS_WCSRCHR -- xorl $((1 << CHAR_PER_VEC)- 1), %VRAX -- and %VRCX, %VRAX --# else -- andn %VRCX, %VRAX, %VRAX --# endif -- jz L(return_old_match) --L(return_new_match_ret): -- bsr %VRAX, %VRAX -- leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax -+ test %VRAX, %VRAX -+ jz L(vector_x1_ret) -+ -+ /* Match found in the second saved vector. */ -+ BSR %VRAX, %VRAX -+ leaq (VEC_SIZE)(%r9, %rax, CHAR_SIZE), %rax - ret - -- .p2align 4,, 4 --L(cross_page_boundary): -- xorq %rdi, %rax -- mov $-1, %VRDX -- VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(6) -- VPTESTN %VMM(6), %VMM(6), %k0 -- KMOV %k0, %VRSI -+L(page_cross): -+ mov %rdi, %rax -+ movl %edi, %ecx - - # ifdef USE_AS_WCSRCHR -- movl %edi, %ecx -- and $(VEC_SIZE - 1), %ecx -- shrl $2, %ecx -+ /* Calculate number of compare result bits to be skipped for -+ wide string alignment adjustment. */ -+ andl $(VEC_SIZE - 1), %ecx -+ sarl $2, %ecx - # endif -- shlx %SHIFT_REG, %VRDX, %VRDX -- -+ /* ecx contains number of w[char] to be skipped as a result -+ of address alignment. */ -+ andq $-VEC_SIZE, %rax -+ VMOVA (%rax), %VMM(1) -+ VPTESTN %VMM(1), %VMM(1), %k1 -+ KMOV %k1, %VRAX -+ SHR %cl, %VRAX -+ jz L(page_cross_continue) -+ VPCMPEQ %VMM(1), %VMM(0), %k0 -+ KMOV %k0, %VRDX -+ SHR %cl, %VRDX -+ BLSMSK %VRAX, %VRAX -+ and %VRDX, %VRAX -+ jz L(ret) -+ BSR %VRAX, %VRAX - # ifdef USE_AS_WCSRCHR -- kmovw %edx, %k1 -+ leaq (%rdi, %rax, CHAR_SIZE), %rax - # else -- KMOV %VRDX, %k1 -+ add %rdi, %rax - # endif - -- VPCOMPRESS %VMM(6), %VMM(1){%k1}{z} -- /* We could technically just jmp back after the vpcompress but -- it doesn't save any 16-byte blocks. */ -- shrx %SHIFT_REG, %VRSI, %VRSI -- test %VRSI, %VRSI -- jnz L(page_cross_return) -- jmp L(page_cross_continue) -- /* 1-byte from cache line. */ --END(STRRCHR) -+ ret -+END (STRRCHR) - #endif -diff --git a/sysdeps/x86_64/multiarch/strrchr-evex.S b/sysdeps/x86_64/multiarch/strrchr-evex.S -index 3bf6a5101422e4d1..85e3b0119f5dc923 100644 ---- a/sysdeps/x86_64/multiarch/strrchr-evex.S -+++ b/sysdeps/x86_64/multiarch/strrchr-evex.S -@@ -1,8 +1,394 @@ -+/* strrchr/wcsrchr optimized with 256-bit EVEX instructions. -+ Copyright (C) 2021-2023 Free Software Foundation, Inc. -+ This file is part of the GNU C Library. -+ -+ The GNU C Library is free software; you can redistribute it and/or -+ modify it under the terms of the GNU Lesser General Public -+ License as published by the Free Software Foundation; either -+ version 2.1 of the License, or (at your option) any later version. -+ -+ The GNU C Library is distributed in the hope that it will be useful, -+ but WITHOUT ANY WARRANTY; without even the implied warranty of -+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU -+ Lesser General Public License for more details. -+ -+ You should have received a copy of the GNU Lesser General Public -+ License along with the GNU C Library; if not, see -+ . */ -+ -+#include -+ -+#if ISA_SHOULD_BUILD (4) -+ -+# include -+ - # ifndef STRRCHR - # define STRRCHR __strrchr_evex - # endif - --#include "x86-evex256-vecs.h" --#include "reg-macros.h" -+# include "x86-evex256-vecs.h" -+ -+# ifdef USE_AS_WCSRCHR -+# define SHIFT_REG rsi -+# define kunpck_2x kunpckbw -+# define kmov_2x kmovd -+# define maskz_2x ecx -+# define maskm_2x eax -+# define CHAR_SIZE 4 -+# define VPMIN vpminud -+# define VPTESTN vptestnmd -+# define VPTEST vptestmd -+# define VPBROADCAST vpbroadcastd -+# define VPCMPEQ vpcmpeqd -+# define VPCMP vpcmpd -+ -+# define USE_WIDE_CHAR -+# else -+# define SHIFT_REG rdi -+# define kunpck_2x kunpckdq -+# define kmov_2x kmovq -+# define maskz_2x rcx -+# define maskm_2x rax -+ -+# define CHAR_SIZE 1 -+# define VPMIN vpminub -+# define VPTESTN vptestnmb -+# define VPTEST vptestmb -+# define VPBROADCAST vpbroadcastb -+# define VPCMPEQ vpcmpeqb -+# define VPCMP vpcmpb -+# endif -+ -+# include "reg-macros.h" -+ -+# define VMATCH VMM(0) -+# define CHAR_PER_VEC (VEC_SIZE / CHAR_SIZE) -+# define PAGE_SIZE 4096 -+ -+ .section SECTION(.text), "ax", @progbits -+ENTRY_P2ALIGN(STRRCHR, 6) -+ movl %edi, %eax -+ /* Broadcast CHAR to VMATCH. */ -+ VPBROADCAST %esi, %VMATCH -+ -+ andl $(PAGE_SIZE - 1), %eax -+ cmpl $(PAGE_SIZE - VEC_SIZE), %eax -+ jg L(cross_page_boundary) -+L(page_cross_continue): -+ VMOVU (%rdi), %VMM(1) -+ /* k0 has a 1 for each zero CHAR in VEC(1). */ -+ VPTESTN %VMM(1), %VMM(1), %k0 -+ KMOV %k0, %VRSI -+ test %VRSI, %VRSI -+ jz L(aligned_more) -+ /* fallthrough: zero CHAR in first VEC. */ -+ /* K1 has a 1 for each search CHAR match in VEC(1). */ -+ VPCMPEQ %VMATCH, %VMM(1), %k1 -+ KMOV %k1, %VRAX -+ /* Build mask up until first zero CHAR (used to mask of -+ potential search CHAR matches past the end of the string). -+ */ -+ blsmsk %VRSI, %VRSI -+ and %VRSI, %VRAX -+ jz L(ret0) -+ /* Get last match (the `and` removed any out of bounds matches). -+ */ -+ bsr %VRAX, %VRAX -+# ifdef USE_AS_WCSRCHR -+ leaq (%rdi, %rax, CHAR_SIZE), %rax -+# else -+ addq %rdi, %rax -+# endif -+L(ret0): -+ ret -+ -+ /* Returns for first vec x1/x2/x3 have hard coded backward -+ search path for earlier matches. */ -+ .p2align 4,, 6 -+L(first_vec_x1): -+ VPCMPEQ %VMATCH, %VMM(2), %k1 -+ KMOV %k1, %VRAX -+ blsmsk %VRCX, %VRCX -+ /* eax non-zero if search CHAR in range. */ -+ and %VRCX, %VRAX -+ jnz L(first_vec_x1_return) -+ -+ /* fallthrough: no match in VEC(2) then need to check for -+ earlier matches (in VEC(1)). */ -+ .p2align 4,, 4 -+L(first_vec_x0_test): -+ VPCMPEQ %VMATCH, %VMM(1), %k1 -+ KMOV %k1, %VRAX -+ test %VRAX, %VRAX -+ jz L(ret1) -+ bsr %VRAX, %VRAX -+# ifdef USE_AS_WCSRCHR -+ leaq (%rsi, %rax, CHAR_SIZE), %rax -+# else -+ addq %rsi, %rax -+# endif -+L(ret1): -+ ret -+ -+ .p2align 4,, 10 -+L(first_vec_x1_or_x2): -+ VPCMPEQ %VMM(3), %VMATCH, %k3 -+ VPCMPEQ %VMM(2), %VMATCH, %k2 -+ /* K2 and K3 have 1 for any search CHAR match. Test if any -+ matches between either of them. Otherwise check VEC(1). */ -+ KORTEST %k2, %k3 -+ jz L(first_vec_x0_test) -+ -+ /* Guaranteed that VEC(2) and VEC(3) are within range so merge -+ the two bitmasks then get last result. */ -+ kunpck_2x %k2, %k3, %k3 -+ kmov_2x %k3, %maskm_2x -+ bsr %maskm_2x, %maskm_2x -+ leaq (VEC_SIZE * 1)(%r8, %rax, CHAR_SIZE), %rax -+ ret -+ -+ .p2align 4,, 7 -+L(first_vec_x3): -+ VPCMPEQ %VMATCH, %VMM(4), %k1 -+ KMOV %k1, %VRAX -+ blsmsk %VRCX, %VRCX -+ /* If no search CHAR match in range check VEC(1)/VEC(2)/VEC(3). -+ */ -+ and %VRCX, %VRAX -+ jz L(first_vec_x1_or_x2) -+ bsr %VRAX, %VRAX -+ leaq (VEC_SIZE * 3)(%rdi, %rax, CHAR_SIZE), %rax -+ ret -+ -+ -+ .p2align 4,, 6 -+L(first_vec_x0_x1_test): -+ VPCMPEQ %VMATCH, %VMM(2), %k1 -+ KMOV %k1, %VRAX -+ /* Check VEC(2) for last match first. If no match try VEC(1). -+ */ -+ test %VRAX, %VRAX -+ jz L(first_vec_x0_test) -+ .p2align 4,, 4 -+L(first_vec_x1_return): -+ bsr %VRAX, %VRAX -+ leaq (VEC_SIZE)(%rdi, %rax, CHAR_SIZE), %rax -+ ret -+ -+ -+ .p2align 4,, 10 -+L(first_vec_x2): -+ VPCMPEQ %VMATCH, %VMM(3), %k1 -+ KMOV %k1, %VRAX -+ blsmsk %VRCX, %VRCX -+ /* Check VEC(3) for last match first. If no match try -+ VEC(2)/VEC(1). */ -+ and %VRCX, %VRAX -+ jz L(first_vec_x0_x1_test) -+ bsr %VRAX, %VRAX -+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax -+ ret -+ -+ -+ .p2align 4,, 12 -+L(aligned_more): -+ /* Need to keep original pointer in case VEC(1) has last match. -+ */ -+ movq %rdi, %rsi -+ andq $-VEC_SIZE, %rdi -+ -+ VMOVU VEC_SIZE(%rdi), %VMM(2) -+ VPTESTN %VMM(2), %VMM(2), %k0 -+ KMOV %k0, %VRCX -+ -+ test %VRCX, %VRCX -+ jnz L(first_vec_x1) -+ -+ VMOVU (VEC_SIZE * 2)(%rdi), %VMM(3) -+ VPTESTN %VMM(3), %VMM(3), %k0 -+ KMOV %k0, %VRCX -+ -+ test %VRCX, %VRCX -+ jnz L(first_vec_x2) -+ -+ VMOVU (VEC_SIZE * 3)(%rdi), %VMM(4) -+ VPTESTN %VMM(4), %VMM(4), %k0 -+ KMOV %k0, %VRCX -+ movq %rdi, %r8 -+ test %VRCX, %VRCX -+ jnz L(first_vec_x3) -+ -+ andq $-(VEC_SIZE * 2), %rdi -+ .p2align 4,, 10 -+L(first_aligned_loop): -+ /* Preserve VEC(1), VEC(2), VEC(3), and VEC(4) until we can -+ guarantee they don't store a match. */ -+ VMOVA (VEC_SIZE * 4)(%rdi), %VMM(5) -+ VMOVA (VEC_SIZE * 5)(%rdi), %VMM(6) -+ -+ VPCMPEQ %VMM(5), %VMATCH, %k2 -+ vpxord %VMM(6), %VMATCH, %VMM(7) -+ -+ VPMIN %VMM(5), %VMM(6), %VMM(8) -+ VPMIN %VMM(8), %VMM(7), %VMM(7) -+ -+ VPTESTN %VMM(7), %VMM(7), %k1 -+ subq $(VEC_SIZE * -2), %rdi -+ KORTEST %k1, %k2 -+ jz L(first_aligned_loop) -+ -+ VPCMPEQ %VMM(6), %VMATCH, %k3 -+ VPTESTN %VMM(8), %VMM(8), %k1 -+ -+ /* If k1 is zero, then we found a CHAR match but no null-term. -+ We can now safely throw out VEC1-4. */ -+ KTEST %k1, %k1 -+ jz L(second_aligned_loop_prep) -+ -+ KORTEST %k2, %k3 -+ jnz L(return_first_aligned_loop) -+ -+ -+ .p2align 4,, 6 -+L(first_vec_x1_or_x2_or_x3): -+ VPCMPEQ %VMM(4), %VMATCH, %k4 -+ KMOV %k4, %VRAX -+ bsr %VRAX, %VRAX -+ jz L(first_vec_x1_or_x2) -+ leaq (VEC_SIZE * 3)(%r8, %rax, CHAR_SIZE), %rax -+ ret -+ -+ -+ .p2align 4,, 8 -+L(return_first_aligned_loop): -+ VPTESTN %VMM(5), %VMM(5), %k0 -+ -+ /* Combined results from VEC5/6. */ -+ kunpck_2x %k0, %k1, %k0 -+ kmov_2x %k0, %maskz_2x -+ -+ blsmsk %maskz_2x, %maskz_2x -+ kunpck_2x %k2, %k3, %k3 -+ kmov_2x %k3, %maskm_2x -+ and %maskz_2x, %maskm_2x -+ jz L(first_vec_x1_or_x2_or_x3) -+ -+ bsr %maskm_2x, %maskm_2x -+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax -+ ret -+ -+ .p2align 4 -+ /* We can throw away the work done for the first 4x checks here -+ as we have a later match. This is the 'fast' path persay. -+ */ -+L(second_aligned_loop_prep): -+L(second_aligned_loop_set_furthest_match): -+ movq %rdi, %rsi -+ /* Ideally we would safe k2/k3 but `kmov/kunpck` take uops on -+ port0 and have noticeable overhead in the loop. */ -+ VMOVA %VMM(5), %VMM(7) -+ VMOVA %VMM(6), %VMM(8) -+ .p2align 4 -+L(second_aligned_loop): -+ VMOVU (VEC_SIZE * 4)(%rdi), %VMM(5) -+ VMOVU (VEC_SIZE * 5)(%rdi), %VMM(6) -+ VPCMPEQ %VMM(5), %VMATCH, %k2 -+ vpxord %VMM(6), %VMATCH, %VMM(3) -+ -+ VPMIN %VMM(5), %VMM(6), %VMM(4) -+ VPMIN %VMM(3), %VMM(4), %VMM(3) -+ -+ VPTESTN %VMM(3), %VMM(3), %k1 -+ subq $(VEC_SIZE * -2), %rdi -+ KORTEST %k1, %k2 -+ jz L(second_aligned_loop) -+ VPCMPEQ %VMM(6), %VMATCH, %k3 -+ VPTESTN %VMM(4), %VMM(4), %k1 -+ KTEST %k1, %k1 -+ jz L(second_aligned_loop_set_furthest_match) -+ -+ /* branch here because we know we have a match in VEC7/8 but -+ might not in VEC5/6 so the latter is expected to be less -+ likely. */ -+ KORTEST %k2, %k3 -+ jnz L(return_new_match) -+ -+L(return_old_match): -+ VPCMPEQ %VMM(8), %VMATCH, %k0 -+ KMOV %k0, %VRCX -+ bsr %VRCX, %VRCX -+ jnz L(return_old_match_ret) -+ -+ VPCMPEQ %VMM(7), %VMATCH, %k0 -+ KMOV %k0, %VRCX -+ bsr %VRCX, %VRCX -+ subq $VEC_SIZE, %rsi -+L(return_old_match_ret): -+ leaq (VEC_SIZE * 3)(%rsi, %rcx, CHAR_SIZE), %rax -+ ret -+ -+ .p2align 4,, 10 -+L(return_new_match): -+ VPTESTN %VMM(5), %VMM(5), %k0 -+ -+ /* Combined results from VEC5/6. */ -+ kunpck_2x %k0, %k1, %k0 -+ kmov_2x %k0, %maskz_2x -+ -+ blsmsk %maskz_2x, %maskz_2x -+ kunpck_2x %k2, %k3, %k3 -+ kmov_2x %k3, %maskm_2x -+ -+ /* Match at end was out-of-bounds so use last known match. */ -+ and %maskz_2x, %maskm_2x -+ jz L(return_old_match) -+ -+ bsr %maskm_2x, %maskm_2x -+ leaq (VEC_SIZE * 2)(%rdi, %rax, CHAR_SIZE), %rax -+ ret -+ -+L(cross_page_boundary): -+ /* eax contains all the page offset bits of src (rdi). `xor rdi, -+ rax` sets pointer will all page offset bits cleared so -+ offset of (PAGE_SIZE - VEC_SIZE) will get last aligned VEC -+ before page cross (guaranteed to be safe to read). Doing this -+ as opposed to `movq %rdi, %rax; andq $-VEC_SIZE, %rax` saves -+ a bit of code size. */ -+ xorq %rdi, %rax -+ VMOVU (PAGE_SIZE - VEC_SIZE)(%rax), %VMM(1) -+ VPTESTN %VMM(1), %VMM(1), %k0 -+ KMOV %k0, %VRCX -+ -+ /* Shift out zero CHAR matches that are before the beginning of -+ src (rdi). */ -+# ifdef USE_AS_WCSRCHR -+ movl %edi, %esi -+ andl $(VEC_SIZE - 1), %esi -+ shrl $2, %esi -+# endif -+ shrx %VGPR(SHIFT_REG), %VRCX, %VRCX -+ -+ test %VRCX, %VRCX -+ jz L(page_cross_continue) - --#include "strrchr-evex-base.S" -+ /* Found zero CHAR so need to test for search CHAR. */ -+ VPCMP $0, %VMATCH, %VMM(1), %k1 -+ KMOV %k1, %VRAX -+ /* Shift out search CHAR matches that are before the beginning of -+ src (rdi). */ -+ shrx %VGPR(SHIFT_REG), %VRAX, %VRAX -+ -+ /* Check if any search CHAR match in range. */ -+ blsmsk %VRCX, %VRCX -+ and %VRCX, %VRAX -+ jz L(ret3) -+ bsr %VRAX, %VRAX -+# ifdef USE_AS_WCSRCHR -+ leaq (%rdi, %rax, CHAR_SIZE), %rax -+# else -+ addq %rdi, %rax -+# endif -+L(ret3): -+ ret -+END(STRRCHR) -+#endif -diff --git a/sysdeps/x86_64/multiarch/wcsrchr-evex.S b/sysdeps/x86_64/multiarch/wcsrchr-evex.S -index a584cd3f430ba9d5..e5c5fe3bf28a5966 100644 ---- a/sysdeps/x86_64/multiarch/wcsrchr-evex.S -+++ b/sysdeps/x86_64/multiarch/wcsrchr-evex.S -@@ -4,5 +4,4 @@ - - #define STRRCHR WCSRCHR - #define USE_AS_WCSRCHR 1 --#define USE_WIDE_CHAR 1 - #include "strrchr-evex.S" diff --git a/glibc-rh2244992.patch b/glibc-rh2244992.patch deleted file mode 100644 index b97861f..0000000 --- a/glibc-rh2244992.patch +++ /dev/null @@ -1,110 +0,0 @@ -commit ffe333d46c4ef7c5221cccad2743cac3e149e615 -Author: Florian Weimer -Date: Tue Nov 7 11:23:15 2023 +0100 - - elf: Fix force_first handling in dlclose (bug 30785) - - The force_first parameter was ineffective because the dlclose'd - object was not necessarily the first in the maps array. Also - enable force_first handling unconditionally, regardless of namespace. - The initial object in a namespace should be destructed first, too. - - The _dl_sort_maps_dfs function had early returns for relocation - dependency processing which broke force_first handling, too, and - this is fixed in this change as well. - -diff --git a/elf/dl-close.c b/elf/dl-close.c -index 1c7a861db140259d..a97a1efa45eb7409 100644 ---- a/elf/dl-close.c -+++ b/elf/dl-close.c -@@ -153,6 +153,16 @@ _dl_close_worker (struct link_map *map, bool force) - } - assert (idx == nloaded); - -+ /* Put the dlclose'd map first, so that its destructor runs first. -+ The map variable is NULL after a retry. */ -+ if (map != NULL) -+ { -+ maps[map->l_idx] = maps[0]; -+ maps[map->l_idx]->l_idx = map->l_idx; -+ maps[0] = map; -+ maps[0]->l_idx = 0; -+ } -+ - /* Keep track of the lowest index link map we have covered already. */ - int done_index = -1; - while (++done_index < nloaded) -@@ -226,9 +236,10 @@ _dl_close_worker (struct link_map *map, bool force) - } - } - -- /* Sort the entries. We can skip looking for the binary itself which is -- at the front of the search list for the main namespace. */ -- _dl_sort_maps (maps, nloaded, (nsid == LM_ID_BASE), true); -+ /* Sort the entries. Unless retrying, the maps[0] object (the -+ original argument to dlclose) needs to remain first, so that its -+ destructor runs first. */ -+ _dl_sort_maps (maps, nloaded, /* force_first */ map != NULL, true); - - /* Call all termination functions at once. */ - bool unload_any = false; -@@ -732,7 +743,11 @@ _dl_close_worker (struct link_map *map, bool force) - /* Recheck if we need to retry, release the lock. */ - out: - if (dl_close_state == rerun) -- goto retry; -+ { -+ /* The map may have been deallocated. */ -+ map = NULL; -+ goto retry; -+ } - - dl_close_state = not_pending; - } -diff --git a/elf/dl-sort-maps.c b/elf/dl-sort-maps.c -index 5616c8a6a33ebb1e..5c846c7c6f9d88bc 100644 ---- a/elf/dl-sort-maps.c -+++ b/elf/dl-sort-maps.c -@@ -255,13 +255,12 @@ _dl_sort_maps_dfs (struct link_map **maps, unsigned int nmaps, - The below memcpy is not needed in the do_reldeps case here, - since we wrote back to maps[] during DFS traversal. */ - if (maps_head == maps) -- return; -+ break; - } - assert (maps_head == maps); -- return; - } -- -- memcpy (maps, rpo, sizeof (struct link_map *) * nmaps); -+ else -+ memcpy (maps, rpo, sizeof (struct link_map *) * nmaps); - - /* Skipping the first object at maps[0] is not valid in general, - since traversing along object dependency-links may "find" that -diff --git a/elf/dso-sort-tests-1.def b/elf/dso-sort-tests-1.def -index 4bf9052db16fb352..cf6453e9eb85ac65 100644 ---- a/elf/dso-sort-tests-1.def -+++ b/elf/dso-sort-tests-1.def -@@ -56,14 +56,16 @@ output: b>a>{}b->c->d order). --# The older dynamic_sort=1 algorithm does not achieve this, while the DFS-based --# dynamic_sort=2 algorithm does, although it is still arguable whether going --# beyond spec to do this is the right thing to do. -+# The older dynamic_sort=1 algorithm originally did not achieve this, -+# but this was a bug in the way _dl_sort_maps was called from _dl_close_worker, -+# effectively disabling proper force_first handling. -+# The new dynamic_sort=2 algorithm shows the effect of the simpler force_first -+# handling: the a object is simply moved to the front. - # The below expected outputs are what the two algorithms currently produce - # respectively, for regression testing purposes. - tst-bz15311: {+a;+e;+f;+g;+d;%d;-d;-g;-f;-e;-a};a->b->c->d;d=>[ba];c=>a;b=>e=>a;c=>f=>b;d=>g=>c --output(glibc.rtld.dynamic_sort=1): {+a[d>c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[c>b>a>];+e[e>];+f[f>];+g[g>];+d[];%d(b(e(a()))a()g(c(a()f(b(e(a()))))));-d[];-g[];-f[];-e[];-a[ -Date: Wed Nov 15 06:31:40 2023 +0100 - - stdlib: Avoid another self-comparison in qsort - - In the insertion phase, we could run off the start of the array if the - comparison function never runs zero. In that case, it never finds the - initial element that terminates the iteration. - -diff --git a/stdlib/qsort.c b/stdlib/qsort.c -index ad110e8a892a66e1..1ea31ff4247da7a4 100644 ---- a/stdlib/qsort.c -+++ b/stdlib/qsort.c -@@ -217,7 +217,7 @@ insertion_sort_qsort_partitions (void *const pbase, size_t total_elems, - while ((run_ptr += size) <= end_ptr) - { - tmp_ptr = run_ptr - size; -- while (cmp (run_ptr, tmp_ptr, arg) < 0) -+ while (tmp_ptr != base_ptr && cmp (run_ptr, tmp_ptr, arg) < 0) - tmp_ptr -= size; - - tmp_ptr += size; diff --git a/glibc-rh2248915.patch b/glibc-rh2248915.patch deleted file mode 100644 index 8ff2042..0000000 --- a/glibc-rh2248915.patch +++ /dev/null @@ -1,60 +0,0 @@ -Author: Florian Weimer -Date: Mon Oct 23 12:53:16 2023 +0200 - - ldconfig: Fixes for skipping temporary files. - - Arguments to a memchr call were swapped, causing incorrect skipping - of files. - - Files related to dpkg have different names: they actually end in - .dpkg-new and .dpkg-tmp, not .tmp as I mistakenly assumed. - - Fixes commit 2aa0974d2573441bffd59 ("elf: ldconfig should skip - temporary files created by package managers"). - -diff --git a/elf/ldconfig.c b/elf/ldconfig.c -index 02387a169c902f01..bccd386761d8cbb2 100644 ---- a/elf/ldconfig.c -+++ b/elf/ldconfig.c -@@ -661,6 +661,17 @@ struct dlib_entry - struct dlib_entry *next; - }; - -+/* Return true if the N bytes at NAME end with with the characters in -+ the string SUFFIX. (NAME[N + 1] does not have to be a null byte.) -+ Expected to be called with a string literal for SUFFIX. */ -+static inline bool -+endswithn (const char *name, size_t n, const char *suffix) -+{ -+ return (n >= strlen (suffix) -+ && memcmp (name + n - strlen (suffix), suffix, -+ strlen (suffix)) == 0); -+} -+ - /* Skip some temporary DSO files. These files may be partially written - and lead to ldconfig crashes when examined. */ - static bool -@@ -670,8 +681,7 @@ skip_dso_based_on_name (const char *name, size_t len) - names like these are never really DSOs we want to look at. */ - if (len >= sizeof (".#prelink#") - 1) - { -- if (strcmp (name + len - sizeof (".#prelink#") + 1, -- ".#prelink#") == 0) -+ if (endswithn (name, len, ".#prelink#")) - return true; - if (len >= sizeof (".#prelink#.XXXXXX") - 1 - && memcmp (name + len - sizeof (".#prelink#.XXXXXX") -@@ -679,10 +689,11 @@ skip_dso_based_on_name (const char *name, size_t len) - return true; - } - /* Skip temporary files created by RPM. */ -- if (memchr (name, len, ';') != NULL) -+ if (memchr (name, ';', len) != NULL) - return true; - /* Skip temporary files created by dpkg. */ -- if (len > 4 && memcmp (name + len - 4, ".tmp", 4) == 0) -+ if (endswithn (name, len, ".dpkg-new") -+ || endswithn (name, len, ".dpkg-tmp")) - return true; - return false; - } diff --git a/glibc.spec b/glibc.spec index b08aa54..9a3f44f 100644 --- a/glibc.spec +++ b/glibc.spec @@ -1,4 +1,4 @@ -%global glibcsrcdir glibc-2.38.9000-244-gd1dcb565a1 +%global glibcsrcdir glibc-2.38.9000-295-g5d7f1bce7d %global glibcversion 2.38.9000 # Pre-release tarballs are pulled in from git using a command that is # effectively: @@ -159,7 +159,7 @@ Version: %{glibcversion} # - It allows using the Release number without the %%dist tag in the dependency # generator to make the generated requires interchangeable between Rawhide # and ELN (.elnYY < .fcXX). -%global baserelease 22 +%global baserelease 23 Release: %{baserelease}%{?dist} # In general, GPLv2+ is used by programs, LGPLv2+ is used for @@ -230,10 +230,7 @@ Patch9: glibc-rh827510.patch Patch13: glibc-fedora-localedata-rh61908.patch Patch17: glibc-cs-path.patch Patch23: glibc-python3.patch -Patch24: glibc-rh2244688.patch -Patch25: glibc-rh2244992.patch -Patch26: glibc-rh2248915.patch -Patch27: glibc-rh2248502-3.patch +Patch24: glibc-benchtests-aarch64.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2204,6 +2201,60 @@ update_gconv_modules_cache () %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Wed Nov 22 2023 Florian Weimer - 2.38.9000-23 +- Apply glibc-benchtests-aarch64.patch to fix an aarch64 build failure. +- Drop glibc-rh2244688.patch revert. Fix applied upstream. +- Drop glibc-rh2244992.patch, glibc-rh2248915.patch, glibc-rh2248502-3.patch. + All applied upstream. +- Auto-sync with upstream branch master, + commit 5d7f1bce7d8eea31f4baeb68bcc3124b35acc751: +- posix: Revert the removal of the crypt prototype from +- elf: Add comments on how LD_AUDIT and LD_PRELOAD handle __libc_enable_secure +- elf: Ignore LD_LIBRARY_PATH and debug env var for setuid for static +- elf: Remove any_debug from dl_main_state +- elf: Remove LD_PROFILE for static binaries +- elf: Ignore LD_PROFILE for setuid binaries +- s390: Use dl-symbol-redir-ifunc.h on cpu-tunables +- x86: Use dl-symbol-redir-ifunc.h on cpu-tunables +- elf: Emit warning if tunable is ill-formatted +- elf: Fix _dl_debug_vdprintf to work before self-relocation +- elf: Do not parse ill-formatted strings +- elf: Do not process invalid tunable format +- elf: Add all malloc tunable to unsecvars +- elf: Ignore GLIBC_TUNABLES for setuid/setgid binaries +- elf: Add GLIBC_TUNABLES to unsecvars +- elf: Remove /etc/suid-debug support +- stdlib: The qsort implementation needs to use heapsort in more cases +- stdlib: Handle various corner cases in the fallback heapsort for qsort +- stdlib: Avoid another self-comparison in qsort +- hurd: fix restarting reauth_dtable on signal +- hurd: Prevent the final file_exec_paths call from signals +- manual: Fix termios.c example. (Bug 31078) +- aarch64: Add vector implementations of expm1 routines +- linux: Use fchmodat2 on fchmod for flags different than 0 (BZ 26401) +- intl: Add test case for bug 16621 +- resolv: free only initialized items from gai pool +- ldconfig: Fixes for skipping temporary files. +- nptl: Link tst-execstack-threads-mod.so with -z execstack +- nptl: Rename tst-execstack to tst-execstack-threads +- localedata: Convert oc_FR locale to UTF-8 +- localedata: Add information for Occitan +- elf: Fix force_first handling in dlclose (bug 30981) +- elf: Handle non-directory name in search path (BZ 31035) +- New Zealand locales (en_NZ & mi_NZ) first day of week should be Monday +- x86: Fix unchecked AVX512-VBMI2 usage in strrchr-evex-base.S +- posix: Check pidfd_spawn with tst-spawn7-pid +- y2038: Fix support for 64-bit time on legacy ABIs +- AArch64: Remove Falkor memcpy +- AArch64: Add memset_zva64 +- AArch64: Cleanup emag memset +- test: Run the tst-tls-allocation-failure-static-patched with test-wrapper. +- aarch64: Add vector implementations of log1p routines +- aarch64: Add vector implementations of atan2 routines +- aarch64: Add vector implementations of atan routines +- aarch64: Add vector implementations of acos routines +- aarch64: Add vector implementations of asin routines + * Wed Nov 15 2023 Florian Weimer - 2.38.9000-22 - Work around another self-comparison application issue in qsort (#2248502) diff --git a/sources b/sources index a0f3c83..1924b89 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (glibc-2.38.9000-244-gd1dcb565a1.tar.xz) = 74d23268abb91447d74906ae477f926baa2f54a95d87fc18d8972a976524987e98f6141f3767406aef6293d2ef7f7bc16cfedaacd2d33bca207253ec7073c915 +SHA512 (glibc-2.38.9000-295-g5d7f1bce7d.tar.xz) = 7e531ed030bef65198d86d9974dc280ea2897d3d89c0ceecdd2667470ce4418c42f5a1633927786c8a9f4d7d967cec2a462e4a6918dfb92fc419e5ec9ecf28e3