forked from rpms/glibc
import glibc-2.28-168.el8
This commit is contained in:
parent
6e9cfdd108
commit
b6400f3e09
306
SOURCES/glibc-rh1983203-1.patch
Normal file
306
SOURCES/glibc-rh1983203-1.patch
Normal file
@ -0,0 +1,306 @@
|
|||||||
|
commit a55e2da2702e235fa0ae66a116d304d1bffc060a
|
||||||
|
Author: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
|
||||||
|
Date: Thu May 6 17:01:52 2021 -0300
|
||||||
|
|
||||||
|
powerpc: Optimized memcmp for power10
|
||||||
|
|
||||||
|
This patch was based on the __memcmp_power8 and the recent
|
||||||
|
__strlen_power10.
|
||||||
|
|
||||||
|
Improvements from __memcmp_power8:
|
||||||
|
|
||||||
|
1. Don't need alignment code.
|
||||||
|
|
||||||
|
On POWER10 lxvp and lxvl do not generate alignment interrupts, so
|
||||||
|
they are safe for use on caching-inhibited memory. Notice that the
|
||||||
|
comparison on the main loop will wait for both VSR to be ready.
|
||||||
|
Therefore aligning one of the input address does not improve
|
||||||
|
performance. In order to align both registers a vperm is necessary
|
||||||
|
which add too much overhead.
|
||||||
|
|
||||||
|
2. Uses new POWER10 instructions
|
||||||
|
|
||||||
|
This code uses lxvp to decrease contention on load by loading 32 bytes
|
||||||
|
per instruction.
|
||||||
|
The vextractbm is used to have a smaller tail code for calculating the
|
||||||
|
return value.
|
||||||
|
|
||||||
|
3. Performance improvement
|
||||||
|
|
||||||
|
This version has around 35% better performance on average. I saw no
|
||||||
|
performance regressions for any length or alignment.
|
||||||
|
|
||||||
|
Thanks Matheus for helping me out with some details.
|
||||||
|
|
||||||
|
Co-authored-by: Matheus Castanho <msc@linux.ibm.com>
|
||||||
|
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcmp.S b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..52f244e7e77cbdf9
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/memcmp.S
|
||||||
|
@@ -0,0 +1,179 @@
|
||||||
|
+/* Optimized memcmp implementation for POWER10.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+/* TODO: Replace macros by the actual instructions when minimum binutils becomes
|
||||||
|
+ >= 2.35. This is used to keep compatibility with older versions. */
|
||||||
|
+#define VEXTRACTBM(rt,vrb) \
|
||||||
|
+ .long(((4)<<(32-6)) \
|
||||||
|
+ | ((rt)<<(32-11)) \
|
||||||
|
+ | ((8)<<(32-16)) \
|
||||||
|
+ | ((vrb)<<(32-21)) \
|
||||||
|
+ | 1602)
|
||||||
|
+
|
||||||
|
+#define LXVP(xtp,dq,ra) \
|
||||||
|
+ .long(((6)<<(32-6)) \
|
||||||
|
+ | ((((xtp)-32)>>1)<<(32-10)) \
|
||||||
|
+ | ((1)<<(32-11)) \
|
||||||
|
+ | ((ra)<<(32-16)) \
|
||||||
|
+ | dq)
|
||||||
|
+
|
||||||
|
+/* Compare 32 bytes. */
|
||||||
|
+#define COMPARE_32(vr1,vr2,offset,tail_1,tail_2)\
|
||||||
|
+ LXVP(32+vr1,offset,r3); \
|
||||||
|
+ LXVP(32+vr2,offset,r4); \
|
||||||
|
+ vcmpneb. v5,vr1+1,vr2+1; \
|
||||||
|
+ bne cr6,L(tail_2); \
|
||||||
|
+ vcmpneb. v4,vr1,vr2; \
|
||||||
|
+ bne cr6,L(tail_1); \
|
||||||
|
+
|
||||||
|
+#define TAIL(v_res,s1,s2) \
|
||||||
|
+ vctzlsbb r7,v_res; \
|
||||||
|
+ vextubrx r8,r7,s1; \
|
||||||
|
+ vextubrx r9,r7,s2; \
|
||||||
|
+ subf r3,r9,r8; \
|
||||||
|
+ blr; \
|
||||||
|
+
|
||||||
|
+/* int [r3] memcmp (const char *s1 [r3], const char *s2 [r4],
|
||||||
|
+ size_t size [r5]) */
|
||||||
|
+
|
||||||
|
+#ifndef MEMCMP
|
||||||
|
+# define MEMCMP memcmp
|
||||||
|
+#endif
|
||||||
|
+ .machine power9
|
||||||
|
+ENTRY_TOCLESS (MEMCMP, 4)
|
||||||
|
+ CALL_MCOUNT 3
|
||||||
|
+
|
||||||
|
+ cmpldi cr6,r5,64
|
||||||
|
+ bgt cr6,L(loop_head)
|
||||||
|
+
|
||||||
|
+/* Compare 64 bytes. This section is used for lengths <= 64 and for the last
|
||||||
|
+ bytes for larger lengths. */
|
||||||
|
+L(last_compare):
|
||||||
|
+ li r8,16
|
||||||
|
+
|
||||||
|
+ sldi r9,r5,56
|
||||||
|
+ sldi r8,r8,56
|
||||||
|
+ addi r6,r3,16
|
||||||
|
+ addi r7,r4,16
|
||||||
|
+
|
||||||
|
+ /* Align up to 16 bytes. */
|
||||||
|
+ lxvl 32+v0,r3,r9
|
||||||
|
+ lxvl 32+v2,r4,r9
|
||||||
|
+
|
||||||
|
+ /* The sub. and vcmpneb. results are concatenated by the crnand in order
|
||||||
|
+ to do a single branch. It's doing a NOT(CR0.GT AND CR6.EQ) then
|
||||||
|
+ loading to CR0.LT. That means r9 is not bigger than 0 and v4 is not
|
||||||
|
+ all equal to 0. */
|
||||||
|
+ sub. r9,r9,r8
|
||||||
|
+ vcmpneb. v4,v0,v2
|
||||||
|
+ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
|
||||||
|
+ bt 4*cr0+lt,L(tail1)
|
||||||
|
+
|
||||||
|
+ addi r3,r3,32
|
||||||
|
+ addi r4,r4,32
|
||||||
|
+
|
||||||
|
+ lxvl 32+v1,r6,r9
|
||||||
|
+ lxvl 32+v3,r7,r9
|
||||||
|
+ sub. r9,r9,r8
|
||||||
|
+ vcmpneb. v5,v1,v3
|
||||||
|
+ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
|
||||||
|
+ bt 4*cr0+lt,L(tail2)
|
||||||
|
+
|
||||||
|
+ addi r6,r3,16
|
||||||
|
+ addi r7,r4,16
|
||||||
|
+
|
||||||
|
+ lxvl 32+v6,r3,r9
|
||||||
|
+ lxvl 32+v8,r4,r9
|
||||||
|
+ sub. r9,r9,r8
|
||||||
|
+ vcmpneb. v4,v6,v8
|
||||||
|
+ crnand 4*cr0+lt,4*cr0+gt,4*cr6+eq
|
||||||
|
+ bt 4*cr0+lt,L(tail3)
|
||||||
|
+
|
||||||
|
+ lxvl 32+v7,r6,r9
|
||||||
|
+ lxvl 32+v9,r7,r9
|
||||||
|
+ vcmpneb. v5,v7,v9
|
||||||
|
+ bne cr6,L(tail4)
|
||||||
|
+
|
||||||
|
+L(finish):
|
||||||
|
+ /* The contents are equal. */
|
||||||
|
+ li r3,0
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+L(loop_head):
|
||||||
|
+ /* Calculate how many loops to run. */
|
||||||
|
+ srdi. r8,r5,7
|
||||||
|
+ beq L(loop_tail)
|
||||||
|
+ mtctr r8
|
||||||
|
+
|
||||||
|
+/* Main loop. Compares 128 bytes each loop. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(loop_128):
|
||||||
|
+ COMPARE_32(v0,v2,0,tail1,tail2)
|
||||||
|
+ COMPARE_32(v6,v8,32,tail3,tail4)
|
||||||
|
+ COMPARE_32(v10,v12,64,tail5,tail6)
|
||||||
|
+ COMPARE_32(v14,v16,96,tail7,tail8)
|
||||||
|
+
|
||||||
|
+ addi r3,r3,128
|
||||||
|
+ addi r4,r4,128
|
||||||
|
+ bdnz L(loop_128)
|
||||||
|
+
|
||||||
|
+ /* Account loop comparisons. */
|
||||||
|
+ clrldi. r5,r5,57
|
||||||
|
+ beq L(finish)
|
||||||
|
+
|
||||||
|
+/* Compares 64 bytes if length is still bigger than 64 bytes. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(loop_tail):
|
||||||
|
+ cmpldi r5,64
|
||||||
|
+ ble L(last_compare)
|
||||||
|
+ COMPARE_32(v0,v2,0,tail1,tail2)
|
||||||
|
+ COMPARE_32(v6,v8,32,tail3,tail4)
|
||||||
|
+ addi r3,r3,64
|
||||||
|
+ addi r4,r4,64
|
||||||
|
+ subi r5,r5,64
|
||||||
|
+ b L(last_compare)
|
||||||
|
+
|
||||||
|
+L(tail1):
|
||||||
|
+ TAIL(v4,v0,v2)
|
||||||
|
+
|
||||||
|
+L(tail2):
|
||||||
|
+ TAIL(v5,v1,v3)
|
||||||
|
+
|
||||||
|
+L(tail3):
|
||||||
|
+ TAIL(v4,v6,v8)
|
||||||
|
+
|
||||||
|
+L(tail4):
|
||||||
|
+ TAIL(v5,v7,v9)
|
||||||
|
+
|
||||||
|
+L(tail5):
|
||||||
|
+ TAIL(v4,v10,v12)
|
||||||
|
+
|
||||||
|
+L(tail6):
|
||||||
|
+ TAIL(v5,v11,v13)
|
||||||
|
+
|
||||||
|
+L(tail7):
|
||||||
|
+ TAIL(v4,v14,v16)
|
||||||
|
+
|
||||||
|
+L(tail8):
|
||||||
|
+ TAIL(v5,v15,v17)
|
||||||
|
+
|
||||||
|
+END (MEMCMP)
|
||||||
|
+libc_hidden_builtin_def (memcmp)
|
||||||
|
+weak_alias (memcmp, bcmp)
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
index ac2446aca62cc4ab..ee98417f4a383356 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||||
|
strncase-power8
|
||||||
|
|
||||||
|
ifneq (,$(filter %le,$(config-machine)))
|
||||||
|
-sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
|
||||||
|
+sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
|
||||||
|
rawmemchr-power9 rawmemchr-power10 \
|
||||||
|
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||||
|
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
index 127af84b32a8196f..5213abdf87c79c88 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -184,6 +184,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/memcmp.c. */
|
||||||
|
IFUNC_IMPL (i, name, memcmp,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, memcmp,
|
||||||
|
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
+ __memcmp_power10)
|
||||||
|
+#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, memcmp, hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||||||
|
__memcmp_power8)
|
||||||
|
IFUNC_IMPL_ADD (array, i, memcmp, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..73a0debd4a811d8e
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp-power10.S
|
||||||
|
@@ -0,0 +1,26 @@
|
||||||
|
+/* Optimized memcmp implementation for POWER10.
|
||||||
|
+ Copyright (C) 2017-2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#define MEMCMP __memcmp_power10
|
||||||
|
+
|
||||||
|
+#undef libc_hidden_builtin_def
|
||||||
|
+#define libc_hidden_builtin_def(name)
|
||||||
|
+#undef weak_alias
|
||||||
|
+#define weak_alias(name,alias)
|
||||||
|
+
|
||||||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/memcmp.S>
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
|
||||||
|
index 2c7a083a6560f920..0b8c0c1d8aa3f90a 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcmp.c
|
||||||
|
@@ -27,11 +27,17 @@ extern __typeof (memcmp) __memcmp_ppc attribute_hidden;
|
||||||
|
extern __typeof (memcmp) __memcmp_power4 attribute_hidden;
|
||||||
|
extern __typeof (memcmp) __memcmp_power7 attribute_hidden;
|
||||||
|
extern __typeof (memcmp) __memcmp_power8 attribute_hidden;
|
||||||
|
+extern __typeof (memcmp) __memcmp_power10 attribute_hidden;
|
||||||
|
# undef memcmp
|
||||||
|
|
||||||
|
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
|
||||||
|
ifunc symbol properly. */
|
||||||
|
libc_ifunc_redirected (__redirect_memcmp, memcmp,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __memcmp_power10 :
|
||||||
|
+#endif
|
||||||
|
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||||
|
? __memcmp_power8 :
|
||||||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
278
SOURCES/glibc-rh1983203-2.patch
Normal file
278
SOURCES/glibc-rh1983203-2.patch
Normal file
@ -0,0 +1,278 @@
|
|||||||
|
commit 813c6ec808556553be9d39e900a3fc97ceb32330
|
||||||
|
Author: Pedro Franco de Carvalho <pedromfc@linux.ibm.com>
|
||||||
|
Date: Wed Jun 30 12:36:07 2021 -0300
|
||||||
|
|
||||||
|
powerpc: optimize strcpy/stpcpy for POWER9/10
|
||||||
|
|
||||||
|
This patch modifies the current POWER9 implementation of strcpy and
|
||||||
|
stpcpy to optimize it for POWER9/10.
|
||||||
|
|
||||||
|
Since no new POWER10 instructions are used, the original POWER9 strcpy is
|
||||||
|
modified instead of creating a new implementation for POWER10. This
|
||||||
|
implementation is based on both the original POWER9 implementation of
|
||||||
|
strcpy and the preamble of the new POWER10 implementation of strlen.
|
||||||
|
|
||||||
|
The changes also affect stpcpy, which uses the same implementation with
|
||||||
|
some additional code before returning.
|
||||||
|
|
||||||
|
On POWER9, averaging improvements across the benchmark
|
||||||
|
inputs (length/source alignment/destination alignment), for an
|
||||||
|
experiment that ran the benchmark five times, bench-strcpy showed an
|
||||||
|
improvement of 5.23%, and bench-stpcpy showed an improvement of 6.59%.
|
||||||
|
|
||||||
|
On POWER10, bench-strcpy showed 13.16%, and bench-stpcpy showed 13.59%.
|
||||||
|
|
||||||
|
The changes are:
|
||||||
|
|
||||||
|
1. Removed the null string optimization.
|
||||||
|
|
||||||
|
Although this results in a few extra cycles for the null string, in
|
||||||
|
combination with the second change, this resulted in improvements for
|
||||||
|
for other cases.
|
||||||
|
|
||||||
|
2. Adapted the preamble from strlen for POWER10.
|
||||||
|
|
||||||
|
This is the part of the function that handles up to the first 16 bytes
|
||||||
|
of the string.
|
||||||
|
|
||||||
|
3. Increased number of unrolled iterations in the main loop to 6.
|
||||||
|
|
||||||
|
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
|
||||||
|
Tested-by: Matheus Castanho <msc@linux.ibm.com>
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
|
||||||
|
index ce8f50329177fd06..9845a1d4cf0e1e5d 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
|
||||||
|
@@ -45,91 +45,78 @@
|
||||||
|
The implementation can load bytes past a null terminator, but only
|
||||||
|
up to the next 16B boundary, so it never crosses a page. */
|
||||||
|
|
||||||
|
+/* Load quadword at addr+offset to vreg, check for null bytes,
|
||||||
|
+ and branch to label if any are found. */
|
||||||
|
+#define CHECK16(vreg,offset,addr,label) \
|
||||||
|
+ lxv vreg+32,offset(addr); \
|
||||||
|
+ vcmpequb. v6,vreg,v18; \
|
||||||
|
+ bne cr6,L(label);
|
||||||
|
+
|
||||||
|
.machine power9
|
||||||
|
ENTRY_TOCLESS (FUNC_NAME, 4)
|
||||||
|
CALL_MCOUNT 2
|
||||||
|
|
||||||
|
- /* NULL string optimisation */
|
||||||
|
- lbz r0,0(r4)
|
||||||
|
- stb r0,0(r3)
|
||||||
|
- cmpwi r0,0
|
||||||
|
- beqlr
|
||||||
|
-
|
||||||
|
- addi r4,r4,1
|
||||||
|
- addi r11,r3,1
|
||||||
|
-
|
||||||
|
vspltisb v18,0 /* Zeroes in v18 */
|
||||||
|
+ vspltisb v19,-1 /* 0xFF bytes in v19 */
|
||||||
|
|
||||||
|
- neg r5,r4
|
||||||
|
- rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
|
||||||
|
+ /* Next 16B-aligned address. Prepare address for L(loop). */
|
||||||
|
+ addi r5,r4,16
|
||||||
|
+ clrrdi r5,r5,4
|
||||||
|
+ subf r8,r4,r5
|
||||||
|
+ add r11,r3,r8
|
||||||
|
|
||||||
|
- /* Get source 16B aligned */
|
||||||
|
+ /* Align data and fill bytes not loaded with non matching char. */
|
||||||
|
lvx v0,0,r4
|
||||||
|
lvsr v1,0,r4
|
||||||
|
- vperm v0,v18,v0,v1
|
||||||
|
-
|
||||||
|
- vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
|
||||||
|
- vctzlsbb r7,v6 /* Number of trailing zeroes */
|
||||||
|
- addi r8,r7,1 /* Add null terminator */
|
||||||
|
+ vperm v0,v19,v0,v1
|
||||||
|
|
||||||
|
- /* r8 = bytes including null
|
||||||
|
- r9 = bytes to get source 16B aligned
|
||||||
|
- if r8 > r9
|
||||||
|
- no null, copy r9 bytes
|
||||||
|
- else
|
||||||
|
- there is a null, copy r8 bytes and return. */
|
||||||
|
- cmpd r8,r9
|
||||||
|
- bgt L(no_null)
|
||||||
|
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
|
||||||
|
+ beq cr6,L(no_null)
|
||||||
|
|
||||||
|
- sldi r10,r8,56 /* stxvl wants size in top 8 bits */
|
||||||
|
- stxvl 32+v0,r11,r10 /* Partial store */
|
||||||
|
+ /* There's a null byte. */
|
||||||
|
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
|
||||||
|
+ addi r9,r8,1 /* Add null byte. */
|
||||||
|
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits. */
|
||||||
|
+ stxvl 32+v0,r3,r10 /* Partial store */
|
||||||
|
|
||||||
|
#ifdef USE_AS_STPCPY
|
||||||
|
/* stpcpy returns the dest address plus the size not counting the
|
||||||
|
final '\0'. */
|
||||||
|
- add r3,r11,r7
|
||||||
|
+ add r3,r3,r8
|
||||||
|
#endif
|
||||||
|
blr
|
||||||
|
|
||||||
|
L(no_null):
|
||||||
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
||||||
|
- stxvl 32+v0,r11,r10 /* Partial store */
|
||||||
|
-
|
||||||
|
- add r4,r4,r9
|
||||||
|
- add r11,r11,r9
|
||||||
|
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
|
||||||
|
+ stxvl 32+v0,r3,r10 /* Partial store */
|
||||||
|
|
||||||
|
+ .p2align 4
|
||||||
|
L(loop):
|
||||||
|
- lxv 32+v0,0(r4)
|
||||||
|
- vcmpequb. v6,v0,v18 /* Any zero bytes? */
|
||||||
|
- bne cr6,L(tail1)
|
||||||
|
-
|
||||||
|
- lxv 32+v1,16(r4)
|
||||||
|
- vcmpequb. v6,v1,v18 /* Any zero bytes? */
|
||||||
|
- bne cr6,L(tail2)
|
||||||
|
-
|
||||||
|
- lxv 32+v2,32(r4)
|
||||||
|
- vcmpequb. v6,v2,v18 /* Any zero bytes? */
|
||||||
|
- bne cr6,L(tail3)
|
||||||
|
-
|
||||||
|
- lxv 32+v3,48(r4)
|
||||||
|
- vcmpequb. v6,v3,v18 /* Any zero bytes? */
|
||||||
|
- bne cr6,L(tail4)
|
||||||
|
+ CHECK16(v0,0,r5,tail1)
|
||||||
|
+ CHECK16(v1,16,r5,tail2)
|
||||||
|
+ CHECK16(v2,32,r5,tail3)
|
||||||
|
+ CHECK16(v3,48,r5,tail4)
|
||||||
|
+ CHECK16(v4,64,r5,tail5)
|
||||||
|
+ CHECK16(v5,80,r5,tail6)
|
||||||
|
|
||||||
|
stxv 32+v0,0(r11)
|
||||||
|
stxv 32+v1,16(r11)
|
||||||
|
stxv 32+v2,32(r11)
|
||||||
|
stxv 32+v3,48(r11)
|
||||||
|
+ stxv 32+v4,64(r11)
|
||||||
|
+ stxv 32+v5,80(r11)
|
||||||
|
|
||||||
|
- addi r4,r4,64
|
||||||
|
- addi r11,r11,64
|
||||||
|
+ addi r5,r5,96
|
||||||
|
+ addi r11,r11,96
|
||||||
|
|
||||||
|
b L(loop)
|
||||||
|
|
||||||
|
+ .p2align 4
|
||||||
|
L(tail1):
|
||||||
|
- vctzlsbb r8,v6
|
||||||
|
- addi r9,r8,1
|
||||||
|
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
|
||||||
|
+ addi r9,r8,1 /* Add null terminator */
|
||||||
|
sldi r9,r9,56 /* stxvl wants size in top 8 bits */
|
||||||
|
- stxvl 32+v0,r11,r9
|
||||||
|
+ stxvl 32+v0,r11,r9 /* Partial store */
|
||||||
|
#ifdef USE_AS_STPCPY
|
||||||
|
/* stpcpy returns the dest address plus the size not counting the
|
||||||
|
final '\0'. */
|
||||||
|
@@ -137,50 +124,81 @@ L(tail1):
|
||||||
|
#endif
|
||||||
|
blr
|
||||||
|
|
||||||
|
+ .p2align 4
|
||||||
|
L(tail2):
|
||||||
|
stxv 32+v0,0(r11)
|
||||||
|
- vctzlsbb r8,v6 /* Number of trailing zeroes */
|
||||||
|
- addi r9,r8,1 /* Add null terminator */
|
||||||
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
||||||
|
+ vctzlsbb r8,v6
|
||||||
|
+ addi r9,r8,1
|
||||||
|
+ sldi r9,r9,56
|
||||||
|
addi r11,r11,16
|
||||||
|
- stxvl 32+v1,r11,r10 /* Partial store */
|
||||||
|
+ stxvl 32+v1,r11,r9
|
||||||
|
#ifdef USE_AS_STPCPY
|
||||||
|
- /* stpcpy returns the dest address plus the size not counting the
|
||||||
|
- final '\0'. */
|
||||||
|
add r3,r11,r8
|
||||||
|
#endif
|
||||||
|
blr
|
||||||
|
|
||||||
|
+ .p2align 4
|
||||||
|
L(tail3):
|
||||||
|
stxv 32+v0,0(r11)
|
||||||
|
stxv 32+v1,16(r11)
|
||||||
|
- vctzlsbb r8,v6 /* Number of trailing zeroes */
|
||||||
|
- addi r9,r8,1 /* Add null terminator */
|
||||||
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
||||||
|
+ vctzlsbb r8,v6
|
||||||
|
+ addi r9,r8,1
|
||||||
|
+ sldi r9,r9,56
|
||||||
|
addi r11,r11,32
|
||||||
|
- stxvl 32+v2,r11,r10 /* Partial store */
|
||||||
|
+ stxvl 32+v2,r11,r9
|
||||||
|
#ifdef USE_AS_STPCPY
|
||||||
|
- /* stpcpy returns the dest address plus the size not counting the
|
||||||
|
- final '\0'. */
|
||||||
|
add r3,r11,r8
|
||||||
|
#endif
|
||||||
|
blr
|
||||||
|
|
||||||
|
+ .p2align 4
|
||||||
|
L(tail4):
|
||||||
|
stxv 32+v0,0(r11)
|
||||||
|
stxv 32+v1,16(r11)
|
||||||
|
stxv 32+v2,32(r11)
|
||||||
|
- vctzlsbb r8,v6 /* Number of trailing zeroes */
|
||||||
|
- addi r9,r8,1 /* Add null terminator */
|
||||||
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
||||||
|
+ vctzlsbb r8,v6
|
||||||
|
+ addi r9,r8,1
|
||||||
|
+ sldi r9,r9,56
|
||||||
|
addi r11,r11,48
|
||||||
|
- stxvl 32+v3,r11,r10 /* Partial store */
|
||||||
|
+ stxvl 32+v3,r11,r9
|
||||||
|
#ifdef USE_AS_STPCPY
|
||||||
|
- /* stpcpy returns the dest address plus the size not counting the
|
||||||
|
- final '\0'. */
|
||||||
|
add r3,r11,r8
|
||||||
|
#endif
|
||||||
|
blr
|
||||||
|
+
|
||||||
|
+ .p2align 4
|
||||||
|
+L(tail5):
|
||||||
|
+ stxv 32+v0,0(r11)
|
||||||
|
+ stxv 32+v1,16(r11)
|
||||||
|
+ stxv 32+v2,32(r11)
|
||||||
|
+ stxv 32+v3,48(r11)
|
||||||
|
+ vctzlsbb r8,v6
|
||||||
|
+ addi r9,r8,1
|
||||||
|
+ sldi r9,r9,56
|
||||||
|
+ addi r11,r11,64
|
||||||
|
+ stxvl 32+v4,r11,r9
|
||||||
|
+#ifdef USE_AS_STPCPY
|
||||||
|
+ add r3,r11,r8
|
||||||
|
+#endif
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+ .p2align 4
|
||||||
|
+L(tail6):
|
||||||
|
+ stxv 32+v0,0(r11)
|
||||||
|
+ stxv 32+v1,16(r11)
|
||||||
|
+ stxv 32+v2,32(r11)
|
||||||
|
+ stxv 32+v3,48(r11)
|
||||||
|
+ stxv 32+v4,64(r11)
|
||||||
|
+ vctzlsbb r8,v6
|
||||||
|
+ addi r9,r8,1
|
||||||
|
+ sldi r9,r9,56
|
||||||
|
+ addi r11,r11,80
|
||||||
|
+ stxvl 32+v5,r11,r9
|
||||||
|
+#ifdef USE_AS_STPCPY
|
||||||
|
+ add r3,r11,r8
|
||||||
|
+#endif
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
END (FUNC_NAME)
|
||||||
|
#ifndef USE_AS_STPCPY
|
||||||
|
libc_hidden_builtin_def (strcpy)
|
@ -1,6 +1,6 @@
|
|||||||
%define glibcsrcdir glibc-2.28
|
%define glibcsrcdir glibc-2.28
|
||||||
%define glibcversion 2.28
|
%define glibcversion 2.28
|
||||||
%define glibcrelease 167%{?dist}
|
%define glibcrelease 168%{?dist}
|
||||||
# Pre-release tarballs are pulled in from git using a command that is
|
# Pre-release tarballs are pulled in from git using a command that is
|
||||||
# effectively:
|
# effectively:
|
||||||
#
|
#
|
||||||
@ -776,6 +776,8 @@ Patch598: glibc-rh1971664-13.patch
|
|||||||
Patch599: glibc-rh1971664-14.patch
|
Patch599: glibc-rh1971664-14.patch
|
||||||
Patch600: glibc-rh1971664-15.patch
|
Patch600: glibc-rh1971664-15.patch
|
||||||
Patch601: glibc-rh1977614.patch
|
Patch601: glibc-rh1977614.patch
|
||||||
|
Patch602: glibc-rh1983203-1.patch
|
||||||
|
Patch603: glibc-rh1983203-2.patch
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Continued list of core "glibc" package information:
|
# Continued list of core "glibc" package information:
|
||||||
@ -2726,6 +2728,9 @@ fi
|
|||||||
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Fri Oct 29 2021 Arjun Shankar <arjun@redhat.com> - 2.28-168
|
||||||
|
- Optimize memcmp, strcpy, and stpcpy for IBM POWER10 (#1983203)
|
||||||
|
|
||||||
* Wed Oct 13 2021 Arjun Shankar <arjun@redhat.com> - 2.28-167
|
* Wed Oct 13 2021 Arjun Shankar <arjun@redhat.com> - 2.28-167
|
||||||
- malloc: Initiate tcache shutdown even without allocations (#1977614)
|
- malloc: Initiate tcache shutdown even without allocations (#1977614)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user