import glibc-2.28-160.el8

This commit is contained in:
CentOS Sources 2021-06-19 04:21:01 +00:00 committed by Andrew Lukoshko
parent c04956366c
commit c11d47b279
9 changed files with 2286 additions and 1 deletions

View File

@ -0,0 +1,100 @@
commit 56c81132ccc6f468fa4fc29c536db060e18e9d87
Author: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
Date: Tue Feb 23 14:14:37 2021 -0300
powerpc: Add optimized ilogb* for POWER9
The instructions xsxexpdp and xsxexpqp introduced on POWER9 extract
the exponent from a double-precision and quad-precision floating-point
respectively, thus they can be used to improve ilogb, ilogbf and ilogbf128.
diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h
index e642d6c8237578ea..5bbc468829062a48 100644
--- a/sysdeps/powerpc/fpu/math_private.h
+++ b/sysdeps/powerpc/fpu/math_private.h
@@ -26,7 +26,28 @@
#include_next <math_private.h>
-#if defined _ARCH_PWR9 && __HAVE_DISTINCT_FLOAT128
+#ifdef _ARCH_PWR9
+
+#if __GNUC_PREREQ (8, 0)
+# define _GL_HAS_BUILTIN_ILOGB 1
+#elif defined __has_builtin
+# define _GL_HAS_BUILTIN_ILOGB __has_builtin (__builtin_vsx_scalar_extract_exp)
+#else
+# define _GL_HAS_BUILTIN_ILOGB 0
+#endif
+
+#define __builtin_test_dc_ilogbf __builtin_test_dc_ilogb
+#define __builtin_ilogbf __builtin_ilogb
+
+#define __builtin_test_dc_ilogb(x, y) \
+ __builtin_vsx_scalar_test_data_class_dp(x, y)
+#define __builtin_ilogb(x) __builtin_vsx_scalar_extract_exp(x) - 0x3ff
+
+#define __builtin_test_dc_ilogbf128(x, y) \
+ __builtin_vsx_scalar_test_data_class_qp(x, y)
+#define __builtin_ilogbf128(x) __builtin_vsx_scalar_extract_expq(x) - 0x3fff
+
+#if __HAVE_DISTINCT_FLOAT128
extern __always_inline _Float128
__ieee754_sqrtf128 (_Float128 __x)
{
@@ -35,6 +56,9 @@ __ieee754_sqrtf128 (_Float128 __x)
return __z;
}
#endif
+#else /* !_ARCH_PWR9 */
+#define _GL_HAS_BUILTIN_ILOGB 0
+#endif
#if defined _ARCH_PWR5X
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c
new file mode 100644
index 0000000000000000..b5c1c0aa9db86f3d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c
@@ -0,0 +1,30 @@
+#include <math.h>
+#include <errno.h>
+#include <limits.h>
+#include <math_private.h>
+#include <fenv.h>
+
+#if _GL_HAS_BUILTIN_ILOGB
+int
+M_DECL_FUNC (__ilogb) (FLOAT x)
+{
+ int r;
+ /* Check for exceptional cases. */
+ if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f))
+ r = M_SUF (__builtin_ilogb) (x);
+ else
+ /* Fallback to the generic ilogb if x is NaN, Inf or subnormal. */
+ r = M_SUF (__ieee754_ilogb) (x);
+ if (__builtin_expect (r == FP_ILOGB0, 0)
+ || __builtin_expect (r == FP_ILOGBNAN, 0)
+ || __builtin_expect (r == INT_MAX, 0))
+ {
+ __set_errno (EDOM);
+ __feraiseexcept (FE_INVALID);
+ }
+ return r;
+}
+declare_mgen_alias (__ilogb, ilogb)
+#else
+#include <math/w_ilogb_template.c>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c
new file mode 100644
index 0000000000000000..205f154f0089a269
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c
@@ -0,0 +1,4 @@
+/* Skip the optimization for long double as ibm128 does not provide an
+ optimized builtin. */
+#include <math-type-macros-ldouble.h>
+#include <math/w_ilogb_template.c>

View File

@ -0,0 +1,64 @@
commit a7d88506c260e7a0e4268803e76fc19e38ed041f
Author: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
Date: Thu Feb 25 09:58:52 2021 -0300
powerpc: Add optimized llogb* for POWER9
The POWER9 builtins used to improve the ilogb* functions can be
used in the llogb* functions as well.
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c
new file mode 100644
index 0000000000000000..d00b71d2a34e28da
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c
@@ -0,0 +1,39 @@
+#include <math.h>
+#include <errno.h>
+#include <limits.h>
+#include <math_private.h>
+#include <fenv.h>
+
+#if _GL_HAS_BUILTIN_ILOGB
+long int
+M_DECL_FUNC (__llogb) (FLOAT x)
+{
+ int r;
+ /* Check for exceptional cases. */
+ if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f))
+ r = M_SUF (__builtin_ilogb) (x);
+ else
+ /* Fallback to the generic ilogb if x is NaN, Inf or subnormal. */
+ r = M_SUF (__ieee754_ilogb) (x);
+ long int lr = r;
+ if (__glibc_unlikely (r == FP_ILOGB0)
+ || __glibc_unlikely (r == FP_ILOGBNAN)
+ || __glibc_unlikely (r == INT_MAX))
+ {
+#if LONG_MAX != INT_MAX
+ if (r == FP_ILOGB0)
+ lr = FP_LLOGB0;
+ else if (r == FP_ILOGBNAN)
+ lr = FP_LLOGBNAN;
+ else
+ lr = LONG_MAX;
+#endif
+ __set_errno (EDOM);
+ __feraiseexcept (FE_INVALID);
+ }
+ return lr;
+}
+declare_mgen_alias (__llogb, llogb)
+#else
+#include <math/w_llogb_template.c>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c
new file mode 100644
index 0000000000000000..69477a37ae82c476
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c
@@ -0,0 +1,4 @@
+/* Skip the optimization for long double as ibm128 does not provide an
+ optimized builtin. */
+#include <math-type-macros-ldouble.h>
+#include <math/w_llogb_template.c>

View File

@ -0,0 +1,334 @@
commit 10624a97e8e47004985740cbb04060a84cfada76
Author: Matheus Castanho <msc@linux.ibm.com>
Date: Tue Sep 29 15:40:08 2020 -0300
powerpc: Add optimized strlen for POWER10
Improvements compared to POWER9 version:
1. Take into account first 16B comparison for aligned strings
The previous version compares the first 16B and increments r4 by the number
of bytes until the address is 16B-aligned, then starts doing aligned loads at
that address. For aligned strings, this causes the first 16B to be compared
twice, because the increment is 0. Here we calculate the next 16B-aligned
address differently, which avoids that issue.
2. Use simple comparisons for the first ~192 bytes
The main loop is good for big strings, but comparing 16B each time is better
for smaller strings. So after aligning the address to 16 Bytes, we check
more 176B in 16B chunks. There may be some overlaps with the main loop for
unaligned strings, but we avoid using the more aggressive strategy too soon,
and also allow the loop to start at a 64B-aligned address. This greatly
benefits smaller strings and avoids overlapping checks if the string is
already aligned at a 64B boundary.
3. Reduce dependencies between load blocks caused by address calculation on loop
Doing a precise time tracing on the code showed many loads in the loop were
stalled waiting for updates to r4 from previous code blocks. This
implementation avoids that as much as possible by using 2 registers (r4 and
r5) to hold addresses to be used by different parts of the code.
Also, the previous code aligned the address to 16B, then to 64B by doing a
few 48B loops (if needed) until the address was aligned. The main loop could
not start until that 48B loop had finished and r4 was updated with the
current address. Here we calculate the address used by the loop very early,
so it can start sooner.
The main loop now uses 2 pointers 128B apart to make pointer updates less
frequent, and also unrolls 1 iteration to guarantee there is enough time
between iterations to update the pointers, reducing stalled cycles.
4. Use new P10 instructions
lxvp is used to load 32B with a single instruction, reducing contention in
the load queue.
vextractbm allows simplifying the tail code for the loop, replacing
vbpermq and avoiding having to generate a permute control vector.
Reviewed-by: Paul E Murphy <murphyp@linux.ibm.com>
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
new file mode 100644
index 0000000000000000..ca7e9eb3d84c9b00
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
@@ -0,0 +1,221 @@
+/* Optimized strlen implementation for POWER10 LE.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#ifndef STRLEN
+# define STRLEN __strlen
+# define DEFINE_STRLEN_HIDDEN_DEF 1
+#endif
+
+/* TODO: Replace macros by the actual instructions when minimum binutils becomes
+ >= 2.35. This is used to keep compatibility with older versions. */
+#define VEXTRACTBM(rt,vrb) \
+ .long(((4)<<(32-6)) \
+ | ((rt)<<(32-11)) \
+ | ((8)<<(32-16)) \
+ | ((vrb)<<(32-21)) \
+ | 1602)
+
+#define LXVP(xtp,dq,ra) \
+ .long(((6)<<(32-6)) \
+ | ((((xtp)-32)>>1)<<(32-10)) \
+ | ((1)<<(32-11)) \
+ | ((ra)<<(32-16)) \
+ | dq)
+
+#define CHECK16(vreg,offset,addr,label) \
+ lxv vreg+32,offset(addr); \
+ vcmpequb. vreg,vreg,v18; \
+ bne cr6,L(label);
+
+/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has #
+ of bytes already checked. */
+#define CHECK64(offset,addr,label) \
+ li r6,offset; \
+ LXVP(v4+32,offset,addr); \
+ LXVP(v6+32,offset+32,addr); \
+ vminub v14,v4,v5; \
+ vminub v15,v6,v7; \
+ vminub v16,v14,v15; \
+ vcmpequb. v0,v16,v18; \
+ bne cr6,L(label)
+
+#define TAIL(vreg,increment) \
+ vctzlsbb r4,vreg; \
+ subf r3,r3,r5; \
+ addi r4,r4,increment; \
+ add r3,r3,r4; \
+ blr
+
+/* Implements the function
+
+ int [r3] strlen (const void *s [r3])
+
+ The implementation can load bytes past a matching byte, but only
+ up to the next 64B boundary, so it never crosses a page. */
+
+.machine power9
+
+ENTRY_TOCLESS (STRLEN, 4)
+ CALL_MCOUNT 1
+
+ vspltisb v18,0
+ vspltisb v19,-1
+
+ /* Next 16B-aligned address. Prepare address for L(aligned). */
+ addi r5,r3,16
+ clrrdi r5,r5,4
+
+ /* Align data and fill bytes not loaded with non matching char. */
+ lvx v0,0,r3
+ lvsr v1,0,r3
+ vperm v0,v19,v0,v1
+
+ vcmpequb. v6,v0,v18
+ beq cr6,L(aligned)
+
+ vctzlsbb r3,v6
+ blr
+
+ /* Test next 176B, 16B at a time. The main loop is optimized for longer
+ strings, so checking the first bytes in 16B chunks benefits a lot
+ small strings. */
+ .p2align 5
+L(aligned):
+ /* Prepare address for the loop. */
+ addi r4,r3,192
+ clrrdi r4,r4,6
+
+ CHECK16(v0,0,r5,tail1)
+ CHECK16(v1,16,r5,tail2)
+ CHECK16(v2,32,r5,tail3)
+ CHECK16(v3,48,r5,tail4)
+ CHECK16(v4,64,r5,tail5)
+ CHECK16(v5,80,r5,tail6)
+ CHECK16(v6,96,r5,tail7)
+ CHECK16(v7,112,r5,tail8)
+ CHECK16(v8,128,r5,tail9)
+ CHECK16(v9,144,r5,tail10)
+ CHECK16(v10,160,r5,tail11)
+
+ addi r5,r4,128
+
+ /* Switch to a more aggressive approach checking 64B each time. Use 2
+ pointers 128B apart and unroll the loop once to make the pointer
+ updates and usages separated enough to avoid stalls waiting for
+ address calculation. */
+ .p2align 5
+L(loop):
+ CHECK64(0,r4,pre_tail_64b)
+ CHECK64(64,r4,pre_tail_64b)
+ addi r4,r4,256
+
+ CHECK64(0,r5,tail_64b)
+ CHECK64(64,r5,tail_64b)
+ addi r5,r5,256
+
+ b L(loop)
+
+ .p2align 5
+L(pre_tail_64b):
+ mr r5,r4
+L(tail_64b):
+ /* OK, we found a null byte. Let's look for it in the current 64-byte
+ block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
+ low 16B bytes into vx+1, and the high into vx, so the order here is
+ v5, v4, v7, v6. */
+ vcmpequb v1,v5,v18
+ vcmpequb v2,v4,v18
+ vcmpequb v3,v7,v18
+ vcmpequb v4,v6,v18
+
+ /* Take into account the other 64B blocks we had already checked. */
+ add r5,r5,r6
+
+ /* Extract first bit of each byte. */
+ VEXTRACTBM(r7,v1)
+ VEXTRACTBM(r8,v2)
+ VEXTRACTBM(r9,v3)
+ VEXTRACTBM(r10,v4)
+
+ /* Shift each value into their corresponding position. */
+ sldi r8,r8,16
+ sldi r9,r9,32
+ sldi r10,r10,48
+
+ /* Merge the results. */
+ or r7,r7,r8
+ or r8,r9,r10
+ or r10,r8,r7
+
+ cnttzd r0,r10 /* Count trailing zeros before the match. */
+ subf r5,r3,r5
+ add r3,r5,r0 /* Compute final length. */
+ blr
+
+ .p2align 5
+L(tail1):
+ TAIL(v0,0)
+
+ .p2align 5
+L(tail2):
+ TAIL(v1,16)
+
+ .p2align 5
+L(tail3):
+ TAIL(v2,32)
+
+ .p2align 5
+L(tail4):
+ TAIL(v3,48)
+
+ .p2align 5
+L(tail5):
+ TAIL(v4,64)
+
+ .p2align 5
+L(tail6):
+ TAIL(v5,80)
+
+ .p2align 5
+L(tail7):
+ TAIL(v6,96)
+
+ .p2align 5
+L(tail8):
+ TAIL(v7,112)
+
+ .p2align 5
+L(tail9):
+ TAIL(v8,128)
+
+ .p2align 5
+L(tail10):
+ TAIL(v9,144)
+
+ .p2align 5
+L(tail11):
+ TAIL(v10,160)
+
+END (STRLEN)
+
+#ifdef DEFINE_STRLEN_HIDDEN_DEF
+weak_alias (__strlen, strlen)
+libc_hidden_builtin_def (strlen)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index a9e13e05e90601cd..61652b65dd223018 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
+ strlen-power10
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index b30bc53930fc0e36..46d5956adda72b86 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -112,6 +112,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
IFUNC_IMPL (i, name, strlen,
#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1,
+ __strlen_power10)
IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
__strlen_power9)
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S
new file mode 100644
index 0000000000000000..6a774fad58c77179
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S
@@ -0,0 +1,2 @@
+#define STRLEN __strlen_power10
+#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
index b7f0fbb13fb97783..11bdb96de2d2aa66 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -31,9 +31,12 @@ extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
+extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden;
libc_ifunc (__libc_strlen,
# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+ ? __strlen_power10 :
(hwcap2 & PPC_FEATURE2_ARCH_3_00)
? __strlen_power9 :
# endif

View File

@ -0,0 +1,527 @@
commit dd59655e9371af86043b97e38953f43bd9496699
Author: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
Date: Fri Apr 30 18:12:08 2021 -0300
powerpc64le: Optimized memmove for POWER10
This patch was initially based on the __memmove_power7 with some ideas
from strncpy implementation for Power 9.
Improvements from __memmove_power7:
1. Use lxvl/stxvl for alignment code.
The code for Power 7 uses branches when the input is not naturally
aligned to the width of a vector. The new implementation uses
lxvl/stxvl instead which reduces pressure on GPRs. It also allows
the removal of branch instructions, implicitly removing branch stalls
and mispredictions.
2. Use of lxv/stxv and lxvl/stxvl pair is safe to use on Cache Inhibited
memory.
On Power 10 vector load and stores are safe to use on CI memory for
addresses unaligned to 16B. This code takes advantage of this to
do unaligned loads.
The unaligned loads don't have a significant performance impact by
themselves. However doing so decreases register pressure on GPRs
and interdependence stalls on load/store pairs. This also improved
readability as there are now less code paths for different alignments.
Finally this reduces the overall code size.
3. Improved performance.
This version runs on average about 30% better than memmove_power7
for lengths larger than 8KB. For input lengths shorter than 8KB
the improvement is smaller, it has on average about 17% better
performance.
This version has a degradation of about 50% for input lengths
in the 0 to 31 bytes range when dest is unaligned.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memmove.S b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
new file mode 100644
index 0000000000000000..7dfd57edeb37e8e4
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
@@ -0,0 +1,320 @@
+/* Optimized memmove implementation for POWER10.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
+
+ This optimization checks if 'src' and 'dst' overlap. If they do not
+ or 'src' is ahead of 'dest' then it copies forward.
+ Otherwise, an optimized backward copy is used. */
+
+#ifndef MEMMOVE
+# define MEMMOVE memmove
+#endif
+ .machine power9
+ENTRY_TOCLESS (MEMMOVE, 5)
+ CALL_MCOUNT 3
+
+L(_memmove):
+ .p2align 5
+ /* Check if there is overlap, if so it will branch to backward copy. */
+ subf r9,r4,r3
+ cmpld cr7,r9,r5
+ blt cr7,L(memmove_bwd)
+
+ /* Fast path for length shorter than 16 bytes. */
+ sldi r7,r5,56
+ lxvl 32+v2,r4,r7
+ stxvl 32+v2,r3,r7
+ subic. r8,r5,16
+ blelr
+
+ /* For shorter lengths aligning the dest address to 16 bytes either
+ decreases performance or is irrelevant. I'm making use of this
+ comparison to skip the alignment in. */
+ cmpldi cr6,r5,256
+ bge cr6,L(ge_256)
+ /* Account for the first 16-byte copy. */
+ addi r4,r4,16
+ addi r11,r3,16 /* use r11 to keep dest address on r3. */
+ subi r5,r5,16
+ b L(loop_head)
+
+ .p2align 5
+L(ge_256):
+ /* Account for the first copy <= 16 bytes. This is necessary for
+ memmove because at this point the src address can be in front of the
+ dest address. */
+ clrldi r9,r5,56
+ li r8,16
+ cmpldi r9,16
+ iselgt r9,r8,r9
+ add r4,r4,r9
+ add r11,r3,r9 /* use r11 to keep dest address on r3. */
+ sub r5,r5,r9
+
+ /* Align dest to 16 bytes. */
+ neg r7,r3
+ clrldi. r9,r7,60
+ beq L(loop_head)
+
+ .p2align 5
+ sldi r6,r9,56
+ lxvl 32+v0,r4,r6
+ stxvl 32+v0,r11,r6
+ sub r5,r5,r9
+ add r4,r4,r9
+ add r11,r11,r9
+
+L(loop_head):
+ cmpldi r5,63
+ ble L(final_64)
+
+ srdi. r7,r5,7
+ beq L(loop_tail)
+
+ mtctr r7
+
+/* Main loop that copies 128 bytes each iteration. */
+ .p2align 5
+L(loop):
+ addi r9,r4,64
+ addi r10,r11,64
+
+ lxv 32+v0,0(r4)
+ lxv 32+v1,16(r4)
+ lxv 32+v2,32(r4)
+ lxv 32+v3,48(r4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ addi r4,r4,128
+ addi r11,r11,128
+
+ lxv 32+v4,0(r9)
+ lxv 32+v5,16(r9)
+ lxv 32+v6,32(r9)
+ lxv 32+v7,48(r9)
+
+ stxv 32+v4,0(r10)
+ stxv 32+v5,16(r10)
+ stxv 32+v6,32(r10)
+ stxv 32+v7,48(r10)
+
+ bdnz L(loop)
+ clrldi. r5,r5,57
+ beqlr
+
+/* Copy 64 bytes. */
+ .p2align 5
+L(loop_tail):
+ cmpldi cr5,r5,63
+ ble cr5,L(final_64)
+
+ lxv 32+v0,0(r4)
+ lxv 32+v1,16(r4)
+ lxv 32+v2,32(r4)
+ lxv 32+v3,48(r4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ addi r4,r4,64
+ addi r11,r11,64
+ subi r5,r5,64
+
+/* Copies the last 1-63 bytes. */
+ .p2align 5
+L(final_64):
+ /* r8 holds the number of bytes that will be copied with lxv/stxv. */
+ clrrdi. r8,r5,4
+ beq L(tail1)
+
+ cmpldi cr5,r5,32
+ lxv 32+v0,0(r4)
+ blt cr5,L(tail2)
+
+ cmpldi cr6,r5,48
+ lxv 32+v1,16(r4)
+ blt cr6,L(tail3)
+
+ .p2align 5
+ lxv 32+v2,32(r4)
+ stxv 32+v2,32(r11)
+L(tail3):
+ stxv 32+v1,16(r11)
+L(tail2):
+ stxv 32+v0,0(r11)
+ sub r5,r5,r8
+ add r4,r4,r8
+ add r11,r11,r8
+ .p2align 5
+L(tail1):
+ sldi r6,r5,56
+ lxvl v4,r4,r6
+ stxvl v4,r11,r6
+ blr
+
+/* If dest and src overlap, we should copy backwards. */
+L(memmove_bwd):
+ add r11,r3,r5
+ add r4,r4,r5
+
+ /* Optimization for length smaller than 16 bytes. */
+ cmpldi cr5,r5,15
+ ble cr5,L(tail1_bwd)
+
+ /* For shorter lengths the alignment either slows down or is irrelevant.
+ The forward copy uses a already need 256 comparison for that. Here
+ it's using 128 as it will reduce code and improve readability. */
+ cmpldi cr7,r5,128
+ blt cr7,L(bwd_loop_tail)
+
+ /* Align dest address to 16 bytes. */
+ .p2align 5
+ clrldi. r9,r11,60
+ beq L(bwd_loop_head)
+ sub r4,r4,r9
+ sub r11,r11,r9
+ lxv 32+v0,0(r4)
+ sldi r6,r9,56
+ stxvl 32+v0,r11,r6
+ sub r5,r5,r9
+
+L(bwd_loop_head):
+ srdi. r7,r5,7
+ beq L(bwd_loop_tail)
+
+ mtctr r7
+
+/* Main loop that copies 128 bytes every iteration. */
+ .p2align 5
+L(bwd_loop):
+ addi r9,r4,-64
+ addi r10,r11,-64
+
+ lxv 32+v0,-16(r4)
+ lxv 32+v1,-32(r4)
+ lxv 32+v2,-48(r4)
+ lxv 32+v3,-64(r4)
+
+ stxv 32+v0,-16(r11)
+ stxv 32+v1,-32(r11)
+ stxv 32+v2,-48(r11)
+ stxv 32+v3,-64(r11)
+
+ addi r4,r4,-128
+ addi r11,r11,-128
+
+ lxv 32+v0,-16(r9)
+ lxv 32+v1,-32(r9)
+ lxv 32+v2,-48(r9)
+ lxv 32+v3,-64(r9)
+
+ stxv 32+v0,-16(r10)
+ stxv 32+v1,-32(r10)
+ stxv 32+v2,-48(r10)
+ stxv 32+v3,-64(r10)
+
+ bdnz L(bwd_loop)
+ clrldi. r5,r5,57
+ beqlr
+
+/* Copy 64 bytes. */
+ .p2align 5
+L(bwd_loop_tail):
+ cmpldi cr5,r5,63
+ ble cr5,L(bwd_final_64)
+
+ addi r4,r4,-64
+ addi r11,r11,-64
+
+ lxv 32+v0,0(r4)
+ lxv 32+v1,16(r4)
+ lxv 32+v2,32(r4)
+ lxv 32+v3,48(r4)
+
+ stxv 32+v0,0(r11)
+ stxv 32+v1,16(r11)
+ stxv 32+v2,32(r11)
+ stxv 32+v3,48(r11)
+
+ subi r5,r5,64
+
+/* Copies the last 1-63 bytes. */
+ .p2align 5
+L(bwd_final_64):
+ /* r8 holds the number of bytes that will be copied with lxv/stxv. */
+ clrrdi. r8,r5,4
+ beq L(tail1_bwd)
+
+ cmpldi cr5,r5,32
+ lxv 32+v2,-16(r4)
+ blt cr5,L(tail2_bwd)
+
+ cmpldi cr6,r5,48
+ lxv 32+v1,-32(r4)
+ blt cr6,L(tail3_bwd)
+
+ .p2align 5
+ lxv 32+v0,-48(r4)
+ stxv 32+v0,-48(r11)
+L(tail3_bwd):
+ stxv 32+v1,-32(r11)
+L(tail2_bwd):
+ stxv 32+v2,-16(r11)
+ sub r4,r4,r5
+ sub r11,r11,r5
+ sub r5,r5,r8
+ sldi r6,r5,56
+ lxvl v4,r4,r6
+ stxvl v4,r11,r6
+ blr
+
+/* Copy last 16 bytes. */
+ .p2align 5
+L(tail1_bwd):
+ sub r4,r4,r5
+ sub r11,r11,r5
+ sldi r6,r5,56
+ lxvl v4,r4,r6
+ stxvl v4,r11,r6
+ blr
+
+END_GEN_TB (MEMMOVE,TB_TOCLESS)
+libc_hidden_builtin_def (memmove)
+
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
+ Implemented in this file to avoid linker create a stub function call
+ in the branch to '_memmove'. */
+ENTRY_TOCLESS (__bcopy)
+ mr r6,r3
+ mr r3,r4
+ mr r4,r6
+ b L(_memmove)
+END (__bcopy)
+#ifndef __bcopy
+weak_alias (__bcopy, bcopy)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 61652b65dd223018..66f8c6ace9824d4a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
+sysdep_routines += memmove-power10 \
+ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
strlen-power10
endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
index 1c4a229b1fc5654a..705fef33d4e57557 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -22,8 +22,17 @@
extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
/* __bcopy_power7 symbol is implemented at memmove-power7.S */
extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
+#ifdef __LITTLE_ENDIAN__
+extern __typeof (bcopy) __bcopy_power10 attribute_hidden;
+#endif
libc_ifunc (bcopy,
+#ifdef __LITTLE_ENDIAN__
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __bcopy_power10 :
+#endif
(hwcap & PPC_FEATURE_HAS_VSX)
? __bcopy_power7
: __bcopy_ppc);
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 46d5956adda72b86..4ce04bc51574cca1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -67,6 +67,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c. */
IFUNC_IMPL (i, name, memmove,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, memmove,
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && (hwcap & PPC_FEATURE_HAS_VSX),
+ __memmove_power10)
+#endif
IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
__memmove_power7)
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
@@ -186,6 +193,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c. */
IFUNC_IMPL (i, name, bcopy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, bcopy,
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && (hwcap & PPC_FEATURE_HAS_VSX),
+ __bcopy_power10)
+#endif
IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
__bcopy_power7)
IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
new file mode 100644
index 0000000000000000..171b32921a0a4d47
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
@@ -0,0 +1,27 @@
+/* Optimized memmove implementation for POWER10.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define MEMMOVE __memmove_power10
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bcopy
+#define __bcopy __bcopy_power10
+
+#include <sysdeps/powerpc/powerpc64/le/power10/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
index 0b251d0f5f087874..fb5261ecda64d061 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
@@ -21,7 +21,7 @@
#undef libc_hidden_builtin_def
#define libc_hidden_builtin_def(name)
-#undef bcopy
-#define bcopy __bcopy_power7
+#undef __bcopy
+#define __bcopy __bcopy_power7
#include <sysdeps/powerpc/powerpc64/power7/memmove.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
index 39987155cc7d3624..2fd7b6d309e4bedd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -28,14 +28,22 @@
# include "init-arch.h"
extern __typeof (__redirect_memmove) __libc_memmove;
-
extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
+#ifdef __LITTLE_ENDIAN__
+extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden;
+#endif
libc_ifunc (__libc_memmove,
- (hwcap & PPC_FEATURE_HAS_VSX)
- ? __memmove_power7
- : __memmove_ppc);
+#ifdef __LITTLE_ENDIAN__
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memmove_power10 :
+#endif
+ (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memmove_power7
+ : __memmove_ppc);
#undef memmove
strong_alias (__libc_memmove, memmove);
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
index b7f3dc28d1a8eac3..9e4cabb07ef9b732 100644
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
mr r4,r6
b L(_memmove)
END (__bcopy)
+#ifndef __bcopy
weak_alias (__bcopy, bcopy)
+#endif

View File

@ -0,0 +1,308 @@
commit e941e0ae80626b7661c1db8953a673cafd3b8b19
Author: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
Date: Fri Apr 30 18:12:08 2021 -0300
powerpc64le: Optimize memcpy for POWER10
This implementation is based on __memcpy_power8_cached and integrates
suggestions from Anton Blanchard.
It benefits from loads and stores with length for short lengths and for
tail code, simplifying the code.
All unaligned memory accesses use instructions that do not generate
alignment interrupts on POWER10, making it safe to use on
caching-inhibited memory.
The main loop has also been modified in order to increase instruction
throughput by reducing the dependency on updates from previous iterations.
On average, this implementation provides around 30% improvement when
compared to __memcpy_power7 and 10% improvement in comparison to
__memcpy_power8_cached.
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcpy.S b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
new file mode 100644
index 0000000000000000..ad1414db4a3a8b9f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
@@ -0,0 +1,198 @@
+/* Optimized memcpy implementation for POWER10.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <http://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+
+#ifndef MEMCPY
+# define MEMCPY memcpy
+#endif
+
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
+ Returns 'dst'. */
+
+ .machine power9
+ENTRY_TOCLESS (MEMCPY, 5)
+ CALL_MCOUNT 3
+
+ /* Copy up to 16 bytes. */
+ sldi r6,r5,56 /* Prepare [l|st]xvl counter. */
+ lxvl v10,r4,r6
+ stxvl v10,r3,r6
+ subic. r6,r5,16 /* Return if len <= 16. */
+ blelr
+
+ /* If len >= 256, assume nothing got copied before and copy
+ again. This might cause issues with overlapped memory, but memcpy
+ is not expected to treat overlapped memory. */
+ cmpdi r5,256
+ bge L(copy_ge_256)
+ /* 16 < len < 256 and the first 16 bytes have already been copied. */
+ addi r10,r3,16 /* Keep r3 intact as return value. */
+ addi r4,r4,16
+ subi r5,r5,16
+ b L(copy_lt_256) /* Avoid the main loop if len < 256. */
+
+ .p2align 5
+L(copy_ge_256):
+ mr r10,r3 /* Keep r3 intact as return value. */
+ /* Align dst to 16 bytes. */
+ andi. r9,r10,0xf
+ beq L(dst_is_align_16)
+ lxv v10,0(r4)
+ subfic r12,r9,16
+ subf r5,r12,r5
+ add r4,r4,r12
+ stxv v10,0(r3)
+ add r10,r3,r12
+
+L(dst_is_align_16):
+ srdi r9,r5,7 /* Divide by 128. */
+ mtctr r9
+ addi r6,r4,64
+ addi r7,r10,64
+
+
+ /* Main loop, copy 128 bytes per iteration.
+ Use r6=src+64 and r7=dest+64 in order to reduce the dependency on
+ r4 and r10. */
+ .p2align 5
+L(copy_128):
+
+ lxv v10, 0(r4)
+ lxv v11, 16(r4)
+ lxv v12, 32(r4)
+ lxv v13, 48(r4)
+
+ addi r4,r4,128
+
+ stxv v10, 0(r10)
+ stxv v11, 16(r10)
+ stxv v12, 32(r10)
+ stxv v13, 48(r10)
+
+ addi r10,r10,128
+
+ lxv v10, 0(r6)
+ lxv v11, 16(r6)
+ lxv v12, 32(r6)
+ lxv v13, 48(r6)
+
+ addi r6,r6,128
+
+ stxv v10, 0(r7)
+ stxv v11, 16(r7)
+ stxv v12, 32(r7)
+ stxv v13, 48(r7)
+
+ addi r7,r7,128
+
+ bdnz L(copy_128)
+
+ clrldi. r5,r5,64-7 /* Have we copied everything? */
+ beqlr
+
+ .p2align 5
+L(copy_lt_256):
+ cmpdi r5,16
+ ble L(copy_le_16)
+ srdi. r9,r5,5 /* Divide by 32. */
+ beq L(copy_lt_32)
+ mtctr r9
+ /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce
+ the dependency on r4 and r10. */
+ addi r6,r4,32
+ addi r7,r10,32
+ addi r8,r4,64
+ addi r9,r10,64
+
+ .p2align 5
+ /* Copy 32 bytes at a time, unaligned.
+ The loop is unrolled 3 times in order to reduce the dependency on
+ r4 and r10, copying up-to 96 bytes per iteration. */
+L(copy_32):
+ lxv v10, 0(r4)
+ lxv v11, 16(r4)
+ stxv v10, 0(r10)
+ stxv v11, 16(r10)
+ bdz L(end_copy_32a)
+ addi r4,r4,96
+ addi r10,r10,96
+
+ lxv v10, 0(r6)
+ lxv v11, 16(r6)
+ addi r6,r6,96
+ stxv v10, 0(r7)
+ stxv v11, 16(r7)
+ bdz L(end_copy_32b)
+ addi r7,r7,96
+
+ lxv v12, 0(r8)
+ lxv v13, 16(r8)
+ addi r8,r8,96
+ stxv v12, 0(r9)
+ stxv v13, 16(r9)
+ addi r9,r9,96
+ bdnz L(copy_32)
+
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
+ beqlr
+ cmpdi r5,16
+ ble L(copy_le_16)
+ b L(copy_lt_32)
+
+ .p2align 5
+L(end_copy_32a):
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
+ beqlr
+ /* 32 bytes have been copied since the last update of r4 and r10. */
+ addi r4,r4,32
+ addi r10,r10,32
+ cmpdi r5,16
+ ble L(copy_le_16)
+ b L(copy_lt_32)
+
+ .p2align 5
+L(end_copy_32b):
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
+ beqlr
+ /* The last iteration of the loop copied 64 bytes. Update r4 and r10
+ accordingly. */
+ addi r4,r4,-32
+ addi r10,r10,-32
+ cmpdi r5,16
+ ble L(copy_le_16)
+
+ .p2align 5
+L(copy_lt_32):
+ lxv v10, 0(r4)
+ stxv v10, 0(r10)
+ addi r4,r4,16
+ addi r10,r10,16
+ subi r5,r5,16
+
+ .p2align 5
+L(copy_le_16):
+ sldi r6,r5,56
+ lxvl v10,r4,r6
+ stxvl v10,r10,r6
+ blr
+
+
+END_GEN_TB (MEMCPY,TB_TOCLESS)
+libc_hidden_builtin_def (memcpy)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 66f8c6ace9824d4a..2e3c8f2e8a81cda4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += memmove-power10 \
+sysdep_routines += memcpy-power10 memmove-power10 \
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
strlen-power10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 4ce04bc51574cca1..9d5a14e480c02171 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -51,6 +51,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
#ifdef SHARED
/* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */
IFUNC_IMPL (i, name, memcpy,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, memcpy,
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap & PPC_FEATURE_HAS_VSX,
+ __memcpy_power10)
+#endif
IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__memcpy_power8_cached)
IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
new file mode 100644
index 0000000000000000..70e0fc3ed610cdc3
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
@@ -0,0 +1,26 @@
+/* Optimized memcpy implementation for POWER10.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define MEMCPY __memcpy_power10
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power10/memcpy.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
index 44dea594f3770673..be0e47f32dde2ccf 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
@@ -36,8 +36,15 @@ extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden;
+# if defined __LITTLE_ENDIAN__
+extern __typeof (__redirect_memcpy) __memcpy_power10 attribute_hidden;
+# endif
libc_ifunc (__libc_memcpy,
+# if defined __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memcpy_power10 :
+# endif
((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
? __memcpy_power8_cached :
(hwcap & PPC_FEATURE_HAS_VSX)

View File

@ -0,0 +1,420 @@
commit 23fdf8178cce3c2ec320dd5eca8b544245bcaef0
Author: Raoni Fassina Firmino <raoni@linux.ibm.com>
Date: Fri Apr 30 18:12:08 2021 -0300
powerpc64le: Optimize memset for POWER10
This implementation is based on __memset_power8 and integrates a lot
of suggestions from Anton Blanchard.
The biggest difference is that it makes extensive use of stxvl to
alignment and tail code to avoid branches and small stores. It has
three main execution paths:
a) "Short lengths" for lengths up to 64 bytes, avoiding as many
branches as possible.
b) "General case" for larger lengths, it has an alignment section
using stxvl to avoid branches, a 128 bytes loop and then a tail
code, again using stxvl with few branches.
c) "Zeroing cache blocks" for lengths from 256 bytes upwards and set
value being zero. It is mostly the __memset_power8 code but the
alignment phase was simplified because, at this point, address is
already 16-bytes aligned and also changed to use vector stores.
The tail code was also simplified to reuse the general case tail.
All unaligned stores use stxvl instructions that do not generate
alignment interrupts on POWER10, making it safe to use on
caching-inhibited memory.
On average, this implementation provides something around 30%
improvement when compared to __memset_power8.
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memset.S b/sysdeps/powerpc/powerpc64/le/power10/memset.S
new file mode 100644
index 0000000000000000..6b8e2cfdaf25fd30
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/memset.S
@@ -0,0 +1,256 @@
+/* Optimized memset implementation for POWER10 LE.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
+ Returns 's'. */
+
+#ifndef MEMSET
+# define MEMSET memset
+#endif
+
+ .machine power9
+ENTRY_TOCLESS (MEMSET, 5)
+ CALL_MCOUNT 3
+
+L(_memset):
+ /* Assume memset of zero length is uncommon, and just let it go
+ through the small path below. */
+ cmpldi r5,64
+
+ /* Replicate byte to quad word. */
+ mtvsrd v0+32,r4
+ vspltb v0,v0,7
+
+ li r7,16
+ sldi r8,r7,56
+
+ bgt L(large)
+
+ /* For short lengths we want to avoid as many branches as possible.
+ We use store VSX vector with length instructions to do this.
+ It takes advantage of the fact that if the length passed to stxvl
+ is zero nothing is done, effectively a no-op. */
+ sldi r5,r5,56
+
+ addi r10,r3,16
+
+ sub. r11,r5,r8
+ isellt r11,0,r11 /* Saturate the subtraction to zero. */
+
+ stxvl v0+32,r3,r5
+ stxvl v0+32,r10,r11
+
+ addi r9,r3,32
+ addi r10,r3,48
+
+ sub. r11,r11,r8
+ isellt r11,0,r11
+
+ sub. r5,r11,r8
+ isellt r5,0,r5
+
+ stxvl v0+32,r9,r11
+ stxvl v0+32,r10,r5
+
+ blr
+
+ .balign 16
+L(large):
+ mr r6,r3 /* Don't modify r3 since we need to return it. */
+
+ /* Get dest 16B aligned. */
+ neg r0,r3
+ clrldi. r7,r0,(64-4)
+ beq L(aligned)
+ rldic r9,r0,56,4 /* (~X & 0xf)<<56 "clrlsldi r9,r0,64-4,56". */
+
+ stxvl v0+32,r6,r9 /* Store up to 15B until aligned address. */
+
+ add r6,r6,r7
+ sub r5,r5,r7
+
+ /* Go to tail if there is less than 64B left after alignment. */
+ cmpldi r5,64
+ blt L(tail_64)
+
+ .balign 16
+L(aligned):
+ /* Go to tail if there is less than 128B left after alignment. */
+ srdi. r0,r5,7
+ beq L(tail_128)
+
+ /* If c == 0 && n >= 256 use dcbz to zero out full cache blocks. */
+ cmpldi cr5,r5,255
+ cmpldi cr6,r4,0
+ crand 27,26,21
+ bt 27,L(dcbz)
+
+ mtctr r0
+
+ .balign 32
+L(loop):
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ stxv v0+32,32(r6)
+ stxv v0+32,48(r6)
+ stxv v0+32,64(r6)
+ stxv v0+32,80(r6)
+ stxv v0+32,96(r6)
+ stxv v0+32,112(r6)
+ addi r6,r6,128
+ bdnz L(loop)
+
+ .balign 16
+L(tail):
+ /* 127B or less left, finish the tail or return. */
+ andi. r5,r5,127
+ beqlr
+
+ cmpldi r5,64
+ blt L(tail_64)
+
+ .balign 16
+L(tail_128):
+ /* Stores a minimum of 64B and up to 128B and return. */
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ stxv v0+32,32(r6)
+ stxv v0+32,48(r6)
+ addi r6,r6,64
+ andi. r5,r5,63
+ beqlr
+
+ .balign 16
+L(tail_64):
+ /* Stores up to 64B and return. */
+ sldi r5,r5,56
+
+ addi r10,r6,16
+
+ sub. r11,r5,r8
+ isellt r11,0,r11
+
+ stxvl v0+32,r6,r5
+ stxvl v0+32,r10,r11
+
+ sub. r11,r11,r8
+ blelr
+
+ addi r9,r6,32
+ addi r10,r6,48
+
+ isellt r11,0,r11
+
+ sub. r5,r11,r8
+ isellt r5,0,r5
+
+ stxvl v0+32,r9,r11
+ stxvl v0+32,r10,r5
+
+ blr
+
+ .balign 16
+L(dcbz):
+ /* Special case when value is 0 and we have a long length to deal
+ with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
+ Before using dcbz though, we need to get the destination 128-byte
+ aligned. */
+ neg r0,r6
+ clrldi. r0,r0,(64-7)
+ beq L(dcbz_aligned)
+
+ sub r5,r5,r0
+ mtocrf 0x2,r0 /* copying bits 57..59 to cr6. The ones for sizes 64,
+ 32 and 16 which need to be checked. */
+
+ /* Write 16-128 bytes until DST is aligned to 128 bytes. */
+64: bf 25,32f
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ stxv v0+32,32(r6)
+ stxv v0+32,48(r6)
+ addi r6,r6,64
+
+32: bf 26,16f
+ stxv v0+32,0(r6)
+ stxv v0+32,16(r6)
+ addi r6,r6,32
+
+16: bf 27,L(dcbz_aligned)
+ stxv v0+32,0(r6)
+ addi r6,r6,16
+
+ .balign 16
+L(dcbz_aligned):
+ /* Setup dcbz unroll offsets and count numbers. */
+ srdi. r0,r5,9
+ li r9,128
+ beq L(bcdz_tail)
+ li r10,256
+ li r11,384
+ mtctr r0
+
+ .balign 16
+L(dcbz_loop):
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
+ a throughput boost for large sizes (2048 bytes or higher). */
+ dcbz 0,r6
+ dcbz r9,r6
+ dcbz r10,r6
+ dcbz r11,r6
+ addi r6,r6,512
+ bdnz L(dcbz_loop)
+
+ andi. r5,r5,511
+ beqlr
+
+ .balign 16
+L(bcdz_tail):
+ /* We have 1-511 bytes remaining. */
+ srdi. r0,r5,7
+ beq L(tail)
+
+ mtocrf 0x1,r0
+
+256: bf 30,128f
+ dcbz 0,r6
+ dcbz r9,r6
+ addi r6,r6,256
+
+128: bf 31,L(tail)
+ dcbz 0,r6
+ addi r6,r6,128
+
+ b L(tail)
+
+END_GEN_TB (MEMSET,TB_TOCLESS)
+libc_hidden_builtin_def (memset)
+
+/* Copied from bzero.S to prevent the linker from inserting a stub
+ between bzero and memset. */
+ENTRY_TOCLESS (__bzero)
+ CALL_MCOUNT 2
+ mr r5,r4
+ li r4,0
+ b L(_memset)
+END (__bzero)
+#ifndef __bzero
+weak_alias (__bzero, bzero)
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 2e3c8f2e8a81cda4..1d517698429e1230 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += memcpy-power10 memmove-power10 \
+sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
strlen-power10
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index f8cb05bea8a3505b..4ce98e324d12a31e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -27,8 +27,16 @@ extern __typeof (bzero) __bzero_power4 attribute_hidden;
extern __typeof (bzero) __bzero_power6 attribute_hidden;
extern __typeof (bzero) __bzero_power7 attribute_hidden;
extern __typeof (bzero) __bzero_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (bzero) __bzero_power10 attribute_hidden;
+# endif
libc_ifunc (__bzero,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __bzero_power10 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __bzero_power8 :
(hwcap & PPC_FEATURE_HAS_VSX)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 9d5a14e480c02171..11532f77d4d03b2a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -86,6 +86,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */
IFUNC_IMPL (i, name, memset,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, memset,
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX,
+ __memset_power10)
+#endif
IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__memset_power8)
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
@@ -187,6 +194,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */
IFUNC_IMPL (i, name, bzero,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, bzero,
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
+ PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX,
+ __bzero_power10)
+#endif
IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
__bzero_power8)
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
new file mode 100644
index 0000000000000000..548e99789735296c
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
@@ -0,0 +1,27 @@
+/* Optimized memset implementation for POWER10 LE.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define MEMSET __memset_power10
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#undef __bzero
+#define __bzero __bzero_power10
+
+#include <sysdeps/powerpc/powerpc64/le/power10/memset.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index 1a7c46fecf78ab1f..4c97622c7d7eb8aa 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -33,10 +33,18 @@ extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__redirect_memset) __memset_power10 attribute_hidden;
+# endif
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__libc_memset,
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memset_power10 :
+# endif
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
? __memset_power8 :
(hwcap & PPC_FEATURE_HAS_VSX)

View File

@ -0,0 +1,131 @@
commit 17a73a6d8b4c46f3e87fc53c7c25fa7cec01d707
Author: Raoni Fassina Firmino <raoni@linux.ibm.com>
Date: Mon May 3 16:59:35 2021 -0300
powerpc64le: Fix ifunc selection for memset, memmove, bzero and bcopy
The hwcap2 check for the aforementioned functions should check for
both PPC_FEATURE2_ARCH_3_1 and PPC_FEATURE2_HAS_ISEL but was
mistakenly checking for any one of them, enabling isa 3.1 version of
the functions in incompatible processors, like POWER8.
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
index 705fef33d4e57557..3c6528e5dbccfdbd 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -28,10 +28,10 @@ extern __typeof (bcopy) __bcopy_power10 attribute_hidden;
libc_ifunc (bcopy,
#ifdef __LITTLE_ENDIAN__
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
- PPC_FEATURE2_HAS_ISEL)
- && (hwcap & PPC_FEATURE_HAS_VSX)
- ? __bcopy_power10 :
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __bcopy_power10 :
#endif
(hwcap & PPC_FEATURE_HAS_VSX)
? __bcopy_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
index 4ce98e324d12a31e..b08b381b4a3999f1 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -33,7 +33,8 @@ extern __typeof (bzero) __bzero_power10 attribute_hidden;
libc_ifunc (__bzero,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
&& hwcap & PPC_FEATURE_HAS_VSX)
? __bzero_power10 :
# endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 11532f77d4d03b2a..6e36659d1903448a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -75,9 +75,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, memmove,
#ifdef __LITTLE_ENDIAN__
IFUNC_IMPL_ADD (array, i, memmove,
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
- PPC_FEATURE2_HAS_ISEL)
- && (hwcap & PPC_FEATURE_HAS_VSX),
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
+ && hwcap & PPC_FEATURE_HAS_VSX,
__memmove_power10)
#endif
IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
@@ -88,8 +88,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, memset,
#ifdef __LITTLE_ENDIAN__
IFUNC_IMPL_ADD (array, i, memset,
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
- PPC_FEATURE2_HAS_ISEL)
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
&& hwcap & PPC_FEATURE_HAS_VSX,
__memset_power10)
#endif
@@ -196,8 +196,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, bzero,
#ifdef __LITTLE_ENDIAN__
IFUNC_IMPL_ADD (array, i, bzero,
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
- PPC_FEATURE2_HAS_ISEL)
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
&& hwcap & PPC_FEATURE_HAS_VSX,
__bzero_power10)
#endif
@@ -215,9 +215,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
IFUNC_IMPL (i, name, bcopy,
#ifdef __LITTLE_ENDIAN__
IFUNC_IMPL_ADD (array, i, bcopy,
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
- PPC_FEATURE2_HAS_ISEL)
- && (hwcap & PPC_FEATURE_HAS_VSX),
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
+ && hwcap & PPC_FEATURE_HAS_VSX,
__bcopy_power10)
#endif
IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
index 2fd7b6d309e4bedd..27895faad0cab40e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -36,10 +36,10 @@ extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden;
libc_ifunc (__libc_memmove,
#ifdef __LITTLE_ENDIAN__
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
- PPC_FEATURE2_HAS_ISEL)
- && (hwcap & PPC_FEATURE_HAS_VSX)
- ? __memmove_power10 :
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memmove_power10 :
#endif
(hwcap & PPC_FEATURE_HAS_VSX)
? __memmove_power7
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
index 4c97622c7d7eb8aa..685623ae870a0725 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -41,7 +41,8 @@ extern __typeof (__redirect_memset) __memset_power10 attribute_hidden;
ifunc symbol properly. */
libc_ifunc (__libc_memset,
# ifdef __LITTLE_ENDIAN__
- (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
&& hwcap & PPC_FEATURE_HAS_VSX)
? __memset_power10 :
# endif

View File

@ -0,0 +1,387 @@
commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd
Author: Matheus Castanho <msc@linux.ibm.com>
Date: Tue May 11 17:53:07 2021 -0300
powerpc: Add optimized rawmemchr for POWER10
Reuse code for optimized strlen to implement a faster version of rawmemchr.
This takes advantage of the same benefits provided by the strlen implementation,
but needs some extra steps. __strlen_power10 code should be unchanged after this
change.
rawmemchr returns a pointer to the char found, while strlen returns only the
length, so we have to take that into account when preparing the return value.
To quickly check 64B, the loop on __strlen_power10 merges the whole block into
16B by using unsigned minimum vector operations (vminub) and checks if there are
any \0 on the resulting vector. The same code is used by rawmemchr if the char c
is 0. However, this approach does not work when c != 0. We first need to
subtract each byte by c, so that the value we are looking for is converted to a
0, then taking the minimum and checking for nulls works again.
The new code branches after it has compared ~256 bytes and chooses which of the
two strategies above will be used in the main loop, based on the char c. This
extra branch adds some overhead (~5%) for length ~256, but is quickly amortized
by the faster loop for larger sizes.
Compared to __rawmemchr_power9, this version is ~20% faster for length < 256.
Because of the optimized main loop, the improvement becomes ~35% for c != 0
and ~50% for c = 0 for strings longer than 256.
Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
new file mode 100644
index 0000000000000000..5351c2634f6086bf
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
@@ -0,0 +1,22 @@
+/* Optimized rawmemchr implementation for POWER10 LE.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+#define USE_AS_RAWMEMCHR 1
+#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644
--- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S
+++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
@@ -18,10 +18,50 @@
#include <sysdep.h>
-#ifndef STRLEN
-# define STRLEN __strlen
-# define DEFINE_STRLEN_HIDDEN_DEF 1
-#endif
+/* To reuse the code for rawmemchr, we have some extra steps compared to the
+ strlen implementation:
+ - Sum the initial value of r3 with the position at which the char was
+ found, to guarantee we return a pointer and not the length.
+ - In the main loop, subtract each byte by the char we are looking for,
+ so we can keep using vminub to quickly check 64B at once. */
+#ifdef USE_AS_RAWMEMCHR
+# ifndef RAWMEMCHR
+# define FUNCNAME __rawmemchr
+# else
+# define FUNCNAME RAWMEMCHR
+# endif
+# define MCOUNT_NARGS 2
+# define VREG_ZERO v20
+# define OFF_START_LOOP 256
+# define RAWMEMCHR_SUBTRACT_VECTORS \
+ vsububm v4,v4,v18; \
+ vsububm v5,v5,v18; \
+ vsububm v6,v6,v18; \
+ vsububm v7,v7,v18;
+# define TAIL(vreg,increment) \
+ vctzlsbb r4,vreg; \
+ addi r4,r4,increment; \
+ add r3,r5,r4; \
+ blr
+
+#else /* strlen */
+
+# ifndef STRLEN
+# define FUNCNAME __strlen
+# define DEFINE_STRLEN_HIDDEN_DEF 1
+# else
+# define FUNCNAME STRLEN
+# endif
+# define MCOUNT_NARGS 1
+# define VREG_ZERO v18
+# define OFF_START_LOOP 192
+# define TAIL(vreg,increment) \
+ vctzlsbb r4,vreg; \
+ subf r3,r3,r5; \
+ addi r4,r4,increment; \
+ add r3,r3,r4; \
+ blr
+#endif /* USE_AS_RAWMEMCHR */
/* TODO: Replace macros by the actual instructions when minimum binutils becomes
>= 2.35. This is used to keep compatibility with older versions. */
@@ -50,33 +90,41 @@
li r6,offset; \
LXVP(v4+32,offset,addr); \
LXVP(v6+32,offset+32,addr); \
+ RAWMEMCHR_SUBTRACT_VECTORS; \
vminub v14,v4,v5; \
vminub v15,v6,v7; \
vminub v16,v14,v15; \
- vcmpequb. v0,v16,v18; \
+ vcmpequb. v0,v16,VREG_ZERO; \
bne cr6,L(label)
-#define TAIL(vreg,increment) \
- vctzlsbb r4,vreg; \
- subf r3,r3,r5; \
- addi r4,r4,increment; \
- add r3,r3,r4; \
- blr
-
/* Implements the function
int [r3] strlen (const void *s [r3])
+ but when USE_AS_RAWMEMCHR is set, implements the function
+
+ void* [r3] rawmemchr (const void *s [r3], int c [r4])
+
The implementation can load bytes past a matching byte, but only
up to the next 64B boundary, so it never crosses a page. */
.machine power9
-ENTRY_TOCLESS (STRLEN, 4)
- CALL_MCOUNT 1
+ENTRY_TOCLESS (FUNCNAME, 4)
+ CALL_MCOUNT MCOUNT_NARGS
- vspltisb v18,0
+#ifdef USE_AS_RAWMEMCHR
+ xori r5,r4,0xff
+
+ mtvsrd v18+32,r4 /* matching char in v18 */
+ mtvsrd v19+32,r5 /* non matching char in v19 */
+
+ vspltb v18,v18,7 /* replicate */
+ vspltb v19,v19,7 /* replicate */
+#else
vspltisb v19,-1
+#endif
+ vspltisb VREG_ZERO,0
/* Next 16B-aligned address. Prepare address for L(aligned). */
addi r5,r3,16
@@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4)
vcmpequb. v6,v0,v18
beq cr6,L(aligned)
+#ifdef USE_AS_RAWMEMCHR
+ vctzlsbb r6,v6
+ add r3,r3,r6
+#else
vctzlsbb r3,v6
+#endif
blr
- /* Test next 176B, 16B at a time. The main loop is optimized for longer
- strings, so checking the first bytes in 16B chunks benefits a lot
- small strings. */
+ /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is
+ optimized for longer strings, so checking the first bytes in 16B
+ chunks benefits a lot small strings. */
.p2align 5
L(aligned):
+#ifdef USE_AS_RAWMEMCHR
+ cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to
+ choose how we will perform the main loop. */
+#endif
/* Prepare address for the loop. */
- addi r4,r3,192
+ addi r4,r3,OFF_START_LOOP
clrrdi r4,r4,6
CHECK16(v0,0,r5,tail1)
@@ -113,15 +170,43 @@ L(aligned):
CHECK16(v8,128,r5,tail9)
CHECK16(v9,144,r5,tail10)
CHECK16(v10,160,r5,tail11)
+#ifdef USE_AS_RAWMEMCHR
+ CHECK16(v0,176,r5,tail12)
+ CHECK16(v1,192,r5,tail13)
+ CHECK16(v2,208,r5,tail14)
+ CHECK16(v3,224,r5,tail15)
+#endif
addi r5,r4,128
+#ifdef USE_AS_RAWMEMCHR
+ /* If c == 0, use the same loop as strlen, without the vsububm. */
+ beq cr5,L(loop)
+
+ /* This is very similar to the block after L(loop), the difference is
+ that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
+ each byte loaded by the char we are looking for, this way we can keep
+ using vminub to merge the results and checking for nulls. */
+ .p2align 5
+L(rawmemchr_loop):
+ CHECK64(0,r4,pre_tail_64b)
+ CHECK64(64,r4,pre_tail_64b)
+ addi r4,r4,256
+
+ CHECK64(0,r5,tail_64b)
+ CHECK64(64,r5,tail_64b)
+ addi r5,r5,256
+
+ b L(rawmemchr_loop)
+#endif
/* Switch to a more aggressive approach checking 64B each time. Use 2
pointers 128B apart and unroll the loop once to make the pointer
updates and usages separated enough to avoid stalls waiting for
address calculation. */
.p2align 5
L(loop):
+#undef RAWMEMCHR_SUBTRACT_VECTORS
+#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
CHECK64(0,r4,pre_tail_64b)
CHECK64(64,r4,pre_tail_64b)
addi r4,r4,256
@@ -140,10 +225,10 @@ L(tail_64b):
block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
low 16B bytes into vx+1, and the high into vx, so the order here is
v5, v4, v7, v6. */
- vcmpequb v1,v5,v18
- vcmpequb v2,v4,v18
- vcmpequb v3,v7,v18
- vcmpequb v4,v6,v18
+ vcmpequb v1,v5,VREG_ZERO
+ vcmpequb v2,v4,VREG_ZERO
+ vcmpequb v3,v7,VREG_ZERO
+ vcmpequb v4,v6,VREG_ZERO
/* Take into account the other 64B blocks we had already checked. */
add r5,r5,r6
@@ -165,7 +250,9 @@ L(tail_64b):
or r10,r8,r7
cnttzd r0,r10 /* Count trailing zeros before the match. */
+#ifndef USE_AS_RAWMEMCHR
subf r5,r3,r5
+#endif
add r3,r5,r0 /* Compute final length. */
blr
@@ -213,9 +300,32 @@ L(tail10):
L(tail11):
TAIL(v10,160)
-END (STRLEN)
+#ifdef USE_AS_RAWMEMCHR
+ .p2align 5
+L(tail12):
+ TAIL(v0,176)
+
+ .p2align 5
+L(tail13):
+ TAIL(v1,192)
+
+ .p2align 5
+L(tail14):
+ TAIL(v2,208)
+
+ .p2align 5
+L(tail15):
+ TAIL(v3,224)
+#endif
+
+END (FUNCNAME)
-#ifdef DEFINE_STRLEN_HIDDEN_DEF
+#ifdef USE_AS_RAWMEMCHR
+weak_alias (__rawmemchr,rawmemchr)
+libc_hidden_builtin_def (__rawmemchr)
+#else
+# ifdef DEFINE_STRLEN_HIDDEN_DEF
weak_alias (__strlen, strlen)
libc_hidden_builtin_def (strlen)
+# endif
#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 1d517698429e1230..ac2446aca62cc4ab 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
+ rawmemchr-power9 rawmemchr-power10 \
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
- rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
- strlen-power10
+ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 6e36659d1903448a..127af84b32a8196f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c. */
IFUNC_IMPL (i, name, rawmemchr,
#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+ && (hwcap & PPC_FEATURE_HAS_VSX),
+ __rawmemchr_power10)
IFUNC_IMPL_ADD (array, i, rawmemchr,
hwcap2 & PPC_FEATURE2_ARCH_3_00,
__rawmemchr_power9)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
new file mode 100644
index 0000000000000000..bf1ed7e1941f922d
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
@@ -0,0 +1,21 @@
+/* Optimized rawmemchr implementation for PowerPC64/POWER10.
+ Copyright (C) 2021 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#define RAWMEMCHR __rawmemchr_power10
+
+#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S>
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
index 2a7ae5a1ed02e556..369d6359e8987052 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
@@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
# ifdef __LITTLE_ENDIAN__
extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
+extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden;
# endif
# undef __rawmemchr
@@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
ifunc symbol properly. */
libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+ && (hwcap & PPC_FEATURE_HAS_VSX)
+ ? __rawmemchr_power10 :
(hwcap2 & PPC_FEATURE2_ARCH_3_00)
? __rawmemchr_power9 :
# endif

View File

@ -1,6 +1,6 @@
%define glibcsrcdir glibc-2.28 %define glibcsrcdir glibc-2.28
%define glibcversion 2.28 %define glibcversion 2.28
%define glibcrelease 158%{?dist} %define glibcrelease 160%{?dist}
# Pre-release tarballs are pulled in from git using a command that is # Pre-release tarballs are pulled in from git using a command that is
# effectively: # effectively:
# #
@ -706,6 +706,14 @@ Patch569: glibc-rh1934155-3.patch
Patch570: glibc-rh1934155-4.patch Patch570: glibc-rh1934155-4.patch
Patch571: glibc-rh1934155-5.patch Patch571: glibc-rh1934155-5.patch
Patch572: glibc-rh1934155-6.patch Patch572: glibc-rh1934155-6.patch
Patch573: glibc-rh1956357-1.patch
Patch574: glibc-rh1956357-2.patch
Patch575: glibc-rh1956357-3.patch
Patch576: glibc-rh1956357-4.patch
Patch577: glibc-rh1956357-5.patch
Patch578: glibc-rh1956357-6.patch
Patch579: glibc-rh1956357-7.patch
Patch580: glibc-rh1956357-8.patch
############################################################################## ##############################################################################
# Continued list of core "glibc" package information: # Continued list of core "glibc" package information:
@ -2617,6 +2625,12 @@ fi
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
%changelog %changelog
* Mon May 31 2021 Arjun Shankar <arjun@redhat.com> - 2.28-160
- Backport POWER10 optimized rawmemchr for ppc64le (#1956357)
* Thu May 27 2021 Arjun Shankar <arjun@redhat.com> - 2.28-159
- Backport additional ifunc optimizations for ppc64le (#1956357)
* Thu Apr 22 2021 Florian Weimer <fweimer@redhat.com> - 2.28-158 * Thu Apr 22 2021 Florian Weimer <fweimer@redhat.com> - 2.28-158
- Rebuild with new binutils (#1946518) - Rebuild with new binutils (#1946518)