forked from rpms/glibc
import glibc-2.28-160.el8
This commit is contained in:
parent
c04956366c
commit
c11d47b279
100
SOURCES/glibc-rh1956357-1.patch
Normal file
100
SOURCES/glibc-rh1956357-1.patch
Normal file
@ -0,0 +1,100 @@
|
|||||||
|
commit 56c81132ccc6f468fa4fc29c536db060e18e9d87
|
||||||
|
Author: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
|
||||||
|
Date: Tue Feb 23 14:14:37 2021 -0300
|
||||||
|
|
||||||
|
powerpc: Add optimized ilogb* for POWER9
|
||||||
|
|
||||||
|
The instructions xsxexpdp and xsxexpqp introduced on POWER9 extract
|
||||||
|
the exponent from a double-precision and quad-precision floating-point
|
||||||
|
respectively, thus they can be used to improve ilogb, ilogbf and ilogbf128.
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h
|
||||||
|
index e642d6c8237578ea..5bbc468829062a48 100644
|
||||||
|
--- a/sysdeps/powerpc/fpu/math_private.h
|
||||||
|
+++ b/sysdeps/powerpc/fpu/math_private.h
|
||||||
|
@@ -26,7 +26,28 @@
|
||||||
|
|
||||||
|
#include_next <math_private.h>
|
||||||
|
|
||||||
|
-#if defined _ARCH_PWR9 && __HAVE_DISTINCT_FLOAT128
|
||||||
|
+#ifdef _ARCH_PWR9
|
||||||
|
+
|
||||||
|
+#if __GNUC_PREREQ (8, 0)
|
||||||
|
+# define _GL_HAS_BUILTIN_ILOGB 1
|
||||||
|
+#elif defined __has_builtin
|
||||||
|
+# define _GL_HAS_BUILTIN_ILOGB __has_builtin (__builtin_vsx_scalar_extract_exp)
|
||||||
|
+#else
|
||||||
|
+# define _GL_HAS_BUILTIN_ILOGB 0
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+#define __builtin_test_dc_ilogbf __builtin_test_dc_ilogb
|
||||||
|
+#define __builtin_ilogbf __builtin_ilogb
|
||||||
|
+
|
||||||
|
+#define __builtin_test_dc_ilogb(x, y) \
|
||||||
|
+ __builtin_vsx_scalar_test_data_class_dp(x, y)
|
||||||
|
+#define __builtin_ilogb(x) __builtin_vsx_scalar_extract_exp(x) - 0x3ff
|
||||||
|
+
|
||||||
|
+#define __builtin_test_dc_ilogbf128(x, y) \
|
||||||
|
+ __builtin_vsx_scalar_test_data_class_qp(x, y)
|
||||||
|
+#define __builtin_ilogbf128(x) __builtin_vsx_scalar_extract_expq(x) - 0x3fff
|
||||||
|
+
|
||||||
|
+#if __HAVE_DISTINCT_FLOAT128
|
||||||
|
extern __always_inline _Float128
|
||||||
|
__ieee754_sqrtf128 (_Float128 __x)
|
||||||
|
{
|
||||||
|
@@ -35,6 +56,9 @@ __ieee754_sqrtf128 (_Float128 __x)
|
||||||
|
return __z;
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
+#else /* !_ARCH_PWR9 */
|
||||||
|
+#define _GL_HAS_BUILTIN_ILOGB 0
|
||||||
|
+#endif
|
||||||
|
|
||||||
|
#if defined _ARCH_PWR5X
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..b5c1c0aa9db86f3d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c
|
||||||
|
@@ -0,0 +1,30 @@
|
||||||
|
+#include <math.h>
|
||||||
|
+#include <errno.h>
|
||||||
|
+#include <limits.h>
|
||||||
|
+#include <math_private.h>
|
||||||
|
+#include <fenv.h>
|
||||||
|
+
|
||||||
|
+#if _GL_HAS_BUILTIN_ILOGB
|
||||||
|
+int
|
||||||
|
+M_DECL_FUNC (__ilogb) (FLOAT x)
|
||||||
|
+{
|
||||||
|
+ int r;
|
||||||
|
+ /* Check for exceptional cases. */
|
||||||
|
+ if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f))
|
||||||
|
+ r = M_SUF (__builtin_ilogb) (x);
|
||||||
|
+ else
|
||||||
|
+ /* Fallback to the generic ilogb if x is NaN, Inf or subnormal. */
|
||||||
|
+ r = M_SUF (__ieee754_ilogb) (x);
|
||||||
|
+ if (__builtin_expect (r == FP_ILOGB0, 0)
|
||||||
|
+ || __builtin_expect (r == FP_ILOGBNAN, 0)
|
||||||
|
+ || __builtin_expect (r == INT_MAX, 0))
|
||||||
|
+ {
|
||||||
|
+ __set_errno (EDOM);
|
||||||
|
+ __feraiseexcept (FE_INVALID);
|
||||||
|
+ }
|
||||||
|
+ return r;
|
||||||
|
+}
|
||||||
|
+declare_mgen_alias (__ilogb, ilogb)
|
||||||
|
+#else
|
||||||
|
+#include <math/w_ilogb_template.c>
|
||||||
|
+#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..205f154f0089a269
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c
|
||||||
|
@@ -0,0 +1,4 @@
|
||||||
|
+/* Skip the optimization for long double as ibm128 does not provide an
|
||||||
|
+ optimized builtin. */
|
||||||
|
+#include <math-type-macros-ldouble.h>
|
||||||
|
+#include <math/w_ilogb_template.c>
|
64
SOURCES/glibc-rh1956357-2.patch
Normal file
64
SOURCES/glibc-rh1956357-2.patch
Normal file
@ -0,0 +1,64 @@
|
|||||||
|
commit a7d88506c260e7a0e4268803e76fc19e38ed041f
|
||||||
|
Author: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
|
||||||
|
Date: Thu Feb 25 09:58:52 2021 -0300
|
||||||
|
|
||||||
|
powerpc: Add optimized llogb* for POWER9
|
||||||
|
|
||||||
|
The POWER9 builtins used to improve the ilogb* functions can be
|
||||||
|
used in the llogb* functions as well.
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..d00b71d2a34e28da
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c
|
||||||
|
@@ -0,0 +1,39 @@
|
||||||
|
+#include <math.h>
|
||||||
|
+#include <errno.h>
|
||||||
|
+#include <limits.h>
|
||||||
|
+#include <math_private.h>
|
||||||
|
+#include <fenv.h>
|
||||||
|
+
|
||||||
|
+#if _GL_HAS_BUILTIN_ILOGB
|
||||||
|
+long int
|
||||||
|
+M_DECL_FUNC (__llogb) (FLOAT x)
|
||||||
|
+{
|
||||||
|
+ int r;
|
||||||
|
+ /* Check for exceptional cases. */
|
||||||
|
+ if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f))
|
||||||
|
+ r = M_SUF (__builtin_ilogb) (x);
|
||||||
|
+ else
|
||||||
|
+ /* Fallback to the generic ilogb if x is NaN, Inf or subnormal. */
|
||||||
|
+ r = M_SUF (__ieee754_ilogb) (x);
|
||||||
|
+ long int lr = r;
|
||||||
|
+ if (__glibc_unlikely (r == FP_ILOGB0)
|
||||||
|
+ || __glibc_unlikely (r == FP_ILOGBNAN)
|
||||||
|
+ || __glibc_unlikely (r == INT_MAX))
|
||||||
|
+ {
|
||||||
|
+#if LONG_MAX != INT_MAX
|
||||||
|
+ if (r == FP_ILOGB0)
|
||||||
|
+ lr = FP_LLOGB0;
|
||||||
|
+ else if (r == FP_ILOGBNAN)
|
||||||
|
+ lr = FP_LLOGBNAN;
|
||||||
|
+ else
|
||||||
|
+ lr = LONG_MAX;
|
||||||
|
+#endif
|
||||||
|
+ __set_errno (EDOM);
|
||||||
|
+ __feraiseexcept (FE_INVALID);
|
||||||
|
+ }
|
||||||
|
+ return lr;
|
||||||
|
+}
|
||||||
|
+declare_mgen_alias (__llogb, llogb)
|
||||||
|
+#else
|
||||||
|
+#include <math/w_llogb_template.c>
|
||||||
|
+#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..69477a37ae82c476
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c
|
||||||
|
@@ -0,0 +1,4 @@
|
||||||
|
+/* Skip the optimization for long double as ibm128 does not provide an
|
||||||
|
+ optimized builtin. */
|
||||||
|
+#include <math-type-macros-ldouble.h>
|
||||||
|
+#include <math/w_llogb_template.c>
|
334
SOURCES/glibc-rh1956357-3.patch
Normal file
334
SOURCES/glibc-rh1956357-3.patch
Normal file
@ -0,0 +1,334 @@
|
|||||||
|
commit 10624a97e8e47004985740cbb04060a84cfada76
|
||||||
|
Author: Matheus Castanho <msc@linux.ibm.com>
|
||||||
|
Date: Tue Sep 29 15:40:08 2020 -0300
|
||||||
|
|
||||||
|
powerpc: Add optimized strlen for POWER10
|
||||||
|
|
||||||
|
Improvements compared to POWER9 version:
|
||||||
|
|
||||||
|
1. Take into account first 16B comparison for aligned strings
|
||||||
|
|
||||||
|
The previous version compares the first 16B and increments r4 by the number
|
||||||
|
of bytes until the address is 16B-aligned, then starts doing aligned loads at
|
||||||
|
that address. For aligned strings, this causes the first 16B to be compared
|
||||||
|
twice, because the increment is 0. Here we calculate the next 16B-aligned
|
||||||
|
address differently, which avoids that issue.
|
||||||
|
|
||||||
|
2. Use simple comparisons for the first ~192 bytes
|
||||||
|
|
||||||
|
The main loop is good for big strings, but comparing 16B each time is better
|
||||||
|
for smaller strings. So after aligning the address to 16 Bytes, we check
|
||||||
|
more 176B in 16B chunks. There may be some overlaps with the main loop for
|
||||||
|
unaligned strings, but we avoid using the more aggressive strategy too soon,
|
||||||
|
and also allow the loop to start at a 64B-aligned address. This greatly
|
||||||
|
benefits smaller strings and avoids overlapping checks if the string is
|
||||||
|
already aligned at a 64B boundary.
|
||||||
|
|
||||||
|
3. Reduce dependencies between load blocks caused by address calculation on loop
|
||||||
|
|
||||||
|
Doing a precise time tracing on the code showed many loads in the loop were
|
||||||
|
stalled waiting for updates to r4 from previous code blocks. This
|
||||||
|
implementation avoids that as much as possible by using 2 registers (r4 and
|
||||||
|
r5) to hold addresses to be used by different parts of the code.
|
||||||
|
|
||||||
|
Also, the previous code aligned the address to 16B, then to 64B by doing a
|
||||||
|
few 48B loops (if needed) until the address was aligned. The main loop could
|
||||||
|
not start until that 48B loop had finished and r4 was updated with the
|
||||||
|
current address. Here we calculate the address used by the loop very early,
|
||||||
|
so it can start sooner.
|
||||||
|
|
||||||
|
The main loop now uses 2 pointers 128B apart to make pointer updates less
|
||||||
|
frequent, and also unrolls 1 iteration to guarantee there is enough time
|
||||||
|
between iterations to update the pointers, reducing stalled cycles.
|
||||||
|
|
||||||
|
4. Use new P10 instructions
|
||||||
|
|
||||||
|
lxvp is used to load 32B with a single instruction, reducing contention in
|
||||||
|
the load queue.
|
||||||
|
|
||||||
|
vextractbm allows simplifying the tail code for the loop, replacing
|
||||||
|
vbpermq and avoiding having to generate a permute control vector.
|
||||||
|
|
||||||
|
Reviewed-by: Paul E Murphy <murphyp@linux.ibm.com>
|
||||||
|
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
|
||||||
|
Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..ca7e9eb3d84c9b00
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
|
||||||
|
@@ -0,0 +1,221 @@
|
||||||
|
+/* Optimized strlen implementation for POWER10 LE.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+#ifndef STRLEN
|
||||||
|
+# define STRLEN __strlen
|
||||||
|
+# define DEFINE_STRLEN_HIDDEN_DEF 1
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+/* TODO: Replace macros by the actual instructions when minimum binutils becomes
|
||||||
|
+ >= 2.35. This is used to keep compatibility with older versions. */
|
||||||
|
+#define VEXTRACTBM(rt,vrb) \
|
||||||
|
+ .long(((4)<<(32-6)) \
|
||||||
|
+ | ((rt)<<(32-11)) \
|
||||||
|
+ | ((8)<<(32-16)) \
|
||||||
|
+ | ((vrb)<<(32-21)) \
|
||||||
|
+ | 1602)
|
||||||
|
+
|
||||||
|
+#define LXVP(xtp,dq,ra) \
|
||||||
|
+ .long(((6)<<(32-6)) \
|
||||||
|
+ | ((((xtp)-32)>>1)<<(32-10)) \
|
||||||
|
+ | ((1)<<(32-11)) \
|
||||||
|
+ | ((ra)<<(32-16)) \
|
||||||
|
+ | dq)
|
||||||
|
+
|
||||||
|
+#define CHECK16(vreg,offset,addr,label) \
|
||||||
|
+ lxv vreg+32,offset(addr); \
|
||||||
|
+ vcmpequb. vreg,vreg,v18; \
|
||||||
|
+ bne cr6,L(label);
|
||||||
|
+
|
||||||
|
+/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has #
|
||||||
|
+ of bytes already checked. */
|
||||||
|
+#define CHECK64(offset,addr,label) \
|
||||||
|
+ li r6,offset; \
|
||||||
|
+ LXVP(v4+32,offset,addr); \
|
||||||
|
+ LXVP(v6+32,offset+32,addr); \
|
||||||
|
+ vminub v14,v4,v5; \
|
||||||
|
+ vminub v15,v6,v7; \
|
||||||
|
+ vminub v16,v14,v15; \
|
||||||
|
+ vcmpequb. v0,v16,v18; \
|
||||||
|
+ bne cr6,L(label)
|
||||||
|
+
|
||||||
|
+#define TAIL(vreg,increment) \
|
||||||
|
+ vctzlsbb r4,vreg; \
|
||||||
|
+ subf r3,r3,r5; \
|
||||||
|
+ addi r4,r4,increment; \
|
||||||
|
+ add r3,r3,r4; \
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+/* Implements the function
|
||||||
|
+
|
||||||
|
+ int [r3] strlen (const void *s [r3])
|
||||||
|
+
|
||||||
|
+ The implementation can load bytes past a matching byte, but only
|
||||||
|
+ up to the next 64B boundary, so it never crosses a page. */
|
||||||
|
+
|
||||||
|
+.machine power9
|
||||||
|
+
|
||||||
|
+ENTRY_TOCLESS (STRLEN, 4)
|
||||||
|
+ CALL_MCOUNT 1
|
||||||
|
+
|
||||||
|
+ vspltisb v18,0
|
||||||
|
+ vspltisb v19,-1
|
||||||
|
+
|
||||||
|
+ /* Next 16B-aligned address. Prepare address for L(aligned). */
|
||||||
|
+ addi r5,r3,16
|
||||||
|
+ clrrdi r5,r5,4
|
||||||
|
+
|
||||||
|
+ /* Align data and fill bytes not loaded with non matching char. */
|
||||||
|
+ lvx v0,0,r3
|
||||||
|
+ lvsr v1,0,r3
|
||||||
|
+ vperm v0,v19,v0,v1
|
||||||
|
+
|
||||||
|
+ vcmpequb. v6,v0,v18
|
||||||
|
+ beq cr6,L(aligned)
|
||||||
|
+
|
||||||
|
+ vctzlsbb r3,v6
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+ /* Test next 176B, 16B at a time. The main loop is optimized for longer
|
||||||
|
+ strings, so checking the first bytes in 16B chunks benefits a lot
|
||||||
|
+ small strings. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(aligned):
|
||||||
|
+ /* Prepare address for the loop. */
|
||||||
|
+ addi r4,r3,192
|
||||||
|
+ clrrdi r4,r4,6
|
||||||
|
+
|
||||||
|
+ CHECK16(v0,0,r5,tail1)
|
||||||
|
+ CHECK16(v1,16,r5,tail2)
|
||||||
|
+ CHECK16(v2,32,r5,tail3)
|
||||||
|
+ CHECK16(v3,48,r5,tail4)
|
||||||
|
+ CHECK16(v4,64,r5,tail5)
|
||||||
|
+ CHECK16(v5,80,r5,tail6)
|
||||||
|
+ CHECK16(v6,96,r5,tail7)
|
||||||
|
+ CHECK16(v7,112,r5,tail8)
|
||||||
|
+ CHECK16(v8,128,r5,tail9)
|
||||||
|
+ CHECK16(v9,144,r5,tail10)
|
||||||
|
+ CHECK16(v10,160,r5,tail11)
|
||||||
|
+
|
||||||
|
+ addi r5,r4,128
|
||||||
|
+
|
||||||
|
+ /* Switch to a more aggressive approach checking 64B each time. Use 2
|
||||||
|
+ pointers 128B apart and unroll the loop once to make the pointer
|
||||||
|
+ updates and usages separated enough to avoid stalls waiting for
|
||||||
|
+ address calculation. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(loop):
|
||||||
|
+ CHECK64(0,r4,pre_tail_64b)
|
||||||
|
+ CHECK64(64,r4,pre_tail_64b)
|
||||||
|
+ addi r4,r4,256
|
||||||
|
+
|
||||||
|
+ CHECK64(0,r5,tail_64b)
|
||||||
|
+ CHECK64(64,r5,tail_64b)
|
||||||
|
+ addi r5,r5,256
|
||||||
|
+
|
||||||
|
+ b L(loop)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(pre_tail_64b):
|
||||||
|
+ mr r5,r4
|
||||||
|
+L(tail_64b):
|
||||||
|
+ /* OK, we found a null byte. Let's look for it in the current 64-byte
|
||||||
|
+ block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
|
||||||
|
+ low 16B bytes into vx+1, and the high into vx, so the order here is
|
||||||
|
+ v5, v4, v7, v6. */
|
||||||
|
+ vcmpequb v1,v5,v18
|
||||||
|
+ vcmpequb v2,v4,v18
|
||||||
|
+ vcmpequb v3,v7,v18
|
||||||
|
+ vcmpequb v4,v6,v18
|
||||||
|
+
|
||||||
|
+ /* Take into account the other 64B blocks we had already checked. */
|
||||||
|
+ add r5,r5,r6
|
||||||
|
+
|
||||||
|
+ /* Extract first bit of each byte. */
|
||||||
|
+ VEXTRACTBM(r7,v1)
|
||||||
|
+ VEXTRACTBM(r8,v2)
|
||||||
|
+ VEXTRACTBM(r9,v3)
|
||||||
|
+ VEXTRACTBM(r10,v4)
|
||||||
|
+
|
||||||
|
+ /* Shift each value into their corresponding position. */
|
||||||
|
+ sldi r8,r8,16
|
||||||
|
+ sldi r9,r9,32
|
||||||
|
+ sldi r10,r10,48
|
||||||
|
+
|
||||||
|
+ /* Merge the results. */
|
||||||
|
+ or r7,r7,r8
|
||||||
|
+ or r8,r9,r10
|
||||||
|
+ or r10,r8,r7
|
||||||
|
+
|
||||||
|
+ cnttzd r0,r10 /* Count trailing zeros before the match. */
|
||||||
|
+ subf r5,r3,r5
|
||||||
|
+ add r3,r5,r0 /* Compute final length. */
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail1):
|
||||||
|
+ TAIL(v0,0)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail2):
|
||||||
|
+ TAIL(v1,16)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail3):
|
||||||
|
+ TAIL(v2,32)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail4):
|
||||||
|
+ TAIL(v3,48)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail5):
|
||||||
|
+ TAIL(v4,64)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail6):
|
||||||
|
+ TAIL(v5,80)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail7):
|
||||||
|
+ TAIL(v6,96)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail8):
|
||||||
|
+ TAIL(v7,112)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail9):
|
||||||
|
+ TAIL(v8,128)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail10):
|
||||||
|
+ TAIL(v9,144)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail11):
|
||||||
|
+ TAIL(v10,160)
|
||||||
|
+
|
||||||
|
+END (STRLEN)
|
||||||
|
+
|
||||||
|
+#ifdef DEFINE_STRLEN_HIDDEN_DEF
|
||||||
|
+weak_alias (__strlen, strlen)
|
||||||
|
+libc_hidden_builtin_def (strlen)
|
||||||
|
+#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
index a9e13e05e90601cd..61652b65dd223018 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
@@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||||
|
|
||||||
|
ifneq (,$(filter %le,$(config-machine)))
|
||||||
|
sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||||
|
- rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
|
||||||
|
+ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
||||||
|
+ strlen-power10
|
||||||
|
endif
|
||||||
|
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
|
||||||
|
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
index b30bc53930fc0e36..46d5956adda72b86 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -112,6 +112,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */
|
||||||
|
IFUNC_IMPL (i, name, strlen,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1,
|
||||||
|
+ __strlen_power10)
|
||||||
|
IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
|
||||||
|
__strlen_power9)
|
||||||
|
#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..6a774fad58c77179
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S
|
||||||
|
@@ -0,0 +1,2 @@
|
||||||
|
+#define STRLEN __strlen_power10
|
||||||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
|
||||||
|
index b7f0fbb13fb97783..11bdb96de2d2aa66 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
|
||||||
|
@@ -31,9 +31,12 @@ extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
|
||||||
|
extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
|
||||||
|
+extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden;
|
||||||
|
|
||||||
|
libc_ifunc (__libc_strlen,
|
||||||
|
# ifdef __LITTLE_ENDIAN__
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1)
|
||||||
|
+ ? __strlen_power10 :
|
||||||
|
(hwcap2 & PPC_FEATURE2_ARCH_3_00)
|
||||||
|
? __strlen_power9 :
|
||||||
|
# endif
|
527
SOURCES/glibc-rh1956357-4.patch
Normal file
527
SOURCES/glibc-rh1956357-4.patch
Normal file
@ -0,0 +1,527 @@
|
|||||||
|
commit dd59655e9371af86043b97e38953f43bd9496699
|
||||||
|
Author: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
|
||||||
|
Date: Fri Apr 30 18:12:08 2021 -0300
|
||||||
|
|
||||||
|
powerpc64le: Optimized memmove for POWER10
|
||||||
|
|
||||||
|
This patch was initially based on the __memmove_power7 with some ideas
|
||||||
|
from strncpy implementation for Power 9.
|
||||||
|
|
||||||
|
Improvements from __memmove_power7:
|
||||||
|
|
||||||
|
1. Use lxvl/stxvl for alignment code.
|
||||||
|
|
||||||
|
The code for Power 7 uses branches when the input is not naturally
|
||||||
|
aligned to the width of a vector. The new implementation uses
|
||||||
|
lxvl/stxvl instead which reduces pressure on GPRs. It also allows
|
||||||
|
the removal of branch instructions, implicitly removing branch stalls
|
||||||
|
and mispredictions.
|
||||||
|
|
||||||
|
2. Use of lxv/stxv and lxvl/stxvl pair is safe to use on Cache Inhibited
|
||||||
|
memory.
|
||||||
|
|
||||||
|
On Power 10 vector load and stores are safe to use on CI memory for
|
||||||
|
addresses unaligned to 16B. This code takes advantage of this to
|
||||||
|
do unaligned loads.
|
||||||
|
|
||||||
|
The unaligned loads don't have a significant performance impact by
|
||||||
|
themselves. However doing so decreases register pressure on GPRs
|
||||||
|
and interdependence stalls on load/store pairs. This also improved
|
||||||
|
readability as there are now less code paths for different alignments.
|
||||||
|
Finally this reduces the overall code size.
|
||||||
|
|
||||||
|
3. Improved performance.
|
||||||
|
|
||||||
|
This version runs on average about 30% better than memmove_power7
|
||||||
|
for lengths larger than 8KB. For input lengths shorter than 8KB
|
||||||
|
the improvement is smaller, it has on average about 17% better
|
||||||
|
performance.
|
||||||
|
|
||||||
|
This version has a degradation of about 50% for input lengths
|
||||||
|
in the 0 to 31 bytes range when dest is unaligned.
|
||||||
|
|
||||||
|
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memmove.S b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..7dfd57edeb37e8e4
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
|
||||||
|
@@ -0,0 +1,320 @@
|
||||||
|
+/* Optimized memmove implementation for POWER10.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
|
||||||
|
+
|
||||||
|
+ This optimization checks if 'src' and 'dst' overlap. If they do not
|
||||||
|
+ or 'src' is ahead of 'dest' then it copies forward.
|
||||||
|
+ Otherwise, an optimized backward copy is used. */
|
||||||
|
+
|
||||||
|
+#ifndef MEMMOVE
|
||||||
|
+# define MEMMOVE memmove
|
||||||
|
+#endif
|
||||||
|
+ .machine power9
|
||||||
|
+ENTRY_TOCLESS (MEMMOVE, 5)
|
||||||
|
+ CALL_MCOUNT 3
|
||||||
|
+
|
||||||
|
+L(_memmove):
|
||||||
|
+ .p2align 5
|
||||||
|
+ /* Check if there is overlap, if so it will branch to backward copy. */
|
||||||
|
+ subf r9,r4,r3
|
||||||
|
+ cmpld cr7,r9,r5
|
||||||
|
+ blt cr7,L(memmove_bwd)
|
||||||
|
+
|
||||||
|
+ /* Fast path for length shorter than 16 bytes. */
|
||||||
|
+ sldi r7,r5,56
|
||||||
|
+ lxvl 32+v2,r4,r7
|
||||||
|
+ stxvl 32+v2,r3,r7
|
||||||
|
+ subic. r8,r5,16
|
||||||
|
+ blelr
|
||||||
|
+
|
||||||
|
+ /* For shorter lengths aligning the dest address to 16 bytes either
|
||||||
|
+ decreases performance or is irrelevant. I'm making use of this
|
||||||
|
+ comparison to skip the alignment in. */
|
||||||
|
+ cmpldi cr6,r5,256
|
||||||
|
+ bge cr6,L(ge_256)
|
||||||
|
+ /* Account for the first 16-byte copy. */
|
||||||
|
+ addi r4,r4,16
|
||||||
|
+ addi r11,r3,16 /* use r11 to keep dest address on r3. */
|
||||||
|
+ subi r5,r5,16
|
||||||
|
+ b L(loop_head)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(ge_256):
|
||||||
|
+ /* Account for the first copy <= 16 bytes. This is necessary for
|
||||||
|
+ memmove because at this point the src address can be in front of the
|
||||||
|
+ dest address. */
|
||||||
|
+ clrldi r9,r5,56
|
||||||
|
+ li r8,16
|
||||||
|
+ cmpldi r9,16
|
||||||
|
+ iselgt r9,r8,r9
|
||||||
|
+ add r4,r4,r9
|
||||||
|
+ add r11,r3,r9 /* use r11 to keep dest address on r3. */
|
||||||
|
+ sub r5,r5,r9
|
||||||
|
+
|
||||||
|
+ /* Align dest to 16 bytes. */
|
||||||
|
+ neg r7,r3
|
||||||
|
+ clrldi. r9,r7,60
|
||||||
|
+ beq L(loop_head)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+ sldi r6,r9,56
|
||||||
|
+ lxvl 32+v0,r4,r6
|
||||||
|
+ stxvl 32+v0,r11,r6
|
||||||
|
+ sub r5,r5,r9
|
||||||
|
+ add r4,r4,r9
|
||||||
|
+ add r11,r11,r9
|
||||||
|
+
|
||||||
|
+L(loop_head):
|
||||||
|
+ cmpldi r5,63
|
||||||
|
+ ble L(final_64)
|
||||||
|
+
|
||||||
|
+ srdi. r7,r5,7
|
||||||
|
+ beq L(loop_tail)
|
||||||
|
+
|
||||||
|
+ mtctr r7
|
||||||
|
+
|
||||||
|
+/* Main loop that copies 128 bytes each iteration. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(loop):
|
||||||
|
+ addi r9,r4,64
|
||||||
|
+ addi r10,r11,64
|
||||||
|
+
|
||||||
|
+ lxv 32+v0,0(r4)
|
||||||
|
+ lxv 32+v1,16(r4)
|
||||||
|
+ lxv 32+v2,32(r4)
|
||||||
|
+ lxv 32+v3,48(r4)
|
||||||
|
+
|
||||||
|
+ stxv 32+v0,0(r11)
|
||||||
|
+ stxv 32+v1,16(r11)
|
||||||
|
+ stxv 32+v2,32(r11)
|
||||||
|
+ stxv 32+v3,48(r11)
|
||||||
|
+
|
||||||
|
+ addi r4,r4,128
|
||||||
|
+ addi r11,r11,128
|
||||||
|
+
|
||||||
|
+ lxv 32+v4,0(r9)
|
||||||
|
+ lxv 32+v5,16(r9)
|
||||||
|
+ lxv 32+v6,32(r9)
|
||||||
|
+ lxv 32+v7,48(r9)
|
||||||
|
+
|
||||||
|
+ stxv 32+v4,0(r10)
|
||||||
|
+ stxv 32+v5,16(r10)
|
||||||
|
+ stxv 32+v6,32(r10)
|
||||||
|
+ stxv 32+v7,48(r10)
|
||||||
|
+
|
||||||
|
+ bdnz L(loop)
|
||||||
|
+ clrldi. r5,r5,57
|
||||||
|
+ beqlr
|
||||||
|
+
|
||||||
|
+/* Copy 64 bytes. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(loop_tail):
|
||||||
|
+ cmpldi cr5,r5,63
|
||||||
|
+ ble cr5,L(final_64)
|
||||||
|
+
|
||||||
|
+ lxv 32+v0,0(r4)
|
||||||
|
+ lxv 32+v1,16(r4)
|
||||||
|
+ lxv 32+v2,32(r4)
|
||||||
|
+ lxv 32+v3,48(r4)
|
||||||
|
+
|
||||||
|
+ stxv 32+v0,0(r11)
|
||||||
|
+ stxv 32+v1,16(r11)
|
||||||
|
+ stxv 32+v2,32(r11)
|
||||||
|
+ stxv 32+v3,48(r11)
|
||||||
|
+
|
||||||
|
+ addi r4,r4,64
|
||||||
|
+ addi r11,r11,64
|
||||||
|
+ subi r5,r5,64
|
||||||
|
+
|
||||||
|
+/* Copies the last 1-63 bytes. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(final_64):
|
||||||
|
+ /* r8 holds the number of bytes that will be copied with lxv/stxv. */
|
||||||
|
+ clrrdi. r8,r5,4
|
||||||
|
+ beq L(tail1)
|
||||||
|
+
|
||||||
|
+ cmpldi cr5,r5,32
|
||||||
|
+ lxv 32+v0,0(r4)
|
||||||
|
+ blt cr5,L(tail2)
|
||||||
|
+
|
||||||
|
+ cmpldi cr6,r5,48
|
||||||
|
+ lxv 32+v1,16(r4)
|
||||||
|
+ blt cr6,L(tail3)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+ lxv 32+v2,32(r4)
|
||||||
|
+ stxv 32+v2,32(r11)
|
||||||
|
+L(tail3):
|
||||||
|
+ stxv 32+v1,16(r11)
|
||||||
|
+L(tail2):
|
||||||
|
+ stxv 32+v0,0(r11)
|
||||||
|
+ sub r5,r5,r8
|
||||||
|
+ add r4,r4,r8
|
||||||
|
+ add r11,r11,r8
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail1):
|
||||||
|
+ sldi r6,r5,56
|
||||||
|
+ lxvl v4,r4,r6
|
||||||
|
+ stxvl v4,r11,r6
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+/* If dest and src overlap, we should copy backwards. */
|
||||||
|
+L(memmove_bwd):
|
||||||
|
+ add r11,r3,r5
|
||||||
|
+ add r4,r4,r5
|
||||||
|
+
|
||||||
|
+ /* Optimization for length smaller than 16 bytes. */
|
||||||
|
+ cmpldi cr5,r5,15
|
||||||
|
+ ble cr5,L(tail1_bwd)
|
||||||
|
+
|
||||||
|
+ /* For shorter lengths the alignment either slows down or is irrelevant.
|
||||||
|
+ The forward copy uses a already need 256 comparison for that. Here
|
||||||
|
+ it's using 128 as it will reduce code and improve readability. */
|
||||||
|
+ cmpldi cr7,r5,128
|
||||||
|
+ blt cr7,L(bwd_loop_tail)
|
||||||
|
+
|
||||||
|
+ /* Align dest address to 16 bytes. */
|
||||||
|
+ .p2align 5
|
||||||
|
+ clrldi. r9,r11,60
|
||||||
|
+ beq L(bwd_loop_head)
|
||||||
|
+ sub r4,r4,r9
|
||||||
|
+ sub r11,r11,r9
|
||||||
|
+ lxv 32+v0,0(r4)
|
||||||
|
+ sldi r6,r9,56
|
||||||
|
+ stxvl 32+v0,r11,r6
|
||||||
|
+ sub r5,r5,r9
|
||||||
|
+
|
||||||
|
+L(bwd_loop_head):
|
||||||
|
+ srdi. r7,r5,7
|
||||||
|
+ beq L(bwd_loop_tail)
|
||||||
|
+
|
||||||
|
+ mtctr r7
|
||||||
|
+
|
||||||
|
+/* Main loop that copies 128 bytes every iteration. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(bwd_loop):
|
||||||
|
+ addi r9,r4,-64
|
||||||
|
+ addi r10,r11,-64
|
||||||
|
+
|
||||||
|
+ lxv 32+v0,-16(r4)
|
||||||
|
+ lxv 32+v1,-32(r4)
|
||||||
|
+ lxv 32+v2,-48(r4)
|
||||||
|
+ lxv 32+v3,-64(r4)
|
||||||
|
+
|
||||||
|
+ stxv 32+v0,-16(r11)
|
||||||
|
+ stxv 32+v1,-32(r11)
|
||||||
|
+ stxv 32+v2,-48(r11)
|
||||||
|
+ stxv 32+v3,-64(r11)
|
||||||
|
+
|
||||||
|
+ addi r4,r4,-128
|
||||||
|
+ addi r11,r11,-128
|
||||||
|
+
|
||||||
|
+ lxv 32+v0,-16(r9)
|
||||||
|
+ lxv 32+v1,-32(r9)
|
||||||
|
+ lxv 32+v2,-48(r9)
|
||||||
|
+ lxv 32+v3,-64(r9)
|
||||||
|
+
|
||||||
|
+ stxv 32+v0,-16(r10)
|
||||||
|
+ stxv 32+v1,-32(r10)
|
||||||
|
+ stxv 32+v2,-48(r10)
|
||||||
|
+ stxv 32+v3,-64(r10)
|
||||||
|
+
|
||||||
|
+ bdnz L(bwd_loop)
|
||||||
|
+ clrldi. r5,r5,57
|
||||||
|
+ beqlr
|
||||||
|
+
|
||||||
|
+/* Copy 64 bytes. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(bwd_loop_tail):
|
||||||
|
+ cmpldi cr5,r5,63
|
||||||
|
+ ble cr5,L(bwd_final_64)
|
||||||
|
+
|
||||||
|
+ addi r4,r4,-64
|
||||||
|
+ addi r11,r11,-64
|
||||||
|
+
|
||||||
|
+ lxv 32+v0,0(r4)
|
||||||
|
+ lxv 32+v1,16(r4)
|
||||||
|
+ lxv 32+v2,32(r4)
|
||||||
|
+ lxv 32+v3,48(r4)
|
||||||
|
+
|
||||||
|
+ stxv 32+v0,0(r11)
|
||||||
|
+ stxv 32+v1,16(r11)
|
||||||
|
+ stxv 32+v2,32(r11)
|
||||||
|
+ stxv 32+v3,48(r11)
|
||||||
|
+
|
||||||
|
+ subi r5,r5,64
|
||||||
|
+
|
||||||
|
+/* Copies the last 1-63 bytes. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(bwd_final_64):
|
||||||
|
+ /* r8 holds the number of bytes that will be copied with lxv/stxv. */
|
||||||
|
+ clrrdi. r8,r5,4
|
||||||
|
+ beq L(tail1_bwd)
|
||||||
|
+
|
||||||
|
+ cmpldi cr5,r5,32
|
||||||
|
+ lxv 32+v2,-16(r4)
|
||||||
|
+ blt cr5,L(tail2_bwd)
|
||||||
|
+
|
||||||
|
+ cmpldi cr6,r5,48
|
||||||
|
+ lxv 32+v1,-32(r4)
|
||||||
|
+ blt cr6,L(tail3_bwd)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+ lxv 32+v0,-48(r4)
|
||||||
|
+ stxv 32+v0,-48(r11)
|
||||||
|
+L(tail3_bwd):
|
||||||
|
+ stxv 32+v1,-32(r11)
|
||||||
|
+L(tail2_bwd):
|
||||||
|
+ stxv 32+v2,-16(r11)
|
||||||
|
+ sub r4,r4,r5
|
||||||
|
+ sub r11,r11,r5
|
||||||
|
+ sub r5,r5,r8
|
||||||
|
+ sldi r6,r5,56
|
||||||
|
+ lxvl v4,r4,r6
|
||||||
|
+ stxvl v4,r11,r6
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+/* Copy last 16 bytes. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail1_bwd):
|
||||||
|
+ sub r4,r4,r5
|
||||||
|
+ sub r11,r11,r5
|
||||||
|
+ sldi r6,r5,56
|
||||||
|
+ lxvl v4,r4,r6
|
||||||
|
+ stxvl v4,r11,r6
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+END_GEN_TB (MEMMOVE,TB_TOCLESS)
|
||||||
|
+libc_hidden_builtin_def (memmove)
|
||||||
|
+
|
||||||
|
+/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
|
||||||
|
+ Implemented in this file to avoid linker create a stub function call
|
||||||
|
+ in the branch to '_memmove'. */
|
||||||
|
+ENTRY_TOCLESS (__bcopy)
|
||||||
|
+ mr r6,r3
|
||||||
|
+ mr r3,r4
|
||||||
|
+ mr r4,r6
|
||||||
|
+ b L(_memmove)
|
||||||
|
+END (__bcopy)
|
||||||
|
+#ifndef __bcopy
|
||||||
|
+weak_alias (__bcopy, bcopy)
|
||||||
|
+#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
index 61652b65dd223018..66f8c6ace9824d4a 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
@@ -32,7 +32,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||||
|
strncase-power8
|
||||||
|
|
||||||
|
ifneq (,$(filter %le,$(config-machine)))
|
||||||
|
-sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||||
|
+sysdep_routines += memmove-power10 \
|
||||||
|
+ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||||
|
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
||||||
|
strlen-power10
|
||||||
|
endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
||||||
|
index 1c4a229b1fc5654a..705fef33d4e57557 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
||||||
|
@@ -22,8 +22,17 @@
|
||||||
|
extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
|
||||||
|
/* __bcopy_power7 symbol is implemented at memmove-power7.S */
|
||||||
|
extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+extern __typeof (bcopy) __bcopy_power10 attribute_hidden;
|
||||||
|
+#endif
|
||||||
|
|
||||||
|
libc_ifunc (bcopy,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
+ PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && (hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __bcopy_power10 :
|
||||||
|
+#endif
|
||||||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
? __bcopy_power7
|
||||||
|
: __bcopy_ppc);
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
index 46d5956adda72b86..4ce04bc51574cca1 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -67,6 +67,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c. */
|
||||||
|
IFUNC_IMPL (i, name, memmove,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, memmove,
|
||||||
|
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
+ PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && (hwcap & PPC_FEATURE_HAS_VSX),
|
||||||
|
+ __memmove_power10)
|
||||||
|
+#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
__memmove_power7)
|
||||||
|
IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
|
||||||
|
@@ -186,6 +193,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c. */
|
||||||
|
IFUNC_IMPL (i, name, bcopy,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, bcopy,
|
||||||
|
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
+ PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && (hwcap & PPC_FEATURE_HAS_VSX),
|
||||||
|
+ __bcopy_power10)
|
||||||
|
+#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
__bcopy_power7)
|
||||||
|
IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..171b32921a0a4d47
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
|
||||||
|
@@ -0,0 +1,27 @@
|
||||||
|
+/* Optimized memmove implementation for POWER10.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#define MEMMOVE __memmove_power10
|
||||||
|
+
|
||||||
|
+#undef libc_hidden_builtin_def
|
||||||
|
+#define libc_hidden_builtin_def(name)
|
||||||
|
+
|
||||||
|
+#undef __bcopy
|
||||||
|
+#define __bcopy __bcopy_power10
|
||||||
|
+
|
||||||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/memmove.S>
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
|
||||||
|
index 0b251d0f5f087874..fb5261ecda64d061 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
|
||||||
|
@@ -21,7 +21,7 @@
|
||||||
|
#undef libc_hidden_builtin_def
|
||||||
|
#define libc_hidden_builtin_def(name)
|
||||||
|
|
||||||
|
-#undef bcopy
|
||||||
|
-#define bcopy __bcopy_power7
|
||||||
|
+#undef __bcopy
|
||||||
|
+#define __bcopy __bcopy_power7
|
||||||
|
|
||||||
|
#include <sysdeps/powerpc/powerpc64/power7/memmove.S>
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
||||||
|
index 39987155cc7d3624..2fd7b6d309e4bedd 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
||||||
|
@@ -28,14 +28,22 @@
|
||||||
|
# include "init-arch.h"
|
||||||
|
|
||||||
|
extern __typeof (__redirect_memmove) __libc_memmove;
|
||||||
|
-
|
||||||
|
extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden;
|
||||||
|
+#endif
|
||||||
|
|
||||||
|
libc_ifunc (__libc_memmove,
|
||||||
|
- (hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
- ? __memmove_power7
|
||||||
|
- : __memmove_ppc);
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
+ PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && (hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __memmove_power10 :
|
||||||
|
+#endif
|
||||||
|
+ (hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __memmove_power7
|
||||||
|
+ : __memmove_ppc);
|
||||||
|
|
||||||
|
#undef memmove
|
||||||
|
strong_alias (__libc_memmove, memmove);
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
|
||||||
|
index b7f3dc28d1a8eac3..9e4cabb07ef9b732 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/power7/memmove.S
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
|
||||||
|
@@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
|
||||||
|
mr r4,r6
|
||||||
|
b L(_memmove)
|
||||||
|
END (__bcopy)
|
||||||
|
+#ifndef __bcopy
|
||||||
|
weak_alias (__bcopy, bcopy)
|
||||||
|
+#endif
|
308
SOURCES/glibc-rh1956357-5.patch
Normal file
308
SOURCES/glibc-rh1956357-5.patch
Normal file
@ -0,0 +1,308 @@
|
|||||||
|
commit e941e0ae80626b7661c1db8953a673cafd3b8b19
|
||||||
|
Author: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||||||
|
Date: Fri Apr 30 18:12:08 2021 -0300
|
||||||
|
|
||||||
|
powerpc64le: Optimize memcpy for POWER10
|
||||||
|
|
||||||
|
This implementation is based on __memcpy_power8_cached and integrates
|
||||||
|
suggestions from Anton Blanchard.
|
||||||
|
It benefits from loads and stores with length for short lengths and for
|
||||||
|
tail code, simplifying the code.
|
||||||
|
|
||||||
|
All unaligned memory accesses use instructions that do not generate
|
||||||
|
alignment interrupts on POWER10, making it safe to use on
|
||||||
|
caching-inhibited memory.
|
||||||
|
|
||||||
|
The main loop has also been modified in order to increase instruction
|
||||||
|
throughput by reducing the dependency on updates from previous iterations.
|
||||||
|
|
||||||
|
On average, this implementation provides around 30% improvement when
|
||||||
|
compared to __memcpy_power7 and 10% improvement in comparison to
|
||||||
|
__memcpy_power8_cached.
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcpy.S b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..ad1414db4a3a8b9f
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
|
||||||
|
@@ -0,0 +1,198 @@
|
||||||
|
+/* Optimized memcpy implementation for POWER10.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <http://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+#ifndef MEMCPY
|
||||||
|
+# define MEMCPY memcpy
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
|
||||||
|
+ Returns 'dst'. */
|
||||||
|
+
|
||||||
|
+ .machine power9
|
||||||
|
+ENTRY_TOCLESS (MEMCPY, 5)
|
||||||
|
+ CALL_MCOUNT 3
|
||||||
|
+
|
||||||
|
+ /* Copy up to 16 bytes. */
|
||||||
|
+ sldi r6,r5,56 /* Prepare [l|st]xvl counter. */
|
||||||
|
+ lxvl v10,r4,r6
|
||||||
|
+ stxvl v10,r3,r6
|
||||||
|
+ subic. r6,r5,16 /* Return if len <= 16. */
|
||||||
|
+ blelr
|
||||||
|
+
|
||||||
|
+ /* If len >= 256, assume nothing got copied before and copy
|
||||||
|
+ again. This might cause issues with overlapped memory, but memcpy
|
||||||
|
+ is not expected to treat overlapped memory. */
|
||||||
|
+ cmpdi r5,256
|
||||||
|
+ bge L(copy_ge_256)
|
||||||
|
+ /* 16 < len < 256 and the first 16 bytes have already been copied. */
|
||||||
|
+ addi r10,r3,16 /* Keep r3 intact as return value. */
|
||||||
|
+ addi r4,r4,16
|
||||||
|
+ subi r5,r5,16
|
||||||
|
+ b L(copy_lt_256) /* Avoid the main loop if len < 256. */
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(copy_ge_256):
|
||||||
|
+ mr r10,r3 /* Keep r3 intact as return value. */
|
||||||
|
+ /* Align dst to 16 bytes. */
|
||||||
|
+ andi. r9,r10,0xf
|
||||||
|
+ beq L(dst_is_align_16)
|
||||||
|
+ lxv v10,0(r4)
|
||||||
|
+ subfic r12,r9,16
|
||||||
|
+ subf r5,r12,r5
|
||||||
|
+ add r4,r4,r12
|
||||||
|
+ stxv v10,0(r3)
|
||||||
|
+ add r10,r3,r12
|
||||||
|
+
|
||||||
|
+L(dst_is_align_16):
|
||||||
|
+ srdi r9,r5,7 /* Divide by 128. */
|
||||||
|
+ mtctr r9
|
||||||
|
+ addi r6,r4,64
|
||||||
|
+ addi r7,r10,64
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+ /* Main loop, copy 128 bytes per iteration.
|
||||||
|
+ Use r6=src+64 and r7=dest+64 in order to reduce the dependency on
|
||||||
|
+ r4 and r10. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(copy_128):
|
||||||
|
+
|
||||||
|
+ lxv v10, 0(r4)
|
||||||
|
+ lxv v11, 16(r4)
|
||||||
|
+ lxv v12, 32(r4)
|
||||||
|
+ lxv v13, 48(r4)
|
||||||
|
+
|
||||||
|
+ addi r4,r4,128
|
||||||
|
+
|
||||||
|
+ stxv v10, 0(r10)
|
||||||
|
+ stxv v11, 16(r10)
|
||||||
|
+ stxv v12, 32(r10)
|
||||||
|
+ stxv v13, 48(r10)
|
||||||
|
+
|
||||||
|
+ addi r10,r10,128
|
||||||
|
+
|
||||||
|
+ lxv v10, 0(r6)
|
||||||
|
+ lxv v11, 16(r6)
|
||||||
|
+ lxv v12, 32(r6)
|
||||||
|
+ lxv v13, 48(r6)
|
||||||
|
+
|
||||||
|
+ addi r6,r6,128
|
||||||
|
+
|
||||||
|
+ stxv v10, 0(r7)
|
||||||
|
+ stxv v11, 16(r7)
|
||||||
|
+ stxv v12, 32(r7)
|
||||||
|
+ stxv v13, 48(r7)
|
||||||
|
+
|
||||||
|
+ addi r7,r7,128
|
||||||
|
+
|
||||||
|
+ bdnz L(copy_128)
|
||||||
|
+
|
||||||
|
+ clrldi. r5,r5,64-7 /* Have we copied everything? */
|
||||||
|
+ beqlr
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(copy_lt_256):
|
||||||
|
+ cmpdi r5,16
|
||||||
|
+ ble L(copy_le_16)
|
||||||
|
+ srdi. r9,r5,5 /* Divide by 32. */
|
||||||
|
+ beq L(copy_lt_32)
|
||||||
|
+ mtctr r9
|
||||||
|
+ /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce
|
||||||
|
+ the dependency on r4 and r10. */
|
||||||
|
+ addi r6,r4,32
|
||||||
|
+ addi r7,r10,32
|
||||||
|
+ addi r8,r4,64
|
||||||
|
+ addi r9,r10,64
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+ /* Copy 32 bytes at a time, unaligned.
|
||||||
|
+ The loop is unrolled 3 times in order to reduce the dependency on
|
||||||
|
+ r4 and r10, copying up-to 96 bytes per iteration. */
|
||||||
|
+L(copy_32):
|
||||||
|
+ lxv v10, 0(r4)
|
||||||
|
+ lxv v11, 16(r4)
|
||||||
|
+ stxv v10, 0(r10)
|
||||||
|
+ stxv v11, 16(r10)
|
||||||
|
+ bdz L(end_copy_32a)
|
||||||
|
+ addi r4,r4,96
|
||||||
|
+ addi r10,r10,96
|
||||||
|
+
|
||||||
|
+ lxv v10, 0(r6)
|
||||||
|
+ lxv v11, 16(r6)
|
||||||
|
+ addi r6,r6,96
|
||||||
|
+ stxv v10, 0(r7)
|
||||||
|
+ stxv v11, 16(r7)
|
||||||
|
+ bdz L(end_copy_32b)
|
||||||
|
+ addi r7,r7,96
|
||||||
|
+
|
||||||
|
+ lxv v12, 0(r8)
|
||||||
|
+ lxv v13, 16(r8)
|
||||||
|
+ addi r8,r8,96
|
||||||
|
+ stxv v12, 0(r9)
|
||||||
|
+ stxv v13, 16(r9)
|
||||||
|
+ addi r9,r9,96
|
||||||
|
+ bdnz L(copy_32)
|
||||||
|
+
|
||||||
|
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
|
||||||
|
+ beqlr
|
||||||
|
+ cmpdi r5,16
|
||||||
|
+ ble L(copy_le_16)
|
||||||
|
+ b L(copy_lt_32)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(end_copy_32a):
|
||||||
|
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
|
||||||
|
+ beqlr
|
||||||
|
+ /* 32 bytes have been copied since the last update of r4 and r10. */
|
||||||
|
+ addi r4,r4,32
|
||||||
|
+ addi r10,r10,32
|
||||||
|
+ cmpdi r5,16
|
||||||
|
+ ble L(copy_le_16)
|
||||||
|
+ b L(copy_lt_32)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(end_copy_32b):
|
||||||
|
+ clrldi. r5,r5,64-5 /* Have we copied everything? */
|
||||||
|
+ beqlr
|
||||||
|
+ /* The last iteration of the loop copied 64 bytes. Update r4 and r10
|
||||||
|
+ accordingly. */
|
||||||
|
+ addi r4,r4,-32
|
||||||
|
+ addi r10,r10,-32
|
||||||
|
+ cmpdi r5,16
|
||||||
|
+ ble L(copy_le_16)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(copy_lt_32):
|
||||||
|
+ lxv v10, 0(r4)
|
||||||
|
+ stxv v10, 0(r10)
|
||||||
|
+ addi r4,r4,16
|
||||||
|
+ addi r10,r10,16
|
||||||
|
+ subi r5,r5,16
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(copy_le_16):
|
||||||
|
+ sldi r6,r5,56
|
||||||
|
+ lxvl v10,r4,r6
|
||||||
|
+ stxvl v10,r10,r6
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+
|
||||||
|
+END_GEN_TB (MEMCPY,TB_TOCLESS)
|
||||||
|
+libc_hidden_builtin_def (memcpy)
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
index 66f8c6ace9824d4a..2e3c8f2e8a81cda4 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||||
|
strncase-power8
|
||||||
|
|
||||||
|
ifneq (,$(filter %le,$(config-machine)))
|
||||||
|
-sysdep_routines += memmove-power10 \
|
||||||
|
+sysdep_routines += memcpy-power10 memmove-power10 \
|
||||||
|
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||||
|
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
||||||
|
strlen-power10
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
index 4ce04bc51574cca1..9d5a14e480c02171 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -51,6 +51,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
#ifdef SHARED
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */
|
||||||
|
IFUNC_IMPL (i, name, memcpy,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, memcpy,
|
||||||
|
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
+ __memcpy_power10)
|
||||||
|
+#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||||||
|
__memcpy_power8_cached)
|
||||||
|
IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..70e0fc3ed610cdc3
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
|
||||||
|
@@ -0,0 +1,26 @@
|
||||||
|
+/* Optimized memcpy implementation for POWER10.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
|
||||||
|
+#define MEMCPY __memcpy_power10
|
||||||
|
+
|
||||||
|
+#undef libc_hidden_builtin_def
|
||||||
|
+#define libc_hidden_builtin_def(name)
|
||||||
|
+
|
||||||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/memcpy.S>
|
||||||
|
+#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
|
||||||
|
index 44dea594f3770673..be0e47f32dde2ccf 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
|
||||||
|
@@ -36,8 +36,15 @@ extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden;
|
||||||
|
+# if defined __LITTLE_ENDIAN__
|
||||||
|
+extern __typeof (__redirect_memcpy) __memcpy_power10 attribute_hidden;
|
||||||
|
+# endif
|
||||||
|
|
||||||
|
libc_ifunc (__libc_memcpy,
|
||||||
|
+# if defined __LITTLE_ENDIAN__
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __memcpy_power10 :
|
||||||
|
+# endif
|
||||||
|
((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
|
||||||
|
? __memcpy_power8_cached :
|
||||||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
420
SOURCES/glibc-rh1956357-6.patch
Normal file
420
SOURCES/glibc-rh1956357-6.patch
Normal file
@ -0,0 +1,420 @@
|
|||||||
|
commit 23fdf8178cce3c2ec320dd5eca8b544245bcaef0
|
||||||
|
Author: Raoni Fassina Firmino <raoni@linux.ibm.com>
|
||||||
|
Date: Fri Apr 30 18:12:08 2021 -0300
|
||||||
|
|
||||||
|
powerpc64le: Optimize memset for POWER10
|
||||||
|
|
||||||
|
This implementation is based on __memset_power8 and integrates a lot
|
||||||
|
of suggestions from Anton Blanchard.
|
||||||
|
|
||||||
|
The biggest difference is that it makes extensive use of stxvl to
|
||||||
|
alignment and tail code to avoid branches and small stores. It has
|
||||||
|
three main execution paths:
|
||||||
|
|
||||||
|
a) "Short lengths" for lengths up to 64 bytes, avoiding as many
|
||||||
|
branches as possible.
|
||||||
|
|
||||||
|
b) "General case" for larger lengths, it has an alignment section
|
||||||
|
using stxvl to avoid branches, a 128 bytes loop and then a tail
|
||||||
|
code, again using stxvl with few branches.
|
||||||
|
|
||||||
|
c) "Zeroing cache blocks" for lengths from 256 bytes upwards and set
|
||||||
|
value being zero. It is mostly the __memset_power8 code but the
|
||||||
|
alignment phase was simplified because, at this point, address is
|
||||||
|
already 16-bytes aligned and also changed to use vector stores.
|
||||||
|
The tail code was also simplified to reuse the general case tail.
|
||||||
|
|
||||||
|
All unaligned stores use stxvl instructions that do not generate
|
||||||
|
alignment interrupts on POWER10, making it safe to use on
|
||||||
|
caching-inhibited memory.
|
||||||
|
|
||||||
|
On average, this implementation provides something around 30%
|
||||||
|
improvement when compared to __memset_power8.
|
||||||
|
|
||||||
|
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
|
||||||
|
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memset.S b/sysdeps/powerpc/powerpc64/le/power10/memset.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..6b8e2cfdaf25fd30
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/memset.S
|
||||||
|
@@ -0,0 +1,256 @@
|
||||||
|
+/* Optimized memset implementation for POWER10 LE.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
|
||||||
|
+ Returns 's'. */
|
||||||
|
+
|
||||||
|
+#ifndef MEMSET
|
||||||
|
+# define MEMSET memset
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+ .machine power9
|
||||||
|
+ENTRY_TOCLESS (MEMSET, 5)
|
||||||
|
+ CALL_MCOUNT 3
|
||||||
|
+
|
||||||
|
+L(_memset):
|
||||||
|
+ /* Assume memset of zero length is uncommon, and just let it go
|
||||||
|
+ through the small path below. */
|
||||||
|
+ cmpldi r5,64
|
||||||
|
+
|
||||||
|
+ /* Replicate byte to quad word. */
|
||||||
|
+ mtvsrd v0+32,r4
|
||||||
|
+ vspltb v0,v0,7
|
||||||
|
+
|
||||||
|
+ li r7,16
|
||||||
|
+ sldi r8,r7,56
|
||||||
|
+
|
||||||
|
+ bgt L(large)
|
||||||
|
+
|
||||||
|
+ /* For short lengths we want to avoid as many branches as possible.
|
||||||
|
+ We use store VSX vector with length instructions to do this.
|
||||||
|
+ It takes advantage of the fact that if the length passed to stxvl
|
||||||
|
+ is zero nothing is done, effectively a no-op. */
|
||||||
|
+ sldi r5,r5,56
|
||||||
|
+
|
||||||
|
+ addi r10,r3,16
|
||||||
|
+
|
||||||
|
+ sub. r11,r5,r8
|
||||||
|
+ isellt r11,0,r11 /* Saturate the subtraction to zero. */
|
||||||
|
+
|
||||||
|
+ stxvl v0+32,r3,r5
|
||||||
|
+ stxvl v0+32,r10,r11
|
||||||
|
+
|
||||||
|
+ addi r9,r3,32
|
||||||
|
+ addi r10,r3,48
|
||||||
|
+
|
||||||
|
+ sub. r11,r11,r8
|
||||||
|
+ isellt r11,0,r11
|
||||||
|
+
|
||||||
|
+ sub. r5,r11,r8
|
||||||
|
+ isellt r5,0,r5
|
||||||
|
+
|
||||||
|
+ stxvl v0+32,r9,r11
|
||||||
|
+ stxvl v0+32,r10,r5
|
||||||
|
+
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(large):
|
||||||
|
+ mr r6,r3 /* Don't modify r3 since we need to return it. */
|
||||||
|
+
|
||||||
|
+ /* Get dest 16B aligned. */
|
||||||
|
+ neg r0,r3
|
||||||
|
+ clrldi. r7,r0,(64-4)
|
||||||
|
+ beq L(aligned)
|
||||||
|
+ rldic r9,r0,56,4 /* (~X & 0xf)<<56 "clrlsldi r9,r0,64-4,56". */
|
||||||
|
+
|
||||||
|
+ stxvl v0+32,r6,r9 /* Store up to 15B until aligned address. */
|
||||||
|
+
|
||||||
|
+ add r6,r6,r7
|
||||||
|
+ sub r5,r5,r7
|
||||||
|
+
|
||||||
|
+ /* Go to tail if there is less than 64B left after alignment. */
|
||||||
|
+ cmpldi r5,64
|
||||||
|
+ blt L(tail_64)
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(aligned):
|
||||||
|
+ /* Go to tail if there is less than 128B left after alignment. */
|
||||||
|
+ srdi. r0,r5,7
|
||||||
|
+ beq L(tail_128)
|
||||||
|
+
|
||||||
|
+ /* If c == 0 && n >= 256 use dcbz to zero out full cache blocks. */
|
||||||
|
+ cmpldi cr5,r5,255
|
||||||
|
+ cmpldi cr6,r4,0
|
||||||
|
+ crand 27,26,21
|
||||||
|
+ bt 27,L(dcbz)
|
||||||
|
+
|
||||||
|
+ mtctr r0
|
||||||
|
+
|
||||||
|
+ .balign 32
|
||||||
|
+L(loop):
|
||||||
|
+ stxv v0+32,0(r6)
|
||||||
|
+ stxv v0+32,16(r6)
|
||||||
|
+ stxv v0+32,32(r6)
|
||||||
|
+ stxv v0+32,48(r6)
|
||||||
|
+ stxv v0+32,64(r6)
|
||||||
|
+ stxv v0+32,80(r6)
|
||||||
|
+ stxv v0+32,96(r6)
|
||||||
|
+ stxv v0+32,112(r6)
|
||||||
|
+ addi r6,r6,128
|
||||||
|
+ bdnz L(loop)
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(tail):
|
||||||
|
+ /* 127B or less left, finish the tail or return. */
|
||||||
|
+ andi. r5,r5,127
|
||||||
|
+ beqlr
|
||||||
|
+
|
||||||
|
+ cmpldi r5,64
|
||||||
|
+ blt L(tail_64)
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(tail_128):
|
||||||
|
+ /* Stores a minimum of 64B and up to 128B and return. */
|
||||||
|
+ stxv v0+32,0(r6)
|
||||||
|
+ stxv v0+32,16(r6)
|
||||||
|
+ stxv v0+32,32(r6)
|
||||||
|
+ stxv v0+32,48(r6)
|
||||||
|
+ addi r6,r6,64
|
||||||
|
+ andi. r5,r5,63
|
||||||
|
+ beqlr
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(tail_64):
|
||||||
|
+ /* Stores up to 64B and return. */
|
||||||
|
+ sldi r5,r5,56
|
||||||
|
+
|
||||||
|
+ addi r10,r6,16
|
||||||
|
+
|
||||||
|
+ sub. r11,r5,r8
|
||||||
|
+ isellt r11,0,r11
|
||||||
|
+
|
||||||
|
+ stxvl v0+32,r6,r5
|
||||||
|
+ stxvl v0+32,r10,r11
|
||||||
|
+
|
||||||
|
+ sub. r11,r11,r8
|
||||||
|
+ blelr
|
||||||
|
+
|
||||||
|
+ addi r9,r6,32
|
||||||
|
+ addi r10,r6,48
|
||||||
|
+
|
||||||
|
+ isellt r11,0,r11
|
||||||
|
+
|
||||||
|
+ sub. r5,r11,r8
|
||||||
|
+ isellt r5,0,r5
|
||||||
|
+
|
||||||
|
+ stxvl v0+32,r9,r11
|
||||||
|
+ stxvl v0+32,r10,r5
|
||||||
|
+
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(dcbz):
|
||||||
|
+ /* Special case when value is 0 and we have a long length to deal
|
||||||
|
+ with. Use dcbz to zero out a full cacheline of 128 bytes at a time.
|
||||||
|
+ Before using dcbz though, we need to get the destination 128-byte
|
||||||
|
+ aligned. */
|
||||||
|
+ neg r0,r6
|
||||||
|
+ clrldi. r0,r0,(64-7)
|
||||||
|
+ beq L(dcbz_aligned)
|
||||||
|
+
|
||||||
|
+ sub r5,r5,r0
|
||||||
|
+ mtocrf 0x2,r0 /* copying bits 57..59 to cr6. The ones for sizes 64,
|
||||||
|
+ 32 and 16 which need to be checked. */
|
||||||
|
+
|
||||||
|
+ /* Write 16-128 bytes until DST is aligned to 128 bytes. */
|
||||||
|
+64: bf 25,32f
|
||||||
|
+ stxv v0+32,0(r6)
|
||||||
|
+ stxv v0+32,16(r6)
|
||||||
|
+ stxv v0+32,32(r6)
|
||||||
|
+ stxv v0+32,48(r6)
|
||||||
|
+ addi r6,r6,64
|
||||||
|
+
|
||||||
|
+32: bf 26,16f
|
||||||
|
+ stxv v0+32,0(r6)
|
||||||
|
+ stxv v0+32,16(r6)
|
||||||
|
+ addi r6,r6,32
|
||||||
|
+
|
||||||
|
+16: bf 27,L(dcbz_aligned)
|
||||||
|
+ stxv v0+32,0(r6)
|
||||||
|
+ addi r6,r6,16
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(dcbz_aligned):
|
||||||
|
+ /* Setup dcbz unroll offsets and count numbers. */
|
||||||
|
+ srdi. r0,r5,9
|
||||||
|
+ li r9,128
|
||||||
|
+ beq L(bcdz_tail)
|
||||||
|
+ li r10,256
|
||||||
|
+ li r11,384
|
||||||
|
+ mtctr r0
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(dcbz_loop):
|
||||||
|
+ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows
|
||||||
|
+ a throughput boost for large sizes (2048 bytes or higher). */
|
||||||
|
+ dcbz 0,r6
|
||||||
|
+ dcbz r9,r6
|
||||||
|
+ dcbz r10,r6
|
||||||
|
+ dcbz r11,r6
|
||||||
|
+ addi r6,r6,512
|
||||||
|
+ bdnz L(dcbz_loop)
|
||||||
|
+
|
||||||
|
+ andi. r5,r5,511
|
||||||
|
+ beqlr
|
||||||
|
+
|
||||||
|
+ .balign 16
|
||||||
|
+L(bcdz_tail):
|
||||||
|
+ /* We have 1-511 bytes remaining. */
|
||||||
|
+ srdi. r0,r5,7
|
||||||
|
+ beq L(tail)
|
||||||
|
+
|
||||||
|
+ mtocrf 0x1,r0
|
||||||
|
+
|
||||||
|
+256: bf 30,128f
|
||||||
|
+ dcbz 0,r6
|
||||||
|
+ dcbz r9,r6
|
||||||
|
+ addi r6,r6,256
|
||||||
|
+
|
||||||
|
+128: bf 31,L(tail)
|
||||||
|
+ dcbz 0,r6
|
||||||
|
+ addi r6,r6,128
|
||||||
|
+
|
||||||
|
+ b L(tail)
|
||||||
|
+
|
||||||
|
+END_GEN_TB (MEMSET,TB_TOCLESS)
|
||||||
|
+libc_hidden_builtin_def (memset)
|
||||||
|
+
|
||||||
|
+/* Copied from bzero.S to prevent the linker from inserting a stub
|
||||||
|
+ between bzero and memset. */
|
||||||
|
+ENTRY_TOCLESS (__bzero)
|
||||||
|
+ CALL_MCOUNT 2
|
||||||
|
+ mr r5,r4
|
||||||
|
+ li r4,0
|
||||||
|
+ b L(_memset)
|
||||||
|
+END (__bzero)
|
||||||
|
+#ifndef __bzero
|
||||||
|
+weak_alias (__bzero, bzero)
|
||||||
|
+#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
index 2e3c8f2e8a81cda4..1d517698429e1230 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||||
|
strncase-power8
|
||||||
|
|
||||||
|
ifneq (,$(filter %le,$(config-machine)))
|
||||||
|
-sysdep_routines += memcpy-power10 memmove-power10 \
|
||||||
|
+sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
|
||||||
|
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||||
|
rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
||||||
|
strlen-power10
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
|
||||||
|
index f8cb05bea8a3505b..4ce98e324d12a31e 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
|
||||||
|
@@ -27,8 +27,16 @@ extern __typeof (bzero) __bzero_power4 attribute_hidden;
|
||||||
|
extern __typeof (bzero) __bzero_power6 attribute_hidden;
|
||||||
|
extern __typeof (bzero) __bzero_power7 attribute_hidden;
|
||||||
|
extern __typeof (bzero) __bzero_power8 attribute_hidden;
|
||||||
|
+# ifdef __LITTLE_ENDIAN__
|
||||||
|
+extern __typeof (bzero) __bzero_power10 attribute_hidden;
|
||||||
|
+# endif
|
||||||
|
|
||||||
|
libc_ifunc (__bzero,
|
||||||
|
+# ifdef __LITTLE_ENDIAN__
|
||||||
|
+ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __bzero_power10 :
|
||||||
|
+# endif
|
||||||
|
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||||
|
? __bzero_power8 :
|
||||||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
index 9d5a14e480c02171..11532f77d4d03b2a 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -86,6 +86,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */
|
||||||
|
IFUNC_IMPL (i, name, memset,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, memset,
|
||||||
|
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
+ PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
+ __memset_power10)
|
||||||
|
+#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||||||
|
__memset_power8)
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
@@ -187,6 +194,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */
|
||||||
|
IFUNC_IMPL (i, name, bzero,
|
||||||
|
+#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, bzero,
|
||||||
|
+ hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
+ PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
+ __bzero_power10)
|
||||||
|
+#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
|
||||||
|
__bzero_power8)
|
||||||
|
IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..548e99789735296c
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
|
||||||
|
@@ -0,0 +1,27 @@
|
||||||
|
+/* Optimized memset implementation for POWER10 LE.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#define MEMSET __memset_power10
|
||||||
|
+
|
||||||
|
+#undef libc_hidden_builtin_def
|
||||||
|
+#define libc_hidden_builtin_def(name)
|
||||||
|
+
|
||||||
|
+#undef __bzero
|
||||||
|
+#define __bzero __bzero_power10
|
||||||
|
+
|
||||||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/memset.S>
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
|
||||||
|
index 1a7c46fecf78ab1f..4c97622c7d7eb8aa 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
|
||||||
|
@@ -33,10 +33,18 @@ extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
|
||||||
|
extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
|
||||||
|
+# ifdef __LITTLE_ENDIAN__
|
||||||
|
+extern __typeof (__redirect_memset) __memset_power10 attribute_hidden;
|
||||||
|
+# endif
|
||||||
|
|
||||||
|
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
|
||||||
|
ifunc symbol properly. */
|
||||||
|
libc_ifunc (__libc_memset,
|
||||||
|
+# ifdef __LITTLE_ENDIAN__
|
||||||
|
+ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __memset_power10 :
|
||||||
|
+# endif
|
||||||
|
(hwcap2 & PPC_FEATURE2_ARCH_2_07)
|
||||||
|
? __memset_power8 :
|
||||||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
131
SOURCES/glibc-rh1956357-7.patch
Normal file
131
SOURCES/glibc-rh1956357-7.patch
Normal file
@ -0,0 +1,131 @@
|
|||||||
|
commit 17a73a6d8b4c46f3e87fc53c7c25fa7cec01d707
|
||||||
|
Author: Raoni Fassina Firmino <raoni@linux.ibm.com>
|
||||||
|
Date: Mon May 3 16:59:35 2021 -0300
|
||||||
|
|
||||||
|
powerpc64le: Fix ifunc selection for memset, memmove, bzero and bcopy
|
||||||
|
|
||||||
|
The hwcap2 check for the aforementioned functions should check for
|
||||||
|
both PPC_FEATURE2_ARCH_3_1 and PPC_FEATURE2_HAS_ISEL but was
|
||||||
|
mistakenly checking for any one of them, enabling isa 3.1 version of
|
||||||
|
the functions in incompatible processors, like POWER8.
|
||||||
|
|
||||||
|
Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
||||||
|
index 705fef33d4e57557..3c6528e5dbccfdbd 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
|
||||||
|
@@ -28,10 +28,10 @@ extern __typeof (bcopy) __bcopy_power10 attribute_hidden;
|
||||||
|
|
||||||
|
libc_ifunc (bcopy,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
- PPC_FEATURE2_HAS_ISEL)
|
||||||
|
- && (hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
- ? __bcopy_power10 :
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __bcopy_power10 :
|
||||||
|
#endif
|
||||||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
? __bcopy_power7
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
|
||||||
|
index 4ce98e324d12a31e..b08b381b4a3999f1 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
|
||||||
|
@@ -33,7 +33,8 @@ extern __typeof (bzero) __bzero_power10 attribute_hidden;
|
||||||
|
|
||||||
|
libc_ifunc (__bzero,
|
||||||
|
# ifdef __LITTLE_ENDIAN__
|
||||||
|
- (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
&& hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
? __bzero_power10 :
|
||||||
|
# endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
index 11532f77d4d03b2a..6e36659d1903448a 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -75,9 +75,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
IFUNC_IMPL (i, name, memmove,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
IFUNC_IMPL_ADD (array, i, memmove,
|
||||||
|
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
- PPC_FEATURE2_HAS_ISEL)
|
||||||
|
- && (hwcap & PPC_FEATURE_HAS_VSX),
|
||||||
|
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
__memmove_power10)
|
||||||
|
#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
@@ -88,8 +88,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
IFUNC_IMPL (i, name, memset,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
IFUNC_IMPL_ADD (array, i, memset,
|
||||||
|
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
- PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
&& hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
__memset_power10)
|
||||||
|
#endif
|
||||||
|
@@ -196,8 +196,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
IFUNC_IMPL (i, name, bzero,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
IFUNC_IMPL_ADD (array, i, bzero,
|
||||||
|
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
- PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
&& hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
__bzero_power10)
|
||||||
|
#endif
|
||||||
|
@@ -215,9 +215,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
IFUNC_IMPL (i, name, bcopy,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
IFUNC_IMPL_ADD (array, i, bcopy,
|
||||||
|
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
- PPC_FEATURE2_HAS_ISEL)
|
||||||
|
- && (hwcap & PPC_FEATURE_HAS_VSX),
|
||||||
|
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
__bcopy_power10)
|
||||||
|
#endif
|
||||||
|
IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
||||||
|
index 2fd7b6d309e4bedd..27895faad0cab40e 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
|
||||||
|
@@ -36,10 +36,10 @@ extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden;
|
||||||
|
|
||||||
|
libc_ifunc (__libc_memmove,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
- hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
|
||||||
|
- PPC_FEATURE2_HAS_ISEL)
|
||||||
|
- && (hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
- ? __memmove_power10 :
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
+ && hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __memmove_power10 :
|
||||||
|
#endif
|
||||||
|
(hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
? __memmove_power7
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
|
||||||
|
index 4c97622c7d7eb8aa..685623ae870a0725 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
|
||||||
|
@@ -41,7 +41,8 @@ extern __typeof (__redirect_memset) __memset_power10 attribute_hidden;
|
||||||
|
ifunc symbol properly. */
|
||||||
|
libc_ifunc (__libc_memset,
|
||||||
|
# ifdef __LITTLE_ENDIAN__
|
||||||
|
- (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
|
||||||
|
+ && hwcap2 & PPC_FEATURE2_HAS_ISEL
|
||||||
|
&& hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
? __memset_power10 :
|
||||||
|
# endif
|
387
SOURCES/glibc-rh1956357-8.patch
Normal file
387
SOURCES/glibc-rh1956357-8.patch
Normal file
@ -0,0 +1,387 @@
|
|||||||
|
commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd
|
||||||
|
Author: Matheus Castanho <msc@linux.ibm.com>
|
||||||
|
Date: Tue May 11 17:53:07 2021 -0300
|
||||||
|
|
||||||
|
powerpc: Add optimized rawmemchr for POWER10
|
||||||
|
|
||||||
|
Reuse code for optimized strlen to implement a faster version of rawmemchr.
|
||||||
|
This takes advantage of the same benefits provided by the strlen implementation,
|
||||||
|
but needs some extra steps. __strlen_power10 code should be unchanged after this
|
||||||
|
change.
|
||||||
|
|
||||||
|
rawmemchr returns a pointer to the char found, while strlen returns only the
|
||||||
|
length, so we have to take that into account when preparing the return value.
|
||||||
|
|
||||||
|
To quickly check 64B, the loop on __strlen_power10 merges the whole block into
|
||||||
|
16B by using unsigned minimum vector operations (vminub) and checks if there are
|
||||||
|
any \0 on the resulting vector. The same code is used by rawmemchr if the char c
|
||||||
|
is 0. However, this approach does not work when c != 0. We first need to
|
||||||
|
subtract each byte by c, so that the value we are looking for is converted to a
|
||||||
|
0, then taking the minimum and checking for nulls works again.
|
||||||
|
|
||||||
|
The new code branches after it has compared ~256 bytes and chooses which of the
|
||||||
|
two strategies above will be used in the main loop, based on the char c. This
|
||||||
|
extra branch adds some overhead (~5%) for length ~256, but is quickly amortized
|
||||||
|
by the faster loop for larger sizes.
|
||||||
|
|
||||||
|
Compared to __rawmemchr_power9, this version is ~20% faster for length < 256.
|
||||||
|
Because of the optimized main loop, the improvement becomes ~35% for c != 0
|
||||||
|
and ~50% for c = 0 for strings longer than 256.
|
||||||
|
|
||||||
|
Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
|
||||||
|
Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
|
||||||
|
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..5351c2634f6086bf
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
|
||||||
|
@@ -0,0 +1,22 @@
|
||||||
|
+/* Optimized rawmemchr implementation for POWER10 LE.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#include <sysdep.h>
|
||||||
|
+
|
||||||
|
+#define USE_AS_RAWMEMCHR 1
|
||||||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
|
||||||
|
index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
|
||||||
|
@@ -18,10 +18,50 @@
|
||||||
|
|
||||||
|
#include <sysdep.h>
|
||||||
|
|
||||||
|
-#ifndef STRLEN
|
||||||
|
-# define STRLEN __strlen
|
||||||
|
-# define DEFINE_STRLEN_HIDDEN_DEF 1
|
||||||
|
-#endif
|
||||||
|
+/* To reuse the code for rawmemchr, we have some extra steps compared to the
|
||||||
|
+ strlen implementation:
|
||||||
|
+ - Sum the initial value of r3 with the position at which the char was
|
||||||
|
+ found, to guarantee we return a pointer and not the length.
|
||||||
|
+ - In the main loop, subtract each byte by the char we are looking for,
|
||||||
|
+ so we can keep using vminub to quickly check 64B at once. */
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+# ifndef RAWMEMCHR
|
||||||
|
+# define FUNCNAME __rawmemchr
|
||||||
|
+# else
|
||||||
|
+# define FUNCNAME RAWMEMCHR
|
||||||
|
+# endif
|
||||||
|
+# define MCOUNT_NARGS 2
|
||||||
|
+# define VREG_ZERO v20
|
||||||
|
+# define OFF_START_LOOP 256
|
||||||
|
+# define RAWMEMCHR_SUBTRACT_VECTORS \
|
||||||
|
+ vsububm v4,v4,v18; \
|
||||||
|
+ vsububm v5,v5,v18; \
|
||||||
|
+ vsububm v6,v6,v18; \
|
||||||
|
+ vsububm v7,v7,v18;
|
||||||
|
+# define TAIL(vreg,increment) \
|
||||||
|
+ vctzlsbb r4,vreg; \
|
||||||
|
+ addi r4,r4,increment; \
|
||||||
|
+ add r3,r5,r4; \
|
||||||
|
+ blr
|
||||||
|
+
|
||||||
|
+#else /* strlen */
|
||||||
|
+
|
||||||
|
+# ifndef STRLEN
|
||||||
|
+# define FUNCNAME __strlen
|
||||||
|
+# define DEFINE_STRLEN_HIDDEN_DEF 1
|
||||||
|
+# else
|
||||||
|
+# define FUNCNAME STRLEN
|
||||||
|
+# endif
|
||||||
|
+# define MCOUNT_NARGS 1
|
||||||
|
+# define VREG_ZERO v18
|
||||||
|
+# define OFF_START_LOOP 192
|
||||||
|
+# define TAIL(vreg,increment) \
|
||||||
|
+ vctzlsbb r4,vreg; \
|
||||||
|
+ subf r3,r3,r5; \
|
||||||
|
+ addi r4,r4,increment; \
|
||||||
|
+ add r3,r3,r4; \
|
||||||
|
+ blr
|
||||||
|
+#endif /* USE_AS_RAWMEMCHR */
|
||||||
|
|
||||||
|
/* TODO: Replace macros by the actual instructions when minimum binutils becomes
|
||||||
|
>= 2.35. This is used to keep compatibility with older versions. */
|
||||||
|
@@ -50,33 +90,41 @@
|
||||||
|
li r6,offset; \
|
||||||
|
LXVP(v4+32,offset,addr); \
|
||||||
|
LXVP(v6+32,offset+32,addr); \
|
||||||
|
+ RAWMEMCHR_SUBTRACT_VECTORS; \
|
||||||
|
vminub v14,v4,v5; \
|
||||||
|
vminub v15,v6,v7; \
|
||||||
|
vminub v16,v14,v15; \
|
||||||
|
- vcmpequb. v0,v16,v18; \
|
||||||
|
+ vcmpequb. v0,v16,VREG_ZERO; \
|
||||||
|
bne cr6,L(label)
|
||||||
|
|
||||||
|
-#define TAIL(vreg,increment) \
|
||||||
|
- vctzlsbb r4,vreg; \
|
||||||
|
- subf r3,r3,r5; \
|
||||||
|
- addi r4,r4,increment; \
|
||||||
|
- add r3,r3,r4; \
|
||||||
|
- blr
|
||||||
|
-
|
||||||
|
/* Implements the function
|
||||||
|
|
||||||
|
int [r3] strlen (const void *s [r3])
|
||||||
|
|
||||||
|
+ but when USE_AS_RAWMEMCHR is set, implements the function
|
||||||
|
+
|
||||||
|
+ void* [r3] rawmemchr (const void *s [r3], int c [r4])
|
||||||
|
+
|
||||||
|
The implementation can load bytes past a matching byte, but only
|
||||||
|
up to the next 64B boundary, so it never crosses a page. */
|
||||||
|
|
||||||
|
.machine power9
|
||||||
|
|
||||||
|
-ENTRY_TOCLESS (STRLEN, 4)
|
||||||
|
- CALL_MCOUNT 1
|
||||||
|
+ENTRY_TOCLESS (FUNCNAME, 4)
|
||||||
|
+ CALL_MCOUNT MCOUNT_NARGS
|
||||||
|
|
||||||
|
- vspltisb v18,0
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+ xori r5,r4,0xff
|
||||||
|
+
|
||||||
|
+ mtvsrd v18+32,r4 /* matching char in v18 */
|
||||||
|
+ mtvsrd v19+32,r5 /* non matching char in v19 */
|
||||||
|
+
|
||||||
|
+ vspltb v18,v18,7 /* replicate */
|
||||||
|
+ vspltb v19,v19,7 /* replicate */
|
||||||
|
+#else
|
||||||
|
vspltisb v19,-1
|
||||||
|
+#endif
|
||||||
|
+ vspltisb VREG_ZERO,0
|
||||||
|
|
||||||
|
/* Next 16B-aligned address. Prepare address for L(aligned). */
|
||||||
|
addi r5,r3,16
|
||||||
|
@@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4)
|
||||||
|
vcmpequb. v6,v0,v18
|
||||||
|
beq cr6,L(aligned)
|
||||||
|
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+ vctzlsbb r6,v6
|
||||||
|
+ add r3,r3,r6
|
||||||
|
+#else
|
||||||
|
vctzlsbb r3,v6
|
||||||
|
+#endif
|
||||||
|
blr
|
||||||
|
|
||||||
|
- /* Test next 176B, 16B at a time. The main loop is optimized for longer
|
||||||
|
- strings, so checking the first bytes in 16B chunks benefits a lot
|
||||||
|
- small strings. */
|
||||||
|
+ /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is
|
||||||
|
+ optimized for longer strings, so checking the first bytes in 16B
|
||||||
|
+ chunks benefits a lot small strings. */
|
||||||
|
.p2align 5
|
||||||
|
L(aligned):
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+ cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to
|
||||||
|
+ choose how we will perform the main loop. */
|
||||||
|
+#endif
|
||||||
|
/* Prepare address for the loop. */
|
||||||
|
- addi r4,r3,192
|
||||||
|
+ addi r4,r3,OFF_START_LOOP
|
||||||
|
clrrdi r4,r4,6
|
||||||
|
|
||||||
|
CHECK16(v0,0,r5,tail1)
|
||||||
|
@@ -113,15 +170,43 @@ L(aligned):
|
||||||
|
CHECK16(v8,128,r5,tail9)
|
||||||
|
CHECK16(v9,144,r5,tail10)
|
||||||
|
CHECK16(v10,160,r5,tail11)
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+ CHECK16(v0,176,r5,tail12)
|
||||||
|
+ CHECK16(v1,192,r5,tail13)
|
||||||
|
+ CHECK16(v2,208,r5,tail14)
|
||||||
|
+ CHECK16(v3,224,r5,tail15)
|
||||||
|
+#endif
|
||||||
|
|
||||||
|
addi r5,r4,128
|
||||||
|
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+ /* If c == 0, use the same loop as strlen, without the vsububm. */
|
||||||
|
+ beq cr5,L(loop)
|
||||||
|
+
|
||||||
|
+ /* This is very similar to the block after L(loop), the difference is
|
||||||
|
+ that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
|
||||||
|
+ each byte loaded by the char we are looking for, this way we can keep
|
||||||
|
+ using vminub to merge the results and checking for nulls. */
|
||||||
|
+ .p2align 5
|
||||||
|
+L(rawmemchr_loop):
|
||||||
|
+ CHECK64(0,r4,pre_tail_64b)
|
||||||
|
+ CHECK64(64,r4,pre_tail_64b)
|
||||||
|
+ addi r4,r4,256
|
||||||
|
+
|
||||||
|
+ CHECK64(0,r5,tail_64b)
|
||||||
|
+ CHECK64(64,r5,tail_64b)
|
||||||
|
+ addi r5,r5,256
|
||||||
|
+
|
||||||
|
+ b L(rawmemchr_loop)
|
||||||
|
+#endif
|
||||||
|
/* Switch to a more aggressive approach checking 64B each time. Use 2
|
||||||
|
pointers 128B apart and unroll the loop once to make the pointer
|
||||||
|
updates and usages separated enough to avoid stalls waiting for
|
||||||
|
address calculation. */
|
||||||
|
.p2align 5
|
||||||
|
L(loop):
|
||||||
|
+#undef RAWMEMCHR_SUBTRACT_VECTORS
|
||||||
|
+#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
|
||||||
|
CHECK64(0,r4,pre_tail_64b)
|
||||||
|
CHECK64(64,r4,pre_tail_64b)
|
||||||
|
addi r4,r4,256
|
||||||
|
@@ -140,10 +225,10 @@ L(tail_64b):
|
||||||
|
block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
|
||||||
|
low 16B bytes into vx+1, and the high into vx, so the order here is
|
||||||
|
v5, v4, v7, v6. */
|
||||||
|
- vcmpequb v1,v5,v18
|
||||||
|
- vcmpequb v2,v4,v18
|
||||||
|
- vcmpequb v3,v7,v18
|
||||||
|
- vcmpequb v4,v6,v18
|
||||||
|
+ vcmpequb v1,v5,VREG_ZERO
|
||||||
|
+ vcmpequb v2,v4,VREG_ZERO
|
||||||
|
+ vcmpequb v3,v7,VREG_ZERO
|
||||||
|
+ vcmpequb v4,v6,VREG_ZERO
|
||||||
|
|
||||||
|
/* Take into account the other 64B blocks we had already checked. */
|
||||||
|
add r5,r5,r6
|
||||||
|
@@ -165,7 +250,9 @@ L(tail_64b):
|
||||||
|
or r10,r8,r7
|
||||||
|
|
||||||
|
cnttzd r0,r10 /* Count trailing zeros before the match. */
|
||||||
|
+#ifndef USE_AS_RAWMEMCHR
|
||||||
|
subf r5,r3,r5
|
||||||
|
+#endif
|
||||||
|
add r3,r5,r0 /* Compute final length. */
|
||||||
|
blr
|
||||||
|
|
||||||
|
@@ -213,9 +300,32 @@ L(tail10):
|
||||||
|
L(tail11):
|
||||||
|
TAIL(v10,160)
|
||||||
|
|
||||||
|
-END (STRLEN)
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail12):
|
||||||
|
+ TAIL(v0,176)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail13):
|
||||||
|
+ TAIL(v1,192)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail14):
|
||||||
|
+ TAIL(v2,208)
|
||||||
|
+
|
||||||
|
+ .p2align 5
|
||||||
|
+L(tail15):
|
||||||
|
+ TAIL(v3,224)
|
||||||
|
+#endif
|
||||||
|
+
|
||||||
|
+END (FUNCNAME)
|
||||||
|
|
||||||
|
-#ifdef DEFINE_STRLEN_HIDDEN_DEF
|
||||||
|
+#ifdef USE_AS_RAWMEMCHR
|
||||||
|
+weak_alias (__rawmemchr,rawmemchr)
|
||||||
|
+libc_hidden_builtin_def (__rawmemchr)
|
||||||
|
+#else
|
||||||
|
+# ifdef DEFINE_STRLEN_HIDDEN_DEF
|
||||||
|
weak_alias (__strlen, strlen)
|
||||||
|
libc_hidden_builtin_def (strlen)
|
||||||
|
+# endif
|
||||||
|
#endif
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
index 1d517698429e1230..ac2446aca62cc4ab 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
|
||||||
|
@@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
|
||||||
|
|
||||||
|
ifneq (,$(filter %le,$(config-machine)))
|
||||||
|
sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
|
||||||
|
+ rawmemchr-power9 rawmemchr-power10 \
|
||||||
|
strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
|
||||||
|
- rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
|
||||||
|
- strlen-power10
|
||||||
|
+ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
|
||||||
|
endif
|
||||||
|
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
|
||||||
|
CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
index 6e36659d1903448a..127af84b32a8196f 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
|
||||||
|
@@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
|
||||||
|
/* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c. */
|
||||||
|
IFUNC_IMPL (i, name, rawmemchr,
|
||||||
|
#ifdef __LITTLE_ENDIAN__
|
||||||
|
+ IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1)
|
||||||
|
+ && (hwcap & PPC_FEATURE_HAS_VSX),
|
||||||
|
+ __rawmemchr_power10)
|
||||||
|
IFUNC_IMPL_ADD (array, i, rawmemchr,
|
||||||
|
hwcap2 & PPC_FEATURE2_ARCH_3_00,
|
||||||
|
__rawmemchr_power9)
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
|
||||||
|
new file mode 100644
|
||||||
|
index 0000000000000000..bf1ed7e1941f922d
|
||||||
|
--- /dev/null
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
|
||||||
|
@@ -0,0 +1,21 @@
|
||||||
|
+/* Optimized rawmemchr implementation for PowerPC64/POWER10.
|
||||||
|
+ Copyright (C) 2021 Free Software Foundation, Inc.
|
||||||
|
+ This file is part of the GNU C Library.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is free software; you can redistribute it and/or
|
||||||
|
+ modify it under the terms of the GNU Lesser General Public
|
||||||
|
+ License as published by the Free Software Foundation; either
|
||||||
|
+ version 2.1 of the License, or (at your option) any later version.
|
||||||
|
+
|
||||||
|
+ The GNU C Library is distributed in the hope that it will be useful,
|
||||||
|
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
|
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
|
+ Lesser General Public License for more details.
|
||||||
|
+
|
||||||
|
+ You should have received a copy of the GNU Lesser General Public
|
||||||
|
+ License along with the GNU C Library; if not, see
|
||||||
|
+ <https://www.gnu.org/licenses/>. */
|
||||||
|
+
|
||||||
|
+#define RAWMEMCHR __rawmemchr_power10
|
||||||
|
+
|
||||||
|
+#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S>
|
||||||
|
diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
|
||||||
|
index 2a7ae5a1ed02e556..369d6359e8987052 100644
|
||||||
|
--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
|
||||||
|
+++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
|
||||||
|
@@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
|
||||||
|
extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
|
||||||
|
# ifdef __LITTLE_ENDIAN__
|
||||||
|
extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
|
||||||
|
+extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden;
|
||||||
|
# endif
|
||||||
|
|
||||||
|
# undef __rawmemchr
|
||||||
|
@@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
|
||||||
|
ifunc symbol properly. */
|
||||||
|
libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
|
||||||
|
# ifdef __LITTLE_ENDIAN__
|
||||||
|
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1)
|
||||||
|
+ && (hwcap & PPC_FEATURE_HAS_VSX)
|
||||||
|
+ ? __rawmemchr_power10 :
|
||||||
|
(hwcap2 & PPC_FEATURE2_ARCH_3_00)
|
||||||
|
? __rawmemchr_power9 :
|
||||||
|
# endif
|
@ -1,6 +1,6 @@
|
|||||||
%define glibcsrcdir glibc-2.28
|
%define glibcsrcdir glibc-2.28
|
||||||
%define glibcversion 2.28
|
%define glibcversion 2.28
|
||||||
%define glibcrelease 158%{?dist}
|
%define glibcrelease 160%{?dist}
|
||||||
# Pre-release tarballs are pulled in from git using a command that is
|
# Pre-release tarballs are pulled in from git using a command that is
|
||||||
# effectively:
|
# effectively:
|
||||||
#
|
#
|
||||||
@ -706,6 +706,14 @@ Patch569: glibc-rh1934155-3.patch
|
|||||||
Patch570: glibc-rh1934155-4.patch
|
Patch570: glibc-rh1934155-4.patch
|
||||||
Patch571: glibc-rh1934155-5.patch
|
Patch571: glibc-rh1934155-5.patch
|
||||||
Patch572: glibc-rh1934155-6.patch
|
Patch572: glibc-rh1934155-6.patch
|
||||||
|
Patch573: glibc-rh1956357-1.patch
|
||||||
|
Patch574: glibc-rh1956357-2.patch
|
||||||
|
Patch575: glibc-rh1956357-3.patch
|
||||||
|
Patch576: glibc-rh1956357-4.patch
|
||||||
|
Patch577: glibc-rh1956357-5.patch
|
||||||
|
Patch578: glibc-rh1956357-6.patch
|
||||||
|
Patch579: glibc-rh1956357-7.patch
|
||||||
|
Patch580: glibc-rh1956357-8.patch
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
# Continued list of core "glibc" package information:
|
# Continued list of core "glibc" package information:
|
||||||
@ -2617,6 +2625,12 @@ fi
|
|||||||
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
%files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Mon May 31 2021 Arjun Shankar <arjun@redhat.com> - 2.28-160
|
||||||
|
- Backport POWER10 optimized rawmemchr for ppc64le (#1956357)
|
||||||
|
|
||||||
|
* Thu May 27 2021 Arjun Shankar <arjun@redhat.com> - 2.28-159
|
||||||
|
- Backport additional ifunc optimizations for ppc64le (#1956357)
|
||||||
|
|
||||||
* Thu Apr 22 2021 Florian Weimer <fweimer@redhat.com> - 2.28-158
|
* Thu Apr 22 2021 Florian Weimer <fweimer@redhat.com> - 2.28-158
|
||||||
- Rebuild with new binutils (#1946518)
|
- Rebuild with new binutils (#1946518)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user