import glibc-2.28-160.el8

2021-06-19 04:21:01 +00:00 · 2021-06-19 04:21:01 +00:00 · c11d47b279
commit c11d47b279
parent c04956366c
9 changed files with 2286 additions and 1 deletions
--- a/SOURCES/glibc-rh1956357-1.patch
+++ b/SOURCES/glibc-rh1956357-1.patch
@ -0,0 +1,100 @@
 commit 56c81132ccc6f468fa4fc29c536db060e18e9d87
 Author: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
 Date:   Tue Feb 23 14:14:37 2021 -0300
    powerpc: Add optimized ilogb* for POWER9
    The instructions xsxexpdp and xsxexpqp introduced on POWER9 extract
    the exponent from a double-precision and quad-precision floating-point
    respectively, thus they can be used to improve ilogb, ilogbf and ilogbf128.
 diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h
 index e642d6c8237578ea..5bbc468829062a48 100644
 --- a/sysdeps/powerpc/fpu/math_private.h
 +++ b/sysdeps/powerpc/fpu/math_private.h
@@ -26,7 +26,28 @@
 #include_next <math_private.h>
 -#if defined _ARCH_PWR9 && __HAVE_DISTINCT_FLOAT128
 +#ifdef _ARCH_PWR9
 +
 +#if __GNUC_PREREQ (8, 0)
 +# define _GL_HAS_BUILTIN_ILOGB 1
 +#elif defined __has_builtin
 +# define _GL_HAS_BUILTIN_ILOGB __has_builtin (__builtin_vsx_scalar_extract_exp)
 +#else
 +# define _GL_HAS_BUILTIN_ILOGB 0
 +#endif
 +
 +#define __builtin_test_dc_ilogbf __builtin_test_dc_ilogb
 +#define __builtin_ilogbf __builtin_ilogb
 +
 +#define __builtin_test_dc_ilogb(x, y) \
 +        __builtin_vsx_scalar_test_data_class_dp(x, y)
 +#define __builtin_ilogb(x) __builtin_vsx_scalar_extract_exp(x) - 0x3ff
 +
 +#define __builtin_test_dc_ilogbf128(x, y) \
 +        __builtin_vsx_scalar_test_data_class_qp(x, y)
 +#define __builtin_ilogbf128(x) __builtin_vsx_scalar_extract_expq(x) - 0x3fff
 +
 +#if __HAVE_DISTINCT_FLOAT128
 extern __always_inline _Float128
 __ieee754_sqrtf128 (_Float128 __x)
 {
@@ -35,6 +56,9 @@ __ieee754_sqrtf128 (_Float128 __x)
   return __z;
 }
 #endif
 +#else /* !_ARCH_PWR9 */
 +#define _GL_HAS_BUILTIN_ILOGB 0
 +#endif
 #if defined _ARCH_PWR5X
 diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c
 new file mode 100644
 index 0000000000000000..b5c1c0aa9db86f3d
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c
@@ -0,0 +1,30 @@
 +#include <math.h>
 +#include <errno.h>
 +#include <limits.h>
 +#include <math_private.h>
 +#include <fenv.h>
 +
 +#if _GL_HAS_BUILTIN_ILOGB
 +int
 +M_DECL_FUNC (__ilogb) (FLOAT x)
 +{
 +  int r;
 +  /* Check for exceptional cases.  */
 +  if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f))
 +    r = M_SUF (__builtin_ilogb) (x);
 +  else
 +    /* Fallback to the generic ilogb if x is NaN, Inf or subnormal.  */
 +    r = M_SUF (__ieee754_ilogb) (x);
 +  if (__builtin_expect (r == FP_ILOGB0, 0)
 +      || __builtin_expect (r == FP_ILOGBNAN, 0)
 +      || __builtin_expect (r == INT_MAX, 0))
 +    {
 +      __set_errno (EDOM);
 +      __feraiseexcept (FE_INVALID);
 +    }
 +  return r;
 +}
 +declare_mgen_alias (__ilogb, ilogb)
 +#else
 +#include <math/w_ilogb_template.c>
 +#endif
 diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c
 new file mode 100644
 index 0000000000000000..205f154f0089a269
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c
@@ -0,0 +1,4 @@
 +/* Skip the optimization for long double as ibm128 does not provide an
 +   optimized builtin. */
 +#include <math-type-macros-ldouble.h>
 +#include <math/w_ilogb_template.c>
--- a/SOURCES/glibc-rh1956357-2.patch
+++ b/SOURCES/glibc-rh1956357-2.patch
@ -0,0 +1,64 @@
 commit a7d88506c260e7a0e4268803e76fc19e38ed041f
 Author: Raphael Moreira Zinsly <rzinsly@linux.ibm.com>
 Date:   Thu Feb 25 09:58:52 2021 -0300
    powerpc: Add optimized llogb* for POWER9
    The POWER9 builtins used to improve the ilogb* functions can be
    used in the llogb* functions as well.
 diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c
 new file mode 100644
 index 0000000000000000..d00b71d2a34e28da
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c
@@ -0,0 +1,39 @@
 +#include <math.h>
 +#include <errno.h>
 +#include <limits.h>
 +#include <math_private.h>
 +#include <fenv.h>
 +
 +#if _GL_HAS_BUILTIN_ILOGB
 +long int
 +M_DECL_FUNC (__llogb) (FLOAT x)
 +{
 +  int r;
 +  /* Check for exceptional cases.  */
 +  if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f))
 +    r = M_SUF (__builtin_ilogb) (x);
 +  else
 +    /* Fallback to the generic ilogb if x is NaN, Inf or subnormal.  */
 +    r = M_SUF (__ieee754_ilogb) (x);
 +  long int lr = r;
 +  if (__glibc_unlikely (r == FP_ILOGB0)
 +      || __glibc_unlikely (r == FP_ILOGBNAN)
 +      || __glibc_unlikely (r == INT_MAX))
 +    {
 +#if LONG_MAX != INT_MAX
 +      if (r == FP_ILOGB0)
 +	lr = FP_LLOGB0;
 +      else if (r == FP_ILOGBNAN)
 +	lr = FP_LLOGBNAN;
 +      else
 +	lr = LONG_MAX;
 +#endif
 +      __set_errno (EDOM);
 +      __feraiseexcept (FE_INVALID);
 +    }
 +  return lr;
 +}
 +declare_mgen_alias (__llogb, llogb)
 +#else
 +#include <math/w_llogb_template.c>
 +#endif
 diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c
 new file mode 100644
 index 0000000000000000..69477a37ae82c476
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c
@@ -0,0 +1,4 @@
 +/* Skip the optimization for long double as ibm128 does not provide an
 +   optimized builtin. */
 +#include <math-type-macros-ldouble.h>
 +#include <math/w_llogb_template.c>
--- a/SOURCES/glibc-rh1956357-3.patch
+++ b/SOURCES/glibc-rh1956357-3.patch
@ -0,0 +1,334 @@
 commit 10624a97e8e47004985740cbb04060a84cfada76
 Author: Matheus Castanho <msc@linux.ibm.com>
 Date:   Tue Sep 29 15:40:08 2020 -0300
    powerpc: Add optimized strlen for POWER10
    Improvements compared to POWER9 version:
    1. Take into account first 16B comparison for aligned strings
       The previous version compares the first 16B and increments r4 by the number
       of bytes until the address is 16B-aligned, then starts doing aligned loads at
       that address. For aligned strings, this causes the first 16B to be compared
       twice, because the increment is 0. Here we calculate the next 16B-aligned
       address differently, which avoids that issue.
    2. Use simple comparisons for the first ~192 bytes
       The main loop is good for big strings, but comparing 16B each time is better
       for smaller strings.  So after aligning the address to 16 Bytes, we check
       more 176B in 16B chunks.  There may be some overlaps with the main loop for
       unaligned strings, but we avoid using the more aggressive strategy too soon,
       and also allow the loop to start at a 64B-aligned address.  This greatly
       benefits smaller strings and avoids overlapping checks if the string is
       already aligned at a 64B boundary.
    3. Reduce dependencies between load blocks caused by address calculation on loop
       Doing a precise time tracing on the code showed many loads in the loop were
       stalled waiting for updates to r4 from previous code blocks.  This
       implementation avoids that as much as possible by using 2 registers (r4 and
       r5) to hold addresses to be used by different parts of the code.
       Also, the previous code aligned the address to 16B, then to 64B by doing a
       few 48B loops (if needed) until the address was aligned. The main loop could
       not start until that 48B loop had finished and r4 was updated with the
       current address. Here we calculate the address used by the loop very early,
       so it can start sooner.
       The main loop now uses 2 pointers 128B apart to make pointer updates less
       frequent, and also unrolls 1 iteration to guarantee there is enough time
       between iterations to update the pointers, reducing stalled cycles.
    4. Use new P10 instructions
       lxvp is used to load 32B with a single instruction, reducing contention in
       the load queue.
       vextractbm allows simplifying the tail code for the loop, replacing
       vbpermq and avoiding having to generate a permute control vector.
    Reviewed-by: Paul E Murphy <murphyp@linux.ibm.com>
    Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
    Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
 new file mode 100644
 index 0000000000000000..ca7e9eb3d84c9b00
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
@@ -0,0 +1,221 @@
 +/* Optimized strlen implementation for POWER10 LE.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +#ifndef STRLEN
 +# define STRLEN __strlen
 +# define DEFINE_STRLEN_HIDDEN_DEF 1
 +#endif
 +
 +/* TODO: Replace macros by the actual instructions when minimum binutils becomes
 +   >= 2.35.  This is used to keep compatibility with older versions.  */
 +#define VEXTRACTBM(rt,vrb)	 \
 +	.long(((4)<<(32-6))	 \
 +	      | ((rt)<<(32-11))	 \
 +	      | ((8)<<(32-16))	 \
 +	      | ((vrb)<<(32-21)) \
 +	      | 1602)
 +
 +#define LXVP(xtp,dq,ra)		   \
 +	.long(((6)<<(32-6))		   \
 +	      | ((((xtp)-32)>>1)<<(32-10)) \
 +	      | ((1)<<(32-11))		   \
 +	      | ((ra)<<(32-16))		   \
 +	      | dq)
 +
 +#define CHECK16(vreg,offset,addr,label) \
 +	lxv	  vreg+32,offset(addr);	\
 +	vcmpequb. vreg,vreg,v18;	\
 +	bne	  cr6,L(label);
 +
 +/* Load 4 quadwords, merge into one VR for speed and check for NULLs.  r6 has #
 +   of bytes already checked.  */
 +#define CHECK64(offset,addr,label)	    \
 +	li	  r6,offset;		    \
 +	LXVP(v4+32,offset,addr);	    \
 +	LXVP(v6+32,offset+32,addr);	    \
 +	vminub	  v14,v4,v5;		    \
 +	vminub	  v15,v6,v7;		    \
 +	vminub	  v16,v14,v15;		    \
 +	vcmpequb. v0,v16,v18;		    \
 +	bne	  cr6,L(label)
 +
 +#define TAIL(vreg,increment)	   \
 +	vctzlsbb  r4,vreg;	   \
 +	subf	  r3,r3,r5;	   \
 +	addi	  r4,r4,increment; \
 +	add	  r3,r3,r4;	   \
 +	blr
 +
 +/* Implements the function
 +
 +   int [r3] strlen (const void *s [r3])
 +
 +   The implementation can load bytes past a matching byte, but only
 +   up to the next 64B boundary, so it never crosses a page.  */
 +
 +.machine power9
 +
 +ENTRY_TOCLESS (STRLEN, 4)
 +	CALL_MCOUNT 1
 +
 +	vspltisb  v18,0
 +	vspltisb  v19,-1
 +
 +	/* Next 16B-aligned address. Prepare address for L(aligned).  */
 +	addi	  r5,r3,16
 +	clrrdi	  r5,r5,4
 +
 +	/* Align data and fill bytes not loaded with non matching char.	 */
 +	lvx	  v0,0,r3
 +	lvsr	  v1,0,r3
 +	vperm	  v0,v19,v0,v1
 +
 +	vcmpequb. v6,v0,v18
 +	beq	  cr6,L(aligned)
 +
 +	vctzlsbb  r3,v6
 +	blr
 +
 +	/* Test next 176B, 16B at a time.  The main loop is optimized for longer
 +	   strings, so checking the first bytes in 16B chunks benefits a lot
 +	   small strings.  */
 +	.p2align 5
 +L(aligned):
 +	/* Prepare address for the loop.  */
 +	addi	  r4,r3,192
 +	clrrdi	  r4,r4,6
 +
 +	CHECK16(v0,0,r5,tail1)
 +	CHECK16(v1,16,r5,tail2)
 +	CHECK16(v2,32,r5,tail3)
 +	CHECK16(v3,48,r5,tail4)
 +	CHECK16(v4,64,r5,tail5)
 +	CHECK16(v5,80,r5,tail6)
 +	CHECK16(v6,96,r5,tail7)
 +	CHECK16(v7,112,r5,tail8)
 +	CHECK16(v8,128,r5,tail9)
 +	CHECK16(v9,144,r5,tail10)
 +	CHECK16(v10,160,r5,tail11)
 +
 +	addi	  r5,r4,128
 +
 +	/* Switch to a more aggressive approach checking 64B each time.  Use 2
 +	   pointers 128B apart and unroll the loop once to make the pointer
 +	   updates and usages separated enough to avoid stalls waiting for
 +	   address calculation.  */
 +	.p2align 5
 +L(loop):
 +	CHECK64(0,r4,pre_tail_64b)
 +	CHECK64(64,r4,pre_tail_64b)
 +	addi	  r4,r4,256
 +
 +	CHECK64(0,r5,tail_64b)
 +	CHECK64(64,r5,tail_64b)
 +	addi	  r5,r5,256
 +
 +	b	  L(loop)
 +
 +	.p2align  5
 +L(pre_tail_64b):
 +	mr	r5,r4
 +L(tail_64b):
 +	/* OK, we found a null byte.  Let's look for it in the current 64-byte
 +	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
 +	   low 16B bytes into vx+1, and the high into vx, so the order here is
 +	   v5, v4, v7, v6.  */
 +	vcmpequb  v1,v5,v18
 +	vcmpequb  v2,v4,v18
 +	vcmpequb  v3,v7,v18
 +	vcmpequb  v4,v6,v18
 +
 +	/* Take into account the other 64B blocks we had already checked.  */
 +	add	r5,r5,r6
 +
 +	/* Extract first bit of each byte.  */
 +	VEXTRACTBM(r7,v1)
 +	VEXTRACTBM(r8,v2)
 +	VEXTRACTBM(r9,v3)
 +	VEXTRACTBM(r10,v4)
 +
 +	/* Shift each value into their corresponding position.  */
 +	sldi	  r8,r8,16
 +	sldi	  r9,r9,32
 +	sldi	  r10,r10,48
 +
 +	/* Merge the results.  */
 +	or	  r7,r7,r8
 +	or	  r8,r9,r10
 +	or	  r10,r8,r7
 +
 +	cnttzd	  r0,r10	  /* Count trailing zeros before the match.  */
 +	subf	  r5,r3,r5
 +	add	  r3,r5,r0	  /* Compute final length.  */
 +	blr
 +
 +	.p2align  5
 +L(tail1):
 +	TAIL(v0,0)
 +
 +	.p2align  5
 +L(tail2):
 +	TAIL(v1,16)
 +
 +	.p2align  5
 +L(tail3):
 +	TAIL(v2,32)
 +
 +	.p2align  5
 +L(tail4):
 +	TAIL(v3,48)
 +
 +	.p2align  5
 +L(tail5):
 +	TAIL(v4,64)
 +
 +	.p2align  5
 +L(tail6):
 +	TAIL(v5,80)
 +
 +	.p2align  5
 +L(tail7):
 +	TAIL(v6,96)
 +
 +	.p2align  5
 +L(tail8):
 +	TAIL(v7,112)
 +
 +	.p2align  5
 +L(tail9):
 +	TAIL(v8,128)
 +
 +	.p2align  5
 +L(tail10):
 +	TAIL(v9,144)
 +
 +	.p2align  5
 +L(tail11):
 +	TAIL(v10,160)
 +
 +END (STRLEN)
 +
 +#ifdef DEFINE_STRLEN_HIDDEN_DEF
 +weak_alias (__strlen, strlen)
 +libc_hidden_builtin_def (strlen)
 +#endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
 index a9e13e05e90601cd..61652b65dd223018 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
 +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
 -		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9
 +		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
 +		   strlen-power10
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index b30bc53930fc0e36..46d5956adda72b86 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -112,6 +112,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c.  */
   IFUNC_IMPL (i, name, strlen,
 #ifdef __LITTLE_ENDIAN__
 +	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1,
 +			      __strlen_power10)
 	      IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00,
 			      __strlen_power9)
 #endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S
 new file mode 100644
 index 0000000000000000..6a774fad58c77179
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S
@@ -0,0 +1,2 @@
 +#define STRLEN __strlen_power10
 +#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
 index b7f0fbb13fb97783..11bdb96de2d2aa66 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c
@@ -31,9 +31,12 @@ extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden;
 extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden;
 +extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden;
 libc_ifunc (__libc_strlen,
 # ifdef __LITTLE_ENDIAN__
 +	(hwcap2 & PPC_FEATURE2_ARCH_3_1)
 +	? __strlen_power10 :
 	  (hwcap2 & PPC_FEATURE2_ARCH_3_00)
 	  ? __strlen_power9 :
 # endif
--- a/SOURCES/glibc-rh1956357-4.patch
+++ b/SOURCES/glibc-rh1956357-4.patch
@ -0,0 +1,527 @@
 commit dd59655e9371af86043b97e38953f43bd9496699
 Author: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
 Date:   Fri Apr 30 18:12:08 2021 -0300
    powerpc64le: Optimized memmove for POWER10
    This patch was initially based on the __memmove_power7 with some ideas
    from strncpy implementation for Power 9.
    Improvements from __memmove_power7:
    1. Use lxvl/stxvl for alignment code.
       The code for Power 7 uses branches when the input is not naturally
       aligned to the width of a vector. The new implementation uses
       lxvl/stxvl instead which reduces pressure on GPRs. It also allows
       the removal of branch instructions, implicitly removing branch stalls
       and mispredictions.
    2. Use of lxv/stxv and lxvl/stxvl pair is safe to use on Cache Inhibited
       memory.
       On Power 10 vector load and stores are safe to use on CI memory for
       addresses unaligned to 16B. This code takes advantage of this to
       do unaligned loads.
       The unaligned loads don't have a significant performance impact by
       themselves. However doing so decreases register pressure on GPRs
       and interdependence stalls on load/store pairs. This also improved
       readability as there are now less code paths for different alignments.
       Finally this reduces the overall code size.
    3. Improved performance.
       This version runs on average about 30% better than memmove_power7
       for lengths  larger than 8KB. For input lengths shorter than 8KB
       the improvement is smaller, it has on average about 17% better
       performance.
       This version has a degradation of about 50% for input lengths
       in the 0 to 31 bytes range when dest is unaligned.
    Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/le/power10/memmove.S b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
 new file mode 100644
 index 0000000000000000..7dfd57edeb37e8e4
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/power10/memmove.S
@@ -0,0 +1,320 @@
 +/* Optimized memmove implementation for POWER10.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +
 +/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5])
 +
 +   This optimization checks if 'src' and 'dst' overlap.  If they do not
 +   or 'src' is ahead of 'dest' then it copies forward.
 +   Otherwise, an optimized backward copy is used.  */
 +
 +#ifndef MEMMOVE
 +# define MEMMOVE memmove
 +#endif
 +	.machine power9
 +ENTRY_TOCLESS (MEMMOVE, 5)
 +	CALL_MCOUNT 3
 +
 +L(_memmove):
 +	.p2align 5
 +	/* Check if there is overlap, if so it will branch to backward copy.  */
 +	subf	r9,r4,r3
 +	cmpld	cr7,r9,r5
 +	blt	cr7,L(memmove_bwd)
 +
 +	/* Fast path for length shorter than 16 bytes.  */
 +	sldi	r7,r5,56
 +	lxvl	32+v2,r4,r7
 +	stxvl	32+v2,r3,r7
 +	subic.	r8,r5,16
 +	blelr
 +
 +	/* For shorter lengths aligning the dest address to 16 bytes either
 +	   decreases performance or is irrelevant.  I'm making use of this
 +	   comparison to skip the alignment in.  */
 +	cmpldi	cr6,r5,256
 +	bge	cr6,L(ge_256)
 +	/* Account for the first 16-byte copy.  */
 +	addi	r4,r4,16
 +	addi	r11,r3,16	/* use r11 to keep dest address on r3.  */
 +	subi	r5,r5,16
 +	b	L(loop_head)
 +
 +	.p2align 5
 +L(ge_256):
 +	/* Account for the first copy <= 16 bytes.  This is necessary for
 +	   memmove because at this point the src address can be in front of the
 +	   dest address.  */
 +	clrldi	r9,r5,56
 +	li	r8,16
 +	cmpldi	r9,16
 +	iselgt	r9,r8,r9
 +	add	r4,r4,r9
 +	add	r11,r3,r9	/* use r11 to keep dest address on r3.  */
 +	sub	r5,r5,r9
 +
 +	/* Align dest to 16 bytes.  */
 +	neg	r7,r3
 +	clrldi.	r9,r7,60
 +	beq	L(loop_head)
 +
 +	.p2align 5
 +	sldi	r6,r9,56
 +	lxvl	32+v0,r4,r6
 +	stxvl	32+v0,r11,r6
 +	sub	r5,r5,r9
 +	add	r4,r4,r9
 +	add	r11,r11,r9
 +
 +L(loop_head):
 +	cmpldi	r5,63
 +	ble	L(final_64)
 +
 +	srdi.	r7,r5,7
 +	beq	L(loop_tail)
 +
 +	mtctr	r7
 +
 +/* Main loop that copies 128 bytes each iteration.  */
 +	.p2align 5
 +L(loop):
 +	addi	r9,r4,64
 +	addi	r10,r11,64
 +
 +	lxv	32+v0,0(r4)
 +	lxv	32+v1,16(r4)
 +	lxv	32+v2,32(r4)
 +	lxv	32+v3,48(r4)
 +
 +	stxv	32+v0,0(r11)
 +	stxv	32+v1,16(r11)
 +	stxv	32+v2,32(r11)
 +	stxv	32+v3,48(r11)
 +
 +	addi	r4,r4,128
 +	addi	r11,r11,128
 +
 +	lxv	32+v4,0(r9)
 +	lxv	32+v5,16(r9)
 +	lxv	32+v6,32(r9)
 +	lxv	32+v7,48(r9)
 +
 +	stxv	32+v4,0(r10)
 +	stxv	32+v5,16(r10)
 +	stxv	32+v6,32(r10)
 +	stxv	32+v7,48(r10)
 +
 +	bdnz	L(loop)
 +	clrldi.	r5,r5,57
 +	beqlr
 +
 +/* Copy 64 bytes.  */
 +	.p2align 5
 +L(loop_tail):
 +	cmpldi 	cr5,r5,63
 +	ble	cr5,L(final_64)
 +
 +	lxv	32+v0,0(r4)
 +	lxv	32+v1,16(r4)
 +	lxv	32+v2,32(r4)
 +	lxv	32+v3,48(r4)
 +
 +	stxv	32+v0,0(r11)
 +	stxv	32+v1,16(r11)
 +	stxv	32+v2,32(r11)
 +	stxv	32+v3,48(r11)
 +
 +	addi	r4,r4,64
 +	addi	r11,r11,64
 +	subi	r5,r5,64
 +
 +/* Copies the last 1-63 bytes.  */
 +	.p2align 5
 +L(final_64):
 +	/* r8 holds the number of bytes that will be copied with lxv/stxv.  */
 +	clrrdi.	r8,r5,4
 +	beq	L(tail1)
 +
 +	cmpldi  cr5,r5,32
 +	lxv	32+v0,0(r4)
 +	blt	cr5,L(tail2)
 +
 +	cmpldi	cr6,r5,48
 +	lxv	32+v1,16(r4)
 +	blt	cr6,L(tail3)
 +
 +	.p2align 5
 +	lxv	32+v2,32(r4)
 +	stxv	32+v2,32(r11)
 +L(tail3):
 +	stxv	32+v1,16(r11)
 +L(tail2):
 +	stxv	32+v0,0(r11)
 +	sub	r5,r5,r8
 +	add	r4,r4,r8
 +	add	r11,r11,r8
 +	.p2align 5
 +L(tail1):
 +	sldi	r6,r5,56
 +	lxvl	v4,r4,r6
 +	stxvl	v4,r11,r6
 +	blr
 +
 +/* If dest and src overlap, we should copy backwards.  */
 +L(memmove_bwd):
 +	add	r11,r3,r5
 +	add	r4,r4,r5
 +
 +	/* Optimization for length smaller than 16 bytes.  */
 +	cmpldi	cr5,r5,15
 +	ble	cr5,L(tail1_bwd)
 +
 +	/* For shorter lengths the alignment either slows down or is irrelevant.
 +	   The forward copy uses a already need 256 comparison for that.  Here
 +	   it's using 128 as it will reduce code and improve readability.  */
 +	cmpldi	cr7,r5,128
 +	blt	cr7,L(bwd_loop_tail)
 +
 +	/* Align dest address to 16 bytes.  */
 +	.p2align 5
 +	clrldi.	r9,r11,60
 +	beq	L(bwd_loop_head)
 +	sub	r4,r4,r9
 +	sub	r11,r11,r9
 +	lxv	32+v0,0(r4)
 +	sldi	r6,r9,56
 +	stxvl   32+v0,r11,r6
 +	sub	r5,r5,r9
 +
 +L(bwd_loop_head):
 +	srdi.	r7,r5,7
 +	beq	L(bwd_loop_tail)
 +
 +	mtctr	r7
 +
 +/* Main loop that copies 128 bytes every iteration.  */
 +	.p2align 5
 +L(bwd_loop):
 +	addi	r9,r4,-64
 +	addi	r10,r11,-64
 +
 +	lxv	32+v0,-16(r4)
 +	lxv	32+v1,-32(r4)
 +	lxv	32+v2,-48(r4)
 +	lxv	32+v3,-64(r4)
 +
 +	stxv	32+v0,-16(r11)
 +	stxv	32+v1,-32(r11)
 +	stxv	32+v2,-48(r11)
 +	stxv	32+v3,-64(r11)
 +
 +	addi	r4,r4,-128
 +	addi	r11,r11,-128
 +
 +	lxv	32+v0,-16(r9)
 +	lxv	32+v1,-32(r9)
 +	lxv	32+v2,-48(r9)
 +	lxv	32+v3,-64(r9)
 +
 +	stxv	32+v0,-16(r10)
 +	stxv	32+v1,-32(r10)
 +	stxv	32+v2,-48(r10)
 +	stxv	32+v3,-64(r10)
 +
 +	bdnz	L(bwd_loop)
 +	clrldi.	r5,r5,57
 +	beqlr
 +
 +/* Copy 64 bytes.  */
 +	.p2align 5
 +L(bwd_loop_tail):
 +	cmpldi 	cr5,r5,63
 +	ble	cr5,L(bwd_final_64)
 +
 +	addi	r4,r4,-64
 +	addi	r11,r11,-64
 +
 +	lxv	32+v0,0(r4)
 +	lxv	32+v1,16(r4)
 +	lxv	32+v2,32(r4)
 +	lxv	32+v3,48(r4)
 +
 +	stxv	32+v0,0(r11)
 +	stxv	32+v1,16(r11)
 +	stxv	32+v2,32(r11)
 +	stxv	32+v3,48(r11)
 +
 +	subi	r5,r5,64
 +
 +/* Copies the last 1-63 bytes.  */
 +	.p2align 5
 +L(bwd_final_64):
 +	/* r8 holds the number of bytes that will be copied with lxv/stxv.  */
 +	clrrdi.	r8,r5,4
 +	beq	L(tail1_bwd)
 +
 +	cmpldi	cr5,r5,32
 +	lxv	32+v2,-16(r4)
 +	blt	cr5,L(tail2_bwd)
 +
 +	cmpldi	cr6,r5,48
 +	lxv	32+v1,-32(r4)
 +	blt	cr6,L(tail3_bwd)
 +
 +	.p2align 5
 +	lxv	32+v0,-48(r4)
 +	stxv	32+v0,-48(r11)
 +L(tail3_bwd):
 +	stxv	32+v1,-32(r11)
 +L(tail2_bwd):
 +	stxv	32+v2,-16(r11)
 +	sub	r4,r4,r5
 +	sub	r11,r11,r5
 +	sub	r5,r5,r8
 +	sldi	r6,r5,56
 +	lxvl	v4,r4,r6
 +	stxvl	v4,r11,r6
 +	blr
 +
 +/* Copy last 16 bytes.  */
 +	.p2align 5
 +L(tail1_bwd):
 +	sub	r4,r4,r5
 +	sub	r11,r11,r5
 +	sldi	r6,r5,56
 +	lxvl	v4,r4,r6
 +	stxvl	v4,r11,r6
 +	blr
 +
 +END_GEN_TB (MEMMOVE,TB_TOCLESS)
 +libc_hidden_builtin_def (memmove)
 +
 +/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5])
 +   Implemented in this file to avoid linker create a stub function call
 +   in the branch to '_memmove'.  */
 +ENTRY_TOCLESS (__bcopy)
 +	mr	r6,r3
 +	mr	r3,r4
 +	mr	r4,r6
 +	b	L(_memmove)
 +END (__bcopy)
 +#ifndef __bcopy
 +weak_alias (__bcopy, bcopy)
 +#endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
 index 61652b65dd223018..66f8c6ace9824d4a 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
 +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 ifneq (,$(filter %le,$(config-machine)))
 -sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
 +sysdep_routines += memmove-power10 \
 +		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
 		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
 		   strlen-power10
 endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
 index 1c4a229b1fc5654a..705fef33d4e57557 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -22,8 +22,17 @@
 extern __typeof (bcopy) __bcopy_ppc attribute_hidden;
 /* __bcopy_power7 symbol is implemented at memmove-power7.S  */
 extern __typeof (bcopy) __bcopy_power7 attribute_hidden;
 +#ifdef __LITTLE_ENDIAN__
 +extern __typeof (bcopy) __bcopy_power10 attribute_hidden;
 +#endif
 libc_ifunc (bcopy,
 +#ifdef __LITTLE_ENDIAN__
 +	     hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 +		       PPC_FEATURE2_HAS_ISEL)
 +	     && (hwcap & PPC_FEATURE_HAS_VSX)
 +	     ? __bcopy_power10 :
 +#endif
             (hwcap & PPC_FEATURE_HAS_VSX)
             ? __bcopy_power7
             : __bcopy_ppc);
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index 46d5956adda72b86..4ce04bc51574cca1 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -67,6 +67,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c.  */
   IFUNC_IMPL (i, name, memmove,
 +#ifdef __LITTLE_ENDIAN__
 +	      IFUNC_IMPL_ADD (array, i, memmove,
 +			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 +					PPC_FEATURE2_HAS_ISEL)
 +			      && (hwcap & PPC_FEATURE_HAS_VSX),
 +			      __memmove_power10)
 +#endif
 	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
 			      __memmove_power7)
 	      IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc))
@@ -186,6 +193,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c.  */
   IFUNC_IMPL (i, name, bcopy,
 +#ifdef __LITTLE_ENDIAN__
 +	      IFUNC_IMPL_ADD (array, i, bcopy,
 +			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 +					PPC_FEATURE2_HAS_ISEL)
 +			      && (hwcap & PPC_FEATURE_HAS_VSX),
 +			      __bcopy_power10)
 +#endif
 	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
 			      __bcopy_power7)
 	      IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc))
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
 new file mode 100644
 index 0000000000000000..171b32921a0a4d47
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S
@@ -0,0 +1,27 @@
 +/* Optimized memmove implementation for POWER10.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#define MEMMOVE __memmove_power10
 +
 +#undef libc_hidden_builtin_def
 +#define libc_hidden_builtin_def(name)
 +
 +#undef __bcopy
 +#define __bcopy __bcopy_power10
 +
 +#include <sysdeps/powerpc/powerpc64/le/power10/memmove.S>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
 index 0b251d0f5f087874..fb5261ecda64d061 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S
@@ -21,7 +21,7 @@
 #undef libc_hidden_builtin_def
 #define libc_hidden_builtin_def(name)
 -#undef bcopy
 -#define bcopy __bcopy_power7
 +#undef __bcopy
 +#define __bcopy __bcopy_power7
 #include <sysdeps/powerpc/powerpc64/power7/memmove.S>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
 index 39987155cc7d3624..2fd7b6d309e4bedd 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -28,14 +28,22 @@
 # include "init-arch.h"
 extern __typeof (__redirect_memmove) __libc_memmove;
 -
 extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden;
 extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden;
 +#ifdef __LITTLE_ENDIAN__
 +extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden;
 +#endif
 libc_ifunc (__libc_memmove,
 -            (hwcap & PPC_FEATURE_HAS_VSX)
 -            ? __memmove_power7
 -            : __memmove_ppc);
 +#ifdef __LITTLE_ENDIAN__
 +	     hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 +		       PPC_FEATURE2_HAS_ISEL)
 +	     && (hwcap & PPC_FEATURE_HAS_VSX)
 +	     ? __memmove_power10 :
 +#endif
 +		     (hwcap & PPC_FEATURE_HAS_VSX)
 +		     ? __memmove_power7
 +		     : __memmove_ppc);
 #undef memmove
 strong_alias (__libc_memmove, memmove);
 diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S
 index b7f3dc28d1a8eac3..9e4cabb07ef9b732 100644
 --- a/sysdeps/powerpc/powerpc64/power7/memmove.S
 +++ b/sysdeps/powerpc/powerpc64/power7/memmove.S
@@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy)
 	mr	r4,r6
 	b	L(_memmove)
 END (__bcopy)
 +#ifndef __bcopy
 weak_alias (__bcopy, bcopy)
 +#endif
--- a/SOURCES/glibc-rh1956357-5.patch
+++ b/SOURCES/glibc-rh1956357-5.patch
@ -0,0 +1,308 @@
 commit e941e0ae80626b7661c1db8953a673cafd3b8b19
 Author: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
 Date:   Fri Apr 30 18:12:08 2021 -0300
    powerpc64le: Optimize memcpy for POWER10
    This implementation is based on __memcpy_power8_cached and integrates
    suggestions from Anton Blanchard.
    It benefits from loads and stores with length for short lengths and for
    tail code, simplifying the code.
    All unaligned memory accesses use instructions that do not generate
    alignment interrupts on POWER10, making it safe to use on
    caching-inhibited memory.
    The main loop has also been modified in order to increase instruction
    throughput by reducing the dependency on updates from previous iterations.
    On average, this implementation provides around 30% improvement when
    compared to __memcpy_power7 and 10% improvement in comparison to
    __memcpy_power8_cached.
 diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcpy.S b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
 new file mode 100644
 index 0000000000000000..ad1414db4a3a8b9f
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S
@@ -0,0 +1,198 @@
 +/* Optimized memcpy implementation for POWER10.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <http://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +
 +#ifndef MEMCPY
 +# define MEMCPY memcpy
 +#endif
 +
 +/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]);
 +	   Returns 'dst'.  */
 +
 +	.machine power9
 +ENTRY_TOCLESS (MEMCPY, 5)
 +	CALL_MCOUNT 3
 +
 +	/* Copy up to 16 bytes.  */
 +	sldi	r6,r5,56	/* Prepare [l|st]xvl counter.  */
 +	lxvl	v10,r4,r6
 +	stxvl	v10,r3,r6
 +	subic.	r6,r5,16	/* Return if len <= 16.  */
 +	blelr
 +
 +	/* If len >= 256, assume nothing got copied before and copy
 +	   again.  This might cause issues with overlapped memory, but memcpy
 +	   is not expected to treat overlapped memory.  */
 +	cmpdi	r5,256
 +	bge	L(copy_ge_256)
 +	/* 16 < len < 256 and the first 16 bytes have already been copied.  */
 +	addi	r10,r3,16	/* Keep r3 intact as return value.  */
 +	addi	r4,r4,16
 +	subi	r5,r5,16
 +	b	L(copy_lt_256)	/* Avoid the main loop if len < 256.  */
 +
 +	.p2align 5
 +L(copy_ge_256):
 +	mr	r10,r3		/* Keep r3 intact as return value.  */
 +	/* Align dst to 16 bytes.  */
 +	andi.	r9,r10,0xf
 +	beq	L(dst_is_align_16)
 +	lxv	v10,0(r4)
 +	subfic	r12,r9,16
 +	subf	r5,r12,r5
 +	add	r4,r4,r12
 +	stxv	v10,0(r3)
 +	add	r10,r3,r12
 +
 +L(dst_is_align_16):
 +	srdi	r9,r5,7		/* Divide by 128.  */
 +	mtctr	r9
 +	addi	r6,r4,64
 +	addi	r7,r10,64
 +
 +
 +	/* Main loop, copy 128 bytes per iteration.
 +	   Use r6=src+64 and r7=dest+64 in order to reduce the dependency on
 +	   r4 and r10.  */
 +	.p2align 5
 +L(copy_128):
 +
 +	lxv	v10, 0(r4)
 +	lxv	v11, 16(r4)
 +	lxv	v12, 32(r4)
 +	lxv	v13, 48(r4)
 +
 +	addi	r4,r4,128
 +
 +	stxv	v10, 0(r10)
 +	stxv	v11, 16(r10)
 +	stxv	v12, 32(r10)
 +	stxv	v13, 48(r10)
 +
 +	addi	r10,r10,128
 +
 +	lxv	v10, 0(r6)
 +	lxv	v11, 16(r6)
 +	lxv	v12, 32(r6)
 +	lxv	v13, 48(r6)
 +
 +	addi	r6,r6,128
 +
 +	stxv	v10, 0(r7)
 +	stxv	v11, 16(r7)
 +	stxv	v12, 32(r7)
 +	stxv	v13, 48(r7)
 +
 +	addi	r7,r7,128
 +
 +	bdnz	L(copy_128)
 +
 +	clrldi.	r5,r5,64-7	/* Have we copied everything?  */
 +	beqlr
 +
 +	.p2align 5
 +L(copy_lt_256):
 +	cmpdi	r5,16
 +	ble	L(copy_le_16)
 +	srdi.	r9,r5,5		/* Divide by 32.  */
 +	beq	L(copy_lt_32)
 +	mtctr	r9
 +	/* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce
 +	   the dependency on r4 and r10.  */
 +	addi	r6,r4,32
 +	addi	r7,r10,32
 +	addi	r8,r4,64
 +	addi	r9,r10,64
 +
 +	.p2align 5
 +	/* Copy 32 bytes at a time, unaligned.
 +	   The loop is unrolled 3 times in order to reduce the dependency on
 +	   r4 and r10, copying up-to 96 bytes per iteration.  */
 +L(copy_32):
 +	lxv	v10, 0(r4)
 +	lxv	v11, 16(r4)
 +	stxv	v10, 0(r10)
 +	stxv	v11, 16(r10)
 +	bdz	L(end_copy_32a)
 +	addi	r4,r4,96
 +	addi	r10,r10,96
 +
 +	lxv	v10, 0(r6)
 +	lxv	v11, 16(r6)
 +	addi	r6,r6,96
 +	stxv	v10, 0(r7)
 +	stxv	v11, 16(r7)
 +	bdz	L(end_copy_32b)
 +	addi	r7,r7,96
 +
 +	lxv	v12, 0(r8)
 +	lxv	v13, 16(r8)
 +	addi	r8,r8,96
 +	stxv	v12, 0(r9)
 +	stxv	v13, 16(r9)
 +	addi	r9,r9,96
 +	bdnz	L(copy_32)
 +
 +	clrldi.	r5,r5,64-5	/* Have we copied everything?  */
 +	beqlr
 +	cmpdi	r5,16
 +	ble	L(copy_le_16)
 +	b	L(copy_lt_32)
 +
 +	.p2align 5
 +L(end_copy_32a):
 +	clrldi.	r5,r5,64-5	/* Have we copied everything?  */
 +	beqlr
 +	/* 32 bytes have been copied since the last update of r4 and r10.  */
 +	addi	r4,r4,32
 +	addi	r10,r10,32
 +	cmpdi	r5,16
 +	ble	L(copy_le_16)
 +	b	L(copy_lt_32)
 +
 +	.p2align 5
 +L(end_copy_32b):
 +	clrldi.	r5,r5,64-5	/* Have we copied everything?  */
 +	beqlr
 +	/* The last iteration of the loop copied 64 bytes.  Update r4 and r10
 +	   accordingly.  */
 +	addi	r4,r4,-32
 +	addi	r10,r10,-32
 +	cmpdi	r5,16
 +	ble	L(copy_le_16)
 +
 +	.p2align 5
 +L(copy_lt_32):
 +	lxv	v10, 0(r4)
 +	stxv	v10, 0(r10)
 +	addi	r4,r4,16
 +	addi	r10,r10,16
 +	subi	r5,r5,16
 +
 +	.p2align 5
 +L(copy_le_16):
 +	sldi	r6,r5,56
 +	lxvl	v10,r4,r6
 +	stxvl	v10,r10,r6
 +	blr
 +
 +
 +END_GEN_TB (MEMCPY,TB_TOCLESS)
 +libc_hidden_builtin_def (memcpy)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
 index 66f8c6ace9824d4a..2e3c8f2e8a81cda4 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
 +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 ifneq (,$(filter %le,$(config-machine)))
 -sysdep_routines += memmove-power10 \
 +sysdep_routines += memcpy-power10 memmove-power10 \
 		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
 		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
 		   strlen-power10
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index 4ce04bc51574cca1..9d5a14e480c02171 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -51,6 +51,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 #ifdef SHARED
   /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c.  */
   IFUNC_IMPL (i, name, memcpy,
 +#ifdef __LITTLE_ENDIAN__
 +	      IFUNC_IMPL_ADD (array, i, memcpy,
 +			      hwcap2 & PPC_FEATURE2_ARCH_3_1
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 +			      __memcpy_power10)
 +#endif
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __memcpy_power8_cached)
 	      IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX,
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
 new file mode 100644
 index 0000000000000000..70e0fc3ed610cdc3
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S
@@ -0,0 +1,26 @@
 +/* Optimized memcpy implementation for POWER10.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
 +#define MEMCPY __memcpy_power10
 +
 +#undef libc_hidden_builtin_def
 +#define libc_hidden_builtin_def(name)
 +
 +#include <sysdeps/powerpc/powerpc64/le/power10/memcpy.S>
 +#endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
 index 44dea594f3770673..be0e47f32dde2ccf 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c
@@ -36,8 +36,15 @@ extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden;
 extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden;
 +# if defined __LITTLE_ENDIAN__
 +extern __typeof (__redirect_memcpy) __memcpy_power10 attribute_hidden;
 +# endif
 libc_ifunc (__libc_memcpy,
 +# if defined __LITTLE_ENDIAN__
 +	    (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX)
 +	    ? __memcpy_power10 :
 +# endif
 	    ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt)
 	    ? __memcpy_power8_cached :
 	      (hwcap & PPC_FEATURE_HAS_VSX)
--- a/SOURCES/glibc-rh1956357-6.patch
+++ b/SOURCES/glibc-rh1956357-6.patch
@ -0,0 +1,420 @@
 commit 23fdf8178cce3c2ec320dd5eca8b544245bcaef0
 Author: Raoni Fassina Firmino <raoni@linux.ibm.com>
 Date:   Fri Apr 30 18:12:08 2021 -0300
    powerpc64le: Optimize memset for POWER10
    This implementation is based on __memset_power8 and integrates a lot
    of suggestions from Anton Blanchard.
    The biggest difference is that it makes extensive use of stxvl to
    alignment and tail code to avoid branches and small stores.  It has
    three main execution paths:
    a) "Short lengths" for lengths up to 64 bytes, avoiding as many
       branches as possible.
    b) "General case" for larger lengths, it has an alignment section
       using stxvl to avoid branches, a 128 bytes loop and then a tail
       code, again using stxvl with few branches.
    c) "Zeroing cache blocks" for lengths from 256 bytes upwards and set
       value being zero.  It is mostly the __memset_power8 code but the
       alignment phase was simplified because, at this point, address is
       already 16-bytes aligned and also changed to use vector stores.
       The tail code was also simplified to reuse the general case tail.
    All unaligned stores use stxvl instructions that do not generate
    alignment interrupts on POWER10, making it safe to use on
    caching-inhibited memory.
    On average, this implementation provides something around 30%
    improvement when compared to __memset_power8.
    Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
    Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/le/power10/memset.S b/sysdeps/powerpc/powerpc64/le/power10/memset.S
 new file mode 100644
 index 0000000000000000..6b8e2cfdaf25fd30
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/power10/memset.S
@@ -0,0 +1,256 @@
 +/* Optimized memset implementation for POWER10 LE.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5]));
 +   Returns 's'.  */
 +
 +#ifndef MEMSET
 +# define MEMSET memset
 +#endif
 +
 +	.machine  power9
 +ENTRY_TOCLESS (MEMSET, 5)
 +	CALL_MCOUNT 3
 +
 +L(_memset):
 +	/* Assume memset of zero length is uncommon, and just let it go
 +	   through the small path below.  */
 +	cmpldi	r5,64
 +
 +	/* Replicate byte to quad word.  */
 +	mtvsrd	v0+32,r4
 +	vspltb	v0,v0,7
 +
 +	li	r7,16
 +	sldi	r8,r7,56
 +
 +	bgt	L(large)
 +
 +	/* For short lengths we want to avoid as many branches as possible.
 +	   We use store VSX vector with length instructions to do this.
 +	   It takes advantage of the fact that if the length passed to stxvl
 +	   is zero nothing is done, effectively a no-op.  */
 +	sldi	r5,r5,56
 +
 +	addi	r10,r3,16
 +
 +	sub.	r11,r5,r8
 +	isellt	r11,0,r11	/* Saturate the subtraction to zero.  */
 +
 +	stxvl	v0+32,r3,r5
 +	stxvl	v0+32,r10,r11
 +
 +	addi	r9,r3,32
 +	addi	r10,r3,48
 +
 +	sub.	r11,r11,r8
 +	isellt	r11,0,r11
 +
 +	sub.	r5,r11,r8
 +	isellt	r5,0,r5
 +
 +	stxvl	v0+32,r9,r11
 +	stxvl	v0+32,r10,r5
 +
 +	blr
 +
 +	.balign	16
 +L(large):
 +	mr	r6,r3	/* Don't modify r3 since we need to return it.  */
 +
 +	/* Get dest 16B aligned.  */
 +	neg	r0,r3
 +	clrldi.	r7,r0,(64-4)
 +	beq	L(aligned)
 +	rldic	r9,r0,56,4	/* (~X & 0xf)<<56 "clrlsldi r9,r0,64-4,56".  */
 +
 +	stxvl	v0+32,r6,r9	/* Store up to 15B until aligned address.  */
 +
 +	add	r6,r6,r7
 +	sub	r5,r5,r7
 +
 +	/* Go to tail if there is less than 64B left after alignment.  */
 +	cmpldi	r5,64
 +	blt	L(tail_64)
 +
 +	.balign	16
 +L(aligned):
 +	/* Go to tail if there is less than 128B left after alignment.  */
 +	srdi.	r0,r5,7
 +	beq	L(tail_128)
 +
 +	/* If c == 0 && n >= 256 use dcbz to zero out full cache blocks.  */
 +	cmpldi	cr5,r5,255
 +	cmpldi	cr6,r4,0
 +	crand	27,26,21
 +	bt	27,L(dcbz)
 +
 +	mtctr	r0
 +
 +	.balign	32
 +L(loop):
 +	stxv	v0+32,0(r6)
 +	stxv	v0+32,16(r6)
 +	stxv	v0+32,32(r6)
 +	stxv	v0+32,48(r6)
 +	stxv	v0+32,64(r6)
 +	stxv	v0+32,80(r6)
 +	stxv	v0+32,96(r6)
 +	stxv	v0+32,112(r6)
 +	addi	r6,r6,128
 +	bdnz	L(loop)
 +
 +	.balign	16
 +L(tail):
 +	/* 127B or less left, finish the tail or return.  */
 +	andi.	r5,r5,127
 +	beqlr
 +
 +	cmpldi	r5,64
 +	blt	L(tail_64)
 +
 +	.balign	16
 +L(tail_128):
 +	/* Stores a minimum of 64B and up to 128B and return.  */
 +	stxv	v0+32,0(r6)
 +	stxv	v0+32,16(r6)
 +	stxv	v0+32,32(r6)
 +	stxv	v0+32,48(r6)
 +	addi	r6,r6,64
 +	andi.	r5,r5,63
 +	beqlr
 +
 +	.balign	16
 +L(tail_64):
 +	/* Stores up to 64B and return.  */
 +	sldi	r5,r5,56
 +
 +	addi	r10,r6,16
 +
 +	sub.	r11,r5,r8
 +	isellt	r11,0,r11
 +
 +	stxvl	v0+32,r6,r5
 +	stxvl	v0+32,r10,r11
 +
 +	sub.	r11,r11,r8
 +	blelr
 +
 +	addi	r9,r6,32
 +	addi	r10,r6,48
 +
 +	isellt	r11,0,r11
 +
 +	sub.	r5,r11,r8
 +	isellt	r5,0,r5
 +
 +	stxvl	v0+32,r9,r11
 +	stxvl	v0+32,r10,r5
 +
 +	blr
 +
 +	.balign	16
 +L(dcbz):
 +	/* Special case when value is 0 and we have a long length to deal
 +	   with.  Use dcbz to zero out a full cacheline of 128 bytes at a time.
 +	   Before using dcbz though, we need to get the destination 128-byte
 +	   aligned.  */
 +	neg	r0,r6
 +	clrldi.	r0,r0,(64-7)
 +	beq	L(dcbz_aligned)
 +
 +	sub	r5,r5,r0
 +	mtocrf	0x2,r0	/* copying bits 57..59 to cr6. The ones for sizes 64,
 +			   32 and 16 which need to be checked.  */
 +
 +	/* Write 16-128 bytes until DST is aligned to 128 bytes.  */
 +64:	bf	25,32f
 +	stxv	v0+32,0(r6)
 +	stxv	v0+32,16(r6)
 +	stxv	v0+32,32(r6)
 +	stxv	v0+32,48(r6)
 +	addi	r6,r6,64
 +
 +32:	bf	26,16f
 +	stxv	v0+32,0(r6)
 +	stxv	v0+32,16(r6)
 +	addi	r6,r6,32
 +
 +16:	bf	27,L(dcbz_aligned)
 +	stxv	v0+32,0(r6)
 +	addi	r6,r6,16
 +
 +	.balign	16
 +L(dcbz_aligned):
 +	/* Setup dcbz unroll offsets and count numbers.  */
 +	srdi.	r0,r5,9
 +	li	r9,128
 +	beq	L(bcdz_tail)
 +	li	r10,256
 +	li	r11,384
 +	mtctr	r0
 +
 +	.balign	16
 +L(dcbz_loop):
 +	/* Sets 512 bytes to zero in each iteration, the loop unrolling shows
 +	   a throughput boost for large sizes (2048 bytes or higher).  */
 +	dcbz	0,r6
 +	dcbz	r9,r6
 +	dcbz	r10,r6
 +	dcbz	r11,r6
 +	addi	r6,r6,512
 +	bdnz	L(dcbz_loop)
 +
 +	andi.	r5,r5,511
 +	beqlr
 +
 +	.balign	16
 +L(bcdz_tail):
 +	/* We have 1-511 bytes remaining.  */
 +	srdi.	r0,r5,7
 +	beq	L(tail)
 +
 +	mtocrf	0x1,r0
 +
 +256:	bf	30,128f
 +	dcbz	0,r6
 +	dcbz	r9,r6
 +	addi	r6,r6,256
 +
 +128:	bf	31,L(tail)
 +	dcbz	0,r6
 +	addi	r6,r6,128
 +
 +	b	L(tail)
 +
 +END_GEN_TB (MEMSET,TB_TOCLESS)
 +libc_hidden_builtin_def (memset)
 +
 +/* Copied from bzero.S to prevent the linker from inserting a stub
 +   between bzero and memset.  */
 +ENTRY_TOCLESS (__bzero)
 +	CALL_MCOUNT 2
 +	mr	r5,r4
 +	li	r4,0
 +	b	L(_memset)
 +END (__bzero)
 +#ifndef __bzero
 +weak_alias (__bzero, bzero)
 +#endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
 index 2e3c8f2e8a81cda4..1d517698429e1230 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
 +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 ifneq (,$(filter %le,$(config-machine)))
 -sysdep_routines += memcpy-power10 memmove-power10 \
 +sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
 		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
 		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
 		   strlen-power10
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
 index f8cb05bea8a3505b..4ce98e324d12a31e 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -27,8 +27,16 @@ extern __typeof (bzero) __bzero_power4 attribute_hidden;
 extern __typeof (bzero) __bzero_power6 attribute_hidden;
 extern __typeof (bzero) __bzero_power7 attribute_hidden;
 extern __typeof (bzero) __bzero_power8 attribute_hidden;
 +# ifdef __LITTLE_ENDIAN__
 +extern __typeof (bzero) __bzero_power10 attribute_hidden;
 +# endif
 libc_ifunc (__bzero,
 +# ifdef __LITTLE_ENDIAN__
 +	    (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
 +	     && hwcap & PPC_FEATURE_HAS_VSX)
 +	    ? __bzero_power10 :
 +# endif
             (hwcap2 & PPC_FEATURE2_ARCH_2_07)
             ? __bzero_power8 :
 	      (hwcap & PPC_FEATURE_HAS_VSX)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index 9d5a14e480c02171..11532f77d4d03b2a 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -86,6 +86,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c.  */
   IFUNC_IMPL (i, name, memset,
 +#ifdef __LITTLE_ENDIAN__
 +	      IFUNC_IMPL_ADD (array, i, memset,
 +			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 +					PPC_FEATURE2_HAS_ISEL)
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 +			      __memset_power10)
 +#endif
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __memset_power8)
 	      IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX,
@@ -187,6 +194,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c.  */
   IFUNC_IMPL (i, name, bzero,
 +#ifdef __LITTLE_ENDIAN__
 +	      IFUNC_IMPL_ADD (array, i, bzero,
 +			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 +					PPC_FEATURE2_HAS_ISEL)
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 +			      __bzero_power10)
 +#endif
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07,
 			      __bzero_power8)
 	      IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX,
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
 new file mode 100644
 index 0000000000000000..548e99789735296c
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S
@@ -0,0 +1,27 @@
 +/* Optimized memset implementation for POWER10 LE.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#define MEMSET __memset_power10
 +
 +#undef libc_hidden_builtin_def
 +#define libc_hidden_builtin_def(name)
 +
 +#undef __bzero
 +#define __bzero __bzero_power10
 +
 +#include <sysdeps/powerpc/powerpc64/le/power10/memset.S>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
 index 1a7c46fecf78ab1f..4c97622c7d7eb8aa 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -33,10 +33,18 @@ extern __typeof (__redirect_memset) __memset_power4 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power6 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power7 attribute_hidden;
 extern __typeof (__redirect_memset) __memset_power8 attribute_hidden;
 +# ifdef __LITTLE_ENDIAN__
 +extern __typeof (__redirect_memset) __memset_power10 attribute_hidden;
 +# endif
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__libc_memset,
 +# ifdef __LITTLE_ENDIAN__
 +	    (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
 +	     && hwcap & PPC_FEATURE_HAS_VSX)
 +	    ? __memset_power10 :
 +# endif
             (hwcap2 & PPC_FEATURE2_ARCH_2_07)
             ? __memset_power8 :
 	      (hwcap & PPC_FEATURE_HAS_VSX)
--- a/SOURCES/glibc-rh1956357-7.patch
+++ b/SOURCES/glibc-rh1956357-7.patch
@ -0,0 +1,131 @@
 commit 17a73a6d8b4c46f3e87fc53c7c25fa7cec01d707
 Author: Raoni Fassina Firmino <raoni@linux.ibm.com>
 Date:   Mon May 3 16:59:35 2021 -0300
    powerpc64le: Fix ifunc selection for memset, memmove, bzero and bcopy
    The hwcap2 check for the aforementioned functions should check for
    both PPC_FEATURE2_ARCH_3_1 and PPC_FEATURE2_HAS_ISEL but was
    mistakenly checking for any one of them, enabling isa 3.1 version of
    the functions in incompatible processors, like POWER8.
    Reviewed-by: Tulio Magno Quites Machado Filho <tuliom@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
 index 705fef33d4e57557..3c6528e5dbccfdbd 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c
@@ -28,10 +28,10 @@ extern __typeof (bcopy) __bcopy_power10 attribute_hidden;
 libc_ifunc (bcopy,
 #ifdef __LITTLE_ENDIAN__
 -	     hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 -		       PPC_FEATURE2_HAS_ISEL)
 -	     && (hwcap & PPC_FEATURE_HAS_VSX)
 -	     ? __bcopy_power10 :
 +	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
 +	     && hwcap2 & PPC_FEATURE2_HAS_ISEL
 +	     && hwcap & PPC_FEATURE_HAS_VSX)
 +	    ? __bcopy_power10 :
 #endif
             (hwcap & PPC_FEATURE_HAS_VSX)
             ? __bcopy_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
 index 4ce98e324d12a31e..b08b381b4a3999f1 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c
@@ -33,7 +33,8 @@ extern __typeof (bzero) __bzero_power10 attribute_hidden;
 libc_ifunc (__bzero,
 # ifdef __LITTLE_ENDIAN__
 -	    (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
 +	     && hwcap2 & PPC_FEATURE2_HAS_ISEL
 	     && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __bzero_power10 :
 # endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index 11532f77d4d03b2a..6e36659d1903448a 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -75,9 +75,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memmove,
 #ifdef __LITTLE_ENDIAN__
 	      IFUNC_IMPL_ADD (array, i, memmove,
 -			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 -					PPC_FEATURE2_HAS_ISEL)
 -			      && (hwcap & PPC_FEATURE_HAS_VSX),
 +			      hwcap2 & PPC_FEATURE2_ARCH_3_1
 +			      && hwcap2 & PPC_FEATURE2_HAS_ISEL
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __memmove_power10)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX,
@@ -88,8 +88,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, memset,
 #ifdef __LITTLE_ENDIAN__
 	      IFUNC_IMPL_ADD (array, i, memset,
 -			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 -					PPC_FEATURE2_HAS_ISEL)
 +			      hwcap2 & PPC_FEATURE2_ARCH_3_1
 +			      && hwcap2 & PPC_FEATURE2_HAS_ISEL
 			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __memset_power10)
 #endif
@@ -196,8 +196,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, bzero,
 #ifdef __LITTLE_ENDIAN__
 	      IFUNC_IMPL_ADD (array, i, bzero,
 -			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 -					PPC_FEATURE2_HAS_ISEL)
 +			      hwcap2 & PPC_FEATURE2_ARCH_3_1
 +			      && hwcap2 & PPC_FEATURE2_HAS_ISEL
 			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __bzero_power10)
 #endif
@@ -215,9 +215,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   IFUNC_IMPL (i, name, bcopy,
 #ifdef __LITTLE_ENDIAN__
 	      IFUNC_IMPL_ADD (array, i, bcopy,
 -			      hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 -					PPC_FEATURE2_HAS_ISEL)
 -			      && (hwcap & PPC_FEATURE_HAS_VSX),
 +			      hwcap2 & PPC_FEATURE2_ARCH_3_1
 +			      && hwcap2 & PPC_FEATURE2_HAS_ISEL
 +			      && hwcap & PPC_FEATURE_HAS_VSX,
 			      __bcopy_power10)
 #endif
 	      IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX,
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
 index 2fd7b6d309e4bedd..27895faad0cab40e 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c
@@ -36,10 +36,10 @@ extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden;
 libc_ifunc (__libc_memmove,
 #ifdef __LITTLE_ENDIAN__
 -	     hwcap2 & (PPC_FEATURE2_ARCH_3_1 |
 -		       PPC_FEATURE2_HAS_ISEL)
 -	     && (hwcap & PPC_FEATURE_HAS_VSX)
 -	     ? __memmove_power10 :
 +	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
 +	     && hwcap2 & PPC_FEATURE2_HAS_ISEL
 +	     && hwcap & PPC_FEATURE_HAS_VSX)
 +	    ? __memmove_power10 :
 #endif
 		     (hwcap & PPC_FEATURE_HAS_VSX)
 		     ? __memmove_power7
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c
 index 4c97622c7d7eb8aa..685623ae870a0725 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/memset.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c
@@ -41,7 +41,8 @@ extern __typeof (__redirect_memset) __memset_power10 attribute_hidden;
    ifunc symbol properly.  */
 libc_ifunc (__libc_memset,
 # ifdef __LITTLE_ENDIAN__
 -	    (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL)
 +	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
 +	     && hwcap2 & PPC_FEATURE2_HAS_ISEL
 	     && hwcap & PPC_FEATURE_HAS_VSX)
 	    ? __memset_power10 :
 # endif
--- a/SOURCES/glibc-rh1956357-8.patch
+++ b/SOURCES/glibc-rh1956357-8.patch
@ -0,0 +1,387 @@
 commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd
 Author: Matheus Castanho <msc@linux.ibm.com>
 Date:   Tue May 11 17:53:07 2021 -0300
    powerpc: Add optimized rawmemchr for POWER10
    Reuse code for optimized strlen to implement a faster version of rawmemchr.
    This takes advantage of the same benefits provided by the strlen implementation,
    but needs some extra steps. __strlen_power10 code should be unchanged after this
    change.
    rawmemchr returns a pointer to the char found, while strlen returns only the
    length, so we have to take that into account when preparing the return value.
    To quickly check 64B, the loop on __strlen_power10 merges the whole block into
    16B by using unsigned minimum vector operations (vminub) and checks if there are
    any \0 on the resulting vector. The same code is used by rawmemchr if the char c
    is 0. However, this approach does not work when c != 0.  We first need to
    subtract each byte by c, so that the value we are looking for is converted to a
    0, then taking the minimum and checking for nulls works again.
    The new code branches after it has compared ~256 bytes and chooses which of the
    two strategies above will be used in the main loop, based on the char c. This
    extra branch adds some overhead (~5%) for length ~256, but is quickly amortized
    by the faster loop for larger sizes.
    Compared to __rawmemchr_power9, this version is ~20% faster for length < 256.
    Because of the optimized main loop, the improvement becomes ~35% for c != 0
    and ~50% for c = 0 for strings longer than 256.
    Reviewed-by: Lucas A. M. Magalhaes <lamm@linux.ibm.com>
    Reviewed-by: Raphael M Zinsly <rzinsly@linux.ibm.com>
 diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
 new file mode 100644
 index 0000000000000000..5351c2634f6086bf
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S
@@ -0,0 +1,22 @@
 +/* Optimized rawmemchr implementation for POWER10 LE.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#include <sysdep.h>
 +
 +#define USE_AS_RAWMEMCHR 1
 +#include <sysdeps/powerpc/powerpc64/le/power10/strlen.S>
 diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
 index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644
 --- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S
 +++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S
@@ -18,10 +18,50 @@
 #include <sysdep.h>
 -#ifndef STRLEN
 -# define STRLEN __strlen
 -# define DEFINE_STRLEN_HIDDEN_DEF 1
 -#endif
 +/* To reuse the code for rawmemchr, we have some extra steps compared to the
 +   strlen implementation:
 +      - Sum the initial value of r3 with the position at which the char was
 +        found, to guarantee we return a pointer and not the length.
 +      - In the main loop, subtract each byte by the char we are looking for,
 +        so we can keep using vminub to quickly check 64B at once.  */
 +#ifdef USE_AS_RAWMEMCHR
 +# ifndef RAWMEMCHR
 +#  define FUNCNAME __rawmemchr
 +# else
 +#  define FUNCNAME RAWMEMCHR
 +# endif
 +# define MCOUNT_NARGS 2
 +# define VREG_ZERO v20
 +# define OFF_START_LOOP 256
 +# define RAWMEMCHR_SUBTRACT_VECTORS \
 +	vsububm   v4,v4,v18;	    \
 +	vsububm   v5,v5,v18;	    \
 +	vsububm   v6,v6,v18;	    \
 +	vsububm   v7,v7,v18;
 +# define TAIL(vreg,increment)	   \
 +	vctzlsbb  r4,vreg;	   \
 +	addi	  r4,r4,increment; \
 +	add	  r3,r5,r4;	   \
 +	blr
 +
 +#else /* strlen */
 +
 +# ifndef STRLEN
 +#  define FUNCNAME __strlen
 +#  define DEFINE_STRLEN_HIDDEN_DEF 1
 +# else
 +#  define FUNCNAME STRLEN
 +# endif
 +# define MCOUNT_NARGS 1
 +# define VREG_ZERO v18
 +# define OFF_START_LOOP 192
 +# define TAIL(vreg,increment)	   \
 +	vctzlsbb  r4,vreg;	   \
 +	subf	  r3,r3,r5;	   \
 +	addi	  r4,r4,increment; \
 +	add	  r3,r3,r4;	   \
 +	blr
 +#endif /* USE_AS_RAWMEMCHR */
 /* TODO: Replace macros by the actual instructions when minimum binutils becomes
    >= 2.35.  This is used to keep compatibility with older versions.  */
@@ -50,33 +90,41 @@
 	li	  r6,offset;		    \
 	LXVP(v4+32,offset,addr);	    \
 	LXVP(v6+32,offset+32,addr);	    \
 +	RAWMEMCHR_SUBTRACT_VECTORS;	    \
 	vminub	  v14,v4,v5;		    \
 	vminub	  v15,v6,v7;		    \
 	vminub	  v16,v14,v15;		    \
 -	vcmpequb. v0,v16,v18;		    \
 +	vcmpequb. v0,v16,VREG_ZERO;	    \
 	bne	  cr6,L(label)
 -#define TAIL(vreg,increment)	   \
 -	vctzlsbb  r4,vreg;	   \
 -	subf	  r3,r3,r5;	   \
 -	addi	  r4,r4,increment; \
 -	add	  r3,r3,r4;	   \
 -	blr
 -
 /* Implements the function
    int [r3] strlen (const void *s [r3])
 +   but when USE_AS_RAWMEMCHR is set, implements the function
 +
 +   void* [r3] rawmemchr (const void *s [r3], int c [r4])
 +
    The implementation can load bytes past a matching byte, but only
    up to the next 64B boundary, so it never crosses a page.  */
 .machine power9
 -ENTRY_TOCLESS (STRLEN, 4)
 -	CALL_MCOUNT 1
 +ENTRY_TOCLESS (FUNCNAME, 4)
 +	CALL_MCOUNT MCOUNT_NARGS
 -	vspltisb  v18,0
 +#ifdef USE_AS_RAWMEMCHR
 +	xori	r5,r4,0xff
 +
 +	mtvsrd	v18+32,r4	/* matching char in v18  */
 +	mtvsrd	v19+32,r5	/* non matching char in v19  */
 +
 +	vspltb	v18,v18,7	/* replicate  */
 +	vspltb	v19,v19,7	/* replicate  */
 +#else
 	vspltisb  v19,-1
 +#endif
 +	vspltisb  VREG_ZERO,0
 	/* Next 16B-aligned address. Prepare address for L(aligned).  */
 	addi	  r5,r3,16
@@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4)
 	vcmpequb. v6,v0,v18
 	beq	  cr6,L(aligned)
 +#ifdef USE_AS_RAWMEMCHR
 +	vctzlsbb  r6,v6
 +	add	  r3,r3,r6
 +#else
 	vctzlsbb  r3,v6
 +#endif
 	blr
 -	/* Test next 176B, 16B at a time.  The main loop is optimized for longer
 -	   strings, so checking the first bytes in 16B chunks benefits a lot
 -	   small strings.  */
 +	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
 +	   optimized for longer strings, so checking the first bytes in 16B
 +	   chunks benefits a lot small strings.  */
 	.p2align 5
 L(aligned):
 +#ifdef USE_AS_RAWMEMCHR
 +	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
 +				  choose how we will perform the main loop.  */
 +#endif
 	/* Prepare address for the loop.  */
 -	addi	  r4,r3,192
 +	addi	  r4,r3,OFF_START_LOOP
 	clrrdi	  r4,r4,6
 	CHECK16(v0,0,r5,tail1)
@@ -113,15 +170,43 @@ L(aligned):
 	CHECK16(v8,128,r5,tail9)
 	CHECK16(v9,144,r5,tail10)
 	CHECK16(v10,160,r5,tail11)
 +#ifdef USE_AS_RAWMEMCHR
 +	CHECK16(v0,176,r5,tail12)
 +	CHECK16(v1,192,r5,tail13)
 +	CHECK16(v2,208,r5,tail14)
 +	CHECK16(v3,224,r5,tail15)
 +#endif
 	addi	  r5,r4,128
 +#ifdef USE_AS_RAWMEMCHR
 +	/* If c == 0, use the same loop as strlen, without the vsububm.  */
 +	beq	cr5,L(loop)
 +
 +	/* This is very similar to the block after L(loop), the difference is
 +	   that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
 +	   each byte loaded by the char we are looking for, this way we can keep
 +	   using vminub to merge the results and checking for nulls.  */
 +	.p2align 5
 +L(rawmemchr_loop):
 +	CHECK64(0,r4,pre_tail_64b)
 +	CHECK64(64,r4,pre_tail_64b)
 +	addi	  r4,r4,256
 +
 +	CHECK64(0,r5,tail_64b)
 +	CHECK64(64,r5,tail_64b)
 +	addi	  r5,r5,256
 +
 +	b	  L(rawmemchr_loop)
 +#endif
 	/* Switch to a more aggressive approach checking 64B each time.  Use 2
 	   pointers 128B apart and unroll the loop once to make the pointer
 	   updates and usages separated enough to avoid stalls waiting for
 	   address calculation.  */
 	.p2align 5
 L(loop):
 +#undef RAWMEMCHR_SUBTRACT_VECTORS
 +#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */
 	CHECK64(0,r4,pre_tail_64b)
 	CHECK64(64,r4,pre_tail_64b)
 	addi	  r4,r4,256
@@ -140,10 +225,10 @@ L(tail_64b):
 	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
 	   low 16B bytes into vx+1, and the high into vx, so the order here is
 	   v5, v4, v7, v6.  */
 -	vcmpequb  v1,v5,v18
 -	vcmpequb  v2,v4,v18
 -	vcmpequb  v3,v7,v18
 -	vcmpequb  v4,v6,v18
 +	vcmpequb  v1,v5,VREG_ZERO
 +	vcmpequb  v2,v4,VREG_ZERO
 +	vcmpequb  v3,v7,VREG_ZERO
 +	vcmpequb  v4,v6,VREG_ZERO
 	/* Take into account the other 64B blocks we had already checked.  */
 	add	r5,r5,r6
@@ -165,7 +250,9 @@ L(tail_64b):
 	or	  r10,r8,r7
 	cnttzd	  r0,r10	  /* Count trailing zeros before the match.  */
 +#ifndef USE_AS_RAWMEMCHR
 	subf	  r5,r3,r5
 +#endif
 	add	  r3,r5,r0	  /* Compute final length.  */
 	blr
@@ -213,9 +300,32 @@ L(tail10):
 L(tail11):
 	TAIL(v10,160)
 -END (STRLEN)
 +#ifdef USE_AS_RAWMEMCHR
 +	.p2align  5
 +L(tail12):
 +	TAIL(v0,176)
 +
 +	.p2align  5
 +L(tail13):
 +	TAIL(v1,192)
 +
 +	.p2align  5
 +L(tail14):
 +	TAIL(v2,208)
 +
 +	.p2align  5
 +L(tail15):
 +	TAIL(v3,224)
 +#endif
 +
 +END (FUNCNAME)
 -#ifdef DEFINE_STRLEN_HIDDEN_DEF
 +#ifdef USE_AS_RAWMEMCHR
 +weak_alias (__rawmemchr,rawmemchr)
 +libc_hidden_builtin_def (__rawmemchr)
 +#else
 +# ifdef DEFINE_STRLEN_HIDDEN_DEF
 weak_alias (__strlen, strlen)
 libc_hidden_builtin_def (strlen)
 +# endif
 #endif
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
 index 1d517698429e1230..ac2446aca62cc4ab 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
 +++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 ifneq (,$(filter %le,$(config-machine)))
 sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \
 +		   rawmemchr-power9 rawmemchr-power10 \
 		   strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
 -		   rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
 -		   strlen-power10
 +		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 index 6e36659d1903448a..127af84b32a8196f 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c.  */
   IFUNC_IMPL (i, name, rawmemchr,
 #ifdef __LITTLE_ENDIAN__
 +	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 +			      (hwcap2 & PPC_FEATURE2_ARCH_3_1)
 +                              && (hwcap & PPC_FEATURE_HAS_VSX),
 +                              __rawmemchr_power10)
 	      IFUNC_IMPL_ADD (array, i, rawmemchr,
 			      hwcap2 & PPC_FEATURE2_ARCH_3_00,
 			      __rawmemchr_power9)
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
 new file mode 100644
 index 0000000000000000..bf1ed7e1941f922d
 --- /dev/null
 +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S
@@ -0,0 +1,21 @@
 +/* Optimized rawmemchr implementation for PowerPC64/POWER10.
 +   Copyright (C) 2021 Free Software Foundation, Inc.
 +   This file is part of the GNU C Library.
 +
 +   The GNU C Library is free software; you can redistribute it and/or
 +   modify it under the terms of the GNU Lesser General Public
 +   License as published by the Free Software Foundation; either
 +   version 2.1 of the License, or (at your option) any later version.
 +
 +   The GNU C Library is distributed in the hope that it will be useful,
 +   but WITHOUT ANY WARRANTY; without even the implied warranty of
 +   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 +   Lesser General Public License for more details.
 +
 +   You should have received a copy of the GNU Lesser General Public
 +   License along with the GNU C Library; if not, see
 +   <https://www.gnu.org/licenses/>.  */
 +
 +#define RAWMEMCHR __rawmemchr_power10
 +
 +#include <sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S>
 diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
 index 2a7ae5a1ed02e556..369d6359e8987052 100644
 --- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
 +++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c
@@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden;
 extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden;
 # ifdef __LITTLE_ENDIAN__
 extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
 +extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden;
 # endif
 # undef __rawmemchr
@@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden;
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr,
 # ifdef __LITTLE_ENDIAN__
 +		     (hwcap2 & PPC_FEATURE2_ARCH_3_1)
 +		     && (hwcap & PPC_FEATURE_HAS_VSX)
 +		     ? __rawmemchr_power10 :
 		       (hwcap2 & PPC_FEATURE2_ARCH_3_00)
 		       ? __rawmemchr_power9 :
 # endif
--- a/SPECS/glibc.spec
+++ b/SPECS/glibc.spec
@ -1,6 +1,6 @@
 %define glibcsrcdir glibc-2.28
 %define glibcversion 2.28
-%define glibcrelease 158%{?dist}
+%define glibcrelease 160%{?dist}
 # Pre-release tarballs are pulled in from git using a command that is
 # effectively:
 #
@ -706,6 +706,14 @@ Patch569: glibc-rh1934155-3.patch
 Patch570: glibc-rh1934155-4.patch
 Patch571: glibc-rh1934155-5.patch
 Patch572: glibc-rh1934155-6.patch
 Patch573: glibc-rh1956357-1.patch
 Patch574: glibc-rh1956357-2.patch
 Patch575: glibc-rh1956357-3.patch
 Patch576: glibc-rh1956357-4.patch
 Patch577: glibc-rh1956357-5.patch
 Patch578: glibc-rh1956357-6.patch
 Patch579: glibc-rh1956357-7.patch
 Patch580: glibc-rh1956357-8.patch
 ##############################################################################
 # Continued list of core "glibc" package information:
@ -2617,6 +2625,12 @@ fi
 %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared
 %changelog
 * Mon May 31 2021 Arjun Shankar <arjun@redhat.com> - 2.28-160
 - Backport POWER10 optimized rawmemchr for ppc64le (#1956357)
 * Thu May 27 2021 Arjun Shankar <arjun@redhat.com> - 2.28-159
 - Backport additional ifunc optimizations for ppc64le (#1956357)
 * Thu Apr 22 2021 Florian Weimer <fweimer@redhat.com> - 2.28-158
 - Rebuild with new binutils (#1946518)