diff --git a/SOURCES/glibc-rh1956357-1.patch b/SOURCES/glibc-rh1956357-1.patch new file mode 100644 index 0000000..2c4fca1 --- /dev/null +++ b/SOURCES/glibc-rh1956357-1.patch @@ -0,0 +1,100 @@ +commit 56c81132ccc6f468fa4fc29c536db060e18e9d87 +Author: Raphael Moreira Zinsly +Date: Tue Feb 23 14:14:37 2021 -0300 + + powerpc: Add optimized ilogb* for POWER9 + + The instructions xsxexpdp and xsxexpqp introduced on POWER9 extract + the exponent from a double-precision and quad-precision floating-point + respectively, thus they can be used to improve ilogb, ilogbf and ilogbf128. + +diff --git a/sysdeps/powerpc/fpu/math_private.h b/sysdeps/powerpc/fpu/math_private.h +index e642d6c8237578ea..5bbc468829062a48 100644 +--- a/sysdeps/powerpc/fpu/math_private.h ++++ b/sysdeps/powerpc/fpu/math_private.h +@@ -26,7 +26,28 @@ + + #include_next + +-#if defined _ARCH_PWR9 && __HAVE_DISTINCT_FLOAT128 ++#ifdef _ARCH_PWR9 ++ ++#if __GNUC_PREREQ (8, 0) ++# define _GL_HAS_BUILTIN_ILOGB 1 ++#elif defined __has_builtin ++# define _GL_HAS_BUILTIN_ILOGB __has_builtin (__builtin_vsx_scalar_extract_exp) ++#else ++# define _GL_HAS_BUILTIN_ILOGB 0 ++#endif ++ ++#define __builtin_test_dc_ilogbf __builtin_test_dc_ilogb ++#define __builtin_ilogbf __builtin_ilogb ++ ++#define __builtin_test_dc_ilogb(x, y) \ ++ __builtin_vsx_scalar_test_data_class_dp(x, y) ++#define __builtin_ilogb(x) __builtin_vsx_scalar_extract_exp(x) - 0x3ff ++ ++#define __builtin_test_dc_ilogbf128(x, y) \ ++ __builtin_vsx_scalar_test_data_class_qp(x, y) ++#define __builtin_ilogbf128(x) __builtin_vsx_scalar_extract_expq(x) - 0x3fff ++ ++#if __HAVE_DISTINCT_FLOAT128 + extern __always_inline _Float128 + __ieee754_sqrtf128 (_Float128 __x) + { +@@ -35,6 +56,9 @@ __ieee754_sqrtf128 (_Float128 __x) + return __z; + } + #endif ++#else /* !_ARCH_PWR9 */ ++#define _GL_HAS_BUILTIN_ILOGB 0 ++#endif + + #if defined _ARCH_PWR5X + +diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c +new file mode 100644 +index 0000000000000000..b5c1c0aa9db86f3d +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogb_template.c +@@ -0,0 +1,30 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#if _GL_HAS_BUILTIN_ILOGB ++int ++M_DECL_FUNC (__ilogb) (FLOAT x) ++{ ++ int r; ++ /* Check for exceptional cases. */ ++ if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f)) ++ r = M_SUF (__builtin_ilogb) (x); ++ else ++ /* Fallback to the generic ilogb if x is NaN, Inf or subnormal. */ ++ r = M_SUF (__ieee754_ilogb) (x); ++ if (__builtin_expect (r == FP_ILOGB0, 0) ++ || __builtin_expect (r == FP_ILOGBNAN, 0) ++ || __builtin_expect (r == INT_MAX, 0)) ++ { ++ __set_errno (EDOM); ++ __feraiseexcept (FE_INVALID); ++ } ++ return r; ++} ++declare_mgen_alias (__ilogb, ilogb) ++#else ++#include ++#endif +diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c +new file mode 100644 +index 0000000000000000..205f154f0089a269 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/fpu/w_ilogbl.c +@@ -0,0 +1,4 @@ ++/* Skip the optimization for long double as ibm128 does not provide an ++ optimized builtin. */ ++#include ++#include diff --git a/SOURCES/glibc-rh1956357-2.patch b/SOURCES/glibc-rh1956357-2.patch new file mode 100644 index 0000000..c1da766 --- /dev/null +++ b/SOURCES/glibc-rh1956357-2.patch @@ -0,0 +1,64 @@ +commit a7d88506c260e7a0e4268803e76fc19e38ed041f +Author: Raphael Moreira Zinsly +Date: Thu Feb 25 09:58:52 2021 -0300 + + powerpc: Add optimized llogb* for POWER9 + + The POWER9 builtins used to improve the ilogb* functions can be + used in the llogb* functions as well. + +diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c +new file mode 100644 +index 0000000000000000..d00b71d2a34e28da +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogb_template.c +@@ -0,0 +1,39 @@ ++#include ++#include ++#include ++#include ++#include ++ ++#if _GL_HAS_BUILTIN_ILOGB ++long int ++M_DECL_FUNC (__llogb) (FLOAT x) ++{ ++ int r; ++ /* Check for exceptional cases. */ ++ if (! M_SUF(__builtin_test_dc_ilogb) (x, 0x7f)) ++ r = M_SUF (__builtin_ilogb) (x); ++ else ++ /* Fallback to the generic ilogb if x is NaN, Inf or subnormal. */ ++ r = M_SUF (__ieee754_ilogb) (x); ++ long int lr = r; ++ if (__glibc_unlikely (r == FP_ILOGB0) ++ || __glibc_unlikely (r == FP_ILOGBNAN) ++ || __glibc_unlikely (r == INT_MAX)) ++ { ++#if LONG_MAX != INT_MAX ++ if (r == FP_ILOGB0) ++ lr = FP_LLOGB0; ++ else if (r == FP_ILOGBNAN) ++ lr = FP_LLOGBNAN; ++ else ++ lr = LONG_MAX; ++#endif ++ __set_errno (EDOM); ++ __feraiseexcept (FE_INVALID); ++ } ++ return lr; ++} ++declare_mgen_alias (__llogb, llogb) ++#else ++#include ++#endif +diff --git a/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c +new file mode 100644 +index 0000000000000000..69477a37ae82c476 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/fpu/w_llogbl.c +@@ -0,0 +1,4 @@ ++/* Skip the optimization for long double as ibm128 does not provide an ++ optimized builtin. */ ++#include ++#include diff --git a/SOURCES/glibc-rh1956357-3.patch b/SOURCES/glibc-rh1956357-3.patch new file mode 100644 index 0000000..6596582 --- /dev/null +++ b/SOURCES/glibc-rh1956357-3.patch @@ -0,0 +1,334 @@ +commit 10624a97e8e47004985740cbb04060a84cfada76 +Author: Matheus Castanho +Date: Tue Sep 29 15:40:08 2020 -0300 + + powerpc: Add optimized strlen for POWER10 + + Improvements compared to POWER9 version: + + 1. Take into account first 16B comparison for aligned strings + + The previous version compares the first 16B and increments r4 by the number + of bytes until the address is 16B-aligned, then starts doing aligned loads at + that address. For aligned strings, this causes the first 16B to be compared + twice, because the increment is 0. Here we calculate the next 16B-aligned + address differently, which avoids that issue. + + 2. Use simple comparisons for the first ~192 bytes + + The main loop is good for big strings, but comparing 16B each time is better + for smaller strings. So after aligning the address to 16 Bytes, we check + more 176B in 16B chunks. There may be some overlaps with the main loop for + unaligned strings, but we avoid using the more aggressive strategy too soon, + and also allow the loop to start at a 64B-aligned address. This greatly + benefits smaller strings and avoids overlapping checks if the string is + already aligned at a 64B boundary. + + 3. Reduce dependencies between load blocks caused by address calculation on loop + + Doing a precise time tracing on the code showed many loads in the loop were + stalled waiting for updates to r4 from previous code blocks. This + implementation avoids that as much as possible by using 2 registers (r4 and + r5) to hold addresses to be used by different parts of the code. + + Also, the previous code aligned the address to 16B, then to 64B by doing a + few 48B loops (if needed) until the address was aligned. The main loop could + not start until that 48B loop had finished and r4 was updated with the + current address. Here we calculate the address used by the loop very early, + so it can start sooner. + + The main loop now uses 2 pointers 128B apart to make pointer updates less + frequent, and also unrolls 1 iteration to guarantee there is enough time + between iterations to update the pointers, reducing stalled cycles. + + 4. Use new P10 instructions + + lxvp is used to load 32B with a single instruction, reducing contention in + the load queue. + + vextractbm allows simplifying the tail code for the loop, replacing + vbpermq and avoiding having to generate a permute control vector. + + Reviewed-by: Paul E Murphy + Reviewed-by: Raphael M Zinsly + Reviewed-by: Lucas A. M. Magalhaes + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S +new file mode 100644 +index 0000000000000000..ca7e9eb3d84c9b00 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S +@@ -0,0 +1,221 @@ ++/* Optimized strlen implementation for POWER10 LE. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#ifndef STRLEN ++# define STRLEN __strlen ++# define DEFINE_STRLEN_HIDDEN_DEF 1 ++#endif ++ ++/* TODO: Replace macros by the actual instructions when minimum binutils becomes ++ >= 2.35. This is used to keep compatibility with older versions. */ ++#define VEXTRACTBM(rt,vrb) \ ++ .long(((4)<<(32-6)) \ ++ | ((rt)<<(32-11)) \ ++ | ((8)<<(32-16)) \ ++ | ((vrb)<<(32-21)) \ ++ | 1602) ++ ++#define LXVP(xtp,dq,ra) \ ++ .long(((6)<<(32-6)) \ ++ | ((((xtp)-32)>>1)<<(32-10)) \ ++ | ((1)<<(32-11)) \ ++ | ((ra)<<(32-16)) \ ++ | dq) ++ ++#define CHECK16(vreg,offset,addr,label) \ ++ lxv vreg+32,offset(addr); \ ++ vcmpequb. vreg,vreg,v18; \ ++ bne cr6,L(label); ++ ++/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has # ++ of bytes already checked. */ ++#define CHECK64(offset,addr,label) \ ++ li r6,offset; \ ++ LXVP(v4+32,offset,addr); \ ++ LXVP(v6+32,offset+32,addr); \ ++ vminub v14,v4,v5; \ ++ vminub v15,v6,v7; \ ++ vminub v16,v14,v15; \ ++ vcmpequb. v0,v16,v18; \ ++ bne cr6,L(label) ++ ++#define TAIL(vreg,increment) \ ++ vctzlsbb r4,vreg; \ ++ subf r3,r3,r5; \ ++ addi r4,r4,increment; \ ++ add r3,r3,r4; \ ++ blr ++ ++/* Implements the function ++ ++ int [r3] strlen (const void *s [r3]) ++ ++ The implementation can load bytes past a matching byte, but only ++ up to the next 64B boundary, so it never crosses a page. */ ++ ++.machine power9 ++ ++ENTRY_TOCLESS (STRLEN, 4) ++ CALL_MCOUNT 1 ++ ++ vspltisb v18,0 ++ vspltisb v19,-1 ++ ++ /* Next 16B-aligned address. Prepare address for L(aligned). */ ++ addi r5,r3,16 ++ clrrdi r5,r5,4 ++ ++ /* Align data and fill bytes not loaded with non matching char. */ ++ lvx v0,0,r3 ++ lvsr v1,0,r3 ++ vperm v0,v19,v0,v1 ++ ++ vcmpequb. v6,v0,v18 ++ beq cr6,L(aligned) ++ ++ vctzlsbb r3,v6 ++ blr ++ ++ /* Test next 176B, 16B at a time. The main loop is optimized for longer ++ strings, so checking the first bytes in 16B chunks benefits a lot ++ small strings. */ ++ .p2align 5 ++L(aligned): ++ /* Prepare address for the loop. */ ++ addi r4,r3,192 ++ clrrdi r4,r4,6 ++ ++ CHECK16(v0,0,r5,tail1) ++ CHECK16(v1,16,r5,tail2) ++ CHECK16(v2,32,r5,tail3) ++ CHECK16(v3,48,r5,tail4) ++ CHECK16(v4,64,r5,tail5) ++ CHECK16(v5,80,r5,tail6) ++ CHECK16(v6,96,r5,tail7) ++ CHECK16(v7,112,r5,tail8) ++ CHECK16(v8,128,r5,tail9) ++ CHECK16(v9,144,r5,tail10) ++ CHECK16(v10,160,r5,tail11) ++ ++ addi r5,r4,128 ++ ++ /* Switch to a more aggressive approach checking 64B each time. Use 2 ++ pointers 128B apart and unroll the loop once to make the pointer ++ updates and usages separated enough to avoid stalls waiting for ++ address calculation. */ ++ .p2align 5 ++L(loop): ++ CHECK64(0,r4,pre_tail_64b) ++ CHECK64(64,r4,pre_tail_64b) ++ addi r4,r4,256 ++ ++ CHECK64(0,r5,tail_64b) ++ CHECK64(64,r5,tail_64b) ++ addi r5,r5,256 ++ ++ b L(loop) ++ ++ .p2align 5 ++L(pre_tail_64b): ++ mr r5,r4 ++L(tail_64b): ++ /* OK, we found a null byte. Let's look for it in the current 64-byte ++ block and mark it in its corresponding VR. lxvp vx,0(ry) puts the ++ low 16B bytes into vx+1, and the high into vx, so the order here is ++ v5, v4, v7, v6. */ ++ vcmpequb v1,v5,v18 ++ vcmpequb v2,v4,v18 ++ vcmpequb v3,v7,v18 ++ vcmpequb v4,v6,v18 ++ ++ /* Take into account the other 64B blocks we had already checked. */ ++ add r5,r5,r6 ++ ++ /* Extract first bit of each byte. */ ++ VEXTRACTBM(r7,v1) ++ VEXTRACTBM(r8,v2) ++ VEXTRACTBM(r9,v3) ++ VEXTRACTBM(r10,v4) ++ ++ /* Shift each value into their corresponding position. */ ++ sldi r8,r8,16 ++ sldi r9,r9,32 ++ sldi r10,r10,48 ++ ++ /* Merge the results. */ ++ or r7,r7,r8 ++ or r8,r9,r10 ++ or r10,r8,r7 ++ ++ cnttzd r0,r10 /* Count trailing zeros before the match. */ ++ subf r5,r3,r5 ++ add r3,r5,r0 /* Compute final length. */ ++ blr ++ ++ .p2align 5 ++L(tail1): ++ TAIL(v0,0) ++ ++ .p2align 5 ++L(tail2): ++ TAIL(v1,16) ++ ++ .p2align 5 ++L(tail3): ++ TAIL(v2,32) ++ ++ .p2align 5 ++L(tail4): ++ TAIL(v3,48) ++ ++ .p2align 5 ++L(tail5): ++ TAIL(v4,64) ++ ++ .p2align 5 ++L(tail6): ++ TAIL(v5,80) ++ ++ .p2align 5 ++L(tail7): ++ TAIL(v6,96) ++ ++ .p2align 5 ++L(tail8): ++ TAIL(v7,112) ++ ++ .p2align 5 ++L(tail9): ++ TAIL(v8,128) ++ ++ .p2align 5 ++L(tail10): ++ TAIL(v9,144) ++ ++ .p2align 5 ++L(tail11): ++ TAIL(v10,160) ++ ++END (STRLEN) ++ ++#ifdef DEFINE_STRLEN_HIDDEN_DEF ++weak_alias (__strlen, strlen) ++libc_hidden_builtin_def (strlen) ++#endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index a9e13e05e90601cd..61652b65dd223018 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + + ifneq (,$(filter %le,$(config-machine))) + sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ +- rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 ++ rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ ++ strlen-power10 + endif + CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops + CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index b30bc53930fc0e36..46d5956adda72b86 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -112,6 +112,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/powerpc/powerpc64/multiarch/strlen.c. */ + IFUNC_IMPL (i, name, strlen, + #ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_1, ++ __strlen_power10) + IFUNC_IMPL_ADD (array, i, strlen, hwcap2 & PPC_FEATURE2_ARCH_3_00, + __strlen_power9) + #endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S +new file mode 100644 +index 0000000000000000..6a774fad58c77179 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/multiarch/strlen-power10.S +@@ -0,0 +1,2 @@ ++#define STRLEN __strlen_power10 ++#include +diff --git a/sysdeps/powerpc/powerpc64/multiarch/strlen.c b/sysdeps/powerpc/powerpc64/multiarch/strlen.c +index b7f0fbb13fb97783..11bdb96de2d2aa66 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/strlen.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/strlen.c +@@ -31,9 +31,12 @@ extern __typeof (__redirect_strlen) __strlen_ppc attribute_hidden; + extern __typeof (__redirect_strlen) __strlen_power7 attribute_hidden; + extern __typeof (__redirect_strlen) __strlen_power8 attribute_hidden; + extern __typeof (__redirect_strlen) __strlen_power9 attribute_hidden; ++extern __typeof (__redirect_strlen) __strlen_power10 attribute_hidden; + + libc_ifunc (__libc_strlen, + # ifdef __LITTLE_ENDIAN__ ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1) ++ ? __strlen_power10 : + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + ? __strlen_power9 : + # endif diff --git a/SOURCES/glibc-rh1956357-4.patch b/SOURCES/glibc-rh1956357-4.patch new file mode 100644 index 0000000..ad6aa06 --- /dev/null +++ b/SOURCES/glibc-rh1956357-4.patch @@ -0,0 +1,527 @@ +commit dd59655e9371af86043b97e38953f43bd9496699 +Author: Lucas A. M. Magalhaes +Date: Fri Apr 30 18:12:08 2021 -0300 + + powerpc64le: Optimized memmove for POWER10 + + This patch was initially based on the __memmove_power7 with some ideas + from strncpy implementation for Power 9. + + Improvements from __memmove_power7: + + 1. Use lxvl/stxvl for alignment code. + + The code for Power 7 uses branches when the input is not naturally + aligned to the width of a vector. The new implementation uses + lxvl/stxvl instead which reduces pressure on GPRs. It also allows + the removal of branch instructions, implicitly removing branch stalls + and mispredictions. + + 2. Use of lxv/stxv and lxvl/stxvl pair is safe to use on Cache Inhibited + memory. + + On Power 10 vector load and stores are safe to use on CI memory for + addresses unaligned to 16B. This code takes advantage of this to + do unaligned loads. + + The unaligned loads don't have a significant performance impact by + themselves. However doing so decreases register pressure on GPRs + and interdependence stalls on load/store pairs. This also improved + readability as there are now less code paths for different alignments. + Finally this reduces the overall code size. + + 3. Improved performance. + + This version runs on average about 30% better than memmove_power7 + for lengths larger than 8KB. For input lengths shorter than 8KB + the improvement is smaller, it has on average about 17% better + performance. + + This version has a degradation of about 50% for input lengths + in the 0 to 31 bytes range when dest is unaligned. + + Reviewed-by: Tulio Magno Quites Machado Filho + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/memmove.S b/sysdeps/powerpc/powerpc64/le/power10/memmove.S +new file mode 100644 +index 0000000000000000..7dfd57edeb37e8e4 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power10/memmove.S +@@ -0,0 +1,320 @@ ++/* Optimized memmove implementation for POWER10. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++ ++/* void* [r3] memmove (void *dest [r3], const void *src [r4], size_t len [r5]) ++ ++ This optimization checks if 'src' and 'dst' overlap. If they do not ++ or 'src' is ahead of 'dest' then it copies forward. ++ Otherwise, an optimized backward copy is used. */ ++ ++#ifndef MEMMOVE ++# define MEMMOVE memmove ++#endif ++ .machine power9 ++ENTRY_TOCLESS (MEMMOVE, 5) ++ CALL_MCOUNT 3 ++ ++L(_memmove): ++ .p2align 5 ++ /* Check if there is overlap, if so it will branch to backward copy. */ ++ subf r9,r4,r3 ++ cmpld cr7,r9,r5 ++ blt cr7,L(memmove_bwd) ++ ++ /* Fast path for length shorter than 16 bytes. */ ++ sldi r7,r5,56 ++ lxvl 32+v2,r4,r7 ++ stxvl 32+v2,r3,r7 ++ subic. r8,r5,16 ++ blelr ++ ++ /* For shorter lengths aligning the dest address to 16 bytes either ++ decreases performance or is irrelevant. I'm making use of this ++ comparison to skip the alignment in. */ ++ cmpldi cr6,r5,256 ++ bge cr6,L(ge_256) ++ /* Account for the first 16-byte copy. */ ++ addi r4,r4,16 ++ addi r11,r3,16 /* use r11 to keep dest address on r3. */ ++ subi r5,r5,16 ++ b L(loop_head) ++ ++ .p2align 5 ++L(ge_256): ++ /* Account for the first copy <= 16 bytes. This is necessary for ++ memmove because at this point the src address can be in front of the ++ dest address. */ ++ clrldi r9,r5,56 ++ li r8,16 ++ cmpldi r9,16 ++ iselgt r9,r8,r9 ++ add r4,r4,r9 ++ add r11,r3,r9 /* use r11 to keep dest address on r3. */ ++ sub r5,r5,r9 ++ ++ /* Align dest to 16 bytes. */ ++ neg r7,r3 ++ clrldi. r9,r7,60 ++ beq L(loop_head) ++ ++ .p2align 5 ++ sldi r6,r9,56 ++ lxvl 32+v0,r4,r6 ++ stxvl 32+v0,r11,r6 ++ sub r5,r5,r9 ++ add r4,r4,r9 ++ add r11,r11,r9 ++ ++L(loop_head): ++ cmpldi r5,63 ++ ble L(final_64) ++ ++ srdi. r7,r5,7 ++ beq L(loop_tail) ++ ++ mtctr r7 ++ ++/* Main loop that copies 128 bytes each iteration. */ ++ .p2align 5 ++L(loop): ++ addi r9,r4,64 ++ addi r10,r11,64 ++ ++ lxv 32+v0,0(r4) ++ lxv 32+v1,16(r4) ++ lxv 32+v2,32(r4) ++ lxv 32+v3,48(r4) ++ ++ stxv 32+v0,0(r11) ++ stxv 32+v1,16(r11) ++ stxv 32+v2,32(r11) ++ stxv 32+v3,48(r11) ++ ++ addi r4,r4,128 ++ addi r11,r11,128 ++ ++ lxv 32+v4,0(r9) ++ lxv 32+v5,16(r9) ++ lxv 32+v6,32(r9) ++ lxv 32+v7,48(r9) ++ ++ stxv 32+v4,0(r10) ++ stxv 32+v5,16(r10) ++ stxv 32+v6,32(r10) ++ stxv 32+v7,48(r10) ++ ++ bdnz L(loop) ++ clrldi. r5,r5,57 ++ beqlr ++ ++/* Copy 64 bytes. */ ++ .p2align 5 ++L(loop_tail): ++ cmpldi cr5,r5,63 ++ ble cr5,L(final_64) ++ ++ lxv 32+v0,0(r4) ++ lxv 32+v1,16(r4) ++ lxv 32+v2,32(r4) ++ lxv 32+v3,48(r4) ++ ++ stxv 32+v0,0(r11) ++ stxv 32+v1,16(r11) ++ stxv 32+v2,32(r11) ++ stxv 32+v3,48(r11) ++ ++ addi r4,r4,64 ++ addi r11,r11,64 ++ subi r5,r5,64 ++ ++/* Copies the last 1-63 bytes. */ ++ .p2align 5 ++L(final_64): ++ /* r8 holds the number of bytes that will be copied with lxv/stxv. */ ++ clrrdi. r8,r5,4 ++ beq L(tail1) ++ ++ cmpldi cr5,r5,32 ++ lxv 32+v0,0(r4) ++ blt cr5,L(tail2) ++ ++ cmpldi cr6,r5,48 ++ lxv 32+v1,16(r4) ++ blt cr6,L(tail3) ++ ++ .p2align 5 ++ lxv 32+v2,32(r4) ++ stxv 32+v2,32(r11) ++L(tail3): ++ stxv 32+v1,16(r11) ++L(tail2): ++ stxv 32+v0,0(r11) ++ sub r5,r5,r8 ++ add r4,r4,r8 ++ add r11,r11,r8 ++ .p2align 5 ++L(tail1): ++ sldi r6,r5,56 ++ lxvl v4,r4,r6 ++ stxvl v4,r11,r6 ++ blr ++ ++/* If dest and src overlap, we should copy backwards. */ ++L(memmove_bwd): ++ add r11,r3,r5 ++ add r4,r4,r5 ++ ++ /* Optimization for length smaller than 16 bytes. */ ++ cmpldi cr5,r5,15 ++ ble cr5,L(tail1_bwd) ++ ++ /* For shorter lengths the alignment either slows down or is irrelevant. ++ The forward copy uses a already need 256 comparison for that. Here ++ it's using 128 as it will reduce code and improve readability. */ ++ cmpldi cr7,r5,128 ++ blt cr7,L(bwd_loop_tail) ++ ++ /* Align dest address to 16 bytes. */ ++ .p2align 5 ++ clrldi. r9,r11,60 ++ beq L(bwd_loop_head) ++ sub r4,r4,r9 ++ sub r11,r11,r9 ++ lxv 32+v0,0(r4) ++ sldi r6,r9,56 ++ stxvl 32+v0,r11,r6 ++ sub r5,r5,r9 ++ ++L(bwd_loop_head): ++ srdi. r7,r5,7 ++ beq L(bwd_loop_tail) ++ ++ mtctr r7 ++ ++/* Main loop that copies 128 bytes every iteration. */ ++ .p2align 5 ++L(bwd_loop): ++ addi r9,r4,-64 ++ addi r10,r11,-64 ++ ++ lxv 32+v0,-16(r4) ++ lxv 32+v1,-32(r4) ++ lxv 32+v2,-48(r4) ++ lxv 32+v3,-64(r4) ++ ++ stxv 32+v0,-16(r11) ++ stxv 32+v1,-32(r11) ++ stxv 32+v2,-48(r11) ++ stxv 32+v3,-64(r11) ++ ++ addi r4,r4,-128 ++ addi r11,r11,-128 ++ ++ lxv 32+v0,-16(r9) ++ lxv 32+v1,-32(r9) ++ lxv 32+v2,-48(r9) ++ lxv 32+v3,-64(r9) ++ ++ stxv 32+v0,-16(r10) ++ stxv 32+v1,-32(r10) ++ stxv 32+v2,-48(r10) ++ stxv 32+v3,-64(r10) ++ ++ bdnz L(bwd_loop) ++ clrldi. r5,r5,57 ++ beqlr ++ ++/* Copy 64 bytes. */ ++ .p2align 5 ++L(bwd_loop_tail): ++ cmpldi cr5,r5,63 ++ ble cr5,L(bwd_final_64) ++ ++ addi r4,r4,-64 ++ addi r11,r11,-64 ++ ++ lxv 32+v0,0(r4) ++ lxv 32+v1,16(r4) ++ lxv 32+v2,32(r4) ++ lxv 32+v3,48(r4) ++ ++ stxv 32+v0,0(r11) ++ stxv 32+v1,16(r11) ++ stxv 32+v2,32(r11) ++ stxv 32+v3,48(r11) ++ ++ subi r5,r5,64 ++ ++/* Copies the last 1-63 bytes. */ ++ .p2align 5 ++L(bwd_final_64): ++ /* r8 holds the number of bytes that will be copied with lxv/stxv. */ ++ clrrdi. r8,r5,4 ++ beq L(tail1_bwd) ++ ++ cmpldi cr5,r5,32 ++ lxv 32+v2,-16(r4) ++ blt cr5,L(tail2_bwd) ++ ++ cmpldi cr6,r5,48 ++ lxv 32+v1,-32(r4) ++ blt cr6,L(tail3_bwd) ++ ++ .p2align 5 ++ lxv 32+v0,-48(r4) ++ stxv 32+v0,-48(r11) ++L(tail3_bwd): ++ stxv 32+v1,-32(r11) ++L(tail2_bwd): ++ stxv 32+v2,-16(r11) ++ sub r4,r4,r5 ++ sub r11,r11,r5 ++ sub r5,r5,r8 ++ sldi r6,r5,56 ++ lxvl v4,r4,r6 ++ stxvl v4,r11,r6 ++ blr ++ ++/* Copy last 16 bytes. */ ++ .p2align 5 ++L(tail1_bwd): ++ sub r4,r4,r5 ++ sub r11,r11,r5 ++ sldi r6,r5,56 ++ lxvl v4,r4,r6 ++ stxvl v4,r11,r6 ++ blr ++ ++END_GEN_TB (MEMMOVE,TB_TOCLESS) ++libc_hidden_builtin_def (memmove) ++ ++/* void bcopy(const void *src [r3], void *dest [r4], size_t n [r5]) ++ Implemented in this file to avoid linker create a stub function call ++ in the branch to '_memmove'. */ ++ENTRY_TOCLESS (__bcopy) ++ mr r6,r3 ++ mr r3,r4 ++ mr r4,r6 ++ b L(_memmove) ++END (__bcopy) ++#ifndef __bcopy ++weak_alias (__bcopy, bcopy) ++#endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index 61652b65dd223018..66f8c6ace9824d4a 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -32,7 +32,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + strncase-power8 + + ifneq (,$(filter %le,$(config-machine))) +-sysdep_routines += strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ ++sysdep_routines += memmove-power10 \ ++ strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ + strlen-power10 + endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c +index 1c4a229b1fc5654a..705fef33d4e57557 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c +@@ -22,8 +22,17 @@ + extern __typeof (bcopy) __bcopy_ppc attribute_hidden; + /* __bcopy_power7 symbol is implemented at memmove-power7.S */ + extern __typeof (bcopy) __bcopy_power7 attribute_hidden; ++#ifdef __LITTLE_ENDIAN__ ++extern __typeof (bcopy) __bcopy_power10 attribute_hidden; ++#endif + + libc_ifunc (bcopy, ++#ifdef __LITTLE_ENDIAN__ ++ hwcap2 & (PPC_FEATURE2_ARCH_3_1 | ++ PPC_FEATURE2_HAS_ISEL) ++ && (hwcap & PPC_FEATURE_HAS_VSX) ++ ? __bcopy_power10 : ++#endif + (hwcap & PPC_FEATURE_HAS_VSX) + ? __bcopy_power7 + : __bcopy_ppc); +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index 46d5956adda72b86..4ce04bc51574cca1 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -67,6 +67,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/powerpc/powerpc64/multiarch/memmove.c. */ + IFUNC_IMPL (i, name, memmove, ++#ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, memmove, ++ hwcap2 & (PPC_FEATURE2_ARCH_3_1 | ++ PPC_FEATURE2_HAS_ISEL) ++ && (hwcap & PPC_FEATURE_HAS_VSX), ++ __memmove_power10) ++#endif + IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX, + __memmove_power7) + IFUNC_IMPL_ADD (array, i, memmove, 1, __memmove_ppc)) +@@ -186,6 +193,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/powerpc/powerpc64/multiarch/bcopy.c. */ + IFUNC_IMPL (i, name, bcopy, ++#ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, bcopy, ++ hwcap2 & (PPC_FEATURE2_ARCH_3_1 | ++ PPC_FEATURE2_HAS_ISEL) ++ && (hwcap & PPC_FEATURE_HAS_VSX), ++ __bcopy_power10) ++#endif + IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX, + __bcopy_power7) + IFUNC_IMPL_ADD (array, i, bcopy, 1, __bcopy_ppc)) +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S +new file mode 100644 +index 0000000000000000..171b32921a0a4d47 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power10.S +@@ -0,0 +1,27 @@ ++/* Optimized memmove implementation for POWER10. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define MEMMOVE __memmove_power10 ++ ++#undef libc_hidden_builtin_def ++#define libc_hidden_builtin_def(name) ++ ++#undef __bcopy ++#define __bcopy __bcopy_power10 ++ ++#include +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S +index 0b251d0f5f087874..fb5261ecda64d061 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S ++++ b/sysdeps/powerpc/powerpc64/multiarch/memmove-power7.S +@@ -21,7 +21,7 @@ + #undef libc_hidden_builtin_def + #define libc_hidden_builtin_def(name) + +-#undef bcopy +-#define bcopy __bcopy_power7 ++#undef __bcopy ++#define __bcopy __bcopy_power7 + + #include +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c +index 39987155cc7d3624..2fd7b6d309e4bedd 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c +@@ -28,14 +28,22 @@ + # include "init-arch.h" + + extern __typeof (__redirect_memmove) __libc_memmove; +- + extern __typeof (__redirect_memmove) __memmove_ppc attribute_hidden; + extern __typeof (__redirect_memmove) __memmove_power7 attribute_hidden; ++#ifdef __LITTLE_ENDIAN__ ++extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden; ++#endif + + libc_ifunc (__libc_memmove, +- (hwcap & PPC_FEATURE_HAS_VSX) +- ? __memmove_power7 +- : __memmove_ppc); ++#ifdef __LITTLE_ENDIAN__ ++ hwcap2 & (PPC_FEATURE2_ARCH_3_1 | ++ PPC_FEATURE2_HAS_ISEL) ++ && (hwcap & PPC_FEATURE_HAS_VSX) ++ ? __memmove_power10 : ++#endif ++ (hwcap & PPC_FEATURE_HAS_VSX) ++ ? __memmove_power7 ++ : __memmove_ppc); + + #undef memmove + strong_alias (__libc_memmove, memmove); +diff --git a/sysdeps/powerpc/powerpc64/power7/memmove.S b/sysdeps/powerpc/powerpc64/power7/memmove.S +index b7f3dc28d1a8eac3..9e4cabb07ef9b732 100644 +--- a/sysdeps/powerpc/powerpc64/power7/memmove.S ++++ b/sysdeps/powerpc/powerpc64/power7/memmove.S +@@ -832,4 +832,6 @@ ENTRY_TOCLESS (__bcopy) + mr r4,r6 + b L(_memmove) + END (__bcopy) ++#ifndef __bcopy + weak_alias (__bcopy, bcopy) ++#endif diff --git a/SOURCES/glibc-rh1956357-5.patch b/SOURCES/glibc-rh1956357-5.patch new file mode 100644 index 0000000..11f1597 --- /dev/null +++ b/SOURCES/glibc-rh1956357-5.patch @@ -0,0 +1,308 @@ +commit e941e0ae80626b7661c1db8953a673cafd3b8b19 +Author: Tulio Magno Quites Machado Filho +Date: Fri Apr 30 18:12:08 2021 -0300 + + powerpc64le: Optimize memcpy for POWER10 + + This implementation is based on __memcpy_power8_cached and integrates + suggestions from Anton Blanchard. + It benefits from loads and stores with length for short lengths and for + tail code, simplifying the code. + + All unaligned memory accesses use instructions that do not generate + alignment interrupts on POWER10, making it safe to use on + caching-inhibited memory. + + The main loop has also been modified in order to increase instruction + throughput by reducing the dependency on updates from previous iterations. + + On average, this implementation provides around 30% improvement when + compared to __memcpy_power7 and 10% improvement in comparison to + __memcpy_power8_cached. + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/memcpy.S b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S +new file mode 100644 +index 0000000000000000..ad1414db4a3a8b9f +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power10/memcpy.S +@@ -0,0 +1,198 @@ ++/* Optimized memcpy implementation for POWER10. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++ ++#ifndef MEMCPY ++# define MEMCPY memcpy ++#endif ++ ++/* __ptr_t [r3] memcpy (__ptr_t dst [r3], __ptr_t src [r4], size_t len [r5]); ++ Returns 'dst'. */ ++ ++ .machine power9 ++ENTRY_TOCLESS (MEMCPY, 5) ++ CALL_MCOUNT 3 ++ ++ /* Copy up to 16 bytes. */ ++ sldi r6,r5,56 /* Prepare [l|st]xvl counter. */ ++ lxvl v10,r4,r6 ++ stxvl v10,r3,r6 ++ subic. r6,r5,16 /* Return if len <= 16. */ ++ blelr ++ ++ /* If len >= 256, assume nothing got copied before and copy ++ again. This might cause issues with overlapped memory, but memcpy ++ is not expected to treat overlapped memory. */ ++ cmpdi r5,256 ++ bge L(copy_ge_256) ++ /* 16 < len < 256 and the first 16 bytes have already been copied. */ ++ addi r10,r3,16 /* Keep r3 intact as return value. */ ++ addi r4,r4,16 ++ subi r5,r5,16 ++ b L(copy_lt_256) /* Avoid the main loop if len < 256. */ ++ ++ .p2align 5 ++L(copy_ge_256): ++ mr r10,r3 /* Keep r3 intact as return value. */ ++ /* Align dst to 16 bytes. */ ++ andi. r9,r10,0xf ++ beq L(dst_is_align_16) ++ lxv v10,0(r4) ++ subfic r12,r9,16 ++ subf r5,r12,r5 ++ add r4,r4,r12 ++ stxv v10,0(r3) ++ add r10,r3,r12 ++ ++L(dst_is_align_16): ++ srdi r9,r5,7 /* Divide by 128. */ ++ mtctr r9 ++ addi r6,r4,64 ++ addi r7,r10,64 ++ ++ ++ /* Main loop, copy 128 bytes per iteration. ++ Use r6=src+64 and r7=dest+64 in order to reduce the dependency on ++ r4 and r10. */ ++ .p2align 5 ++L(copy_128): ++ ++ lxv v10, 0(r4) ++ lxv v11, 16(r4) ++ lxv v12, 32(r4) ++ lxv v13, 48(r4) ++ ++ addi r4,r4,128 ++ ++ stxv v10, 0(r10) ++ stxv v11, 16(r10) ++ stxv v12, 32(r10) ++ stxv v13, 48(r10) ++ ++ addi r10,r10,128 ++ ++ lxv v10, 0(r6) ++ lxv v11, 16(r6) ++ lxv v12, 32(r6) ++ lxv v13, 48(r6) ++ ++ addi r6,r6,128 ++ ++ stxv v10, 0(r7) ++ stxv v11, 16(r7) ++ stxv v12, 32(r7) ++ stxv v13, 48(r7) ++ ++ addi r7,r7,128 ++ ++ bdnz L(copy_128) ++ ++ clrldi. r5,r5,64-7 /* Have we copied everything? */ ++ beqlr ++ ++ .p2align 5 ++L(copy_lt_256): ++ cmpdi r5,16 ++ ble L(copy_le_16) ++ srdi. r9,r5,5 /* Divide by 32. */ ++ beq L(copy_lt_32) ++ mtctr r9 ++ /* Use r6=src+32, r7=dest+32, r8=src+64, r9=dest+64 in order to reduce ++ the dependency on r4 and r10. */ ++ addi r6,r4,32 ++ addi r7,r10,32 ++ addi r8,r4,64 ++ addi r9,r10,64 ++ ++ .p2align 5 ++ /* Copy 32 bytes at a time, unaligned. ++ The loop is unrolled 3 times in order to reduce the dependency on ++ r4 and r10, copying up-to 96 bytes per iteration. */ ++L(copy_32): ++ lxv v10, 0(r4) ++ lxv v11, 16(r4) ++ stxv v10, 0(r10) ++ stxv v11, 16(r10) ++ bdz L(end_copy_32a) ++ addi r4,r4,96 ++ addi r10,r10,96 ++ ++ lxv v10, 0(r6) ++ lxv v11, 16(r6) ++ addi r6,r6,96 ++ stxv v10, 0(r7) ++ stxv v11, 16(r7) ++ bdz L(end_copy_32b) ++ addi r7,r7,96 ++ ++ lxv v12, 0(r8) ++ lxv v13, 16(r8) ++ addi r8,r8,96 ++ stxv v12, 0(r9) ++ stxv v13, 16(r9) ++ addi r9,r9,96 ++ bdnz L(copy_32) ++ ++ clrldi. r5,r5,64-5 /* Have we copied everything? */ ++ beqlr ++ cmpdi r5,16 ++ ble L(copy_le_16) ++ b L(copy_lt_32) ++ ++ .p2align 5 ++L(end_copy_32a): ++ clrldi. r5,r5,64-5 /* Have we copied everything? */ ++ beqlr ++ /* 32 bytes have been copied since the last update of r4 and r10. */ ++ addi r4,r4,32 ++ addi r10,r10,32 ++ cmpdi r5,16 ++ ble L(copy_le_16) ++ b L(copy_lt_32) ++ ++ .p2align 5 ++L(end_copy_32b): ++ clrldi. r5,r5,64-5 /* Have we copied everything? */ ++ beqlr ++ /* The last iteration of the loop copied 64 bytes. Update r4 and r10 ++ accordingly. */ ++ addi r4,r4,-32 ++ addi r10,r10,-32 ++ cmpdi r5,16 ++ ble L(copy_le_16) ++ ++ .p2align 5 ++L(copy_lt_32): ++ lxv v10, 0(r4) ++ stxv v10, 0(r10) ++ addi r4,r4,16 ++ addi r10,r10,16 ++ subi r5,r5,16 ++ ++ .p2align 5 ++L(copy_le_16): ++ sldi r6,r5,56 ++ lxvl v10,r4,r6 ++ stxvl v10,r10,r6 ++ blr ++ ++ ++END_GEN_TB (MEMCPY,TB_TOCLESS) ++libc_hidden_builtin_def (memcpy) +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index 66f8c6ace9824d4a..2e3c8f2e8a81cda4 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + strncase-power8 + + ifneq (,$(filter %le,$(config-machine))) +-sysdep_routines += memmove-power10 \ ++sysdep_routines += memcpy-power10 memmove-power10 \ + strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ + strlen-power10 +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index 4ce04bc51574cca1..9d5a14e480c02171 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -51,6 +51,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + #ifdef SHARED + /* Support sysdeps/powerpc/powerpc64/multiarch/memcpy.c. */ + IFUNC_IMPL (i, name, memcpy, ++#ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, memcpy, ++ hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap & PPC_FEATURE_HAS_VSX, ++ __memcpy_power10) ++#endif + IFUNC_IMPL_ADD (array, i, memcpy, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __memcpy_power8_cached) + IFUNC_IMPL_ADD (array, i, memcpy, hwcap & PPC_FEATURE_HAS_VSX, +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S +new file mode 100644 +index 0000000000000000..70e0fc3ed610cdc3 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy-power10.S +@@ -0,0 +1,26 @@ ++/* Optimized memcpy implementation for POWER10. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#if defined __LITTLE_ENDIAN__ && IS_IN (libc) ++#define MEMCPY __memcpy_power10 ++ ++#undef libc_hidden_builtin_def ++#define libc_hidden_builtin_def(name) ++ ++#include ++#endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c +index 44dea594f3770673..be0e47f32dde2ccf 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memcpy.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/memcpy.c +@@ -36,8 +36,15 @@ extern __typeof (__redirect_memcpy) __memcpy_power6 attribute_hidden; + extern __typeof (__redirect_memcpy) __memcpy_a2 attribute_hidden; + extern __typeof (__redirect_memcpy) __memcpy_power7 attribute_hidden; + extern __typeof (__redirect_memcpy) __memcpy_power8_cached attribute_hidden; ++# if defined __LITTLE_ENDIAN__ ++extern __typeof (__redirect_memcpy) __memcpy_power10 attribute_hidden; ++# endif + + libc_ifunc (__libc_memcpy, ++# if defined __LITTLE_ENDIAN__ ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1 && hwcap & PPC_FEATURE_HAS_VSX) ++ ? __memcpy_power10 : ++# endif + ((hwcap2 & PPC_FEATURE2_ARCH_2_07) && use_cached_memopt) + ? __memcpy_power8_cached : + (hwcap & PPC_FEATURE_HAS_VSX) diff --git a/SOURCES/glibc-rh1956357-6.patch b/SOURCES/glibc-rh1956357-6.patch new file mode 100644 index 0000000..cc92e31 --- /dev/null +++ b/SOURCES/glibc-rh1956357-6.patch @@ -0,0 +1,420 @@ +commit 23fdf8178cce3c2ec320dd5eca8b544245bcaef0 +Author: Raoni Fassina Firmino +Date: Fri Apr 30 18:12:08 2021 -0300 + + powerpc64le: Optimize memset for POWER10 + + This implementation is based on __memset_power8 and integrates a lot + of suggestions from Anton Blanchard. + + The biggest difference is that it makes extensive use of stxvl to + alignment and tail code to avoid branches and small stores. It has + three main execution paths: + + a) "Short lengths" for lengths up to 64 bytes, avoiding as many + branches as possible. + + b) "General case" for larger lengths, it has an alignment section + using stxvl to avoid branches, a 128 bytes loop and then a tail + code, again using stxvl with few branches. + + c) "Zeroing cache blocks" for lengths from 256 bytes upwards and set + value being zero. It is mostly the __memset_power8 code but the + alignment phase was simplified because, at this point, address is + already 16-bytes aligned and also changed to use vector stores. + The tail code was also simplified to reuse the general case tail. + + All unaligned stores use stxvl instructions that do not generate + alignment interrupts on POWER10, making it safe to use on + caching-inhibited memory. + + On average, this implementation provides something around 30% + improvement when compared to __memset_power8. + + Reviewed-by: Matheus Castanho + Reviewed-by: Tulio Magno Quites Machado Filho + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/memset.S b/sysdeps/powerpc/powerpc64/le/power10/memset.S +new file mode 100644 +index 0000000000000000..6b8e2cfdaf25fd30 +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power10/memset.S +@@ -0,0 +1,256 @@ ++/* Optimized memset implementation for POWER10 LE. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++/* void * [r3] memset (void *s [r3], int c [r4], size_t n [r5])); ++ Returns 's'. */ ++ ++#ifndef MEMSET ++# define MEMSET memset ++#endif ++ ++ .machine power9 ++ENTRY_TOCLESS (MEMSET, 5) ++ CALL_MCOUNT 3 ++ ++L(_memset): ++ /* Assume memset of zero length is uncommon, and just let it go ++ through the small path below. */ ++ cmpldi r5,64 ++ ++ /* Replicate byte to quad word. */ ++ mtvsrd v0+32,r4 ++ vspltb v0,v0,7 ++ ++ li r7,16 ++ sldi r8,r7,56 ++ ++ bgt L(large) ++ ++ /* For short lengths we want to avoid as many branches as possible. ++ We use store VSX vector with length instructions to do this. ++ It takes advantage of the fact that if the length passed to stxvl ++ is zero nothing is done, effectively a no-op. */ ++ sldi r5,r5,56 ++ ++ addi r10,r3,16 ++ ++ sub. r11,r5,r8 ++ isellt r11,0,r11 /* Saturate the subtraction to zero. */ ++ ++ stxvl v0+32,r3,r5 ++ stxvl v0+32,r10,r11 ++ ++ addi r9,r3,32 ++ addi r10,r3,48 ++ ++ sub. r11,r11,r8 ++ isellt r11,0,r11 ++ ++ sub. r5,r11,r8 ++ isellt r5,0,r5 ++ ++ stxvl v0+32,r9,r11 ++ stxvl v0+32,r10,r5 ++ ++ blr ++ ++ .balign 16 ++L(large): ++ mr r6,r3 /* Don't modify r3 since we need to return it. */ ++ ++ /* Get dest 16B aligned. */ ++ neg r0,r3 ++ clrldi. r7,r0,(64-4) ++ beq L(aligned) ++ rldic r9,r0,56,4 /* (~X & 0xf)<<56 "clrlsldi r9,r0,64-4,56". */ ++ ++ stxvl v0+32,r6,r9 /* Store up to 15B until aligned address. */ ++ ++ add r6,r6,r7 ++ sub r5,r5,r7 ++ ++ /* Go to tail if there is less than 64B left after alignment. */ ++ cmpldi r5,64 ++ blt L(tail_64) ++ ++ .balign 16 ++L(aligned): ++ /* Go to tail if there is less than 128B left after alignment. */ ++ srdi. r0,r5,7 ++ beq L(tail_128) ++ ++ /* If c == 0 && n >= 256 use dcbz to zero out full cache blocks. */ ++ cmpldi cr5,r5,255 ++ cmpldi cr6,r4,0 ++ crand 27,26,21 ++ bt 27,L(dcbz) ++ ++ mtctr r0 ++ ++ .balign 32 ++L(loop): ++ stxv v0+32,0(r6) ++ stxv v0+32,16(r6) ++ stxv v0+32,32(r6) ++ stxv v0+32,48(r6) ++ stxv v0+32,64(r6) ++ stxv v0+32,80(r6) ++ stxv v0+32,96(r6) ++ stxv v0+32,112(r6) ++ addi r6,r6,128 ++ bdnz L(loop) ++ ++ .balign 16 ++L(tail): ++ /* 127B or less left, finish the tail or return. */ ++ andi. r5,r5,127 ++ beqlr ++ ++ cmpldi r5,64 ++ blt L(tail_64) ++ ++ .balign 16 ++L(tail_128): ++ /* Stores a minimum of 64B and up to 128B and return. */ ++ stxv v0+32,0(r6) ++ stxv v0+32,16(r6) ++ stxv v0+32,32(r6) ++ stxv v0+32,48(r6) ++ addi r6,r6,64 ++ andi. r5,r5,63 ++ beqlr ++ ++ .balign 16 ++L(tail_64): ++ /* Stores up to 64B and return. */ ++ sldi r5,r5,56 ++ ++ addi r10,r6,16 ++ ++ sub. r11,r5,r8 ++ isellt r11,0,r11 ++ ++ stxvl v0+32,r6,r5 ++ stxvl v0+32,r10,r11 ++ ++ sub. r11,r11,r8 ++ blelr ++ ++ addi r9,r6,32 ++ addi r10,r6,48 ++ ++ isellt r11,0,r11 ++ ++ sub. r5,r11,r8 ++ isellt r5,0,r5 ++ ++ stxvl v0+32,r9,r11 ++ stxvl v0+32,r10,r5 ++ ++ blr ++ ++ .balign 16 ++L(dcbz): ++ /* Special case when value is 0 and we have a long length to deal ++ with. Use dcbz to zero out a full cacheline of 128 bytes at a time. ++ Before using dcbz though, we need to get the destination 128-byte ++ aligned. */ ++ neg r0,r6 ++ clrldi. r0,r0,(64-7) ++ beq L(dcbz_aligned) ++ ++ sub r5,r5,r0 ++ mtocrf 0x2,r0 /* copying bits 57..59 to cr6. The ones for sizes 64, ++ 32 and 16 which need to be checked. */ ++ ++ /* Write 16-128 bytes until DST is aligned to 128 bytes. */ ++64: bf 25,32f ++ stxv v0+32,0(r6) ++ stxv v0+32,16(r6) ++ stxv v0+32,32(r6) ++ stxv v0+32,48(r6) ++ addi r6,r6,64 ++ ++32: bf 26,16f ++ stxv v0+32,0(r6) ++ stxv v0+32,16(r6) ++ addi r6,r6,32 ++ ++16: bf 27,L(dcbz_aligned) ++ stxv v0+32,0(r6) ++ addi r6,r6,16 ++ ++ .balign 16 ++L(dcbz_aligned): ++ /* Setup dcbz unroll offsets and count numbers. */ ++ srdi. r0,r5,9 ++ li r9,128 ++ beq L(bcdz_tail) ++ li r10,256 ++ li r11,384 ++ mtctr r0 ++ ++ .balign 16 ++L(dcbz_loop): ++ /* Sets 512 bytes to zero in each iteration, the loop unrolling shows ++ a throughput boost for large sizes (2048 bytes or higher). */ ++ dcbz 0,r6 ++ dcbz r9,r6 ++ dcbz r10,r6 ++ dcbz r11,r6 ++ addi r6,r6,512 ++ bdnz L(dcbz_loop) ++ ++ andi. r5,r5,511 ++ beqlr ++ ++ .balign 16 ++L(bcdz_tail): ++ /* We have 1-511 bytes remaining. */ ++ srdi. r0,r5,7 ++ beq L(tail) ++ ++ mtocrf 0x1,r0 ++ ++256: bf 30,128f ++ dcbz 0,r6 ++ dcbz r9,r6 ++ addi r6,r6,256 ++ ++128: bf 31,L(tail) ++ dcbz 0,r6 ++ addi r6,r6,128 ++ ++ b L(tail) ++ ++END_GEN_TB (MEMSET,TB_TOCLESS) ++libc_hidden_builtin_def (memset) ++ ++/* Copied from bzero.S to prevent the linker from inserting a stub ++ between bzero and memset. */ ++ENTRY_TOCLESS (__bzero) ++ CALL_MCOUNT 2 ++ mr r5,r4 ++ li r4,0 ++ b L(_memset) ++END (__bzero) ++#ifndef __bzero ++weak_alias (__bzero, bzero) ++#endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index 2e3c8f2e8a81cda4..1d517698429e1230 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -32,7 +32,7 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + strncase-power8 + + ifneq (,$(filter %le,$(config-machine))) +-sysdep_routines += memcpy-power10 memmove-power10 \ ++sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \ + strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ + rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ + strlen-power10 +diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c +index f8cb05bea8a3505b..4ce98e324d12a31e 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c +@@ -27,8 +27,16 @@ extern __typeof (bzero) __bzero_power4 attribute_hidden; + extern __typeof (bzero) __bzero_power6 attribute_hidden; + extern __typeof (bzero) __bzero_power7 attribute_hidden; + extern __typeof (bzero) __bzero_power8 attribute_hidden; ++# ifdef __LITTLE_ENDIAN__ ++extern __typeof (bzero) __bzero_power10 attribute_hidden; ++# endif + + libc_ifunc (__bzero, ++# ifdef __LITTLE_ENDIAN__ ++ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL) ++ && hwcap & PPC_FEATURE_HAS_VSX) ++ ? __bzero_power10 : ++# endif + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __bzero_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index 9d5a14e480c02171..11532f77d4d03b2a 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -86,6 +86,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/powerpc/powerpc64/multiarch/memset.c. */ + IFUNC_IMPL (i, name, memset, ++#ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, memset, ++ hwcap2 & (PPC_FEATURE2_ARCH_3_1 | ++ PPC_FEATURE2_HAS_ISEL) ++ && hwcap & PPC_FEATURE_HAS_VSX, ++ __memset_power10) ++#endif + IFUNC_IMPL_ADD (array, i, memset, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __memset_power8) + IFUNC_IMPL_ADD (array, i, memset, hwcap & PPC_FEATURE_HAS_VSX, +@@ -187,6 +194,13 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + + /* Support sysdeps/powerpc/powerpc64/multiarch/bzero.c. */ + IFUNC_IMPL (i, name, bzero, ++#ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, bzero, ++ hwcap2 & (PPC_FEATURE2_ARCH_3_1 | ++ PPC_FEATURE2_HAS_ISEL) ++ && hwcap & PPC_FEATURE_HAS_VSX, ++ __bzero_power10) ++#endif + IFUNC_IMPL_ADD (array, i, bzero, hwcap2 & PPC_FEATURE2_ARCH_2_07, + __bzero_power8) + IFUNC_IMPL_ADD (array, i, bzero, hwcap & PPC_FEATURE_HAS_VSX, +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S +new file mode 100644 +index 0000000000000000..548e99789735296c +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/multiarch/memset-power10.S +@@ -0,0 +1,27 @@ ++/* Optimized memset implementation for POWER10 LE. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define MEMSET __memset_power10 ++ ++#undef libc_hidden_builtin_def ++#define libc_hidden_builtin_def(name) ++ ++#undef __bzero ++#define __bzero __bzero_power10 ++ ++#include +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c +index 1a7c46fecf78ab1f..4c97622c7d7eb8aa 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c +@@ -33,10 +33,18 @@ extern __typeof (__redirect_memset) __memset_power4 attribute_hidden; + extern __typeof (__redirect_memset) __memset_power6 attribute_hidden; + extern __typeof (__redirect_memset) __memset_power7 attribute_hidden; + extern __typeof (__redirect_memset) __memset_power8 attribute_hidden; ++# ifdef __LITTLE_ENDIAN__ ++extern __typeof (__redirect_memset) __memset_power10 attribute_hidden; ++# endif + + /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle + ifunc symbol properly. */ + libc_ifunc (__libc_memset, ++# ifdef __LITTLE_ENDIAN__ ++ (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL) ++ && hwcap & PPC_FEATURE_HAS_VSX) ++ ? __memset_power10 : ++# endif + (hwcap2 & PPC_FEATURE2_ARCH_2_07) + ? __memset_power8 : + (hwcap & PPC_FEATURE_HAS_VSX) diff --git a/SOURCES/glibc-rh1956357-7.patch b/SOURCES/glibc-rh1956357-7.patch new file mode 100644 index 0000000..dca81f2 --- /dev/null +++ b/SOURCES/glibc-rh1956357-7.patch @@ -0,0 +1,131 @@ +commit 17a73a6d8b4c46f3e87fc53c7c25fa7cec01d707 +Author: Raoni Fassina Firmino +Date: Mon May 3 16:59:35 2021 -0300 + + powerpc64le: Fix ifunc selection for memset, memmove, bzero and bcopy + + The hwcap2 check for the aforementioned functions should check for + both PPC_FEATURE2_ARCH_3_1 and PPC_FEATURE2_HAS_ISEL but was + mistakenly checking for any one of them, enabling isa 3.1 version of + the functions in incompatible processors, like POWER8. + + Reviewed-by: Tulio Magno Quites Machado Filho + +diff --git a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c +index 705fef33d4e57557..3c6528e5dbccfdbd 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/bcopy.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/bcopy.c +@@ -28,10 +28,10 @@ extern __typeof (bcopy) __bcopy_power10 attribute_hidden; + + libc_ifunc (bcopy, + #ifdef __LITTLE_ENDIAN__ +- hwcap2 & (PPC_FEATURE2_ARCH_3_1 | +- PPC_FEATURE2_HAS_ISEL) +- && (hwcap & PPC_FEATURE_HAS_VSX) +- ? __bcopy_power10 : ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL ++ && hwcap & PPC_FEATURE_HAS_VSX) ++ ? __bcopy_power10 : + #endif + (hwcap & PPC_FEATURE_HAS_VSX) + ? __bcopy_power7 +diff --git a/sysdeps/powerpc/powerpc64/multiarch/bzero.c b/sysdeps/powerpc/powerpc64/multiarch/bzero.c +index 4ce98e324d12a31e..b08b381b4a3999f1 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/bzero.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/bzero.c +@@ -33,7 +33,8 @@ extern __typeof (bzero) __bzero_power10 attribute_hidden; + + libc_ifunc (__bzero, + # ifdef __LITTLE_ENDIAN__ +- (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL) ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL + && hwcap & PPC_FEATURE_HAS_VSX) + ? __bzero_power10 : + # endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index 11532f77d4d03b2a..6e36659d1903448a 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -75,9 +75,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, memmove, + #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, memmove, +- hwcap2 & (PPC_FEATURE2_ARCH_3_1 | +- PPC_FEATURE2_HAS_ISEL) +- && (hwcap & PPC_FEATURE_HAS_VSX), ++ hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL ++ && hwcap & PPC_FEATURE_HAS_VSX, + __memmove_power10) + #endif + IFUNC_IMPL_ADD (array, i, memmove, hwcap & PPC_FEATURE_HAS_VSX, +@@ -88,8 +88,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, memset, + #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, memset, +- hwcap2 & (PPC_FEATURE2_ARCH_3_1 | +- PPC_FEATURE2_HAS_ISEL) ++ hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL + && hwcap & PPC_FEATURE_HAS_VSX, + __memset_power10) + #endif +@@ -196,8 +196,8 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, bzero, + #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, bzero, +- hwcap2 & (PPC_FEATURE2_ARCH_3_1 | +- PPC_FEATURE2_HAS_ISEL) ++ hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL + && hwcap & PPC_FEATURE_HAS_VSX, + __bzero_power10) + #endif +@@ -215,9 +215,9 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + IFUNC_IMPL (i, name, bcopy, + #ifdef __LITTLE_ENDIAN__ + IFUNC_IMPL_ADD (array, i, bcopy, +- hwcap2 & (PPC_FEATURE2_ARCH_3_1 | +- PPC_FEATURE2_HAS_ISEL) +- && (hwcap & PPC_FEATURE_HAS_VSX), ++ hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL ++ && hwcap & PPC_FEATURE_HAS_VSX, + __bcopy_power10) + #endif + IFUNC_IMPL_ADD (array, i, bcopy, hwcap & PPC_FEATURE_HAS_VSX, +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memmove.c b/sysdeps/powerpc/powerpc64/multiarch/memmove.c +index 2fd7b6d309e4bedd..27895faad0cab40e 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memmove.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/memmove.c +@@ -36,10 +36,10 @@ extern __typeof (__redirect_memmove) __memmove_power10 attribute_hidden; + + libc_ifunc (__libc_memmove, + #ifdef __LITTLE_ENDIAN__ +- hwcap2 & (PPC_FEATURE2_ARCH_3_1 | +- PPC_FEATURE2_HAS_ISEL) +- && (hwcap & PPC_FEATURE_HAS_VSX) +- ? __memmove_power10 : ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL ++ && hwcap & PPC_FEATURE_HAS_VSX) ++ ? __memmove_power10 : + #endif + (hwcap & PPC_FEATURE_HAS_VSX) + ? __memmove_power7 +diff --git a/sysdeps/powerpc/powerpc64/multiarch/memset.c b/sysdeps/powerpc/powerpc64/multiarch/memset.c +index 4c97622c7d7eb8aa..685623ae870a0725 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/memset.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/memset.c +@@ -41,7 +41,8 @@ extern __typeof (__redirect_memset) __memset_power10 attribute_hidden; + ifunc symbol properly. */ + libc_ifunc (__libc_memset, + # ifdef __LITTLE_ENDIAN__ +- (hwcap2 & (PPC_FEATURE2_ARCH_3_1 | PPC_FEATURE2_HAS_ISEL) ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1 ++ && hwcap2 & PPC_FEATURE2_HAS_ISEL + && hwcap & PPC_FEATURE_HAS_VSX) + ? __memset_power10 : + # endif diff --git a/SOURCES/glibc-rh1956357-8.patch b/SOURCES/glibc-rh1956357-8.patch new file mode 100644 index 0000000..beee664 --- /dev/null +++ b/SOURCES/glibc-rh1956357-8.patch @@ -0,0 +1,387 @@ +commit 1a594aa986ffe28657a03baa5c53c0a0e7dc2ecd +Author: Matheus Castanho +Date: Tue May 11 17:53:07 2021 -0300 + + powerpc: Add optimized rawmemchr for POWER10 + + Reuse code for optimized strlen to implement a faster version of rawmemchr. + This takes advantage of the same benefits provided by the strlen implementation, + but needs some extra steps. __strlen_power10 code should be unchanged after this + change. + + rawmemchr returns a pointer to the char found, while strlen returns only the + length, so we have to take that into account when preparing the return value. + + To quickly check 64B, the loop on __strlen_power10 merges the whole block into + 16B by using unsigned minimum vector operations (vminub) and checks if there are + any \0 on the resulting vector. The same code is used by rawmemchr if the char c + is 0. However, this approach does not work when c != 0. We first need to + subtract each byte by c, so that the value we are looking for is converted to a + 0, then taking the minimum and checking for nulls works again. + + The new code branches after it has compared ~256 bytes and chooses which of the + two strategies above will be used in the main loop, based on the char c. This + extra branch adds some overhead (~5%) for length ~256, but is quickly amortized + by the faster loop for larger sizes. + + Compared to __rawmemchr_power9, this version is ~20% faster for length < 256. + Because of the optimized main loop, the improvement becomes ~35% for c != 0 + and ~50% for c = 0 for strings longer than 256. + + Reviewed-by: Lucas A. M. Magalhaes + Reviewed-by: Raphael M Zinsly + +diff --git a/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S +new file mode 100644 +index 0000000000000000..5351c2634f6086bf +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/le/power10/rawmemchr.S +@@ -0,0 +1,22 @@ ++/* Optimized rawmemchr implementation for POWER10 LE. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#include ++ ++#define USE_AS_RAWMEMCHR 1 ++#include +diff --git a/sysdeps/powerpc/powerpc64/le/power10/strlen.S b/sysdeps/powerpc/powerpc64/le/power10/strlen.S +index ca7e9eb3d84c9b00..dda5282f1b9a07cf 100644 +--- a/sysdeps/powerpc/powerpc64/le/power10/strlen.S ++++ b/sysdeps/powerpc/powerpc64/le/power10/strlen.S +@@ -18,10 +18,50 @@ + + #include + +-#ifndef STRLEN +-# define STRLEN __strlen +-# define DEFINE_STRLEN_HIDDEN_DEF 1 +-#endif ++/* To reuse the code for rawmemchr, we have some extra steps compared to the ++ strlen implementation: ++ - Sum the initial value of r3 with the position at which the char was ++ found, to guarantee we return a pointer and not the length. ++ - In the main loop, subtract each byte by the char we are looking for, ++ so we can keep using vminub to quickly check 64B at once. */ ++#ifdef USE_AS_RAWMEMCHR ++# ifndef RAWMEMCHR ++# define FUNCNAME __rawmemchr ++# else ++# define FUNCNAME RAWMEMCHR ++# endif ++# define MCOUNT_NARGS 2 ++# define VREG_ZERO v20 ++# define OFF_START_LOOP 256 ++# define RAWMEMCHR_SUBTRACT_VECTORS \ ++ vsububm v4,v4,v18; \ ++ vsububm v5,v5,v18; \ ++ vsububm v6,v6,v18; \ ++ vsububm v7,v7,v18; ++# define TAIL(vreg,increment) \ ++ vctzlsbb r4,vreg; \ ++ addi r4,r4,increment; \ ++ add r3,r5,r4; \ ++ blr ++ ++#else /* strlen */ ++ ++# ifndef STRLEN ++# define FUNCNAME __strlen ++# define DEFINE_STRLEN_HIDDEN_DEF 1 ++# else ++# define FUNCNAME STRLEN ++# endif ++# define MCOUNT_NARGS 1 ++# define VREG_ZERO v18 ++# define OFF_START_LOOP 192 ++# define TAIL(vreg,increment) \ ++ vctzlsbb r4,vreg; \ ++ subf r3,r3,r5; \ ++ addi r4,r4,increment; \ ++ add r3,r3,r4; \ ++ blr ++#endif /* USE_AS_RAWMEMCHR */ + + /* TODO: Replace macros by the actual instructions when minimum binutils becomes + >= 2.35. This is used to keep compatibility with older versions. */ +@@ -50,33 +90,41 @@ + li r6,offset; \ + LXVP(v4+32,offset,addr); \ + LXVP(v6+32,offset+32,addr); \ ++ RAWMEMCHR_SUBTRACT_VECTORS; \ + vminub v14,v4,v5; \ + vminub v15,v6,v7; \ + vminub v16,v14,v15; \ +- vcmpequb. v0,v16,v18; \ ++ vcmpequb. v0,v16,VREG_ZERO; \ + bne cr6,L(label) + +-#define TAIL(vreg,increment) \ +- vctzlsbb r4,vreg; \ +- subf r3,r3,r5; \ +- addi r4,r4,increment; \ +- add r3,r3,r4; \ +- blr +- + /* Implements the function + + int [r3] strlen (const void *s [r3]) + ++ but when USE_AS_RAWMEMCHR is set, implements the function ++ ++ void* [r3] rawmemchr (const void *s [r3], int c [r4]) ++ + The implementation can load bytes past a matching byte, but only + up to the next 64B boundary, so it never crosses a page. */ + + .machine power9 + +-ENTRY_TOCLESS (STRLEN, 4) +- CALL_MCOUNT 1 ++ENTRY_TOCLESS (FUNCNAME, 4) ++ CALL_MCOUNT MCOUNT_NARGS + +- vspltisb v18,0 ++#ifdef USE_AS_RAWMEMCHR ++ xori r5,r4,0xff ++ ++ mtvsrd v18+32,r4 /* matching char in v18 */ ++ mtvsrd v19+32,r5 /* non matching char in v19 */ ++ ++ vspltb v18,v18,7 /* replicate */ ++ vspltb v19,v19,7 /* replicate */ ++#else + vspltisb v19,-1 ++#endif ++ vspltisb VREG_ZERO,0 + + /* Next 16B-aligned address. Prepare address for L(aligned). */ + addi r5,r3,16 +@@ -90,16 +138,25 @@ ENTRY_TOCLESS (STRLEN, 4) + vcmpequb. v6,v0,v18 + beq cr6,L(aligned) + ++#ifdef USE_AS_RAWMEMCHR ++ vctzlsbb r6,v6 ++ add r3,r3,r6 ++#else + vctzlsbb r3,v6 ++#endif + blr + +- /* Test next 176B, 16B at a time. The main loop is optimized for longer +- strings, so checking the first bytes in 16B chunks benefits a lot +- small strings. */ ++ /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is ++ optimized for longer strings, so checking the first bytes in 16B ++ chunks benefits a lot small strings. */ + .p2align 5 + L(aligned): ++#ifdef USE_AS_RAWMEMCHR ++ cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to ++ choose how we will perform the main loop. */ ++#endif + /* Prepare address for the loop. */ +- addi r4,r3,192 ++ addi r4,r3,OFF_START_LOOP + clrrdi r4,r4,6 + + CHECK16(v0,0,r5,tail1) +@@ -113,15 +170,43 @@ L(aligned): + CHECK16(v8,128,r5,tail9) + CHECK16(v9,144,r5,tail10) + CHECK16(v10,160,r5,tail11) ++#ifdef USE_AS_RAWMEMCHR ++ CHECK16(v0,176,r5,tail12) ++ CHECK16(v1,192,r5,tail13) ++ CHECK16(v2,208,r5,tail14) ++ CHECK16(v3,224,r5,tail15) ++#endif + + addi r5,r4,128 + ++#ifdef USE_AS_RAWMEMCHR ++ /* If c == 0, use the same loop as strlen, without the vsububm. */ ++ beq cr5,L(loop) ++ ++ /* This is very similar to the block after L(loop), the difference is ++ that here RAWMEMCHR_SUBTRACT_VECTORS is not empty, and we subtract ++ each byte loaded by the char we are looking for, this way we can keep ++ using vminub to merge the results and checking for nulls. */ ++ .p2align 5 ++L(rawmemchr_loop): ++ CHECK64(0,r4,pre_tail_64b) ++ CHECK64(64,r4,pre_tail_64b) ++ addi r4,r4,256 ++ ++ CHECK64(0,r5,tail_64b) ++ CHECK64(64,r5,tail_64b) ++ addi r5,r5,256 ++ ++ b L(rawmemchr_loop) ++#endif + /* Switch to a more aggressive approach checking 64B each time. Use 2 + pointers 128B apart and unroll the loop once to make the pointer + updates and usages separated enough to avoid stalls waiting for + address calculation. */ + .p2align 5 + L(loop): ++#undef RAWMEMCHR_SUBTRACT_VECTORS ++#define RAWMEMCHR_SUBTRACT_VECTORS /* nothing */ + CHECK64(0,r4,pre_tail_64b) + CHECK64(64,r4,pre_tail_64b) + addi r4,r4,256 +@@ -140,10 +225,10 @@ L(tail_64b): + block and mark it in its corresponding VR. lxvp vx,0(ry) puts the + low 16B bytes into vx+1, and the high into vx, so the order here is + v5, v4, v7, v6. */ +- vcmpequb v1,v5,v18 +- vcmpequb v2,v4,v18 +- vcmpequb v3,v7,v18 +- vcmpequb v4,v6,v18 ++ vcmpequb v1,v5,VREG_ZERO ++ vcmpequb v2,v4,VREG_ZERO ++ vcmpequb v3,v7,VREG_ZERO ++ vcmpequb v4,v6,VREG_ZERO + + /* Take into account the other 64B blocks we had already checked. */ + add r5,r5,r6 +@@ -165,7 +250,9 @@ L(tail_64b): + or r10,r8,r7 + + cnttzd r0,r10 /* Count trailing zeros before the match. */ ++#ifndef USE_AS_RAWMEMCHR + subf r5,r3,r5 ++#endif + add r3,r5,r0 /* Compute final length. */ + blr + +@@ -213,9 +300,32 @@ L(tail10): + L(tail11): + TAIL(v10,160) + +-END (STRLEN) ++#ifdef USE_AS_RAWMEMCHR ++ .p2align 5 ++L(tail12): ++ TAIL(v0,176) ++ ++ .p2align 5 ++L(tail13): ++ TAIL(v1,192) ++ ++ .p2align 5 ++L(tail14): ++ TAIL(v2,208) ++ ++ .p2align 5 ++L(tail15): ++ TAIL(v3,224) ++#endif ++ ++END (FUNCNAME) + +-#ifdef DEFINE_STRLEN_HIDDEN_DEF ++#ifdef USE_AS_RAWMEMCHR ++weak_alias (__rawmemchr,rawmemchr) ++libc_hidden_builtin_def (__rawmemchr) ++#else ++# ifdef DEFINE_STRLEN_HIDDEN_DEF + weak_alias (__strlen, strlen) + libc_hidden_builtin_def (strlen) ++# endif + #endif +diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile +index 1d517698429e1230..ac2446aca62cc4ab 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile ++++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile +@@ -33,9 +33,9 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \ + + ifneq (,$(filter %le,$(config-machine))) + sysdep_routines += memcpy-power10 memmove-power10 memset-power10 \ ++ rawmemchr-power9 rawmemchr-power10 \ + strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \ +- rawmemchr-power9 strlen-power9 strncpy-power9 stpncpy-power9 \ +- strlen-power10 ++ strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10 + endif + CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops + CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops +diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +index 6e36659d1903448a..127af84b32a8196f 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c +@@ -257,6 +257,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array, + /* Support sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c. */ + IFUNC_IMPL (i, name, rawmemchr, + #ifdef __LITTLE_ENDIAN__ ++ IFUNC_IMPL_ADD (array, i, rawmemchr, ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1) ++ && (hwcap & PPC_FEATURE_HAS_VSX), ++ __rawmemchr_power10) + IFUNC_IMPL_ADD (array, i, rawmemchr, + hwcap2 & PPC_FEATURE2_ARCH_3_00, + __rawmemchr_power9) +diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S +new file mode 100644 +index 0000000000000000..bf1ed7e1941f922d +--- /dev/null ++++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr-power10.S +@@ -0,0 +1,21 @@ ++/* Optimized rawmemchr implementation for PowerPC64/POWER10. ++ Copyright (C) 2021 Free Software Foundation, Inc. ++ This file is part of the GNU C Library. ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Lesser General Public ++ License as published by the Free Software Foundation; either ++ version 2.1 of the License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Lesser General Public License for more details. ++ ++ You should have received a copy of the GNU Lesser General Public ++ License along with the GNU C Library; if not, see ++ . */ ++ ++#define RAWMEMCHR __rawmemchr_power10 ++ ++#include +diff --git a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c +index 2a7ae5a1ed02e556..369d6359e8987052 100644 +--- a/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c ++++ b/sysdeps/powerpc/powerpc64/multiarch/rawmemchr.c +@@ -26,6 +26,7 @@ extern __typeof (__rawmemchr) __rawmemchr_ppc attribute_hidden; + extern __typeof (__rawmemchr) __rawmemchr_power7 attribute_hidden; + # ifdef __LITTLE_ENDIAN__ + extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; ++extern __typeof (__rawmemchr) __rawmemchr_power10 attribute_hidden; + # endif + + # undef __rawmemchr +@@ -34,6 +35,9 @@ extern __typeof (__rawmemchr) __rawmemchr_power9 attribute_hidden; + ifunc symbol properly. */ + libc_ifunc_redirected (__redirect___rawmemchr, __rawmemchr, + # ifdef __LITTLE_ENDIAN__ ++ (hwcap2 & PPC_FEATURE2_ARCH_3_1) ++ && (hwcap & PPC_FEATURE_HAS_VSX) ++ ? __rawmemchr_power10 : + (hwcap2 & PPC_FEATURE2_ARCH_3_00) + ? __rawmemchr_power9 : + # endif diff --git a/SPECS/glibc.spec b/SPECS/glibc.spec index 4e46638..5cc2271 100644 --- a/SPECS/glibc.spec +++ b/SPECS/glibc.spec @@ -1,6 +1,6 @@ %define glibcsrcdir glibc-2.28 %define glibcversion 2.28 -%define glibcrelease 158%{?dist} +%define glibcrelease 160%{?dist} # Pre-release tarballs are pulled in from git using a command that is # effectively: # @@ -706,6 +706,14 @@ Patch569: glibc-rh1934155-3.patch Patch570: glibc-rh1934155-4.patch Patch571: glibc-rh1934155-5.patch Patch572: glibc-rh1934155-6.patch +Patch573: glibc-rh1956357-1.patch +Patch574: glibc-rh1956357-2.patch +Patch575: glibc-rh1956357-3.patch +Patch576: glibc-rh1956357-4.patch +Patch577: glibc-rh1956357-5.patch +Patch578: glibc-rh1956357-6.patch +Patch579: glibc-rh1956357-7.patch +Patch580: glibc-rh1956357-8.patch ############################################################################## # Continued list of core "glibc" package information: @@ -2617,6 +2625,12 @@ fi %files -f compat-libpthread-nonshared.filelist -n compat-libpthread-nonshared %changelog +* Mon May 31 2021 Arjun Shankar - 2.28-160 +- Backport POWER10 optimized rawmemchr for ppc64le (#1956357) + +* Thu May 27 2021 Arjun Shankar - 2.28-159 +- Backport additional ifunc optimizations for ppc64le (#1956357) + * Thu Apr 22 2021 Florian Weimer - 2.28-158 - Rebuild with new binutils (#1946518)