POWER10 string function optimizations (RHEL-24740)

Resolves: RHEL-24740
This commit is contained in:
Florian Weimer 2025-02-13 16:42:58 +01:00
parent 1ed4da461e
commit 2ca0b0c57e
4 changed files with 975 additions and 1 deletions

315
glibc-RHEL-24740-1.patch Normal file
View File

@ -0,0 +1,315 @@
commit 3367d8e180848030d1646f088759f02b8dfe0d6f
Author: Amrita H S <amritahs@linux.vnet.ibm.com>
Date: Wed Dec 6 11:43:11 2023 -0500
powerpc: Optimized strcmp for power10
This patch is based on __strcmp_power9 and __strlen_power10.
Improvements from __strcmp_power9:
1. Uses new POWER10 instructions
- This code uses lxvp to decrease contention on load
by loading 32 bytes per instruction.
2. Performance implication
- This version has around 30% better performance on average.
- Performance regression is seen for a specific combination
of sizes and alignments. Some of them is observed without
changes also, while rest may be induced by the patch.
Signed-off-by: Amrita H S <amritahs@linux.vnet.ibm.com>
Reviewed-by: Paul E. Murphy <murphyp@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
new file mode 100644
index 0000000000000000..a3c1adad539978e0
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
@@ -0,0 +1,204 @@
+/* Optimized strcmp implementation for PowerPC64/POWER10.
+ Copyright (C) 2021-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+#include <sysdep.h>
+
+#ifndef STRCMP
+# define STRCMP strcmp
+#endif
+
+/* Implements the function
+ int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]). */
+
+/* TODO: Change this to actual instructions when minimum binutils is upgraded
+ to 2.27. Macros are defined below for these newer instructions in order
+ to maintain compatibility. */
+
+#define LXVP(xtp,dq,ra) \
+ .long(((6)<<(32-6)) \
+ | ((((xtp)-32)>>1)<<(32-10)) \
+ | ((1)<<(32-11)) \
+ | ((ra)<<(32-16)) \
+ | dq)
+
+#define COMPARE_16(vreg1,vreg2,offset) \
+ lxv vreg1+32,offset(r3); \
+ lxv vreg2+32,offset(r4); \
+ vcmpnezb. v7,vreg1,vreg2; \
+ bne cr6,L(different); \
+
+#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
+ LXVP(vreg1+32,offset,r3); \
+ LXVP(vreg2+32,offset,r4); \
+ vcmpnezb. v7,vreg1+1,vreg2+1; \
+ bne cr6,L(label1); \
+ vcmpnezb. v7,vreg1,vreg2; \
+ bne cr6,L(label2); \
+
+#define TAIL(vreg1,vreg2) \
+ vctzlsbb r6,v7; \
+ vextubrx r5,r6,vreg1; \
+ vextubrx r4,r6,vreg2; \
+ subf r3,r4,r5; \
+ blr; \
+
+#define CHECK_N_BYTES(reg1,reg2,len_reg) \
+ sldi r0,len_reg,56; \
+ lxvl 32+v4,reg1,r0; \
+ lxvl 32+v5,reg2,r0; \
+ add reg1,reg1,len_reg; \
+ add reg2,reg2,len_reg; \
+ vcmpnezb. v7,v4,v5; \
+ vctzlsbb r6,v7; \
+ cmpld cr7,r6,len_reg; \
+ blt cr7,L(different); \
+
+ /* TODO: change this to .machine power10 when the minimum required
+ binutils allows it. */
+
+ .machine power9
+ENTRY_TOCLESS (STRCMP, 4)
+ li r11,16
+ /* eq bit of cr1 used as swap status flag to indicate if
+ source pointers were swapped. */
+ crclr 4*cr1+eq
+ vspltisb v19,-1
+ andi. r7,r3,15
+ sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
+ andi. r9,r4,15
+ sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
+ cmpld cr7,r7,r5
+ beq cr7,L(same_aligned)
+ blt cr7,L(nalign1_min)
+ /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
+ pointer which is closer to the next 16B boundary so that only
+ one CHECK_N_BYTES is needed before entering the loop below. */
+ mr r8,r4
+ mr r4,r3
+ mr r3,r8
+ mr r12,r7
+ mr r7,r5
+ mr r5,r12
+ crset 4*cr1+eq /* Set bit on swapping source pointers. */
+
+ .p2align 5
+L(nalign1_min):
+ CHECK_N_BYTES(r3,r4,r7)
+
+ .p2align 5
+L(s1_aligned):
+ /* r9 and r5 is number of bytes to be read after and before
+ page boundary correspondingly. */
+ sub r5,r5,r7
+ subfic r9,r5,16
+ /* Now let r7 hold the count of quadwords which can be
+ checked without crossing a page boundary. quadword offset is
+ (str2>>4)&0xFF. */
+ rlwinm r7,r4,28,0xFF
+ /* Below check is required only for first iteration. For second
+ iteration and beyond, the new loop counter is always 255. */
+ cmpldi r7,255
+ beq L(L3)
+ /* Get the initial loop count by 255-((str2>>4)&0xFF). */
+ subfic r11,r7,255
+
+ .p2align 5
+L(L1):
+ mtctr r11
+
+ .p2align 5
+L(L2):
+ COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
+ addi r3,r3,16
+ addi r4,r4,16
+ bdnz L(L2)
+ /* Cross the page boundary of s2, carefully. */
+
+ .p2align 5
+L(L3):
+ CHECK_N_BYTES(r3,r4,r5)
+ CHECK_N_BYTES(r3,r4,r9)
+ li r11,255 /* Load the new loop counter. */
+ b L(L1)
+
+ .p2align 5
+L(same_aligned):
+ CHECK_N_BYTES(r3,r4,r7)
+ /* Align s1 to 32B and adjust s2 address.
+ Use lxvp only if both s1 and s2 are 32B aligned. */
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ COMPARE_16(v4,v5,48)
+ addi r3,r3,64
+ addi r4,r4,64
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+
+ clrldi r6,r3,59
+ subfic r5,r6,32
+ add r3,r3,r5
+ add r4,r4,r5
+ andi. r5,r4,0x1F
+ beq cr0,L(32B_aligned_loop)
+
+ .p2align 5
+L(16B_aligned_loop):
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ COMPARE_16(v4,v5,48)
+ addi r3,r3,64
+ addi r4,r4,64
+ b L(16B_aligned_loop)
+
+ /* Calculate and return the difference. */
+L(different):
+ vctzlsbb r6,v7
+ vextubrx r5,r6,v4
+ vextubrx r4,r6,v5
+ bt 4*cr1+eq,L(swapped)
+ subf r3,r4,r5
+ blr
+
+ /* If src pointers were swapped, then swap the
+ indices and calculate the return value. */
+L(swapped):
+ subf r3,r5,r4
+ blr
+
+ .p2align 5
+L(32B_aligned_loop):
+ COMPARE_32(v14,v16,0,tail1,tail2)
+ COMPARE_32(v18,v20,32,tail3,tail4)
+ COMPARE_32(v22,v24,64,tail5,tail6)
+ COMPARE_32(v26,v28,96,tail7,tail8)
+ addi r3,r3,128
+ addi r4,r4,128
+ b L(32B_aligned_loop)
+
+L(tail1): TAIL(v15,v17)
+L(tail2): TAIL(v14,v16)
+L(tail3): TAIL(v19,v21)
+L(tail4): TAIL(v18,v20)
+L(tail5): TAIL(v23,v25)
+L(tail6): TAIL(v22,v24)
+L(tail7): TAIL(v27,v29)
+L(tail8): TAIL(v26,v28)
+
+END (STRCMP)
+libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 0ee7ce39d6470d80..91ed88a9c716800d 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -33,7 +33,8 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
ifneq (,$(filter %le,$(config-machine)))
sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
rawmemchr-power9 rawmemchr-power10 \
- strcmp-power9 strncmp-power9 strcpy-power9 stpcpy-power9 \
+ strcmp-power9 strcmp-power10 strncmp-power9 \
+ strcpy-power9 stpcpy-power9 \
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 2c84d287ee76a7ea..caec2047ab10d209 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -416,6 +416,10 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c. */
IFUNC_IMPL (i, name, strcmp,
#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, strcmp,
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1)
+ && (hwcap & PPC_FEATURE_HAS_VSX),
+ __strcmp_power10)
IFUNC_IMPL_ADD (array, i, strcmp,
hwcap2 & PPC_FEATURE2_ARCH_3_00
&& hwcap & PPC_FEATURE_HAS_ALTIVEC,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
new file mode 100644
index 0000000000000000..c80067ce3305de81
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
@@ -0,0 +1,26 @@
+/* Optimized strcmp implementation for POWER10/PPC64.
+ Copyright (C) 2021-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define STRCMP __strcmp_power10
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+
+#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
+#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 8132682a992edb7a..4e77005117525edb 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -29,12 +29,16 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
# ifdef __LITTLE_ENDIAN__
extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
+extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
# endif
# undef strcmp
libc_ifunc_redirected (__redirect_strcmp, strcmp,
# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __strcmp_power10 :
(hwcap2 & PPC_FEATURE2_ARCH_3_00
&& hwcap & PPC_FEATURE_HAS_ALTIVEC)
? __strcmp_power9 :

435
glibc-RHEL-24740-2.patch Normal file
View File

@ -0,0 +1,435 @@
commit b9182c793caa05df5d697427c0538936e6396d4b
Author: MAHESH BODAPATI <bmahi496@linux.ibm.com>
Date: Tue Dec 12 08:52:45 2023 -0600
powerpc : Add optimized memchr for POWER10
Optimized memchr for POWER10 based on existing rawmemchr and strlen.
Reordering instructions and loop unrolling helped in getting better performance.
Reviewed-by: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
new file mode 100644
index 0000000000000000..faf293f3447e6fc6
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
@@ -0,0 +1,315 @@
+/* Optimized memchr implementation for POWER10 LE.
+ Copyright (C) 2021-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#include <sysdep.h>
+
+# ifndef MEMCHR
+# define MEMCHR __memchr
+# endif
+# define M_VREG_ZERO v20
+# define M_OFF_START_LOOP 256
+# define MEMCHR_SUBTRACT_VECTORS \
+ vsububm v4,v4,v18; \
+ vsububm v5,v5,v18; \
+ vsububm v6,v6,v18; \
+ vsububm v7,v7,v18;
+# define M_TAIL(vreg,increment) \
+ vctzlsbb r4,vreg; \
+ cmpld r5,r4; \
+ ble L(null); \
+ addi r4,r4,increment; \
+ add r3,r6,r4; \
+ blr
+
+/* TODO: Replace macros by the actual instructions when minimum binutils becomes
+ >= 2.35. This is used to keep compatibility with older versions. */
+#define M_VEXTRACTBM(rt,vrb) \
+ .long(((4)<<(32-6)) \
+ | ((rt)<<(32-11)) \
+ | ((8)<<(32-16)) \
+ | ((vrb)<<(32-21)) \
+ | 1602)
+
+#define M_LXVP(xtp,dq,ra) \
+ .long(((6)<<(32-6)) \
+ | ((((xtp)-32)>>1)<<(32-10)) \
+ | ((1)<<(32-11)) \
+ | ((ra)<<(32-16)) \
+ | dq)
+
+#define CHECK16B(vreg,offset,addr,label) \
+ lxv vreg+32,offset(addr); \
+ vcmpequb. vreg,vreg,v18; \
+ bne cr6,L(label); \
+ cmpldi r5,16; \
+ ble L(null); \
+ addi r5,r5,-16;
+
+/* Load 4 quadwords, merge into one VR for speed and check for NULLs. r6 has #
+ of bytes already checked. */
+#define CHECK64B(offset,addr,label) \
+ M_LXVP(v4+32,offset,addr); \
+ M_LXVP(v6+32,offset+32,addr); \
+ MEMCHR_SUBTRACT_VECTORS; \
+ vminub v14,v4,v5; \
+ vminub v15,v6,v7; \
+ vminub v16,v14,v15; \
+ vcmpequb. v0,v16,M_VREG_ZERO; \
+ beq cr6,$+12; \
+ li r7,offset; \
+ b L(label); \
+ cmpldi r5,64; \
+ ble L(null); \
+ addi r5,r5,-64
+
+/* Implements the function
+ void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]). */
+
+ .machine power9
+
+ENTRY_TOCLESS (MEMCHR)
+ CALL_MCOUNT 3
+
+ cmpldi r5,0
+ beq L(null)
+ mr r0,r5
+ xori r6,r4,0xff
+
+ mtvsrd v18+32,r4 /* matching char in v18 */
+ mtvsrd v19+32,r6 /* non matching char in v19 */
+
+ vspltb v18,v18,7 /* replicate */
+ vspltb v19,v19,7 /* replicate */
+ vspltisb M_VREG_ZERO,0
+
+ /* Next 16B-aligned address. Prepare address for L(aligned). */
+ addi r6,r3,16
+ clrrdi r6,r6,4
+
+ /* Align data and fill bytes not loaded with non matching char. */
+ lvx v0,0,r3
+ lvsr v1,0,r3
+ vperm v0,v19,v0,v1
+
+ vcmpequb. v6,v0,v18
+ bne cr6,L(found)
+ sub r4,r6,r3
+ cmpld r5,r4
+ ble L(null)
+ sub r5,r5,r4
+
+ /* Test up to OFF_START_LOOP-16 bytes in 16B chunks. The main loop is
+ optimized for longer strings, so checking the first bytes in 16B
+ chunks benefits a lot small strings. */
+ .p2align 5
+L(aligned):
+ cmpldi r5,0
+ beq L(null)
+
+ CHECK16B(v0,0,r6,tail1)
+ CHECK16B(v1,16,r6,tail2)
+ CHECK16B(v2,32,r6,tail3)
+ CHECK16B(v3,48,r6,tail4)
+ CHECK16B(v4,64,r6,tail5)
+ CHECK16B(v5,80,r6,tail6)
+ CHECK16B(v6,96,r6,tail7)
+ CHECK16B(v7,112,r6,tail8)
+ CHECK16B(v8,128,r6,tail9)
+ CHECK16B(v9,144,r6,tail10)
+ CHECK16B(v10,160,r6,tail11)
+ CHECK16B(v0,176,r6,tail12)
+ CHECK16B(v1,192,r6,tail13)
+ CHECK16B(v2,208,r6,tail14)
+ CHECK16B(v3,224,r6,tail15)
+
+ cmpdi cr5,r4,0 /* Check if c == 0. This will be useful to
+ choose how we will perform the main loop. */
+
+ /* Prepare address for the loop. */
+ addi r4,r3,M_OFF_START_LOOP
+ clrrdi r4,r4,6
+ sub r6,r4,r3
+ sub r5,r0,r6
+ addi r6,r4,128
+
+ /* If c == 0, use the loop without the vsububm. */
+ beq cr5,L(loop)
+
+ /* This is very similar to the block after L(loop), the difference is
+ that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
+ each byte loaded by the char we are looking for, this way we can keep
+ using vminub to merge the results and checking for nulls. */
+ .p2align 5
+L(memchr_loop):
+ CHECK64B(0,r4,pre_tail_64b)
+ CHECK64B(64,r4,pre_tail_64b)
+ addi r4,r4,256
+
+ CHECK64B(0,r6,tail_64b)
+ CHECK64B(64,r6,tail_64b)
+ addi r6,r6,256
+
+ CHECK64B(0,r4,pre_tail_64b)
+ CHECK64B(64,r4,pre_tail_64b)
+ addi r4,r4,256
+
+ CHECK64B(0,r6,tail_64b)
+ CHECK64B(64,r6,tail_64b)
+ addi r6,r6,256
+
+ b L(memchr_loop)
+ /* Switch to a more aggressive approach checking 64B each time. Use 2
+ pointers 128B apart and unroll the loop once to make the pointer
+ updates and usages separated enough to avoid stalls waiting for
+ address calculation. */
+ .p2align 5
+L(loop):
+#undef MEMCHR_SUBTRACT_VECTORS
+#define MEMCHR_SUBTRACT_VECTORS /* nothing */
+ CHECK64B(0,r4,pre_tail_64b)
+ CHECK64B(64,r4,pre_tail_64b)
+ addi r4,r4,256
+
+ CHECK64B(0,r6,tail_64b)
+ CHECK64B(64,r6,tail_64b)
+ addi r6,r6,256
+
+ CHECK64B(0,r4,pre_tail_64b)
+ CHECK64B(64,r4,pre_tail_64b)
+ addi r4,r4,256
+
+ CHECK64B(0,r6,tail_64b)
+ CHECK64B(64,r6,tail_64b)
+ addi r6,r6,256
+
+ b L(loop)
+
+ .p2align 5
+L(pre_tail_64b):
+ mr r6,r4
+L(tail_64b):
+ /* OK, we found a null byte. Let's look for it in the current 64-byte
+ block and mark it in its corresponding VR. lxvp vx,0(ry) puts the
+ low 16B bytes into vx+1, and the high into vx, so the order here is
+ v5, v4, v7, v6. */
+ vcmpequb v1,v5,M_VREG_ZERO
+ vcmpequb v2,v4,M_VREG_ZERO
+ vcmpequb v3,v7,M_VREG_ZERO
+ vcmpequb v4,v6,M_VREG_ZERO
+
+ /* Take into account the other 64B blocks we had already checked. */
+ add r6,r6,r7
+ /* Extract first bit of each byte. */
+ M_VEXTRACTBM(r8,v1)
+ M_VEXTRACTBM(r9,v2)
+ M_VEXTRACTBM(r10,v3)
+ M_VEXTRACTBM(r11,v4)
+
+ /* Shift each value into their corresponding position. */
+ sldi r9,r9,16
+ sldi r10,r10,32
+ sldi r11,r11,48
+
+ /* Merge the results. */
+ or r8,r8,r9
+ or r9,r10,r11
+ or r11,r9,r8
+
+ cnttzd r0,r11 /* Count trailing zeros before the match. */
+ cmpld r5,r0
+ ble L(null)
+ add r3,r6,r0 /* Compute final address. */
+ blr
+
+ .p2align 5
+L(tail1):
+ M_TAIL(v0,0)
+
+ .p2align 5
+L(tail2):
+ M_TAIL(v1,16)
+
+ .p2align 5
+L(tail3):
+ M_TAIL(v2,32)
+
+ .p2align 5
+L(tail4):
+ M_TAIL(v3,48)
+
+ .p2align 5
+L(tail5):
+ M_TAIL(v4,64)
+
+ .p2align 5
+L(tail6):
+ M_TAIL(v5,80)
+
+ .p2align 5
+L(tail7):
+ M_TAIL(v6,96)
+
+ .p2align 5
+L(tail8):
+ M_TAIL(v7,112)
+
+ .p2align 5
+L(tail9):
+ M_TAIL(v8,128)
+
+ .p2align 5
+L(tail10):
+ M_TAIL(v9,144)
+
+ .p2align 5
+L(tail11):
+ M_TAIL(v10,160)
+
+ .p2align 5
+L(tail12):
+ M_TAIL(v0,176)
+
+ .p2align 5
+L(tail13):
+ M_TAIL(v1,192)
+
+ .p2align 5
+L(tail14):
+ M_TAIL(v2,208)
+
+ .p2align 5
+L(tail15):
+ M_TAIL(v3,224)
+
+ .p2align 5
+L(found):
+ vctzlsbb r7,v6
+ cmpld r5,r7
+ ble L(null)
+ add r3,r3,r7
+ blr
+
+ .p2align 5
+L(null):
+ li r3,0
+ blr
+
+END (MEMCHR)
+
+weak_alias (__memchr, memchr)
+libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index 91ed88a9c716800d..b4251932de1854c2 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -31,10 +31,10 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
strncase-power8
ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
- rawmemchr-power9 rawmemchr-power10 \
- strcmp-power9 strcmp-power10 strncmp-power9 \
- strcpy-power9 stpcpy-power9 \
+sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
+ memmove-power10 memset-power10 rawmemchr-power9 \
+ rawmemchr-power10 strcmp-power9 strcmp-power10 \
+ strncmp-power9 strcpy-power9 stpcpy-power9 \
strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
endif
CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index caec2047ab10d209..e8a38fd4d5e1357e 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -265,6 +265,12 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
/* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c. */
IFUNC_IMPL (i, name, memchr,
+#ifdef __LITTLE_ENDIAN__
+ IFUNC_IMPL_ADD (array, i, memchr,
+ hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap & PPC_FEATURE_HAS_VSX,
+ __memchr_power10)
+#endif
IFUNC_IMPL_ADD (array, i, memchr,
hwcap2 & PPC_FEATURE2_ARCH_2_07
&& hwcap & PPC_FEATURE_HAS_ALTIVEC,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
new file mode 100644
index 0000000000000000..b9ed7926762e2b6f
--- /dev/null
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
@@ -0,0 +1,28 @@
+/* Optimized memchr implementation for POWER10/PPC64.
+ Copyright (C) 2016-2023 Free Software Foundation, Inc.
+ This file is part of the GNU C Library.
+
+ The GNU C Library is free software; you can redistribute it and/or
+ modify it under the terms of the GNU Lesser General Public
+ License as published by the Free Software Foundation; either
+ version 2.1 of the License, or (at your option) any later version.
+
+ The GNU C Library is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ Lesser General Public License for more details.
+
+ You should have received a copy of the GNU Lesser General Public
+ License along with the GNU C Library; if not, see
+ <https://www.gnu.org/licenses/>. */
+
+#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
+#define MEMCHR __memchr_power10
+
+#undef libc_hidden_builtin_def
+#define libc_hidden_builtin_def(name)
+#undef weak_alias
+#define weak_alias(name,alias)
+
+#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S>
+#endif
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
index f40013e06113096f..389d5f18683c2dfc 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -25,15 +25,23 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden;
extern __typeof (__memchr) __memchr_power7 attribute_hidden;
extern __typeof (__memchr) __memchr_power8 attribute_hidden;
+# ifdef __LITTLE_ENDIAN__
+extern __typeof (__memchr) __memchr_power10 attribute_hidden;
+# endif
/* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
ifunc symbol properly. */
libc_ifunc (__memchr,
- (hwcap2 & PPC_FEATURE2_ARCH_2_07
- && hwcap & PPC_FEATURE_HAS_ALTIVEC)
- ? __memchr_power8 :
- (hwcap & PPC_FEATURE_ARCH_2_06)
- ? __memchr_power7
- : __memchr_ppc);
+# ifdef __LITTLE_ENDIAN__
+ (hwcap2 & PPC_FEATURE2_ARCH_3_1
+ && hwcap & PPC_FEATURE_HAS_VSX)
+ ? __memchr_power10 :
+# endif
+ (hwcap2 & PPC_FEATURE2_ARCH_2_07
+ && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+ ? __memchr_power8 :
+ (hwcap & PPC_FEATURE_ARCH_2_06)
+ ? __memchr_power7
+ : __memchr_ppc);
weak_alias (__memchr, memchr)
libc_hidden_builtin_def (memchr)

218
glibc-RHEL-24740-3.patch Normal file
View File

@ -0,0 +1,218 @@
commit 90bcc8721ef82b7378d2b080141228660e862d56
Author: Amrita H S <amritahs@linux.vnet.ibm.com>
Date: Fri Dec 15 11:48:17 2023 -0500
powerpc: Fix performance issues of strcmp power10
Current implementation of strcmp for power10 has
performance regression for multiple small sizes
and alignment combination.
Most of these performance issues are fixed by this
patch. The compare loop is unrolled and page crosses
of unrolled loop is handled.
Thanks to Paul E. Murphy for helping in fixing the
performance issues.
Signed-off-by: Amrita H S <amritahs@linux.vnet.ibm.com>
Co-Authored-By: Paul E. Murphy <murphyp@linux.ibm.com>
Reviewed-by: Rajalakshmi Srinivasaraghavan <rajis@linux.ibm.com>
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
index a3c1adad539978e0..3406f4f26a214270 100644
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+++ b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
@@ -62,7 +62,7 @@
lxvl 32+v5,reg2,r0; \
add reg1,reg1,len_reg; \
add reg2,reg2,len_reg; \
- vcmpnezb. v7,v4,v5; \
+ vcmpnezb v7,v4,v5; \
vctzlsbb r6,v7; \
cmpld cr7,r6,len_reg; \
blt cr7,L(different); \
@@ -72,70 +72,110 @@
.machine power9
ENTRY_TOCLESS (STRCMP, 4)
- li r11,16
- /* eq bit of cr1 used as swap status flag to indicate if
- source pointers were swapped. */
- crclr 4*cr1+eq
- vspltisb v19,-1
- andi. r7,r3,15
- sub r7,r11,r7 /* r7(nalign1) = 16 - (str1 & 15). */
- andi. r9,r4,15
- sub r5,r11,r9 /* r5(nalign2) = 16 - (str2 & 15). */
- cmpld cr7,r7,r5
- beq cr7,L(same_aligned)
- blt cr7,L(nalign1_min)
- /* Swap r3 and r4, and r7 and r5 such that r3 and r7 hold the
- pointer which is closer to the next 16B boundary so that only
- one CHECK_N_BYTES is needed before entering the loop below. */
- mr r8,r4
- mr r4,r3
- mr r3,r8
- mr r12,r7
- mr r7,r5
- mr r5,r12
- crset 4*cr1+eq /* Set bit on swapping source pointers. */
+ andi. r7,r3,4095
+ andi. r8,r4,4095
+ cmpldi cr0,r7,4096-16
+ cmpldi cr1,r8,4096-16
+ bgt cr0,L(crosses)
+ bgt cr1,L(crosses)
+ COMPARE_16(v4,v5,0)
- .p2align 5
+L(crosses):
+ andi. r7,r3,15
+ subfic r7,r7,16 /* r7(nalign1) = 16 - (str1 & 15). */
+ andi. r9,r4,15
+ subfic r5,r9,16 /* r5(nalign2) = 16 - (str2 & 15). */
+ cmpld cr7,r7,r5
+ beq cr7,L(same_aligned)
+ blt cr7,L(nalign1_min)
+
+ /* nalign2 is minimum and s2 pointer is aligned. */
+ CHECK_N_BYTES(r3,r4,r5)
+ /* Are we on the 64B hunk which crosses a page? */
+ andi. r10,r3,63 /* Determine offset into 64B hunk. */
+ andi. r8,r3,15 /* The offset into the 16B hunk. */
+ neg r7,r3
+ andi. r9,r7,15 /* Number of bytes after a 16B cross. */
+ rlwinm. r7,r7,26,0x3F /* ((r3-4096))>>6&63. */
+ beq L(compare_64_pagecross)
+ mtctr r7
+ b L(compare_64B_unaligned)
+
+ /* nalign1 is minimum and s1 pointer is aligned. */
L(nalign1_min):
CHECK_N_BYTES(r3,r4,r7)
+ /* Are we on the 64B hunk which crosses a page? */
+ andi. r10,r4,63 /* Determine offset into 64B hunk. */
+ andi. r8,r4,15 /* The offset into the 16B hunk. */
+ neg r7,r4
+ andi. r9,r7,15 /* Number of bytes after a 16B cross. */
+ rlwinm. r7,r7,26,0x3F /* ((r4-4096))>>6&63. */
+ beq L(compare_64_pagecross)
+ mtctr r7
.p2align 5
-L(s1_aligned):
- /* r9 and r5 is number of bytes to be read after and before
- page boundary correspondingly. */
- sub r5,r5,r7
- subfic r9,r5,16
- /* Now let r7 hold the count of quadwords which can be
- checked without crossing a page boundary. quadword offset is
- (str2>>4)&0xFF. */
- rlwinm r7,r4,28,0xFF
- /* Below check is required only for first iteration. For second
- iteration and beyond, the new loop counter is always 255. */
- cmpldi r7,255
- beq L(L3)
- /* Get the initial loop count by 255-((str2>>4)&0xFF). */
- subfic r11,r7,255
+L(compare_64B_unaligned):
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ COMPARE_16(v4,v5,48)
+ addi r3,r3,64
+ addi r4,r4,64
+ bdnz L(compare_64B_unaligned)
- .p2align 5
-L(L1):
+ /* Cross the page boundary of s2, carefully. Only for first
+ iteration we have to get the count of 64B blocks to be checked.
+ From second iteration and beyond, loop counter is always 63. */
+L(compare_64_pagecross):
+ li r11, 63
mtctr r11
-
- .p2align 5
-L(L2):
- COMPARE_16(v4,v5,0) /* Load 16B blocks using lxv. */
+ cmpldi r10,16
+ ble L(cross_4)
+ cmpldi r10,32
+ ble L(cross_3)
+ cmpldi r10,48
+ ble L(cross_2)
+L(cross_1):
+ CHECK_N_BYTES(r3,r4,r9)
+ CHECK_N_BYTES(r3,r4,r8)
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ addi r3,r3,48
+ addi r4,r4,48
+ b L(compare_64B_unaligned)
+L(cross_2):
+ COMPARE_16(v4,v5,0)
addi r3,r3,16
addi r4,r4,16
- bdnz L(L2)
- /* Cross the page boundary of s2, carefully. */
-
- .p2align 5
-L(L3):
- CHECK_N_BYTES(r3,r4,r5)
CHECK_N_BYTES(r3,r4,r9)
- li r11,255 /* Load the new loop counter. */
- b L(L1)
+ CHECK_N_BYTES(r3,r4,r8)
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ addi r3,r3,32
+ addi r4,r4,32
+ b L(compare_64B_unaligned)
+L(cross_3):
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ addi r3,r3,32
+ addi r4,r4,32
+ CHECK_N_BYTES(r3,r4,r9)
+ CHECK_N_BYTES(r3,r4,r8)
+ COMPARE_16(v4,v5,0)
+ addi r3,r3,16
+ addi r4,r4,16
+ b L(compare_64B_unaligned)
+L(cross_4):
+ COMPARE_16(v4,v5,0)
+ COMPARE_16(v4,v5,16)
+ COMPARE_16(v4,v5,32)
+ addi r3,r3,48
+ addi r4,r4,48
+ CHECK_N_BYTES(r3,r4,r9)
+ CHECK_N_BYTES(r3,r4,r8)
+ b L(compare_64B_unaligned)
- .p2align 5
L(same_aligned):
CHECK_N_BYTES(r3,r4,r7)
/* Align s1 to 32B and adjust s2 address.
@@ -168,18 +208,7 @@ L(16B_aligned_loop):
/* Calculate and return the difference. */
L(different):
- vctzlsbb r6,v7
- vextubrx r5,r6,v4
- vextubrx r4,r6,v5
- bt 4*cr1+eq,L(swapped)
- subf r3,r4,r5
- blr
-
- /* If src pointers were swapped, then swap the
- indices and calculate the return value. */
-L(swapped):
- subf r3,r5,r4
- blr
+ TAIL(v4,v5)
.p2align 5
L(32B_aligned_loop):

View File

@ -157,7 +157,7 @@ end \
Summary: The GNU libc libraries
Name: glibc
Version: %{glibcversion}
Release: 166%{?dist}
Release: 167%{?dist}
# In general, GPLv2+ is used by programs, LGPLv2+ is used for
# libraries.
@ -1110,6 +1110,9 @@ Patch802: glibc-RHEL-2419-9.patch
Patch803: glibc-RHEL-2419-10.patch
Patch804: glibc-RHEL-46738-5.patch
Patch805: glibc-RHEL-46761-6.patch
Patch806: glibc-RHEL-24740-1.patch
Patch807: glibc-RHEL-24740-2.patch
Patch808: glibc-RHEL-24740-3.patch
##############################################################################
# Continued list of core "glibc" package information:
@ -3103,6 +3106,9 @@ update_gconv_modules_cache ()
%endif
%changelog
* Thu Feb 13 2025 Florian Weimer <fweimer@redhat.com> - 2.34-167
- POWER10 string function optimizations (RHEL-24740)
* Tue Feb 11 2025 Arjun Shankar <arjun@redhat.com> - 2.34-166
- Revert: Backport: debug: Add regression tests for BZ 30932 (RHEL-46761)