279 lines
7.2 KiB
Diff
279 lines
7.2 KiB
Diff
commit 813c6ec808556553be9d39e900a3fc97ceb32330
|
|
Author: Pedro Franco de Carvalho <pedromfc@linux.ibm.com>
|
|
Date: Wed Jun 30 12:36:07 2021 -0300
|
|
|
|
powerpc: optimize strcpy/stpcpy for POWER9/10
|
|
|
|
This patch modifies the current POWER9 implementation of strcpy and
|
|
stpcpy to optimize it for POWER9/10.
|
|
|
|
Since no new POWER10 instructions are used, the original POWER9 strcpy is
|
|
modified instead of creating a new implementation for POWER10. This
|
|
implementation is based on both the original POWER9 implementation of
|
|
strcpy and the preamble of the new POWER10 implementation of strlen.
|
|
|
|
The changes also affect stpcpy, which uses the same implementation with
|
|
some additional code before returning.
|
|
|
|
On POWER9, averaging improvements across the benchmark
|
|
inputs (length/source alignment/destination alignment), for an
|
|
experiment that ran the benchmark five times, bench-strcpy showed an
|
|
improvement of 5.23%, and bench-stpcpy showed an improvement of 6.59%.
|
|
|
|
On POWER10, bench-strcpy showed 13.16%, and bench-stpcpy showed 13.59%.
|
|
|
|
The changes are:
|
|
|
|
1. Removed the null string optimization.
|
|
|
|
Although this results in a few extra cycles for the null string, in
|
|
combination with the second change, this resulted in improvements for
|
|
for other cases.
|
|
|
|
2. Adapted the preamble from strlen for POWER10.
|
|
|
|
This is the part of the function that handles up to the first 16 bytes
|
|
of the string.
|
|
|
|
3. Increased number of unrolled iterations in the main loop to 6.
|
|
|
|
Reviewed-by: Matheus Castanho <msc@linux.ibm.com>
|
|
Tested-by: Matheus Castanho <msc@linux.ibm.com>
|
|
|
|
diff --git a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
|
|
index ce8f50329177fd06..9845a1d4cf0e1e5d 100644
|
|
--- a/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
|
|
+++ b/sysdeps/powerpc/powerpc64/le/power9/strcpy.S
|
|
@@ -45,91 +45,78 @@
|
|
The implementation can load bytes past a null terminator, but only
|
|
up to the next 16B boundary, so it never crosses a page. */
|
|
|
|
+/* Load quadword at addr+offset to vreg, check for null bytes,
|
|
+ and branch to label if any are found. */
|
|
+#define CHECK16(vreg,offset,addr,label) \
|
|
+ lxv vreg+32,offset(addr); \
|
|
+ vcmpequb. v6,vreg,v18; \
|
|
+ bne cr6,L(label);
|
|
+
|
|
.machine power9
|
|
ENTRY_TOCLESS (FUNC_NAME, 4)
|
|
CALL_MCOUNT 2
|
|
|
|
- /* NULL string optimisation */
|
|
- lbz r0,0(r4)
|
|
- stb r0,0(r3)
|
|
- cmpwi r0,0
|
|
- beqlr
|
|
-
|
|
- addi r4,r4,1
|
|
- addi r11,r3,1
|
|
-
|
|
vspltisb v18,0 /* Zeroes in v18 */
|
|
+ vspltisb v19,-1 /* 0xFF bytes in v19 */
|
|
|
|
- neg r5,r4
|
|
- rldicl r9,r5,0,60 /* How many bytes to get source 16B aligned? */
|
|
+ /* Next 16B-aligned address. Prepare address for L(loop). */
|
|
+ addi r5,r4,16
|
|
+ clrrdi r5,r5,4
|
|
+ subf r8,r4,r5
|
|
+ add r11,r3,r8
|
|
|
|
- /* Get source 16B aligned */
|
|
+ /* Align data and fill bytes not loaded with non matching char. */
|
|
lvx v0,0,r4
|
|
lvsr v1,0,r4
|
|
- vperm v0,v18,v0,v1
|
|
-
|
|
- vcmpequb v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
|
|
- vctzlsbb r7,v6 /* Number of trailing zeroes */
|
|
- addi r8,r7,1 /* Add null terminator */
|
|
+ vperm v0,v19,v0,v1
|
|
|
|
- /* r8 = bytes including null
|
|
- r9 = bytes to get source 16B aligned
|
|
- if r8 > r9
|
|
- no null, copy r9 bytes
|
|
- else
|
|
- there is a null, copy r8 bytes and return. */
|
|
- cmpd r8,r9
|
|
- bgt L(no_null)
|
|
+ vcmpequb. v6,v0,v18 /* 0xff if byte is NULL, 0x00 otherwise */
|
|
+ beq cr6,L(no_null)
|
|
|
|
- sldi r10,r8,56 /* stxvl wants size in top 8 bits */
|
|
- stxvl 32+v0,r11,r10 /* Partial store */
|
|
+ /* There's a null byte. */
|
|
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
|
|
+ addi r9,r8,1 /* Add null byte. */
|
|
+ sldi r10,r9,56 /* stxvl wants size in top 8 bits. */
|
|
+ stxvl 32+v0,r3,r10 /* Partial store */
|
|
|
|
#ifdef USE_AS_STPCPY
|
|
/* stpcpy returns the dest address plus the size not counting the
|
|
final '\0'. */
|
|
- add r3,r11,r7
|
|
+ add r3,r3,r8
|
|
#endif
|
|
blr
|
|
|
|
L(no_null):
|
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
|
- stxvl 32+v0,r11,r10 /* Partial store */
|
|
-
|
|
- add r4,r4,r9
|
|
- add r11,r11,r9
|
|
+ sldi r10,r8,56 /* stxvl wants size in top 8 bits */
|
|
+ stxvl 32+v0,r3,r10 /* Partial store */
|
|
|
|
+ .p2align 4
|
|
L(loop):
|
|
- lxv 32+v0,0(r4)
|
|
- vcmpequb. v6,v0,v18 /* Any zero bytes? */
|
|
- bne cr6,L(tail1)
|
|
-
|
|
- lxv 32+v1,16(r4)
|
|
- vcmpequb. v6,v1,v18 /* Any zero bytes? */
|
|
- bne cr6,L(tail2)
|
|
-
|
|
- lxv 32+v2,32(r4)
|
|
- vcmpequb. v6,v2,v18 /* Any zero bytes? */
|
|
- bne cr6,L(tail3)
|
|
-
|
|
- lxv 32+v3,48(r4)
|
|
- vcmpequb. v6,v3,v18 /* Any zero bytes? */
|
|
- bne cr6,L(tail4)
|
|
+ CHECK16(v0,0,r5,tail1)
|
|
+ CHECK16(v1,16,r5,tail2)
|
|
+ CHECK16(v2,32,r5,tail3)
|
|
+ CHECK16(v3,48,r5,tail4)
|
|
+ CHECK16(v4,64,r5,tail5)
|
|
+ CHECK16(v5,80,r5,tail6)
|
|
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
stxv 32+v3,48(r11)
|
|
+ stxv 32+v4,64(r11)
|
|
+ stxv 32+v5,80(r11)
|
|
|
|
- addi r4,r4,64
|
|
- addi r11,r11,64
|
|
+ addi r5,r5,96
|
|
+ addi r11,r11,96
|
|
|
|
b L(loop)
|
|
|
|
+ .p2align 4
|
|
L(tail1):
|
|
- vctzlsbb r8,v6
|
|
- addi r9,r8,1
|
|
+ vctzlsbb r8,v6 /* Number of trailing zeroes */
|
|
+ addi r9,r8,1 /* Add null terminator */
|
|
sldi r9,r9,56 /* stxvl wants size in top 8 bits */
|
|
- stxvl 32+v0,r11,r9
|
|
+ stxvl 32+v0,r11,r9 /* Partial store */
|
|
#ifdef USE_AS_STPCPY
|
|
/* stpcpy returns the dest address plus the size not counting the
|
|
final '\0'. */
|
|
@@ -137,50 +124,81 @@ L(tail1):
|
|
#endif
|
|
blr
|
|
|
|
+ .p2align 4
|
|
L(tail2):
|
|
stxv 32+v0,0(r11)
|
|
- vctzlsbb r8,v6 /* Number of trailing zeroes */
|
|
- addi r9,r8,1 /* Add null terminator */
|
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
|
+ vctzlsbb r8,v6
|
|
+ addi r9,r8,1
|
|
+ sldi r9,r9,56
|
|
addi r11,r11,16
|
|
- stxvl 32+v1,r11,r10 /* Partial store */
|
|
+ stxvl 32+v1,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
- /* stpcpy returns the dest address plus the size not counting the
|
|
- final '\0'. */
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
+ .p2align 4
|
|
L(tail3):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
- vctzlsbb r8,v6 /* Number of trailing zeroes */
|
|
- addi r9,r8,1 /* Add null terminator */
|
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
|
+ vctzlsbb r8,v6
|
|
+ addi r9,r8,1
|
|
+ sldi r9,r9,56
|
|
addi r11,r11,32
|
|
- stxvl 32+v2,r11,r10 /* Partial store */
|
|
+ stxvl 32+v2,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
- /* stpcpy returns the dest address plus the size not counting the
|
|
- final '\0'. */
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
|
|
+ .p2align 4
|
|
L(tail4):
|
|
stxv 32+v0,0(r11)
|
|
stxv 32+v1,16(r11)
|
|
stxv 32+v2,32(r11)
|
|
- vctzlsbb r8,v6 /* Number of trailing zeroes */
|
|
- addi r9,r8,1 /* Add null terminator */
|
|
- sldi r10,r9,56 /* stxvl wants size in top 8 bits */
|
|
+ vctzlsbb r8,v6
|
|
+ addi r9,r8,1
|
|
+ sldi r9,r9,56
|
|
addi r11,r11,48
|
|
- stxvl 32+v3,r11,r10 /* Partial store */
|
|
+ stxvl 32+v3,r11,r9
|
|
#ifdef USE_AS_STPCPY
|
|
- /* stpcpy returns the dest address plus the size not counting the
|
|
- final '\0'. */
|
|
add r3,r11,r8
|
|
#endif
|
|
blr
|
|
+
|
|
+ .p2align 4
|
|
+L(tail5):
|
|
+ stxv 32+v0,0(r11)
|
|
+ stxv 32+v1,16(r11)
|
|
+ stxv 32+v2,32(r11)
|
|
+ stxv 32+v3,48(r11)
|
|
+ vctzlsbb r8,v6
|
|
+ addi r9,r8,1
|
|
+ sldi r9,r9,56
|
|
+ addi r11,r11,64
|
|
+ stxvl 32+v4,r11,r9
|
|
+#ifdef USE_AS_STPCPY
|
|
+ add r3,r11,r8
|
|
+#endif
|
|
+ blr
|
|
+
|
|
+ .p2align 4
|
|
+L(tail6):
|
|
+ stxv 32+v0,0(r11)
|
|
+ stxv 32+v1,16(r11)
|
|
+ stxv 32+v2,32(r11)
|
|
+ stxv 32+v3,48(r11)
|
|
+ stxv 32+v4,64(r11)
|
|
+ vctzlsbb r8,v6
|
|
+ addi r9,r8,1
|
|
+ sldi r9,r9,56
|
|
+ addi r11,r11,80
|
|
+ stxvl 32+v5,r11,r9
|
|
+#ifdef USE_AS_STPCPY
|
|
+ add r3,r11,r8
|
|
+#endif
|
|
+ blr
|
|
+
|
|
END (FUNC_NAME)
|
|
#ifndef USE_AS_STPCPY
|
|
libc_hidden_builtin_def (strcpy)
|