631 lines
17 KiB
Diff
631 lines
17 KiB
Diff
From b31bd11454fade731e5158b1aea40b133ae19926 Mon Sep 17 00:00:00 2001
|
|
From: Wilco Dijkstra <wdijkstr@arm.com>
|
|
Date: Thu, 2 Dec 2021 18:33:26 +0000
|
|
Subject: [PATCH] AArch64: Improve A64FX memcpy
|
|
|
|
v2 is a complete rewrite of the A64FX memcpy. Performance is improved
|
|
by streamlining the code, aligning all large copies and using a single
|
|
unrolled loop for all sizes. The code size for memcpy and memmove goes
|
|
down from 1796 bytes to 868 bytes. Performance is better in all cases:
|
|
bench-memcpy-random is 2.3% faster overall, bench-memcpy-large is ~33%
|
|
faster for large sizes, bench-memcpy-walk is 25% faster for small sizes
|
|
and 20% for the largest sizes. The geomean of all tests in bench-memcpy
|
|
is 5.1% faster, and total time is reduced by 4%.
|
|
|
|
Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
|
|
---
|
|
sysdeps/aarch64/multiarch/memcpy_a64fx.S | 546 ++++++++++-------------
|
|
1 file changed, 225 insertions(+), 321 deletions(-)
|
|
|
|
diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
|
|
index ae7464e09f..0b306925e6 100644
|
|
--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S
|
|
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
|
|
@@ -28,20 +28,15 @@
|
|
*
|
|
*/
|
|
|
|
-#define L2_SIZE (8*1024*1024)/2 // L2 8MB/2
|
|
-#define CACHE_LINE_SIZE 256
|
|
-#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance
|
|
-#define dest x0
|
|
-#define src x1
|
|
-#define n x2 // size
|
|
-#define tmp1 x3
|
|
-#define tmp2 x4
|
|
-#define tmp3 x5
|
|
-#define rest x6
|
|
-#define dest_ptr x7
|
|
-#define src_ptr x8
|
|
-#define vector_length x9
|
|
-#define cl_remainder x10 // CACHE_LINE_SIZE remainder
|
|
+#define dstin x0
|
|
+#define src x1
|
|
+#define n x2
|
|
+#define dst x3
|
|
+#define dstend x4
|
|
+#define srcend x5
|
|
+#define tmp x6
|
|
+#define vlen x7
|
|
+#define vlen8 x8
|
|
|
|
#if HAVE_AARCH64_SVE_ASM
|
|
# if IS_IN (libc)
|
|
@@ -50,45 +45,37 @@
|
|
|
|
.arch armv8.2-a+sve
|
|
|
|
- .macro dc_zva times
|
|
- dc zva, tmp1
|
|
- add tmp1, tmp1, CACHE_LINE_SIZE
|
|
- .if \times-1
|
|
- dc_zva "(\times-1)"
|
|
- .endif
|
|
- .endm
|
|
-
|
|
.macro ld1b_unroll8
|
|
- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
- ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
|
|
- ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
|
|
- ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
|
|
- ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
|
|
- ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
|
|
- ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
|
|
- ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p0/z, [src, 1, mul vl]
|
|
+ ld1b z2.b, p0/z, [src, 2, mul vl]
|
|
+ ld1b z3.b, p0/z, [src, 3, mul vl]
|
|
+ ld1b z4.b, p0/z, [src, 4, mul vl]
|
|
+ ld1b z5.b, p0/z, [src, 5, mul vl]
|
|
+ ld1b z6.b, p0/z, [src, 6, mul vl]
|
|
+ ld1b z7.b, p0/z, [src, 7, mul vl]
|
|
.endm
|
|
|
|
.macro stld1b_unroll4a
|
|
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
- st1b z1.b, p0, [dest_ptr, #1, mul vl]
|
|
- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
- ld1b z1.b, p0/z, [src_ptr, #1, mul vl]
|
|
- st1b z2.b, p0, [dest_ptr, #2, mul vl]
|
|
- st1b z3.b, p0, [dest_ptr, #3, mul vl]
|
|
- ld1b z2.b, p0/z, [src_ptr, #2, mul vl]
|
|
- ld1b z3.b, p0/z, [src_ptr, #3, mul vl]
|
|
+ st1b z0.b, p0, [dst, 0, mul vl]
|
|
+ st1b z1.b, p0, [dst, 1, mul vl]
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p0/z, [src, 1, mul vl]
|
|
+ st1b z2.b, p0, [dst, 2, mul vl]
|
|
+ st1b z3.b, p0, [dst, 3, mul vl]
|
|
+ ld1b z2.b, p0/z, [src, 2, mul vl]
|
|
+ ld1b z3.b, p0/z, [src, 3, mul vl]
|
|
.endm
|
|
|
|
.macro stld1b_unroll4b
|
|
- st1b z4.b, p0, [dest_ptr, #4, mul vl]
|
|
- st1b z5.b, p0, [dest_ptr, #5, mul vl]
|
|
- ld1b z4.b, p0/z, [src_ptr, #4, mul vl]
|
|
- ld1b z5.b, p0/z, [src_ptr, #5, mul vl]
|
|
- st1b z6.b, p0, [dest_ptr, #6, mul vl]
|
|
- st1b z7.b, p0, [dest_ptr, #7, mul vl]
|
|
- ld1b z6.b, p0/z, [src_ptr, #6, mul vl]
|
|
- ld1b z7.b, p0/z, [src_ptr, #7, mul vl]
|
|
+ st1b z4.b, p0, [dst, 4, mul vl]
|
|
+ st1b z5.b, p0, [dst, 5, mul vl]
|
|
+ ld1b z4.b, p0/z, [src, 4, mul vl]
|
|
+ ld1b z5.b, p0/z, [src, 5, mul vl]
|
|
+ st1b z6.b, p0, [dst, 6, mul vl]
|
|
+ st1b z7.b, p0, [dst, 7, mul vl]
|
|
+ ld1b z6.b, p0/z, [src, 6, mul vl]
|
|
+ ld1b z7.b, p0/z, [src, 7, mul vl]
|
|
.endm
|
|
|
|
.macro stld1b_unroll8
|
|
@@ -97,87 +84,18 @@
|
|
.endm
|
|
|
|
.macro st1b_unroll8
|
|
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
- st1b z1.b, p0, [dest_ptr, #1, mul vl]
|
|
- st1b z2.b, p0, [dest_ptr, #2, mul vl]
|
|
- st1b z3.b, p0, [dest_ptr, #3, mul vl]
|
|
- st1b z4.b, p0, [dest_ptr, #4, mul vl]
|
|
- st1b z5.b, p0, [dest_ptr, #5, mul vl]
|
|
- st1b z6.b, p0, [dest_ptr, #6, mul vl]
|
|
- st1b z7.b, p0, [dest_ptr, #7, mul vl]
|
|
+ st1b z0.b, p0, [dst, 0, mul vl]
|
|
+ st1b z1.b, p0, [dst, 1, mul vl]
|
|
+ st1b z2.b, p0, [dst, 2, mul vl]
|
|
+ st1b z3.b, p0, [dst, 3, mul vl]
|
|
+ st1b z4.b, p0, [dst, 4, mul vl]
|
|
+ st1b z5.b, p0, [dst, 5, mul vl]
|
|
+ st1b z6.b, p0, [dst, 6, mul vl]
|
|
+ st1b z7.b, p0, [dst, 7, mul vl]
|
|
.endm
|
|
|
|
- .macro shortcut_for_small_size exit
|
|
- // if rest <= vector_length * 2
|
|
- whilelo p0.b, xzr, n
|
|
- whilelo p1.b, vector_length, n
|
|
- b.last 1f
|
|
- ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
- ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
- st1b z0.b, p0, [dest, #0, mul vl]
|
|
- st1b z1.b, p1, [dest, #1, mul vl]
|
|
- ret
|
|
-1: // if rest > vector_length * 8
|
|
- cmp n, vector_length, lsl 3 // vector_length * 8
|
|
- b.hi \exit
|
|
- // if rest <= vector_length * 4
|
|
- lsl tmp1, vector_length, 1 // vector_length * 2
|
|
- whilelo p2.b, tmp1, n
|
|
- incb tmp1
|
|
- whilelo p3.b, tmp1, n
|
|
- b.last 1f
|
|
- ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
- ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
- ld1b z2.b, p2/z, [src, #2, mul vl]
|
|
- ld1b z3.b, p3/z, [src, #3, mul vl]
|
|
- st1b z0.b, p0, [dest, #0, mul vl]
|
|
- st1b z1.b, p1, [dest, #1, mul vl]
|
|
- st1b z2.b, p2, [dest, #2, mul vl]
|
|
- st1b z3.b, p3, [dest, #3, mul vl]
|
|
- ret
|
|
-1: // if rest <= vector_length * 8
|
|
- lsl tmp1, vector_length, 2 // vector_length * 4
|
|
- whilelo p4.b, tmp1, n
|
|
- incb tmp1
|
|
- whilelo p5.b, tmp1, n
|
|
- b.last 1f
|
|
- ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
- ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
- ld1b z2.b, p2/z, [src, #2, mul vl]
|
|
- ld1b z3.b, p3/z, [src, #3, mul vl]
|
|
- ld1b z4.b, p4/z, [src, #4, mul vl]
|
|
- ld1b z5.b, p5/z, [src, #5, mul vl]
|
|
- st1b z0.b, p0, [dest, #0, mul vl]
|
|
- st1b z1.b, p1, [dest, #1, mul vl]
|
|
- st1b z2.b, p2, [dest, #2, mul vl]
|
|
- st1b z3.b, p3, [dest, #3, mul vl]
|
|
- st1b z4.b, p4, [dest, #4, mul vl]
|
|
- st1b z5.b, p5, [dest, #5, mul vl]
|
|
- ret
|
|
-1: lsl tmp1, vector_length, 2 // vector_length * 4
|
|
- incb tmp1 // vector_length * 5
|
|
- incb tmp1 // vector_length * 6
|
|
- whilelo p6.b, tmp1, n
|
|
- incb tmp1
|
|
- whilelo p7.b, tmp1, n
|
|
- ld1b z0.b, p0/z, [src, #0, mul vl]
|
|
- ld1b z1.b, p1/z, [src, #1, mul vl]
|
|
- ld1b z2.b, p2/z, [src, #2, mul vl]
|
|
- ld1b z3.b, p3/z, [src, #3, mul vl]
|
|
- ld1b z4.b, p4/z, [src, #4, mul vl]
|
|
- ld1b z5.b, p5/z, [src, #5, mul vl]
|
|
- ld1b z6.b, p6/z, [src, #6, mul vl]
|
|
- ld1b z7.b, p7/z, [src, #7, mul vl]
|
|
- st1b z0.b, p0, [dest, #0, mul vl]
|
|
- st1b z1.b, p1, [dest, #1, mul vl]
|
|
- st1b z2.b, p2, [dest, #2, mul vl]
|
|
- st1b z3.b, p3, [dest, #3, mul vl]
|
|
- st1b z4.b, p4, [dest, #4, mul vl]
|
|
- st1b z5.b, p5, [dest, #5, mul vl]
|
|
- st1b z6.b, p6, [dest, #6, mul vl]
|
|
- st1b z7.b, p7, [dest, #7, mul vl]
|
|
- ret
|
|
- .endm
|
|
+#undef BTI_C
|
|
+#define BTI_C
|
|
|
|
ENTRY (MEMCPY)
|
|
|
|
@@ -185,223 +103,209 @@ ENTRY (MEMCPY)
|
|
PTR_ARG (1)
|
|
SIZE_ARG (2)
|
|
|
|
-L(memcpy):
|
|
- cntb vector_length
|
|
- // shortcut for less than vector_length * 8
|
|
- // gives a free ptrue to p0.b for n >= vector_length
|
|
- shortcut_for_small_size L(vl_agnostic)
|
|
- // end of shortcut
|
|
-
|
|
-L(vl_agnostic): // VL Agnostic
|
|
- mov rest, n
|
|
- mov dest_ptr, dest
|
|
- mov src_ptr, src
|
|
- // if rest >= L2_SIZE && vector_length == 64 then L(L2)
|
|
- mov tmp1, 64
|
|
- cmp rest, L2_SIZE
|
|
- ccmp vector_length, tmp1, 0, cs
|
|
- b.eq L(L2)
|
|
-
|
|
-L(unroll8): // unrolling and software pipeline
|
|
- lsl tmp1, vector_length, 3 // vector_length * 8
|
|
- .p2align 3
|
|
- cmp rest, tmp1
|
|
- b.cc L(last)
|
|
+ cntb vlen
|
|
+ cmp n, vlen, lsl 1
|
|
+ b.hi L(copy_small)
|
|
+ whilelo p1.b, vlen, n
|
|
+ whilelo p0.b, xzr, n
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p1/z, [src, 1, mul vl]
|
|
+ st1b z0.b, p0, [dstin, 0, mul vl]
|
|
+ st1b z1.b, p1, [dstin, 1, mul vl]
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+
|
|
+L(copy_small):
|
|
+ cmp n, vlen, lsl 3
|
|
+ b.hi L(copy_large)
|
|
+ add dstend, dstin, n
|
|
+ add srcend, src, n
|
|
+ cmp n, vlen, lsl 2
|
|
+ b.hi 1f
|
|
+
|
|
+ /* Copy 2-4 vectors. */
|
|
+ ptrue p0.b
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p0/z, [src, 1, mul vl]
|
|
+ ld1b z2.b, p0/z, [srcend, -2, mul vl]
|
|
+ ld1b z3.b, p0/z, [srcend, -1, mul vl]
|
|
+ st1b z0.b, p0, [dstin, 0, mul vl]
|
|
+ st1b z1.b, p0, [dstin, 1, mul vl]
|
|
+ st1b z2.b, p0, [dstend, -2, mul vl]
|
|
+ st1b z3.b, p0, [dstend, -1, mul vl]
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+ /* Copy 4-8 vectors. */
|
|
+1: ptrue p0.b
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p0/z, [src, 1, mul vl]
|
|
+ ld1b z2.b, p0/z, [src, 2, mul vl]
|
|
+ ld1b z3.b, p0/z, [src, 3, mul vl]
|
|
+ ld1b z4.b, p0/z, [srcend, -4, mul vl]
|
|
+ ld1b z5.b, p0/z, [srcend, -3, mul vl]
|
|
+ ld1b z6.b, p0/z, [srcend, -2, mul vl]
|
|
+ ld1b z7.b, p0/z, [srcend, -1, mul vl]
|
|
+ st1b z0.b, p0, [dstin, 0, mul vl]
|
|
+ st1b z1.b, p0, [dstin, 1, mul vl]
|
|
+ st1b z2.b, p0, [dstin, 2, mul vl]
|
|
+ st1b z3.b, p0, [dstin, 3, mul vl]
|
|
+ st1b z4.b, p0, [dstend, -4, mul vl]
|
|
+ st1b z5.b, p0, [dstend, -3, mul vl]
|
|
+ st1b z6.b, p0, [dstend, -2, mul vl]
|
|
+ st1b z7.b, p0, [dstend, -1, mul vl]
|
|
+ ret
|
|
+
|
|
+ .p2align 4
|
|
+ /* At least 8 vectors - always align to vector length for
|
|
+ higher and consistent write performance. */
|
|
+L(copy_large):
|
|
+ sub tmp, vlen, 1
|
|
+ and tmp, dstin, tmp
|
|
+ sub tmp, vlen, tmp
|
|
+ whilelo p1.b, xzr, tmp
|
|
+ ld1b z1.b, p1/z, [src]
|
|
+ st1b z1.b, p1, [dstin]
|
|
+ add dst, dstin, tmp
|
|
+ add src, src, tmp
|
|
+ sub n, n, tmp
|
|
+ ptrue p0.b
|
|
+
|
|
+ lsl vlen8, vlen, 3
|
|
+ subs n, n, vlen8
|
|
+ b.ls 3f
|
|
ld1b_unroll8
|
|
- add src_ptr, src_ptr, tmp1
|
|
- sub rest, rest, tmp1
|
|
- cmp rest, tmp1
|
|
- b.cc 2f
|
|
- .p2align 3
|
|
+ add src, src, vlen8
|
|
+ subs n, n, vlen8
|
|
+ b.ls 2f
|
|
+
|
|
+ .p2align 4
|
|
+ /* 8x unrolled and software pipelined loop. */
|
|
1: stld1b_unroll8
|
|
- add dest_ptr, dest_ptr, tmp1
|
|
- add src_ptr, src_ptr, tmp1
|
|
- sub rest, rest, tmp1
|
|
- cmp rest, tmp1
|
|
- b.ge 1b
|
|
+ add dst, dst, vlen8
|
|
+ add src, src, vlen8
|
|
+ subs n, n, vlen8
|
|
+ b.hi 1b
|
|
2: st1b_unroll8
|
|
- add dest_ptr, dest_ptr, tmp1
|
|
-
|
|
- .p2align 3
|
|
-L(last):
|
|
- whilelo p0.b, xzr, rest
|
|
- whilelo p1.b, vector_length, rest
|
|
- b.last 1f
|
|
- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
|
|
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
- st1b z1.b, p1, [dest_ptr, #1, mul vl]
|
|
- ret
|
|
-1: lsl tmp1, vector_length, 1 // vector_length * 2
|
|
- whilelo p2.b, tmp1, rest
|
|
- incb tmp1
|
|
- whilelo p3.b, tmp1, rest
|
|
- b.last 1f
|
|
- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
|
|
- ld1b z2.b, p2/z, [src_ptr, #2, mul vl]
|
|
- ld1b z3.b, p3/z, [src_ptr, #3, mul vl]
|
|
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
- st1b z1.b, p1, [dest_ptr, #1, mul vl]
|
|
- st1b z2.b, p2, [dest_ptr, #2, mul vl]
|
|
- st1b z3.b, p3, [dest_ptr, #3, mul vl]
|
|
+ add dst, dst, vlen8
|
|
+3: add n, n, vlen8
|
|
+
|
|
+ /* Move last 0-8 vectors. */
|
|
+L(last_bytes):
|
|
+ cmp n, vlen, lsl 1
|
|
+ b.hi 1f
|
|
+ whilelo p0.b, xzr, n
|
|
+ whilelo p1.b, vlen, n
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p1/z, [src, 1, mul vl]
|
|
+ st1b z0.b, p0, [dst, 0, mul vl]
|
|
+ st1b z1.b, p1, [dst, 1, mul vl]
|
|
ret
|
|
-1: lsl tmp1, vector_length, 2 // vector_length * 4
|
|
- whilelo p4.b, tmp1, rest
|
|
- incb tmp1
|
|
- whilelo p5.b, tmp1, rest
|
|
- incb tmp1
|
|
- whilelo p6.b, tmp1, rest
|
|
- incb tmp1
|
|
- whilelo p7.b, tmp1, rest
|
|
- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]
|
|
- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]
|
|
- ld1b z2.b, p2/z, [src_ptr, #2, mul vl]
|
|
- ld1b z3.b, p3/z, [src_ptr, #3, mul vl]
|
|
- ld1b z4.b, p4/z, [src_ptr, #4, mul vl]
|
|
- ld1b z5.b, p5/z, [src_ptr, #5, mul vl]
|
|
- ld1b z6.b, p6/z, [src_ptr, #6, mul vl]
|
|
- ld1b z7.b, p7/z, [src_ptr, #7, mul vl]
|
|
- st1b z0.b, p0, [dest_ptr, #0, mul vl]
|
|
- st1b z1.b, p1, [dest_ptr, #1, mul vl]
|
|
- st1b z2.b, p2, [dest_ptr, #2, mul vl]
|
|
- st1b z3.b, p3, [dest_ptr, #3, mul vl]
|
|
- st1b z4.b, p4, [dest_ptr, #4, mul vl]
|
|
- st1b z5.b, p5, [dest_ptr, #5, mul vl]
|
|
- st1b z6.b, p6, [dest_ptr, #6, mul vl]
|
|
- st1b z7.b, p7, [dest_ptr, #7, mul vl]
|
|
+
|
|
+ .p2align 4
|
|
+
|
|
+1: add srcend, src, n
|
|
+ add dstend, dst, n
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p0/z, [src, 1, mul vl]
|
|
+ ld1b z2.b, p0/z, [srcend, -2, mul vl]
|
|
+ ld1b z3.b, p0/z, [srcend, -1, mul vl]
|
|
+ cmp n, vlen, lsl 2
|
|
+ b.hi 1f
|
|
+
|
|
+ st1b z0.b, p0, [dst, 0, mul vl]
|
|
+ st1b z1.b, p0, [dst, 1, mul vl]
|
|
+ st1b z2.b, p0, [dstend, -2, mul vl]
|
|
+ st1b z3.b, p0, [dstend, -1, mul vl]
|
|
ret
|
|
|
|
-L(L2):
|
|
- // align dest address at CACHE_LINE_SIZE byte boundary
|
|
- mov tmp1, CACHE_LINE_SIZE
|
|
- ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1
|
|
- // if cl_remainder == 0
|
|
- b.eq L(L2_dc_zva)
|
|
- sub cl_remainder, tmp1, tmp2
|
|
- // process remainder until the first CACHE_LINE_SIZE boundary
|
|
- whilelo p1.b, xzr, cl_remainder // keep p0.b all true
|
|
- whilelo p2.b, vector_length, cl_remainder
|
|
- b.last 1f
|
|
- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
|
|
- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
|
|
- st1b z1.b, p1, [dest_ptr, #0, mul vl]
|
|
- st1b z2.b, p2, [dest_ptr, #1, mul vl]
|
|
- b 2f
|
|
-1: lsl tmp1, vector_length, 1 // vector_length * 2
|
|
- whilelo p3.b, tmp1, cl_remainder
|
|
- incb tmp1
|
|
- whilelo p4.b, tmp1, cl_remainder
|
|
- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]
|
|
- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]
|
|
- ld1b z3.b, p3/z, [src_ptr, #2, mul vl]
|
|
- ld1b z4.b, p4/z, [src_ptr, #3, mul vl]
|
|
- st1b z1.b, p1, [dest_ptr, #0, mul vl]
|
|
- st1b z2.b, p2, [dest_ptr, #1, mul vl]
|
|
- st1b z3.b, p3, [dest_ptr, #2, mul vl]
|
|
- st1b z4.b, p4, [dest_ptr, #3, mul vl]
|
|
-2: add dest_ptr, dest_ptr, cl_remainder
|
|
- add src_ptr, src_ptr, cl_remainder
|
|
- sub rest, rest, cl_remainder
|
|
-
|
|
-L(L2_dc_zva):
|
|
- // zero fill
|
|
- and tmp1, dest, 0xffffffffffffff
|
|
- and tmp2, src, 0xffffffffffffff
|
|
- subs tmp1, tmp1, tmp2 // diff
|
|
- b.ge 1f
|
|
- neg tmp1, tmp1
|
|
-1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
|
|
- cmp tmp1, tmp3
|
|
- b.lo L(unroll8)
|
|
- mov tmp1, dest_ptr
|
|
- dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1
|
|
- // unroll
|
|
- ld1b_unroll8 // this line has to be after "b.lo L(unroll8)"
|
|
- add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
|
|
- sub rest, rest, CACHE_LINE_SIZE * 2
|
|
- mov tmp1, ZF_DIST
|
|
- .p2align 3
|
|
-1: stld1b_unroll4a
|
|
- add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST
|
|
- dc zva, tmp2
|
|
- stld1b_unroll4b
|
|
- add tmp2, tmp2, CACHE_LINE_SIZE
|
|
- dc zva, tmp2
|
|
- add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
|
|
- add src_ptr, src_ptr, CACHE_LINE_SIZE * 2
|
|
- sub rest, rest, CACHE_LINE_SIZE * 2
|
|
- cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2
|
|
- b.ge 1b
|
|
- st1b_unroll8
|
|
- add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
|
|
- b L(unroll8)
|
|
+1: ld1b z4.b, p0/z, [src, 2, mul vl]
|
|
+ ld1b z5.b, p0/z, [src, 3, mul vl]
|
|
+ ld1b z6.b, p0/z, [srcend, -4, mul vl]
|
|
+ ld1b z7.b, p0/z, [srcend, -3, mul vl]
|
|
+ st1b z0.b, p0, [dst, 0, mul vl]
|
|
+ st1b z1.b, p0, [dst, 1, mul vl]
|
|
+ st1b z4.b, p0, [dst, 2, mul vl]
|
|
+ st1b z5.b, p0, [dst, 3, mul vl]
|
|
+ st1b z6.b, p0, [dstend, -4, mul vl]
|
|
+ st1b z7.b, p0, [dstend, -3, mul vl]
|
|
+ st1b z2.b, p0, [dstend, -2, mul vl]
|
|
+ st1b z3.b, p0, [dstend, -1, mul vl]
|
|
+ ret
|
|
|
|
END (MEMCPY)
|
|
libc_hidden_builtin_def (MEMCPY)
|
|
|
|
|
|
-ENTRY (MEMMOVE)
|
|
+ENTRY_ALIGN (MEMMOVE, 4)
|
|
|
|
PTR_ARG (0)
|
|
PTR_ARG (1)
|
|
SIZE_ARG (2)
|
|
|
|
- // remove tag address
|
|
- // dest has to be immutable because it is the return value
|
|
- // src has to be immutable because it is used in L(bwd_last)
|
|
- and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2
|
|
- and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3
|
|
- cmp n, 0
|
|
- ccmp tmp2, tmp3, 4, ne
|
|
- b.ne 1f
|
|
+ /* Fast case for up to 2 vectors. */
|
|
+ cntb vlen
|
|
+ cmp n, vlen, lsl 1
|
|
+ b.hi 1f
|
|
+ whilelo p0.b, xzr, n
|
|
+ whilelo p1.b, vlen, n
|
|
+ ld1b z0.b, p0/z, [src, 0, mul vl]
|
|
+ ld1b z1.b, p1/z, [src, 1, mul vl]
|
|
+ st1b z0.b, p0, [dstin, 0, mul vl]
|
|
+ st1b z1.b, p1, [dstin, 1, mul vl]
|
|
+L(full_overlap):
|
|
ret
|
|
-1: cntb vector_length
|
|
- // shortcut for less than vector_length * 8
|
|
- // gives a free ptrue to p0.b for n >= vector_length
|
|
- // tmp2 and tmp3 should not be used in this macro to keep
|
|
- // notag addresses
|
|
- shortcut_for_small_size L(dispatch)
|
|
- // end of shortcut
|
|
-
|
|
-L(dispatch):
|
|
- // tmp2 = dest_notag, tmp3 = src_notag
|
|
- // diff = dest_notag - src_notag
|
|
- sub tmp1, tmp2, tmp3
|
|
- // if diff <= 0 || diff >= n then memcpy
|
|
- cmp tmp1, 0
|
|
- ccmp tmp1, n, 2, gt
|
|
- b.cs L(vl_agnostic)
|
|
-
|
|
-L(bwd_start):
|
|
- mov rest, n
|
|
- add dest_ptr, dest, n // dest_end
|
|
- add src_ptr, src, n // src_end
|
|
-
|
|
-L(bwd_unroll8): // unrolling and software pipeline
|
|
- lsl tmp1, vector_length, 3 // vector_length * 8
|
|
- .p2align 3
|
|
- cmp rest, tmp1
|
|
- b.cc L(bwd_last)
|
|
- sub src_ptr, src_ptr, tmp1
|
|
+
|
|
+ .p2align 4
|
|
+ /* Check for overlapping moves. Return if there is a full overlap.
|
|
+ Small moves up to 8 vectors use the overlap-safe copy_small code.
|
|
+ Non-overlapping or overlapping moves with dst < src use memcpy.
|
|
+ Overlapping moves with dst > src use a backward copy loop. */
|
|
+1: sub tmp, dstin, src
|
|
+ ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */
|
|
+ b.eq L(full_overlap)
|
|
+ cmp n, vlen, lsl 3
|
|
+ b.ls L(copy_small)
|
|
+ cmp tmp, n
|
|
+ b.hs L(copy_large)
|
|
+
|
|
+ /* Align to vector length. */
|
|
+ add dst, dstin, n
|
|
+ sub tmp, vlen, 1
|
|
+ ands tmp, dst, tmp
|
|
+ csel tmp, tmp, vlen, ne
|
|
+ whilelo p1.b, xzr, tmp
|
|
+ sub n, n, tmp
|
|
+ ld1b z1.b, p1/z, [src, n]
|
|
+ st1b z1.b, p1, [dstin, n]
|
|
+ add src, src, n
|
|
+ add dst, dstin, n
|
|
+
|
|
+ ptrue p0.b
|
|
+ lsl vlen8, vlen, 3
|
|
+ subs n, n, vlen8
|
|
+ b.ls 3f
|
|
+ sub src, src, vlen8
|
|
ld1b_unroll8
|
|
- sub rest, rest, tmp1
|
|
- cmp rest, tmp1
|
|
- b.cc 2f
|
|
- .p2align 3
|
|
-1: sub src_ptr, src_ptr, tmp1
|
|
- sub dest_ptr, dest_ptr, tmp1
|
|
+ subs n, n, vlen8
|
|
+ b.ls 2f
|
|
+
|
|
+ .p2align 4
|
|
+ /* 8x unrolled and software pipelined backward copy loop. */
|
|
+1: sub src, src, vlen8
|
|
+ sub dst, dst, vlen8
|
|
stld1b_unroll8
|
|
- sub rest, rest, tmp1
|
|
- cmp rest, tmp1
|
|
- b.ge 1b
|
|
-2: sub dest_ptr, dest_ptr, tmp1
|
|
+ subs n, n, vlen8
|
|
+ b.hi 1b
|
|
+2: sub dst, dst, vlen8
|
|
st1b_unroll8
|
|
+3: add n, n, vlen8
|
|
|
|
-L(bwd_last):
|
|
- mov dest_ptr, dest
|
|
- mov src_ptr, src
|
|
- b L(last)
|
|
+ /* Adjust src/dst for last 0-8 vectors. */
|
|
+ sub src, src, n
|
|
+ mov dst, dstin
|
|
+ b L(last_bytes)
|
|
|
|
END (MEMMOVE)
|
|
libc_hidden_builtin_def (MEMMOVE)
|
|
--
|
|
2.31.1
|
|
|