glibc/SOURCES/glibc-rh2037416-8.patch

From b31bd11454fade731e5158b1aea40b133ae19926 Mon Sep 17 00:00:00 2001
From: Wilco Dijkstra <wdijkstr@arm.com>
Date: Thu, 2 Dec 2021 18:33:26 +0000
Subject: [PATCH] AArch64: Improve A64FX memcpy

v2 is a complete rewrite of the A64FX memcpy. Performance is improved
by streamlining the code, aligning all large copies and using a single
unrolled loop for all sizes. The code size for memcpy and memmove goes
down from 1796 bytes to 868 bytes. Performance is better in all cases:
bench-memcpy-random is 2.3% faster overall, bench-memcpy-large is ~33%
faster for large sizes, bench-memcpy-walk is 25% faster for small sizes
and 20% for the largest sizes. The geomean of all tests in bench-memcpy
is 5.1% faster, and total time is reduced by 4%.

Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>
---
 sysdeps/aarch64/multiarch/memcpy_a64fx.S | 546 ++++++++++-------------
 1 file changed, 225 insertions(+), 321 deletions(-)

diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
index ae7464e09f..0b306925e6 100644
--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S
+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S
@@ -28,20 +28,15 @@
  *
  */
 
-#define L2_SIZE		(8*1024*1024)/2	// L2 8MB/2
-#define CACHE_LINE_SIZE	256
-#define ZF_DIST		(CACHE_LINE_SIZE * 21)	// Zerofill distance
-#define dest		x0
-#define src		x1
-#define n		x2	// size
-#define tmp1		x3
-#define tmp2		x4
-#define tmp3		x5
-#define rest		x6
-#define dest_ptr	x7
-#define src_ptr		x8
-#define vector_length	x9
-#define cl_remainder	x10	// CACHE_LINE_SIZE remainder
+#define dstin	x0
+#define src	x1
+#define n	x2
+#define dst	x3
+#define dstend	x4
+#define srcend	x5
+#define tmp	x6
+#define vlen	x7
+#define vlen8	x8
 
 #if HAVE_AARCH64_SVE_ASM
 # if IS_IN (libc)
@@ -50,45 +45,37 @@
 
 	.arch armv8.2-a+sve
 
-	.macro dc_zva times
-	dc	zva, tmp1
-	add	tmp1, tmp1, CACHE_LINE_SIZE
-	.if \times-1
-	dc_zva "(\times-1)"
-	.endif
-	.endm
-
 	.macro ld1b_unroll8
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
-	ld1b	z1.b, p0/z, [src_ptr, #1, mul vl]
-	ld1b	z2.b, p0/z, [src_ptr, #2, mul vl]
-	ld1b	z3.b, p0/z, [src_ptr, #3, mul vl]
-	ld1b	z4.b, p0/z, [src_ptr, #4, mul vl]
-	ld1b	z5.b, p0/z, [src_ptr, #5, mul vl]
-	ld1b	z6.b, p0/z, [src_ptr, #6, mul vl]
-	ld1b	z7.b, p0/z, [src_ptr, #7, mul vl]
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p0/z, [src, 1, mul vl]
+	ld1b	z2.b, p0/z, [src, 2, mul vl]
+	ld1b	z3.b, p0/z, [src, 3, mul vl]
+	ld1b	z4.b, p0/z, [src, 4, mul vl]
+	ld1b	z5.b, p0/z, [src, 5, mul vl]
+	ld1b	z6.b, p0/z, [src, 6, mul vl]
+	ld1b	z7.b, p0/z, [src, 7, mul vl]
 	.endm
 
 	.macro stld1b_unroll4a
-	st1b	z0.b, p0,   [dest_ptr, #0, mul vl]
-	st1b	z1.b, p0,   [dest_ptr, #1, mul vl]
-	ld1b	z0.b, p0/z, [src_ptr,  #0, mul vl]
-	ld1b	z1.b, p0/z, [src_ptr,  #1, mul vl]
-	st1b	z2.b, p0,   [dest_ptr, #2, mul vl]
-	st1b	z3.b, p0,   [dest_ptr, #3, mul vl]
-	ld1b	z2.b, p0/z, [src_ptr,  #2, mul vl]
-	ld1b	z3.b, p0/z, [src_ptr,  #3, mul vl]
+	st1b	z0.b, p0,   [dst, 0, mul vl]
+	st1b	z1.b, p0,   [dst, 1, mul vl]
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p0/z, [src, 1, mul vl]
+	st1b	z2.b, p0,   [dst, 2, mul vl]
+	st1b	z3.b, p0,   [dst, 3, mul vl]
+	ld1b	z2.b, p0/z, [src, 2, mul vl]
+	ld1b	z3.b, p0/z, [src, 3, mul vl]
 	.endm
 
 	.macro stld1b_unroll4b
-	st1b	z4.b, p0,   [dest_ptr, #4, mul vl]
-	st1b	z5.b, p0,   [dest_ptr, #5, mul vl]
-	ld1b	z4.b, p0/z, [src_ptr,  #4, mul vl]
-	ld1b	z5.b, p0/z, [src_ptr,  #5, mul vl]
-	st1b	z6.b, p0,   [dest_ptr, #6, mul vl]
-	st1b	z7.b, p0,   [dest_ptr, #7, mul vl]
-	ld1b	z6.b, p0/z, [src_ptr,  #6, mul vl]
-	ld1b	z7.b, p0/z, [src_ptr,  #7, mul vl]
+	st1b	z4.b, p0,   [dst, 4, mul vl]
+	st1b	z5.b, p0,   [dst, 5, mul vl]
+	ld1b	z4.b, p0/z, [src, 4, mul vl]
+	ld1b	z5.b, p0/z, [src, 5, mul vl]
+	st1b	z6.b, p0,   [dst, 6, mul vl]
+	st1b	z7.b, p0,   [dst, 7, mul vl]
+	ld1b	z6.b, p0/z, [src, 6, mul vl]
+	ld1b	z7.b, p0/z, [src, 7, mul vl]
 	.endm
 
 	.macro stld1b_unroll8
@@ -97,87 +84,18 @@
 	.endm
 
 	.macro st1b_unroll8
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
-	st1b	z1.b, p0, [dest_ptr, #1, mul vl]
-	st1b	z2.b, p0, [dest_ptr, #2, mul vl]
-	st1b	z3.b, p0, [dest_ptr, #3, mul vl]
-	st1b	z4.b, p0, [dest_ptr, #4, mul vl]
-	st1b	z5.b, p0, [dest_ptr, #5, mul vl]
-	st1b	z6.b, p0, [dest_ptr, #6, mul vl]
-	st1b	z7.b, p0, [dest_ptr, #7, mul vl]
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z1.b, p0, [dst, 1, mul vl]
+	st1b	z2.b, p0, [dst, 2, mul vl]
+	st1b	z3.b, p0, [dst, 3, mul vl]
+	st1b	z4.b, p0, [dst, 4, mul vl]
+	st1b	z5.b, p0, [dst, 5, mul vl]
+	st1b	z6.b, p0, [dst, 6, mul vl]
+	st1b	z7.b, p0, [dst, 7, mul vl]
 	.endm
 
-	.macro shortcut_for_small_size exit
-	// if rest <= vector_length * 2
-	whilelo	p0.b, xzr, n
-	whilelo	p1.b, vector_length, n
-	b.last	1f
-	ld1b	z0.b, p0/z, [src, #0, mul vl]
-	ld1b	z1.b, p1/z, [src, #1, mul vl]
-	st1b	z0.b, p0, [dest, #0, mul vl]
-	st1b	z1.b, p1, [dest, #1, mul vl]
-	ret
-1:	// if rest > vector_length * 8
-	cmp	n, vector_length, lsl 3 // vector_length * 8
-	b.hi	\exit
-	// if rest <= vector_length * 4
-	lsl	tmp1, vector_length, 1  // vector_length * 2
-	whilelo	p2.b, tmp1, n
-	incb	tmp1
-	whilelo	p3.b, tmp1, n
-	b.last	1f
-	ld1b	z0.b, p0/z, [src, #0, mul vl]
-	ld1b	z1.b, p1/z, [src, #1, mul vl]
-	ld1b	z2.b, p2/z, [src, #2, mul vl]
-	ld1b	z3.b, p3/z, [src, #3, mul vl]
-	st1b	z0.b, p0, [dest, #0, mul vl]
-	st1b	z1.b, p1, [dest, #1, mul vl]
-	st1b	z2.b, p2, [dest, #2, mul vl]
-	st1b	z3.b, p3, [dest, #3, mul vl]
-	ret
-1:	// if rest <= vector_length * 8
-	lsl	tmp1, vector_length, 2  // vector_length * 4
-	whilelo	p4.b, tmp1, n
-	incb	tmp1
-	whilelo	p5.b, tmp1, n
-	b.last	1f
-	ld1b	z0.b, p0/z, [src, #0, mul vl]
-	ld1b	z1.b, p1/z, [src, #1, mul vl]
-	ld1b	z2.b, p2/z, [src, #2, mul vl]
-	ld1b	z3.b, p3/z, [src, #3, mul vl]
-	ld1b	z4.b, p4/z, [src, #4, mul vl]
-	ld1b	z5.b, p5/z, [src, #5, mul vl]
-	st1b	z0.b, p0, [dest, #0, mul vl]
-	st1b	z1.b, p1, [dest, #1, mul vl]
-	st1b	z2.b, p2, [dest, #2, mul vl]
-	st1b	z3.b, p3, [dest, #3, mul vl]
-	st1b	z4.b, p4, [dest, #4, mul vl]
-	st1b	z5.b, p5, [dest, #5, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
-	incb	tmp1			// vector_length * 5
-	incb	tmp1			// vector_length * 6
-	whilelo	p6.b, tmp1, n
-	incb	tmp1
-	whilelo	p7.b, tmp1, n
-	ld1b	z0.b, p0/z, [src, #0, mul vl]
-	ld1b	z1.b, p1/z, [src, #1, mul vl]
-	ld1b	z2.b, p2/z, [src, #2, mul vl]
-	ld1b	z3.b, p3/z, [src, #3, mul vl]
-	ld1b	z4.b, p4/z, [src, #4, mul vl]
-	ld1b	z5.b, p5/z, [src, #5, mul vl]
-	ld1b	z6.b, p6/z, [src, #6, mul vl]
-	ld1b	z7.b, p7/z, [src, #7, mul vl]
-	st1b	z0.b, p0, [dest, #0, mul vl]
-	st1b	z1.b, p1, [dest, #1, mul vl]
-	st1b	z2.b, p2, [dest, #2, mul vl]
-	st1b	z3.b, p3, [dest, #3, mul vl]
-	st1b	z4.b, p4, [dest, #4, mul vl]
-	st1b	z5.b, p5, [dest, #5, mul vl]
-	st1b	z6.b, p6, [dest, #6, mul vl]
-	st1b	z7.b, p7, [dest, #7, mul vl]
-	ret
-	.endm
+#undef BTI_C
+#define BTI_C
 
 ENTRY (MEMCPY)
 
@@ -185,223 +103,209 @@ ENTRY (MEMCPY)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 
-L(memcpy):
-	cntb	vector_length
-	// shortcut for less than vector_length * 8
-	// gives a free ptrue to p0.b for n >= vector_length
-	shortcut_for_small_size L(vl_agnostic)
-	// end of shortcut
-
-L(vl_agnostic): // VL Agnostic
-	mov	rest, n
-	mov	dest_ptr, dest
-	mov	src_ptr, src
-	// if rest >= L2_SIZE && vector_length == 64 then L(L2)
-	mov	tmp1, 64
-	cmp	rest, L2_SIZE
-	ccmp	vector_length, tmp1, 0, cs
-	b.eq	L(L2)
-
-L(unroll8): // unrolling and software pipeline
-	lsl	tmp1, vector_length, 3	// vector_length * 8
-	.p2align 3
-	cmp	 rest, tmp1
-	b.cc	L(last)
+	cntb	vlen
+	cmp	n, vlen, lsl 1
+	b.hi	L(copy_small)
+	whilelo	p1.b, vlen, n
+	whilelo	p0.b, xzr, n
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
+	ret
+
+	.p2align 4
+
+L(copy_small):
+	cmp	n, vlen, lsl 3
+	b.hi	L(copy_large)
+	add	dstend, dstin, n
+	add	srcend, src, n
+	cmp	n, vlen, lsl 2
+	b.hi	1f
+
+	/* Copy 2-4 vectors.  */
+	ptrue	p0.b
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p0/z, [src, 1, mul vl]
+	ld1b	z2.b, p0/z, [srcend, -2, mul vl]
+	ld1b	z3.b, p0/z, [srcend, -1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p0, [dstin, 1, mul vl]
+	st1b	z2.b, p0, [dstend, -2, mul vl]
+	st1b	z3.b, p0, [dstend, -1, mul vl]
+	ret
+
+	.p2align 4
+	/* Copy 4-8 vectors.  */
+1:	ptrue	p0.b
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p0/z, [src, 1, mul vl]
+	ld1b	z2.b, p0/z, [src, 2, mul vl]
+	ld1b	z3.b, p0/z, [src, 3, mul vl]
+	ld1b	z4.b, p0/z, [srcend, -4, mul vl]
+	ld1b	z5.b, p0/z, [srcend, -3, mul vl]
+	ld1b	z6.b, p0/z, [srcend, -2, mul vl]
+	ld1b	z7.b, p0/z, [srcend, -1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p0, [dstin, 1, mul vl]
+	st1b	z2.b, p0, [dstin, 2, mul vl]
+	st1b	z3.b, p0, [dstin, 3, mul vl]
+	st1b	z4.b, p0, [dstend, -4, mul vl]
+	st1b	z5.b, p0, [dstend, -3, mul vl]
+	st1b	z6.b, p0, [dstend, -2, mul vl]
+	st1b	z7.b, p0, [dstend, -1, mul vl]
+	ret
+
+	.p2align 4
+	/* At least 8 vectors - always align to vector length for
+	   higher and consistent write performance.  */
+L(copy_large):
+	sub	tmp, vlen, 1
+	and	tmp, dstin, tmp
+	sub	tmp, vlen, tmp
+	whilelo	p1.b, xzr, tmp
+	ld1b	z1.b, p1/z, [src]
+	st1b	z1.b, p1, [dstin]
+	add	dst, dstin, tmp
+	add	src, src, tmp
+	sub	n, n, tmp
+	ptrue	p0.b
+
+	lsl	vlen8, vlen, 3
+	subs	n, n, vlen8
+	b.ls	3f
 	ld1b_unroll8
-	add	src_ptr, src_ptr, tmp1
-	sub	rest, rest, tmp1
-	cmp	rest, tmp1
-	b.cc	2f
-	.p2align 3
+	add	src, src, vlen8
+	subs	n, n, vlen8
+	b.ls	2f
+
+	.p2align 4
+	/* 8x unrolled and software pipelined loop.  */
 1:	stld1b_unroll8
-	add	dest_ptr, dest_ptr, tmp1
-	add	src_ptr, src_ptr, tmp1
-	sub	rest, rest, tmp1
-	cmp	rest, tmp1
-	b.ge	1b
+	add	dst, dst, vlen8
+	add	src, src, vlen8
+	subs	n, n, vlen8
+	b.hi	1b
 2:	st1b_unroll8
-	add	dest_ptr, dest_ptr, tmp1
-
-	.p2align 3
-L(last):
-	whilelo	p0.b, xzr, rest
-	whilelo	p1.b, vector_length, rest
-	b.last	1f
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
-	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
-	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
-	ret
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2
-	whilelo	p2.b, tmp1, rest
-	incb	tmp1
-	whilelo	p3.b, tmp1, rest
-	b.last	1f
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
-	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
-	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
-	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
-	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
-	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
-	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
+	add	dst, dst, vlen8
+3:	add	n, n, vlen8
+
+	/* Move last 0-8 vectors.  */
+L(last_bytes):
+	cmp	n, vlen, lsl 1
+	b.hi	1f
+	whilelo	p0.b, xzr, n
+	whilelo	p1.b, vlen, n
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z1.b, p1, [dst, 1, mul vl]
 	ret
-1:	lsl	tmp1, vector_length, 2	// vector_length * 4
-	whilelo	p4.b, tmp1, rest
-	incb	tmp1
-	whilelo	p5.b, tmp1, rest
-	incb	tmp1
-	whilelo	p6.b, tmp1, rest
-	incb	tmp1
-	whilelo	p7.b, tmp1, rest
-	ld1b	z0.b, p0/z, [src_ptr, #0, mul vl]
-	ld1b	z1.b, p1/z, [src_ptr, #1, mul vl]
-	ld1b	z2.b, p2/z, [src_ptr, #2, mul vl]
-	ld1b	z3.b, p3/z, [src_ptr, #3, mul vl]
-	ld1b	z4.b, p4/z, [src_ptr, #4, mul vl]
-	ld1b	z5.b, p5/z, [src_ptr, #5, mul vl]
-	ld1b	z6.b, p6/z, [src_ptr, #6, mul vl]
-	ld1b	z7.b, p7/z, [src_ptr, #7, mul vl]
-	st1b	z0.b, p0, [dest_ptr, #0, mul vl]
-	st1b	z1.b, p1, [dest_ptr, #1, mul vl]
-	st1b	z2.b, p2, [dest_ptr, #2, mul vl]
-	st1b	z3.b, p3, [dest_ptr, #3, mul vl]
-	st1b	z4.b, p4, [dest_ptr, #4, mul vl]
-	st1b	z5.b, p5, [dest_ptr, #5, mul vl]
-	st1b	z6.b, p6, [dest_ptr, #6, mul vl]
-	st1b	z7.b, p7, [dest_ptr, #7, mul vl]
+
+	.p2align 4
+
+1:	add	srcend, src, n
+	add	dstend, dst, n
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p0/z, [src, 1, mul vl]
+	ld1b	z2.b, p0/z, [srcend, -2, mul vl]
+	ld1b	z3.b, p0/z, [srcend, -1, mul vl]
+	cmp	n, vlen, lsl 2
+	b.hi	1f
+
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z1.b, p0, [dst, 1, mul vl]
+	st1b	z2.b, p0, [dstend, -2, mul vl]
+	st1b	z3.b, p0, [dstend, -1, mul vl]
 	ret
 
-L(L2):
-	// align dest address at CACHE_LINE_SIZE byte boundary
-	mov	tmp1, CACHE_LINE_SIZE
-	ands	tmp2, dest_ptr, CACHE_LINE_SIZE - 1
-	// if cl_remainder == 0
-	b.eq	L(L2_dc_zva)
-	sub	cl_remainder, tmp1, tmp2
-	// process remainder until the first CACHE_LINE_SIZE boundary
-	whilelo	p1.b, xzr, cl_remainder	// keep p0.b all true
-	whilelo	p2.b, vector_length, cl_remainder
-	b.last	1f
-	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
-	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
-	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
-	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
-	b	2f
-1:	lsl	tmp1, vector_length, 1	// vector_length * 2
-	whilelo	p3.b, tmp1, cl_remainder
-	incb	tmp1
-	whilelo	p4.b, tmp1, cl_remainder
-	ld1b	z1.b, p1/z, [src_ptr, #0, mul vl]
-	ld1b	z2.b, p2/z, [src_ptr, #1, mul vl]
-	ld1b	z3.b, p3/z, [src_ptr, #2, mul vl]
-	ld1b	z4.b, p4/z, [src_ptr, #3, mul vl]
-	st1b	z1.b, p1, [dest_ptr, #0, mul vl]
-	st1b	z2.b, p2, [dest_ptr, #1, mul vl]
-	st1b	z3.b, p3, [dest_ptr, #2, mul vl]
-	st1b	z4.b, p4, [dest_ptr, #3, mul vl]
-2:	add	dest_ptr, dest_ptr, cl_remainder
-	add	src_ptr, src_ptr, cl_remainder
-	sub	rest, rest, cl_remainder
-
-L(L2_dc_zva):
-	// zero fill
-	and	tmp1, dest, 0xffffffffffffff
-	and	tmp2, src, 0xffffffffffffff
-	subs	tmp1, tmp1, tmp2	// diff
-	b.ge	1f
-	neg	tmp1, tmp1
-1:	mov	tmp3, ZF_DIST + CACHE_LINE_SIZE * 2
-	cmp	tmp1, tmp3
-	b.lo	L(unroll8)
-	mov	tmp1, dest_ptr
-	dc_zva	(ZF_DIST / CACHE_LINE_SIZE) - 1
-	// unroll
-	ld1b_unroll8	// this line has to be after "b.lo L(unroll8)"
-	add	 src_ptr, src_ptr, CACHE_LINE_SIZE * 2
-	sub	 rest, rest, CACHE_LINE_SIZE * 2
-	mov	 tmp1, ZF_DIST
-	.p2align 3
-1:	stld1b_unroll4a
-	add	tmp2, dest_ptr, tmp1	// dest_ptr + ZF_DIST
-	dc	zva, tmp2
-	stld1b_unroll4b
-	add	tmp2, tmp2, CACHE_LINE_SIZE
-	dc	zva, tmp2
-	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
-	add	src_ptr, src_ptr, CACHE_LINE_SIZE * 2
-	sub	rest, rest, CACHE_LINE_SIZE * 2
-	cmp	rest, tmp3	// ZF_DIST + CACHE_LINE_SIZE * 2
-	b.ge	1b
-	st1b_unroll8
-	add	dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2
-	b	L(unroll8)
+1:	ld1b	z4.b, p0/z, [src, 2, mul vl]
+	ld1b	z5.b, p0/z, [src, 3, mul vl]
+	ld1b	z6.b, p0/z, [srcend, -4, mul vl]
+	ld1b	z7.b, p0/z, [srcend, -3, mul vl]
+	st1b	z0.b, p0, [dst, 0, mul vl]
+	st1b	z1.b, p0, [dst, 1, mul vl]
+	st1b	z4.b, p0, [dst, 2, mul vl]
+	st1b	z5.b, p0, [dst, 3, mul vl]
+	st1b	z6.b, p0, [dstend, -4, mul vl]
+	st1b	z7.b, p0, [dstend, -3, mul vl]
+	st1b	z2.b, p0, [dstend, -2, mul vl]
+	st1b	z3.b, p0, [dstend, -1, mul vl]
+	ret
 
 END (MEMCPY)
 libc_hidden_builtin_def (MEMCPY)
 
 
-ENTRY (MEMMOVE)
+ENTRY_ALIGN (MEMMOVE, 4)
 
 	PTR_ARG (0)
 	PTR_ARG (1)
 	SIZE_ARG (2)
 
-	// remove tag address
-	// dest has to be immutable because it is the return value
-	// src has to be immutable because it is used in L(bwd_last)
-	and	tmp2, dest, 0xffffffffffffff	// save dest_notag into tmp2
-	and	tmp3, src, 0xffffffffffffff	// save src_notag intp tmp3
-	cmp	n, 0
-	ccmp	tmp2, tmp3, 4, ne
-	b.ne	1f
+	/* Fast case for up to 2 vectors.  */
+	cntb	vlen
+	cmp	n, vlen, lsl 1
+	b.hi	1f
+	whilelo	p0.b, xzr, n
+	whilelo	p1.b, vlen, n
+	ld1b	z0.b, p0/z, [src, 0, mul vl]
+	ld1b	z1.b, p1/z, [src, 1, mul vl]
+	st1b	z0.b, p0, [dstin, 0, mul vl]
+	st1b	z1.b, p1, [dstin, 1, mul vl]
+L(full_overlap):
 	ret
-1:	cntb	vector_length
-	// shortcut for less than vector_length * 8
-	// gives a free ptrue to p0.b for n >= vector_length
-	// tmp2 and tmp3 should not be used in this macro to keep
-	// notag addresses
-	shortcut_for_small_size L(dispatch)
-	// end of shortcut
-
-L(dispatch):
-	// tmp2 = dest_notag, tmp3 = src_notag
-	// diff = dest_notag - src_notag
-	sub	tmp1, tmp2, tmp3
-	// if diff <= 0 || diff >= n then memcpy
-	cmp	tmp1, 0
-	ccmp	tmp1, n, 2, gt
-	b.cs	L(vl_agnostic)
-
-L(bwd_start):
-	mov	rest, n
-	add	dest_ptr, dest, n	// dest_end
-	add	src_ptr, src, n		// src_end
-
-L(bwd_unroll8): // unrolling and software pipeline
-	lsl	tmp1, vector_length, 3	// vector_length * 8
-	.p2align 3
-	cmp	rest, tmp1
-	b.cc	L(bwd_last)
-	sub	src_ptr, src_ptr, tmp1
+
+	.p2align 4
+	/* Check for overlapping moves. Return if there is a full overlap.
+	   Small moves up to 8 vectors use the overlap-safe copy_small code.
+	   Non-overlapping or overlapping moves with dst < src use memcpy.
+	   Overlapping moves with dst > src use a backward copy loop.  */
+1:	sub	tmp, dstin, src
+	ands	tmp, tmp, 0xffffffffffffff	/* Clear special tag bits.  */
+	b.eq	L(full_overlap)
+	cmp	n, vlen, lsl 3
+	b.ls	L(copy_small)
+	cmp	tmp, n
+	b.hs	L(copy_large)
+
+	/* Align to vector length.  */
+	add	dst, dstin, n
+	sub	tmp, vlen, 1
+	ands	tmp, dst, tmp
+	csel	tmp, tmp, vlen, ne
+	whilelo	p1.b, xzr, tmp
+	sub	n, n, tmp
+	ld1b	z1.b, p1/z, [src, n]
+	st1b	z1.b, p1, [dstin, n]
+	add	src, src, n
+	add	dst, dstin, n
+
+	ptrue	p0.b
+	lsl	vlen8, vlen, 3
+	subs	n, n, vlen8
+	b.ls	3f
+	sub	src, src, vlen8
 	ld1b_unroll8
-	sub	rest, rest, tmp1
-	cmp	rest, tmp1
-	b.cc	2f
-	.p2align 3
-1:	sub	src_ptr, src_ptr, tmp1
-	sub	dest_ptr, dest_ptr, tmp1
+	subs	n, n, vlen8
+	b.ls	2f
+
+	.p2align 4
+	/* 8x unrolled and software pipelined backward copy loop.  */
+1:	sub	src, src, vlen8
+	sub	dst, dst, vlen8
 	stld1b_unroll8
-	sub	rest, rest, tmp1
-	cmp	rest, tmp1
-	b.ge	1b
-2:	sub	dest_ptr, dest_ptr, tmp1
+	subs	n, n, vlen8
+	b.hi	1b
+2:	sub	dst, dst, vlen8
 	st1b_unroll8
+3:	add	n, n, vlen8
 
-L(bwd_last):
-	mov	dest_ptr, dest
-	mov	src_ptr, src
-	b	L(last)
+	/* Adjust src/dst for last 0-8 vectors.  */
+	sub	src, src, n
+	mov	dst, dstin
+	b	L(last_bytes)
 
 END (MEMMOVE)
 libc_hidden_builtin_def (MEMMOVE)
-- 
2.31.1
import glibc-2.28-211.el8 2022-11-08 06:58:46 +00:00			`From b31bd11454fade731e5158b1aea40b133ae19926 Mon Sep 17 00:00:00 2001`
			`From: Wilco Dijkstra <wdijkstr@arm.com>`
			`Date: Thu, 2 Dec 2021 18:33:26 +0000`
			`Subject: [PATCH] AArch64: Improve A64FX memcpy`

			`v2 is a complete rewrite of the A64FX memcpy. Performance is improved`
			`by streamlining the code, aligning all large copies and using a single`
			`unrolled loop for all sizes. The code size for memcpy and memmove goes`
			`down from 1796 bytes to 868 bytes. Performance is better in all cases:`
			`bench-memcpy-random is 2.3% faster overall, bench-memcpy-large is ~33%`
			`faster for large sizes, bench-memcpy-walk is 25% faster for small sizes`
			`and 20% for the largest sizes. The geomean of all tests in bench-memcpy`
			`is 5.1% faster, and total time is reduced by 4%.`

			`Reviewed-by: Szabolcs Nagy <szabolcs.nagy@arm.com>`
			`---`
			`sysdeps/aarch64/multiarch/memcpy_a64fx.S \| 546 ++++++++++-------------`
			`1 file changed, 225 insertions(+), 321 deletions(-)`

			`diff --git a/sysdeps/aarch64/multiarch/memcpy_a64fx.S b/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
			`index ae7464e09f..0b306925e6 100644`
			`--- a/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
			`+++ b/sysdeps/aarch64/multiarch/memcpy_a64fx.S`
			`@@ -28,20 +28,15 @@`
			`*`
			`*/`

			`-#define L2_SIZE (810241024)/2 // L2 8MB/2`
			`-#define CACHE_LINE_SIZE 256`
			`-#define ZF_DIST (CACHE_LINE_SIZE * 21) // Zerofill distance`
			`-#define dest x0`
			`-#define src x1`
			`-#define n x2 // size`
			`-#define tmp1 x3`
			`-#define tmp2 x4`
			`-#define tmp3 x5`
			`-#define rest x6`
			`-#define dest_ptr x7`
			`-#define src_ptr x8`
			`-#define vector_length x9`
			`-#define cl_remainder x10 // CACHE_LINE_SIZE remainder`
			`+#define dstin x0`
			`+#define src x1`
			`+#define n x2`
			`+#define dst x3`
			`+#define dstend x4`
			`+#define srcend x5`
			`+#define tmp x6`
			`+#define vlen x7`
			`+#define vlen8 x8`

			`#if HAVE_AARCH64_SVE_ASM`
			`# if IS_IN (libc)`
			`@@ -50,45 +45,37 @@`

			`.arch armv8.2-a+sve`

			`- .macro dc_zva times`
			`- dc zva, tmp1`
			`- add tmp1, tmp1, CACHE_LINE_SIZE`
			`- .if \times-1`
			`- dc_zva "(\times-1)"`
			`- .endif`
			`- .endm`
			`-`
			`.macro ld1b_unroll8`
			`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
			`- ld1b z1.b, p0/z, [src_ptr, #1, mul vl]`
			`- ld1b z2.b, p0/z, [src_ptr, #2, mul vl]`
			`- ld1b z3.b, p0/z, [src_ptr, #3, mul vl]`
			`- ld1b z4.b, p0/z, [src_ptr, #4, mul vl]`
			`- ld1b z5.b, p0/z, [src_ptr, #5, mul vl]`
			`- ld1b z6.b, p0/z, [src_ptr, #6, mul vl]`
			`- ld1b z7.b, p0/z, [src_ptr, #7, mul vl]`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
			`+ ld1b z2.b, p0/z, [src, 2, mul vl]`
			`+ ld1b z3.b, p0/z, [src, 3, mul vl]`
			`+ ld1b z4.b, p0/z, [src, 4, mul vl]`
			`+ ld1b z5.b, p0/z, [src, 5, mul vl]`
			`+ ld1b z6.b, p0/z, [src, 6, mul vl]`
			`+ ld1b z7.b, p0/z, [src, 7, mul vl]`
			`.endm`

			`.macro stld1b_unroll4a`
			`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
			`- st1b z1.b, p0, [dest_ptr, #1, mul vl]`
			`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
			`- ld1b z1.b, p0/z, [src_ptr, #1, mul vl]`
			`- st1b z2.b, p0, [dest_ptr, #2, mul vl]`
			`- st1b z3.b, p0, [dest_ptr, #3, mul vl]`
			`- ld1b z2.b, p0/z, [src_ptr, #2, mul vl]`
			`- ld1b z3.b, p0/z, [src_ptr, #3, mul vl]`
			`+ st1b z0.b, p0, [dst, 0, mul vl]`
			`+ st1b z1.b, p0, [dst, 1, mul vl]`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
			`+ st1b z2.b, p0, [dst, 2, mul vl]`
			`+ st1b z3.b, p0, [dst, 3, mul vl]`
			`+ ld1b z2.b, p0/z, [src, 2, mul vl]`
			`+ ld1b z3.b, p0/z, [src, 3, mul vl]`
			`.endm`

			`.macro stld1b_unroll4b`
			`- st1b z4.b, p0, [dest_ptr, #4, mul vl]`
			`- st1b z5.b, p0, [dest_ptr, #5, mul vl]`
			`- ld1b z4.b, p0/z, [src_ptr, #4, mul vl]`
			`- ld1b z5.b, p0/z, [src_ptr, #5, mul vl]`
			`- st1b z6.b, p0, [dest_ptr, #6, mul vl]`
			`- st1b z7.b, p0, [dest_ptr, #7, mul vl]`
			`- ld1b z6.b, p0/z, [src_ptr, #6, mul vl]`
			`- ld1b z7.b, p0/z, [src_ptr, #7, mul vl]`
			`+ st1b z4.b, p0, [dst, 4, mul vl]`
			`+ st1b z5.b, p0, [dst, 5, mul vl]`
			`+ ld1b z4.b, p0/z, [src, 4, mul vl]`
			`+ ld1b z5.b, p0/z, [src, 5, mul vl]`
			`+ st1b z6.b, p0, [dst, 6, mul vl]`
			`+ st1b z7.b, p0, [dst, 7, mul vl]`
			`+ ld1b z6.b, p0/z, [src, 6, mul vl]`
			`+ ld1b z7.b, p0/z, [src, 7, mul vl]`
			`.endm`

			`.macro stld1b_unroll8`
			`@@ -97,87 +84,18 @@`
			`.endm`

			`.macro st1b_unroll8`
			`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
			`- st1b z1.b, p0, [dest_ptr, #1, mul vl]`
			`- st1b z2.b, p0, [dest_ptr, #2, mul vl]`
			`- st1b z3.b, p0, [dest_ptr, #3, mul vl]`
			`- st1b z4.b, p0, [dest_ptr, #4, mul vl]`
			`- st1b z5.b, p0, [dest_ptr, #5, mul vl]`
			`- st1b z6.b, p0, [dest_ptr, #6, mul vl]`
			`- st1b z7.b, p0, [dest_ptr, #7, mul vl]`
			`+ st1b z0.b, p0, [dst, 0, mul vl]`
			`+ st1b z1.b, p0, [dst, 1, mul vl]`
			`+ st1b z2.b, p0, [dst, 2, mul vl]`
			`+ st1b z3.b, p0, [dst, 3, mul vl]`
			`+ st1b z4.b, p0, [dst, 4, mul vl]`
			`+ st1b z5.b, p0, [dst, 5, mul vl]`
			`+ st1b z6.b, p0, [dst, 6, mul vl]`
			`+ st1b z7.b, p0, [dst, 7, mul vl]`
			`.endm`

			`- .macro shortcut_for_small_size exit`
			`- // if rest <= vector_length * 2`
			`- whilelo p0.b, xzr, n`
			`- whilelo p1.b, vector_length, n`
			`- b.last 1f`
			`- ld1b z0.b, p0/z, [src, #0, mul vl]`
			`- ld1b z1.b, p1/z, [src, #1, mul vl]`
			`- st1b z0.b, p0, [dest, #0, mul vl]`
			`- st1b z1.b, p1, [dest, #1, mul vl]`
			`- ret`
			`-1: // if rest > vector_length * 8`
			`- cmp n, vector_length, lsl 3 // vector_length * 8`
			`- b.hi \exit`
			`- // if rest <= vector_length * 4`
			`- lsl tmp1, vector_length, 1 // vector_length * 2`
			`- whilelo p2.b, tmp1, n`
			`- incb tmp1`
			`- whilelo p3.b, tmp1, n`
			`- b.last 1f`
			`- ld1b z0.b, p0/z, [src, #0, mul vl]`
			`- ld1b z1.b, p1/z, [src, #1, mul vl]`
			`- ld1b z2.b, p2/z, [src, #2, mul vl]`
			`- ld1b z3.b, p3/z, [src, #3, mul vl]`
			`- st1b z0.b, p0, [dest, #0, mul vl]`
			`- st1b z1.b, p1, [dest, #1, mul vl]`
			`- st1b z2.b, p2, [dest, #2, mul vl]`
			`- st1b z3.b, p3, [dest, #3, mul vl]`
			`- ret`
			`-1: // if rest <= vector_length * 8`
			`- lsl tmp1, vector_length, 2 // vector_length * 4`
			`- whilelo p4.b, tmp1, n`
			`- incb tmp1`
			`- whilelo p5.b, tmp1, n`
			`- b.last 1f`
			`- ld1b z0.b, p0/z, [src, #0, mul vl]`
			`- ld1b z1.b, p1/z, [src, #1, mul vl]`
			`- ld1b z2.b, p2/z, [src, #2, mul vl]`
			`- ld1b z3.b, p3/z, [src, #3, mul vl]`
			`- ld1b z4.b, p4/z, [src, #4, mul vl]`
			`- ld1b z5.b, p5/z, [src, #5, mul vl]`
			`- st1b z0.b, p0, [dest, #0, mul vl]`
			`- st1b z1.b, p1, [dest, #1, mul vl]`
			`- st1b z2.b, p2, [dest, #2, mul vl]`
			`- st1b z3.b, p3, [dest, #3, mul vl]`
			`- st1b z4.b, p4, [dest, #4, mul vl]`
			`- st1b z5.b, p5, [dest, #5, mul vl]`
			`- ret`
			`-1: lsl tmp1, vector_length, 2 // vector_length * 4`
			`- incb tmp1 // vector_length * 5`
			`- incb tmp1 // vector_length * 6`
			`- whilelo p6.b, tmp1, n`
			`- incb tmp1`
			`- whilelo p7.b, tmp1, n`
			`- ld1b z0.b, p0/z, [src, #0, mul vl]`
			`- ld1b z1.b, p1/z, [src, #1, mul vl]`
			`- ld1b z2.b, p2/z, [src, #2, mul vl]`
			`- ld1b z3.b, p3/z, [src, #3, mul vl]`
			`- ld1b z4.b, p4/z, [src, #4, mul vl]`
			`- ld1b z5.b, p5/z, [src, #5, mul vl]`
			`- ld1b z6.b, p6/z, [src, #6, mul vl]`
			`- ld1b z7.b, p7/z, [src, #7, mul vl]`
			`- st1b z0.b, p0, [dest, #0, mul vl]`
			`- st1b z1.b, p1, [dest, #1, mul vl]`
			`- st1b z2.b, p2, [dest, #2, mul vl]`
			`- st1b z3.b, p3, [dest, #3, mul vl]`
			`- st1b z4.b, p4, [dest, #4, mul vl]`
			`- st1b z5.b, p5, [dest, #5, mul vl]`
			`- st1b z6.b, p6, [dest, #6, mul vl]`
			`- st1b z7.b, p7, [dest, #7, mul vl]`
			`- ret`
			`- .endm`
			`+#undef BTI_C`
			`+#define BTI_C`

			`ENTRY (MEMCPY)`

			`@@ -185,223 +103,209 @@ ENTRY (MEMCPY)`
			`PTR_ARG (1)`
			`SIZE_ARG (2)`

			`-L(memcpy):`
			`- cntb vector_length`
			`- // shortcut for less than vector_length * 8`
			`- // gives a free ptrue to p0.b for n >= vector_length`
			`- shortcut_for_small_size L(vl_agnostic)`
			`- // end of shortcut`
			`-`
			`-L(vl_agnostic): // VL Agnostic`
			`- mov rest, n`
			`- mov dest_ptr, dest`
			`- mov src_ptr, src`
			`- // if rest >= L2_SIZE && vector_length == 64 then L(L2)`
			`- mov tmp1, 64`
			`- cmp rest, L2_SIZE`
			`- ccmp vector_length, tmp1, 0, cs`
			`- b.eq L(L2)`
			`-`
			`-L(unroll8): // unrolling and software pipeline`
			`- lsl tmp1, vector_length, 3 // vector_length * 8`
			`- .p2align 3`
			`- cmp rest, tmp1`
			`- b.cc L(last)`
			`+ cntb vlen`
			`+ cmp n, vlen, lsl 1`
			`+ b.hi L(copy_small)`
			`+ whilelo p1.b, vlen, n`
			`+ whilelo p0.b, xzr, n`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p1/z, [src, 1, mul vl]`
			`+ st1b z0.b, p0, [dstin, 0, mul vl]`
			`+ st1b z1.b, p1, [dstin, 1, mul vl]`
			`+ ret`
			`+`
			`+ .p2align 4`
			`+`
			`+L(copy_small):`
			`+ cmp n, vlen, lsl 3`
			`+ b.hi L(copy_large)`
			`+ add dstend, dstin, n`
			`+ add srcend, src, n`
			`+ cmp n, vlen, lsl 2`
			`+ b.hi 1f`
			`+`
			`+ /* Copy 2-4 vectors. */`
			`+ ptrue p0.b`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
			`+ ld1b z2.b, p0/z, [srcend, -2, mul vl]`
			`+ ld1b z3.b, p0/z, [srcend, -1, mul vl]`
			`+ st1b z0.b, p0, [dstin, 0, mul vl]`
			`+ st1b z1.b, p0, [dstin, 1, mul vl]`
			`+ st1b z2.b, p0, [dstend, -2, mul vl]`
			`+ st1b z3.b, p0, [dstend, -1, mul vl]`
			`+ ret`
			`+`
			`+ .p2align 4`
			`+ /* Copy 4-8 vectors. */`
			`+1: ptrue p0.b`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
			`+ ld1b z2.b, p0/z, [src, 2, mul vl]`
			`+ ld1b z3.b, p0/z, [src, 3, mul vl]`
			`+ ld1b z4.b, p0/z, [srcend, -4, mul vl]`
			`+ ld1b z5.b, p0/z, [srcend, -3, mul vl]`
			`+ ld1b z6.b, p0/z, [srcend, -2, mul vl]`
			`+ ld1b z7.b, p0/z, [srcend, -1, mul vl]`
			`+ st1b z0.b, p0, [dstin, 0, mul vl]`
			`+ st1b z1.b, p0, [dstin, 1, mul vl]`
			`+ st1b z2.b, p0, [dstin, 2, mul vl]`
			`+ st1b z3.b, p0, [dstin, 3, mul vl]`
			`+ st1b z4.b, p0, [dstend, -4, mul vl]`
			`+ st1b z5.b, p0, [dstend, -3, mul vl]`
			`+ st1b z6.b, p0, [dstend, -2, mul vl]`
			`+ st1b z7.b, p0, [dstend, -1, mul vl]`
			`+ ret`
			`+`
			`+ .p2align 4`
			`+ /* At least 8 vectors - always align to vector length for`
			`+ higher and consistent write performance. */`
			`+L(copy_large):`
			`+ sub tmp, vlen, 1`
			`+ and tmp, dstin, tmp`
			`+ sub tmp, vlen, tmp`
			`+ whilelo p1.b, xzr, tmp`
			`+ ld1b z1.b, p1/z, [src]`
			`+ st1b z1.b, p1, [dstin]`
			`+ add dst, dstin, tmp`
			`+ add src, src, tmp`
			`+ sub n, n, tmp`
			`+ ptrue p0.b`
			`+`
			`+ lsl vlen8, vlen, 3`
			`+ subs n, n, vlen8`
			`+ b.ls 3f`
			`ld1b_unroll8`
			`- add src_ptr, src_ptr, tmp1`
			`- sub rest, rest, tmp1`
			`- cmp rest, tmp1`
			`- b.cc 2f`
			`- .p2align 3`
			`+ add src, src, vlen8`
			`+ subs n, n, vlen8`
			`+ b.ls 2f`
			`+`
			`+ .p2align 4`
			`+ /* 8x unrolled and software pipelined loop. */`
			`1: stld1b_unroll8`
			`- add dest_ptr, dest_ptr, tmp1`
			`- add src_ptr, src_ptr, tmp1`
			`- sub rest, rest, tmp1`
			`- cmp rest, tmp1`
			`- b.ge 1b`
			`+ add dst, dst, vlen8`
			`+ add src, src, vlen8`
			`+ subs n, n, vlen8`
			`+ b.hi 1b`
			`2: st1b_unroll8`
			`- add dest_ptr, dest_ptr, tmp1`
			`-`
			`- .p2align 3`
			`-L(last):`
			`- whilelo p0.b, xzr, rest`
			`- whilelo p1.b, vector_length, rest`
			`- b.last 1f`
			`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
			`- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
			`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
			`- st1b z1.b, p1, [dest_ptr, #1, mul vl]`
			`- ret`
			`-1: lsl tmp1, vector_length, 1 // vector_length * 2`
			`- whilelo p2.b, tmp1, rest`
			`- incb tmp1`
			`- whilelo p3.b, tmp1, rest`
			`- b.last 1f`
			`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
			`- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
			`- ld1b z2.b, p2/z, [src_ptr, #2, mul vl]`
			`- ld1b z3.b, p3/z, [src_ptr, #3, mul vl]`
			`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
			`- st1b z1.b, p1, [dest_ptr, #1, mul vl]`
			`- st1b z2.b, p2, [dest_ptr, #2, mul vl]`
			`- st1b z3.b, p3, [dest_ptr, #3, mul vl]`
			`+ add dst, dst, vlen8`
			`+3: add n, n, vlen8`
			`+`
			`+ /* Move last 0-8 vectors. */`
			`+L(last_bytes):`
			`+ cmp n, vlen, lsl 1`
			`+ b.hi 1f`
			`+ whilelo p0.b, xzr, n`
			`+ whilelo p1.b, vlen, n`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p1/z, [src, 1, mul vl]`
			`+ st1b z0.b, p0, [dst, 0, mul vl]`
			`+ st1b z1.b, p1, [dst, 1, mul vl]`
			`ret`
			`-1: lsl tmp1, vector_length, 2 // vector_length * 4`
			`- whilelo p4.b, tmp1, rest`
			`- incb tmp1`
			`- whilelo p5.b, tmp1, rest`
			`- incb tmp1`
			`- whilelo p6.b, tmp1, rest`
			`- incb tmp1`
			`- whilelo p7.b, tmp1, rest`
			`- ld1b z0.b, p0/z, [src_ptr, #0, mul vl]`
			`- ld1b z1.b, p1/z, [src_ptr, #1, mul vl]`
			`- ld1b z2.b, p2/z, [src_ptr, #2, mul vl]`
			`- ld1b z3.b, p3/z, [src_ptr, #3, mul vl]`
			`- ld1b z4.b, p4/z, [src_ptr, #4, mul vl]`
			`- ld1b z5.b, p5/z, [src_ptr, #5, mul vl]`
			`- ld1b z6.b, p6/z, [src_ptr, #6, mul vl]`
			`- ld1b z7.b, p7/z, [src_ptr, #7, mul vl]`
			`- st1b z0.b, p0, [dest_ptr, #0, mul vl]`
			`- st1b z1.b, p1, [dest_ptr, #1, mul vl]`
			`- st1b z2.b, p2, [dest_ptr, #2, mul vl]`
			`- st1b z3.b, p3, [dest_ptr, #3, mul vl]`
			`- st1b z4.b, p4, [dest_ptr, #4, mul vl]`
			`- st1b z5.b, p5, [dest_ptr, #5, mul vl]`
			`- st1b z6.b, p6, [dest_ptr, #6, mul vl]`
			`- st1b z7.b, p7, [dest_ptr, #7, mul vl]`
			`+`
			`+ .p2align 4`
			`+`
			`+1: add srcend, src, n`
			`+ add dstend, dst, n`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p0/z, [src, 1, mul vl]`
			`+ ld1b z2.b, p0/z, [srcend, -2, mul vl]`
			`+ ld1b z3.b, p0/z, [srcend, -1, mul vl]`
			`+ cmp n, vlen, lsl 2`
			`+ b.hi 1f`
			`+`
			`+ st1b z0.b, p0, [dst, 0, mul vl]`
			`+ st1b z1.b, p0, [dst, 1, mul vl]`
			`+ st1b z2.b, p0, [dstend, -2, mul vl]`
			`+ st1b z3.b, p0, [dstend, -1, mul vl]`
			`ret`

			`-L(L2):`
			`- // align dest address at CACHE_LINE_SIZE byte boundary`
			`- mov tmp1, CACHE_LINE_SIZE`
			`- ands tmp2, dest_ptr, CACHE_LINE_SIZE - 1`
			`- // if cl_remainder == 0`
			`- b.eq L(L2_dc_zva)`
			`- sub cl_remainder, tmp1, tmp2`
			`- // process remainder until the first CACHE_LINE_SIZE boundary`
			`- whilelo p1.b, xzr, cl_remainder // keep p0.b all true`
			`- whilelo p2.b, vector_length, cl_remainder`
			`- b.last 1f`
			`- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]`
			`- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]`
			`- st1b z1.b, p1, [dest_ptr, #0, mul vl]`
			`- st1b z2.b, p2, [dest_ptr, #1, mul vl]`
			`- b 2f`
			`-1: lsl tmp1, vector_length, 1 // vector_length * 2`
			`- whilelo p3.b, tmp1, cl_remainder`
			`- incb tmp1`
			`- whilelo p4.b, tmp1, cl_remainder`
			`- ld1b z1.b, p1/z, [src_ptr, #0, mul vl]`
			`- ld1b z2.b, p2/z, [src_ptr, #1, mul vl]`
			`- ld1b z3.b, p3/z, [src_ptr, #2, mul vl]`
			`- ld1b z4.b, p4/z, [src_ptr, #3, mul vl]`
			`- st1b z1.b, p1, [dest_ptr, #0, mul vl]`
			`- st1b z2.b, p2, [dest_ptr, #1, mul vl]`
			`- st1b z3.b, p3, [dest_ptr, #2, mul vl]`
			`- st1b z4.b, p4, [dest_ptr, #3, mul vl]`
			`-2: add dest_ptr, dest_ptr, cl_remainder`
			`- add src_ptr, src_ptr, cl_remainder`
			`- sub rest, rest, cl_remainder`
			`-`
			`-L(L2_dc_zva):`
			`- // zero fill`
			`- and tmp1, dest, 0xffffffffffffff`
			`- and tmp2, src, 0xffffffffffffff`
			`- subs tmp1, tmp1, tmp2 // diff`
			`- b.ge 1f`
			`- neg tmp1, tmp1`
			`-1: mov tmp3, ZF_DIST + CACHE_LINE_SIZE * 2`
			`- cmp tmp1, tmp3`
			`- b.lo L(unroll8)`
			`- mov tmp1, dest_ptr`
			`- dc_zva (ZF_DIST / CACHE_LINE_SIZE) - 1`
			`- // unroll`
			`- ld1b_unroll8 // this line has to be after "b.lo L(unroll8)"`
			`- add src_ptr, src_ptr, CACHE_LINE_SIZE * 2`
			`- sub rest, rest, CACHE_LINE_SIZE * 2`
			`- mov tmp1, ZF_DIST`
			`- .p2align 3`
			`-1: stld1b_unroll4a`
			`- add tmp2, dest_ptr, tmp1 // dest_ptr + ZF_DIST`
			`- dc zva, tmp2`
			`- stld1b_unroll4b`
			`- add tmp2, tmp2, CACHE_LINE_SIZE`
			`- dc zva, tmp2`
			`- add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2`
			`- add src_ptr, src_ptr, CACHE_LINE_SIZE * 2`
			`- sub rest, rest, CACHE_LINE_SIZE * 2`
			`- cmp rest, tmp3 // ZF_DIST + CACHE_LINE_SIZE * 2`
			`- b.ge 1b`
			`- st1b_unroll8`
			`- add dest_ptr, dest_ptr, CACHE_LINE_SIZE * 2`
			`- b L(unroll8)`
			`+1: ld1b z4.b, p0/z, [src, 2, mul vl]`
			`+ ld1b z5.b, p0/z, [src, 3, mul vl]`
			`+ ld1b z6.b, p0/z, [srcend, -4, mul vl]`
			`+ ld1b z7.b, p0/z, [srcend, -3, mul vl]`
			`+ st1b z0.b, p0, [dst, 0, mul vl]`
			`+ st1b z1.b, p0, [dst, 1, mul vl]`
			`+ st1b z4.b, p0, [dst, 2, mul vl]`
			`+ st1b z5.b, p0, [dst, 3, mul vl]`
			`+ st1b z6.b, p0, [dstend, -4, mul vl]`
			`+ st1b z7.b, p0, [dstend, -3, mul vl]`
			`+ st1b z2.b, p0, [dstend, -2, mul vl]`
			`+ st1b z3.b, p0, [dstend, -1, mul vl]`
			`+ ret`

			`END (MEMCPY)`
			`libc_hidden_builtin_def (MEMCPY)`


			`-ENTRY (MEMMOVE)`
			`+ENTRY_ALIGN (MEMMOVE, 4)`

			`PTR_ARG (0)`
			`PTR_ARG (1)`
			`SIZE_ARG (2)`

			`- // remove tag address`
			`- // dest has to be immutable because it is the return value`
			`- // src has to be immutable because it is used in L(bwd_last)`
			`- and tmp2, dest, 0xffffffffffffff // save dest_notag into tmp2`
			`- and tmp3, src, 0xffffffffffffff // save src_notag intp tmp3`
			`- cmp n, 0`
			`- ccmp tmp2, tmp3, 4, ne`
			`- b.ne 1f`
			`+ /* Fast case for up to 2 vectors. */`
			`+ cntb vlen`
			`+ cmp n, vlen, lsl 1`
			`+ b.hi 1f`
			`+ whilelo p0.b, xzr, n`
			`+ whilelo p1.b, vlen, n`
			`+ ld1b z0.b, p0/z, [src, 0, mul vl]`
			`+ ld1b z1.b, p1/z, [src, 1, mul vl]`
			`+ st1b z0.b, p0, [dstin, 0, mul vl]`
			`+ st1b z1.b, p1, [dstin, 1, mul vl]`
			`+L(full_overlap):`
			`ret`
			`-1: cntb vector_length`
			`- // shortcut for less than vector_length * 8`
			`- // gives a free ptrue to p0.b for n >= vector_length`
			`- // tmp2 and tmp3 should not be used in this macro to keep`
			`- // notag addresses`
			`- shortcut_for_small_size L(dispatch)`
			`- // end of shortcut`
			`-`
			`-L(dispatch):`
			`- // tmp2 = dest_notag, tmp3 = src_notag`
			`- // diff = dest_notag - src_notag`
			`- sub tmp1, tmp2, tmp3`
			`- // if diff <= 0 \|\| diff >= n then memcpy`
			`- cmp tmp1, 0`
			`- ccmp tmp1, n, 2, gt`
			`- b.cs L(vl_agnostic)`
			`-`
			`-L(bwd_start):`
			`- mov rest, n`
			`- add dest_ptr, dest, n // dest_end`
			`- add src_ptr, src, n // src_end`
			`-`
			`-L(bwd_unroll8): // unrolling and software pipeline`
			`- lsl tmp1, vector_length, 3 // vector_length * 8`
			`- .p2align 3`
			`- cmp rest, tmp1`
			`- b.cc L(bwd_last)`
			`- sub src_ptr, src_ptr, tmp1`
			`+`
			`+ .p2align 4`
			`+ /* Check for overlapping moves. Return if there is a full overlap.`
			`+ Small moves up to 8 vectors use the overlap-safe copy_small code.`
			`+ Non-overlapping or overlapping moves with dst < src use memcpy.`
			`+ Overlapping moves with dst > src use a backward copy loop. */`
			`+1: sub tmp, dstin, src`
			`+ ands tmp, tmp, 0xffffffffffffff /* Clear special tag bits. */`
			`+ b.eq L(full_overlap)`
			`+ cmp n, vlen, lsl 3`
			`+ b.ls L(copy_small)`
			`+ cmp tmp, n`
			`+ b.hs L(copy_large)`
			`+`
			`+ /* Align to vector length. */`
			`+ add dst, dstin, n`
			`+ sub tmp, vlen, 1`
			`+ ands tmp, dst, tmp`
			`+ csel tmp, tmp, vlen, ne`
			`+ whilelo p1.b, xzr, tmp`
			`+ sub n, n, tmp`
			`+ ld1b z1.b, p1/z, [src, n]`
			`+ st1b z1.b, p1, [dstin, n]`
			`+ add src, src, n`
			`+ add dst, dstin, n`
			`+`
			`+ ptrue p0.b`
			`+ lsl vlen8, vlen, 3`
			`+ subs n, n, vlen8`
			`+ b.ls 3f`
			`+ sub src, src, vlen8`
			`ld1b_unroll8`
			`- sub rest, rest, tmp1`
			`- cmp rest, tmp1`
			`- b.cc 2f`
			`- .p2align 3`
			`-1: sub src_ptr, src_ptr, tmp1`
			`- sub dest_ptr, dest_ptr, tmp1`
			`+ subs n, n, vlen8`
			`+ b.ls 2f`
			`+`
			`+ .p2align 4`
			`+ /* 8x unrolled and software pipelined backward copy loop. */`
			`+1: sub src, src, vlen8`
			`+ sub dst, dst, vlen8`
			`stld1b_unroll8`
			`- sub rest, rest, tmp1`
			`- cmp rest, tmp1`
			`- b.ge 1b`
			`-2: sub dest_ptr, dest_ptr, tmp1`
			`+ subs n, n, vlen8`
			`+ b.hi 1b`
			`+2: sub dst, dst, vlen8`
			`st1b_unroll8`
			`+3: add n, n, vlen8`

			`-L(bwd_last):`
			`- mov dest_ptr, dest`
			`- mov src_ptr, src`
			`- b L(last)`
			`+ /* Adjust src/dst for last 0-8 vectors. */`
			`+ sub src, src, n`
			`+ mov dst, dstin`
			`+ b L(last_bytes)`

			`END (MEMMOVE)`
			`libc_hidden_builtin_def (MEMMOVE)`
			`--`
			`2.31.1`