glibc/glibc-RHEL-15696-39.patch

From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001
From: noah <goldstein.w.n@gmail.com>
Date: Sat, 3 Apr 2021 04:12:15 -0400
Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S
Content-type: text/plain; charset=UTF-8

No Bug. This commit updates the large memcpy case (no overlap). The
update is to perform memcpy on either 2 or 4 contiguous pages at
once. This 1) helps to alleviate the affects of false memory aliasing
when destination and source have a close 4k alignment and 2) In most
cases and for most DRAM units is a modestly more efficient access
pattern. These changes are a clear performance improvement for
VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,
test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all
pass.

Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>
---
 .../multiarch/memmove-vec-unaligned-erms.S    | 338 ++++++++++++++----
 1 file changed, 265 insertions(+), 73 deletions(-)

Conflicts:
	sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
	(different number of sections)

diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
index c475fed4..3e2dd6bc 100644
--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S
@@ -32,7 +32,16 @@
       overlapping addresses.
    6. If size >= __x86_shared_non_temporal_threshold and there is no
       overlap between destination and source, use non-temporal store
-      instead of aligned store.  */
+      instead of aligned store copying from either 2 or 4 pages at
+      once.
+   8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold
+      and source and destination do not page alias, copy from 2 pages
+      at once using non-temporal stores. Page aliasing in this case is
+      considered true if destination's page alignment - sources' page
+      alignment is less than 8 * VEC_SIZE.
+   9. If size >= 16 * __x86_shared_non_temporal_threshold or source
+      and destination do page alias copy from 4 pages at once using
+      non-temporal stores.  */
 
 #include <sysdep.h>
 
@@ -64,6 +73,34 @@
 # endif
 #endif
 
+#ifndef PAGE_SIZE
+# define PAGE_SIZE 4096
+#endif
+
+#if PAGE_SIZE != 4096
+# error Unsupported PAGE_SIZE
+#endif
+
+#ifndef LOG_PAGE_SIZE
+# define LOG_PAGE_SIZE 12
+#endif
+
+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)
+# error Invalid LOG_PAGE_SIZE
+#endif
+
+/* Byte per page for large_memcpy inner loop.  */
+#if VEC_SIZE == 64
+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)
+#else
+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)
+#endif
+
+/* Amount to shift rdx by to compare for memcpy_large_4x.  */
+#ifndef LOG_4X_MEMCPY_THRESH
+# define LOG_4X_MEMCPY_THRESH 4
+#endif
+
 /* Avoid short distance rep movsb only with non-SSE vector.  */
 #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB
 # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)
@@ -103,6 +140,28 @@
 # error Unsupported PREFETCH_SIZE!
 #endif
 
+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1;
+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \
+	VMOVNT  vec0, (offset)base; \
+	VMOVNT  vec1, ((offset) + VEC_SIZE)base;
+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)
+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVU	(offset)base, vec0; \
+	VMOVU	((offset) + VEC_SIZE)base, vec1; \
+	VMOVU	((offset) + VEC_SIZE * 2)base, vec2; \
+	VMOVU	((offset) + VEC_SIZE * 3)base, vec3;
+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \
+	VMOVNT	vec0, (offset)base; \
+	VMOVNT	vec1, ((offset) + VEC_SIZE)base; \
+	VMOVNT	vec2, ((offset) + VEC_SIZE * 2)base; \
+	VMOVNT	vec3, ((offset) + VEC_SIZE * 3)base;
+#else
+# error Invalid LARGE_LOAD_SIZE
+#endif
+
 #ifndef SECTION
 # error SECTION is not defined!
 #endif
@@ -390,6 +449,15 @@ L(last_4x_vec):
 	VZEROUPPER_RETURN
 
 L(more_8x_vec):
+	/* Check if non-temporal move candidate.  */
+#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
+	/* Check non-temporal store threshold.  */
+	cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP
+	ja	L(large_memcpy_2x)
+#endif
+	/* Entry if rdx is greater than non-temporal threshold but there
+       is overlap.  */
+L(more_8x_vec_check):
 	cmpq	%rsi, %rdi
 	ja	L(more_8x_vec_backward)
 	/* Source == destination is less common.  */
@@ -416,24 +484,21 @@ L(more_8x_vec):
 	subq	%r8, %rdi
 	/* Adjust length.  */
 	addq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_forward)
-#endif
+
+	.p2align 4
 L(loop_4x_vec_forward):
 	/* Copy 4 * VEC a time forward.  */
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$(VEC_SIZE * 4), %rsi
-	subq	$(VEC_SIZE * 4), %rdx
+	subq	$-(VEC_SIZE * 4), %rsi
+	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%rdi)
 	VMOVA	%VEC(1), VEC_SIZE(%rdi)
 	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
 	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$(VEC_SIZE * 4), %rdi
+	subq	$-(VEC_SIZE * 4), %rdi
 	cmpq	$(VEC_SIZE * 4), %rdx
 	ja	L(loop_4x_vec_forward)
 	/* Store the last 4 * VEC.  */
@@ -467,24 +532,21 @@ L(more_8x_vec_backward):
 	subq	%r8, %r9
 	/* Adjust length.  */
 	subq	%r8, %rdx
-#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-	/* Check non-temporal store threshold.  */
-	cmp	__x86_shared_non_temporal_threshold(%rip), %RDX_LP
-	ja	L(large_backward)
-#endif
+
+	.p2align 4
 L(loop_4x_vec_backward):
 	/* Copy 4 * VEC a time backward.  */
 	VMOVU	(%rcx), %VEC(0)
 	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
 	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
 	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$(VEC_SIZE * 4), %rcx
-	subq	$(VEC_SIZE * 4), %rdx
+	addq	$-(VEC_SIZE * 4), %rcx
+	addq	$-(VEC_SIZE * 4), %rdx
 	VMOVA	%VEC(0), (%r9)
 	VMOVA	%VEC(1), -VEC_SIZE(%r9)
 	VMOVA	%VEC(2), -(VEC_SIZE * 2)(%r9)
 	VMOVA	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$(VEC_SIZE * 4), %r9
+	addq	$-(VEC_SIZE * 4), %r9
 	cmpq	$(VEC_SIZE * 4), %rdx
 	ja	L(loop_4x_vec_backward)
 	/* Store the first 4 * VEC.  */
@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):
 	VZEROUPPER_RETURN
 
 #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc)
-L(large_forward):
+	.p2align 4
+L(large_memcpy_2x):
+	/* Compute absolute value of difference between source and
+	   destination.  */
+	movq	%rdi, %r9
+	subq	%rsi, %r9
+	movq	%r9, %r8
+	leaq	-1(%r9), %rcx
+	sarq	$63, %r8
+	xorq	%r8, %r9
+	subq	%r8, %r9
 	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rdi, %rdx), %r10
-	cmpq    %r10, %rsi
-	jb	L(loop_4x_vec_forward)
-L(loop_large_forward):
+	   destination and source since destination may be in cache when
+	   source is loaded.  */
+	cmpq	%r9, %rdx
+	ja	L(more_8x_vec_check)
+
+	/* Cache align destination. First store the first 64 bytes then
+	   adjust alignments.  */
+	VMOVU	(%rsi), %VEC(8)
+#if VEC_SIZE < 64
+	VMOVU	VEC_SIZE(%rsi), %VEC(9)
+#if VEC_SIZE < 32
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(10)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(11)
+#endif
+#endif
+	VMOVU	%VEC(8), (%rdi)
+#if VEC_SIZE < 64
+	VMOVU	%VEC(9), VEC_SIZE(%rdi)
+#if VEC_SIZE < 32
+	VMOVU	%VEC(10), (VEC_SIZE * 2)(%rdi)
+	VMOVU	%VEC(11), (VEC_SIZE * 3)(%rdi)
+#endif
+#endif
+	/* Adjust source, destination, and size.  */
+	movq	%rdi, %r8
+	andq	$63, %r8
+	/* Get the negative of offset for alignment.  */
+	subq	$64, %r8
+	/* Adjust source.  */
+	subq	%r8, %rsi
+	/* Adjust destination which should be aligned now.  */
+	subq	%r8, %rdi
+	/* Adjust length.  */
+	addq	%r8, %rdx
+
+	/* Test if source and destination addresses will alias. If they do
+	   the larger pipeline in large_memcpy_4x alleviated the
+	   performance drop.  */
+	testl	$(PAGE_SIZE - VEC_SIZE * 8), %ecx
+	jz	L(large_memcpy_4x)
+
+	movq	%rdx, %r10
+	shrq	$LOG_4X_MEMCPY_THRESH, %r10
+	cmp	__x86_shared_non_temporal_threshold(%rip), %r10
+	jae	L(large_memcpy_4x)
+
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 2 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10
+	/* Copy 4x VEC at a time from 2 pages.  */
+	.p2align 4
+L(loop_large_memcpy_2x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_2x_inner):
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_2x_inner)
+	addq	$PAGE_SIZE, %rdi
+	addq	$PAGE_SIZE, %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_2x_outer)
+	sfence
+
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_2x_end)
+
+	/* Handle the last 2 * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_2x_tail):
 	/* Copy 4 * VEC a time forward with non-temporal stores.  */
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
 	VMOVU	(%rsi), %VEC(0)
 	VMOVU	VEC_SIZE(%rsi), %VEC(1)
 	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
 	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
-	addq	$PREFETCHED_LOAD_SIZE, %rsi
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%rdi)
-	VMOVNT	%VEC(1), VEC_SIZE(%rdi)
-	VMOVNT	%VEC(2), (VEC_SIZE * 2)(%rdi)
-	VMOVNT	%VEC(3), (VEC_SIZE * 3)(%rdi)
-	addq	$PREFETCHED_LOAD_SIZE, %rdi
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_forward)
-	sfence
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_2x_tail)
+
+L(large_memcpy_2x_end):
 	/* Store the last 4 * VEC.  */
-	VMOVU	%VEC(5), (%rcx)
-	VMOVU	%VEC(6), -VEC_SIZE(%rcx)
-	VMOVU	%VEC(7), -(VEC_SIZE * 2)(%rcx)
-	VMOVU	%VEC(8), -(VEC_SIZE * 3)(%rcx)
-	/* Store the first VEC.  */
-	VMOVU	%VEC(4), (%r11)
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 
-L(large_backward):
-	/* Don't use non-temporal store if there is overlap between
-	   destination and source since destination may be in cache
-	   when source is loaded.  */
-	leaq    (%rcx, %rdx), %r10
-	cmpq    %r10, %r9
-	jb	L(loop_4x_vec_backward)
-L(loop_large_backward):
-	/* Copy 4 * VEC a time backward with non-temporal stores.  */
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)
-	PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)
-	VMOVU	(%rcx), %VEC(0)
-	VMOVU	-VEC_SIZE(%rcx), %VEC(1)
-	VMOVU	-(VEC_SIZE * 2)(%rcx), %VEC(2)
-	VMOVU	-(VEC_SIZE * 3)(%rcx), %VEC(3)
-	subq	$PREFETCHED_LOAD_SIZE, %rcx
-	subq	$PREFETCHED_LOAD_SIZE, %rdx
-	VMOVNT	%VEC(0), (%r9)
-	VMOVNT	%VEC(1), -VEC_SIZE(%r9)
-	VMOVNT	%VEC(2), -(VEC_SIZE * 2)(%r9)
-	VMOVNT	%VEC(3), -(VEC_SIZE * 3)(%r9)
-	subq	$PREFETCHED_LOAD_SIZE, %r9
-	cmpq	$PREFETCHED_LOAD_SIZE, %rdx
-	ja	L(loop_large_backward)
+	.p2align 4
+L(large_memcpy_4x):
+	movq	%rdx, %r10
+	/* edx will store remainder size for copying tail.  */
+	andl	$(PAGE_SIZE * 4 - 1), %edx
+	/* r10 stores outer loop counter.  */
+	shrq	$(LOG_PAGE_SIZE + 2), %r10
+	/* Copy 4x VEC at a time from 4 pages.  */
+	.p2align 4
+L(loop_large_memcpy_4x_outer):
+	/* ecx stores inner loop counter.  */
+	movl	$(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx
+L(loop_large_memcpy_4x_inner):
+	/* Only one prefetch set per page as doing 4 pages give more time
+	   for prefetcher to keep up.  */
+	PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)
+	/* Load vectors from rsi.  */
+	LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rsi
+	/* Non-temporal store vectors to rdi.  */
+	STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))
+	STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))
+	STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))
+	subq	$-LARGE_LOAD_SIZE, %rdi
+	decl	%ecx
+	jnz	L(loop_large_memcpy_4x_inner)
+	addq	$(PAGE_SIZE * 3), %rdi
+	addq	$(PAGE_SIZE * 3), %rsi
+	decq	%r10
+	jne	L(loop_large_memcpy_4x_outer)
 	sfence
-	/* Store the first 4 * VEC.  */
-	VMOVU	%VEC(4), (%rdi)
-	VMOVU	%VEC(5), VEC_SIZE(%rdi)
-	VMOVU	%VEC(6), (VEC_SIZE * 2)(%rdi)
-	VMOVU	%VEC(7), (VEC_SIZE * 3)(%rdi)
-	/* Store the last VEC.  */
-	VMOVU	%VEC(8), (%r11)
+	/* Check if only last 4 loads are needed.  */
+	cmpl	$(VEC_SIZE * 4), %edx
+	jbe	L(large_memcpy_4x_end)
+
+	/* Handle the last 4  * PAGE_SIZE bytes.  */
+L(loop_large_memcpy_4x_tail):
+	/* Copy 4 * VEC a time forward with non-temporal stores.  */
+	PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)
+	PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)
+	VMOVU	(%rsi), %VEC(0)
+	VMOVU	VEC_SIZE(%rsi), %VEC(1)
+	VMOVU	(VEC_SIZE * 2)(%rsi), %VEC(2)
+	VMOVU	(VEC_SIZE * 3)(%rsi), %VEC(3)
+	subq	$-(VEC_SIZE * 4), %rsi
+	addl	$-(VEC_SIZE * 4), %edx
+	VMOVA	%VEC(0), (%rdi)
+	VMOVA	%VEC(1), VEC_SIZE(%rdi)
+	VMOVA	%VEC(2), (VEC_SIZE * 2)(%rdi)
+	VMOVA	%VEC(3), (VEC_SIZE * 3)(%rdi)
+	subq	$-(VEC_SIZE * 4), %rdi
+	cmpl	$(VEC_SIZE * 4), %edx
+	ja	L(loop_large_memcpy_4x_tail)
+
+L(large_memcpy_4x_end):
+	/* Store the last 4 * VEC.  */
+	VMOVU	-(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)
+	VMOVU	-(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)
+	VMOVU	-(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)
+	VMOVU	-VEC_SIZE(%rsi, %rdx), %VEC(3)
+
+	VMOVU	%VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)
+	VMOVU	%VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)
+	VMOVU	%VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)
+	VMOVU	%VEC(3), -VEC_SIZE(%rdi, %rdx)
 	VZEROUPPER_RETURN
 #endif
 END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))
-- 
GitLab
Import Intel hyperscale improvements (RHEL-15696) Resolves: RHEL-15696 Includes two additional (well, 1.5) upstream patches to resolve roundeven redirects. 2023-12-14 22:33:45 +00:00			`From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001`
			`From: noah <goldstein.w.n@gmail.com>`
			`Date: Sat, 3 Apr 2021 04:12:15 -0400`
			`Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S`
			`Content-type: text/plain; charset=UTF-8`

			`No Bug. This commit updates the large memcpy case (no overlap). The`
			`update is to perform memcpy on either 2 or 4 contiguous pages at`
			`once. This 1) helps to alleviate the affects of false memory aliasing`
			`when destination and source have a close 4k alignment and 2) In most`
			`cases and for most DRAM units is a modestly more efficient access`
			`pattern. These changes are a clear performance improvement for`
			`VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy,`
			`test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all`
			`pass.`

			`Signed-off-by: Noah Goldstein <goldstein.w.n@gmail.com>`
			`---`
			`.../multiarch/memmove-vec-unaligned-erms.S \| 338 ++++++++++++++----`
			`1 file changed, 265 insertions(+), 73 deletions(-)`

			`Conflicts:`
			`sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S`
			`(different number of sections)`

			`diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S`
			`index c475fed4..3e2dd6bc 100644`
			`--- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S`
			`+++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S`
			`@@ -32,7 +32,16 @@`
			`overlapping addresses.`
			`6. If size >= __x86_shared_non_temporal_threshold and there is no`
			`overlap between destination and source, use non-temporal store`
			`- instead of aligned store. */`
			`+ instead of aligned store copying from either 2 or 4 pages at`
			`+ once.`
			`+ 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold`
			`+ and source and destination do not page alias, copy from 2 pages`
			`+ at once using non-temporal stores. Page aliasing in this case is`
			`+ considered true if destination's page alignment - sources' page`
			`+ alignment is less than 8 * VEC_SIZE.`
			`+ 9. If size >= 16 * __x86_shared_non_temporal_threshold or source`
			`+ and destination do page alias copy from 4 pages at once using`
			`+ non-temporal stores. */`

			`#include <sysdep.h>`

			`@@ -64,6 +73,34 @@`
			`# endif`
			`#endif`

			`+#ifndef PAGE_SIZE`
			`+# define PAGE_SIZE 4096`
			`+#endif`
			`+`
			`+#if PAGE_SIZE != 4096`
			`+# error Unsupported PAGE_SIZE`
			`+#endif`
			`+`
			`+#ifndef LOG_PAGE_SIZE`
			`+# define LOG_PAGE_SIZE 12`
			`+#endif`
			`+`
			`+#if PAGE_SIZE != (1 << LOG_PAGE_SIZE)`
			`+# error Invalid LOG_PAGE_SIZE`
			`+#endif`
			`+`
			`+/* Byte per page for large_memcpy inner loop. */`
			`+#if VEC_SIZE == 64`
			`+# define LARGE_LOAD_SIZE (VEC_SIZE * 2)`
			`+#else`
			`+# define LARGE_LOAD_SIZE (VEC_SIZE * 4)`
			`+#endif`
			`+`
			`+/* Amount to shift rdx by to compare for memcpy_large_4x. */`
			`+#ifndef LOG_4X_MEMCPY_THRESH`
			`+# define LOG_4X_MEMCPY_THRESH 4`
			`+#endif`
			`+`
			`/* Avoid short distance rep movsb only with non-SSE vector. */`
			`#ifndef AVOID_SHORT_DISTANCE_REP_MOVSB`
			`# define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16)`
			`@@ -103,6 +140,28 @@`
			`# error Unsupported PREFETCH_SIZE!`
			`#endif`

			`+#if LARGE_LOAD_SIZE == (VEC_SIZE * 2)`
			`+# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \`
			`+ VMOVU (offset)base, vec0; \`
			`+ VMOVU ((offset) + VEC_SIZE)base, vec1;`
			`+# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \`
			`+ VMOVNT vec0, (offset)base; \`
			`+ VMOVNT vec1, ((offset) + VEC_SIZE)base;`
			`+#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4)`
			`+# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \`
			`+ VMOVU (offset)base, vec0; \`
			`+ VMOVU ((offset) + VEC_SIZE)base, vec1; \`
			`+ VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \`
			`+ VMOVU ((offset) + VEC_SIZE * 3)base, vec3;`
			`+# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \`
			`+ VMOVNT vec0, (offset)base; \`
			`+ VMOVNT vec1, ((offset) + VEC_SIZE)base; \`
			`+ VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \`
			`+ VMOVNT vec3, ((offset) + VEC_SIZE * 3)base;`
			`+#else`
			`+# error Invalid LARGE_LOAD_SIZE`
			`+#endif`
			`+`
			`#ifndef SECTION`
			`# error SECTION is not defined!`
			`#endif`
			`@@ -390,6 +449,15 @@ L(last_4x_vec):`
			`VZEROUPPER_RETURN`

			`L(more_8x_vec):`
			`+ /* Check if non-temporal move candidate. */`
			`+#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)`
			`+ /* Check non-temporal store threshold. */`
			`+ cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP`
			`+ ja L(large_memcpy_2x)`
			`+#endif`
			`+ /* Entry if rdx is greater than non-temporal threshold but there`
			`+ is overlap. */`
			`+L(more_8x_vec_check):`
			`cmpq %rsi, %rdi`
			`ja L(more_8x_vec_backward)`
			`/* Source == destination is less common. */`
			`@@ -416,24 +484,21 @@ L(more_8x_vec):`
			`subq %r8, %rdi`
			`/* Adjust length. */`
			`addq %r8, %rdx`
			`-#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)`
			`- /* Check non-temporal store threshold. */`
			`- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP`
			`- ja L(large_forward)`
			`-#endif`
			`+`
			`+ .p2align 4`
			`L(loop_4x_vec_forward):`
			`/* Copy 4 * VEC a time forward. */`
			`VMOVU (%rsi), %VEC(0)`
			`VMOVU VEC_SIZE(%rsi), %VEC(1)`
			`VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)`
			`VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)`
			`- addq $(VEC_SIZE * 4), %rsi`
			`- subq $(VEC_SIZE * 4), %rdx`
			`+ subq $-(VEC_SIZE * 4), %rsi`
			`+ addq $-(VEC_SIZE * 4), %rdx`
			`VMOVA %VEC(0), (%rdi)`
			`VMOVA %VEC(1), VEC_SIZE(%rdi)`
			`VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)`
			`VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)`
			`- addq $(VEC_SIZE * 4), %rdi`
			`+ subq $-(VEC_SIZE * 4), %rdi`
			`cmpq $(VEC_SIZE * 4), %rdx`
			`ja L(loop_4x_vec_forward)`
			`/* Store the last 4 * VEC. */`
			`@@ -467,24 +532,21 @@ L(more_8x_vec_backward):`
			`subq %r8, %r9`
			`/* Adjust length. */`
			`subq %r8, %rdx`
			`-#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)`
			`- /* Check non-temporal store threshold. */`
			`- cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP`
			`- ja L(large_backward)`
			`-#endif`
			`+`
			`+ .p2align 4`
			`L(loop_4x_vec_backward):`
			`/* Copy 4 * VEC a time backward. */`
			`VMOVU (%rcx), %VEC(0)`
			`VMOVU -VEC_SIZE(%rcx), %VEC(1)`
			`VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)`
			`VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)`
			`- subq $(VEC_SIZE * 4), %rcx`
			`- subq $(VEC_SIZE * 4), %rdx`
			`+ addq $-(VEC_SIZE * 4), %rcx`
			`+ addq $-(VEC_SIZE * 4), %rdx`
			`VMOVA %VEC(0), (%r9)`
			`VMOVA %VEC(1), -VEC_SIZE(%r9)`
			`VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9)`
			`VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9)`
			`- subq $(VEC_SIZE * 4), %r9`
			`+ addq $-(VEC_SIZE * 4), %r9`
			`cmpq $(VEC_SIZE * 4), %rdx`
			`ja L(loop_4x_vec_backward)`
			`/* Store the first 4 * VEC. */`
			`@@ -497,72 +559,202 @@ L(loop_4x_vec_backward):`
			`VZEROUPPER_RETURN`

			`#if (defined USE_MULTIARCH \|\| VEC_SIZE == 16) && IS_IN (libc)`
			`-L(large_forward):`
			`+ .p2align 4`
			`+L(large_memcpy_2x):`
			`+ /* Compute absolute value of difference between source and`
			`+ destination. */`
			`+ movq %rdi, %r9`
			`+ subq %rsi, %r9`
			`+ movq %r9, %r8`
			`+ leaq -1(%r9), %rcx`
			`+ sarq $63, %r8`
			`+ xorq %r8, %r9`
			`+ subq %r8, %r9`
			`/* Don't use non-temporal store if there is overlap between`
			`- destination and source since destination may be in cache`
			`- when source is loaded. */`
			`- leaq (%rdi, %rdx), %r10`
			`- cmpq %r10, %rsi`
			`- jb L(loop_4x_vec_forward)`
			`-L(loop_large_forward):`
			`+ destination and source since destination may be in cache when`
			`+ source is loaded. */`
			`+ cmpq %r9, %rdx`
			`+ ja L(more_8x_vec_check)`
			`+`
			`+ /* Cache align destination. First store the first 64 bytes then`
			`+ adjust alignments. */`
			`+ VMOVU (%rsi), %VEC(8)`
			`+#if VEC_SIZE < 64`
			`+ VMOVU VEC_SIZE(%rsi), %VEC(9)`
			`+#if VEC_SIZE < 32`
			`+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10)`
			`+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11)`
			`+#endif`
			`+#endif`
			`+ VMOVU %VEC(8), (%rdi)`
			`+#if VEC_SIZE < 64`
			`+ VMOVU %VEC(9), VEC_SIZE(%rdi)`
			`+#if VEC_SIZE < 32`
			`+ VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi)`
			`+ VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi)`
			`+#endif`
			`+#endif`
			`+ /* Adjust source, destination, and size. */`
			`+ movq %rdi, %r8`
			`+ andq $63, %r8`
			`+ /* Get the negative of offset for alignment. */`
			`+ subq $64, %r8`
			`+ /* Adjust source. */`
			`+ subq %r8, %rsi`
			`+ /* Adjust destination which should be aligned now. */`
			`+ subq %r8, %rdi`
			`+ /* Adjust length. */`
			`+ addq %r8, %rdx`
			`+`
			`+ /* Test if source and destination addresses will alias. If they do`
			`+ the larger pipeline in large_memcpy_4x alleviated the`
			`+ performance drop. */`
			`+ testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx`
			`+ jz L(large_memcpy_4x)`
			`+`
			`+ movq %rdx, %r10`
			`+ shrq $LOG_4X_MEMCPY_THRESH, %r10`
			`+ cmp __x86_shared_non_temporal_threshold(%rip), %r10`
			`+ jae L(large_memcpy_4x)`
			`+`
			`+ /* edx will store remainder size for copying tail. */`
			`+ andl $(PAGE_SIZE * 2 - 1), %edx`
			`+ /* r10 stores outer loop counter. */`
			`+ shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10`
			`+ /* Copy 4x VEC at a time from 2 pages. */`
			`+ .p2align 4`
			`+L(loop_large_memcpy_2x_outer):`
			`+ /* ecx stores inner loop counter. */`
			`+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx`
			`+L(loop_large_memcpy_2x_inner):`
			`+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)`
			`+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2)`
			`+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)`
			`+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2)`
			`+ /* Load vectors from rsi. */`
			`+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))`
			`+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))`
			`+ subq $-LARGE_LOAD_SIZE, %rsi`
			`+ /* Non-temporal store vectors to rdi. */`
			`+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))`
			`+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))`
			`+ subq $-LARGE_LOAD_SIZE, %rdi`
			`+ decl %ecx`
			`+ jnz L(loop_large_memcpy_2x_inner)`
			`+ addq $PAGE_SIZE, %rdi`
			`+ addq $PAGE_SIZE, %rsi`
			`+ decq %r10`
			`+ jne L(loop_large_memcpy_2x_outer)`
			`+ sfence`
			`+`
			`+ /* Check if only last 4 loads are needed. */`
			`+ cmpl $(VEC_SIZE * 4), %edx`
			`+ jbe L(large_memcpy_2x_end)`
			`+`
			`+ /* Handle the last 2 * PAGE_SIZE bytes. */`
			`+L(loop_large_memcpy_2x_tail):`
			`/* Copy 4 * VEC a time forward with non-temporal stores. */`
			`- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2)`
			`- PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3)`
			`+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)`
			`+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)`
			`VMOVU (%rsi), %VEC(0)`
			`VMOVU VEC_SIZE(%rsi), %VEC(1)`
			`VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)`
			`VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)`
			`- addq $PREFETCHED_LOAD_SIZE, %rsi`
			`- subq $PREFETCHED_LOAD_SIZE, %rdx`
			`- VMOVNT %VEC(0), (%rdi)`
			`- VMOVNT %VEC(1), VEC_SIZE(%rdi)`
			`- VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi)`
			`- VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi)`
			`- addq $PREFETCHED_LOAD_SIZE, %rdi`
			`- cmpq $PREFETCHED_LOAD_SIZE, %rdx`
			`- ja L(loop_large_forward)`
			`- sfence`
			`+ subq $-(VEC_SIZE * 4), %rsi`
			`+ addl $-(VEC_SIZE * 4), %edx`
			`+ VMOVA %VEC(0), (%rdi)`
			`+ VMOVA %VEC(1), VEC_SIZE(%rdi)`
			`+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)`
			`+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)`
			`+ subq $-(VEC_SIZE * 4), %rdi`
			`+ cmpl $(VEC_SIZE * 4), %edx`
			`+ ja L(loop_large_memcpy_2x_tail)`
			`+`
			`+L(large_memcpy_2x_end):`
			`/* Store the last 4 * VEC. */`
			`- VMOVU %VEC(5), (%rcx)`
			`- VMOVU %VEC(6), -VEC_SIZE(%rcx)`
			`- VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx)`
			`- VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx)`
			`- /* Store the first VEC. */`
			`- VMOVU %VEC(4), (%r11)`
			`+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)`
			`+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)`
			`+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)`
			`+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)`
			`+`
			`+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)`
			`+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)`
			`+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)`
			`+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)`
			`VZEROUPPER_RETURN`

			`-L(large_backward):`
			`- /* Don't use non-temporal store if there is overlap between`
			`- destination and source since destination may be in cache`
			`- when source is loaded. */`
			`- leaq (%rcx, %rdx), %r10`
			`- cmpq %r10, %r9`
			`- jb L(loop_4x_vec_backward)`
			`-L(loop_large_backward):`
			`- /* Copy 4 * VEC a time backward with non-temporal stores. */`
			`- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2)`
			`- PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3)`
			`- VMOVU (%rcx), %VEC(0)`
			`- VMOVU -VEC_SIZE(%rcx), %VEC(1)`
			`- VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2)`
			`- VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3)`
			`- subq $PREFETCHED_LOAD_SIZE, %rcx`
			`- subq $PREFETCHED_LOAD_SIZE, %rdx`
			`- VMOVNT %VEC(0), (%r9)`
			`- VMOVNT %VEC(1), -VEC_SIZE(%r9)`
			`- VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9)`
			`- VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9)`
			`- subq $PREFETCHED_LOAD_SIZE, %r9`
			`- cmpq $PREFETCHED_LOAD_SIZE, %rdx`
			`- ja L(loop_large_backward)`
			`+ .p2align 4`
			`+L(large_memcpy_4x):`
			`+ movq %rdx, %r10`
			`+ /* edx will store remainder size for copying tail. */`
			`+ andl $(PAGE_SIZE * 4 - 1), %edx`
			`+ /* r10 stores outer loop counter. */`
			`+ shrq $(LOG_PAGE_SIZE + 2), %r10`
			`+ /* Copy 4x VEC at a time from 4 pages. */`
			`+ .p2align 4`
			`+L(loop_large_memcpy_4x_outer):`
			`+ /* ecx stores inner loop counter. */`
			`+ movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx`
			`+L(loop_large_memcpy_4x_inner):`
			`+ /* Only one prefetch set per page as doing 4 pages give more time`
			`+ for prefetcher to keep up. */`
			`+ PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE)`
			`+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE)`
			`+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE)`
			`+ PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE)`
			`+ /* Load vectors from rsi. */`
			`+ LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))`
			`+ LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))`
			`+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))`
			`+ LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))`
			`+ subq $-LARGE_LOAD_SIZE, %rsi`
			`+ /* Non-temporal store vectors to rdi. */`
			`+ STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3))`
			`+ STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7))`
			`+ STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11))`
			`+ STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15))`
			`+ subq $-LARGE_LOAD_SIZE, %rdi`
			`+ decl %ecx`
			`+ jnz L(loop_large_memcpy_4x_inner)`
			`+ addq $(PAGE_SIZE * 3), %rdi`
			`+ addq $(PAGE_SIZE * 3), %rsi`
			`+ decq %r10`
			`+ jne L(loop_large_memcpy_4x_outer)`
			`sfence`
			`- /* Store the first 4 * VEC. */`
			`- VMOVU %VEC(4), (%rdi)`
			`- VMOVU %VEC(5), VEC_SIZE(%rdi)`
			`- VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi)`
			`- VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi)`
			`- /* Store the last VEC. */`
			`- VMOVU %VEC(8), (%r11)`
			`+ /* Check if only last 4 loads are needed. */`
			`+ cmpl $(VEC_SIZE * 4), %edx`
			`+ jbe L(large_memcpy_4x_end)`
			`+`
			`+ /* Handle the last 4 * PAGE_SIZE bytes. */`
			`+L(loop_large_memcpy_4x_tail):`
			`+ /* Copy 4 * VEC a time forward with non-temporal stores. */`
			`+ PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE)`
			`+ PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE)`
			`+ VMOVU (%rsi), %VEC(0)`
			`+ VMOVU VEC_SIZE(%rsi), %VEC(1)`
			`+ VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2)`
			`+ VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3)`
			`+ subq $-(VEC_SIZE * 4), %rsi`
			`+ addl $-(VEC_SIZE * 4), %edx`
			`+ VMOVA %VEC(0), (%rdi)`
			`+ VMOVA %VEC(1), VEC_SIZE(%rdi)`
			`+ VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi)`
			`+ VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi)`
			`+ subq $-(VEC_SIZE * 4), %rdi`
			`+ cmpl $(VEC_SIZE * 4), %edx`
			`+ ja L(loop_large_memcpy_4x_tail)`
			`+`
			`+L(large_memcpy_4x_end):`
			`+ /* Store the last 4 * VEC. */`
			`+ VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0)`
			`+ VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1)`
			`+ VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2)`
			`+ VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3)`
			`+`
			`+ VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx)`
			`+ VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx)`
			`+ VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx)`
			`+ VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx)`
			`VZEROUPPER_RETURN`
			`#endif`
			`END (MEMMOVE_SYMBOL (__memmove, unaligned_erms))`
			`--`
			`GitLab`