From 1a8605b6cd257e8a74e29b5b71c057211f5fb847 Mon Sep 17 00:00:00 2001 From: noah Date: Sat, 3 Apr 2021 04:12:15 -0400 Subject: [PATCH] x86: Update large memcpy case in memmove-vec-unaligned-erms.S Content-type: text/plain; charset=UTF-8 No Bug. This commit updates the large memcpy case (no overlap). The update is to perform memcpy on either 2 or 4 contiguous pages at once. This 1) helps to alleviate the affects of false memory aliasing when destination and source have a close 4k alignment and 2) In most cases and for most DRAM units is a modestly more efficient access pattern. These changes are a clear performance improvement for VEC_SIZE =16/32, though more ambiguous for VEC_SIZE=64. test-memcpy, test-memccpy, test-mempcpy, test-memmove, and tst-memmove-overflow all pass. Signed-off-by: Noah Goldstein --- .../multiarch/memmove-vec-unaligned-erms.S | 338 ++++++++++++++---- 1 file changed, 265 insertions(+), 73 deletions(-) Conflicts: sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S (different number of sections) diff --git a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S index c475fed4..3e2dd6bc 100644 --- a/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S +++ b/sysdeps/x86_64/multiarch/memmove-vec-unaligned-erms.S @@ -32,7 +32,16 @@ overlapping addresses. 6. If size >= __x86_shared_non_temporal_threshold and there is no overlap between destination and source, use non-temporal store - instead of aligned store. */ + instead of aligned store copying from either 2 or 4 pages at + once. + 8. For point 7) if size < 16 * __x86_shared_non_temporal_threshold + and source and destination do not page alias, copy from 2 pages + at once using non-temporal stores. Page aliasing in this case is + considered true if destination's page alignment - sources' page + alignment is less than 8 * VEC_SIZE. + 9. If size >= 16 * __x86_shared_non_temporal_threshold or source + and destination do page alias copy from 4 pages at once using + non-temporal stores. */ #include @@ -64,6 +73,34 @@ # endif #endif +#ifndef PAGE_SIZE +# define PAGE_SIZE 4096 +#endif + +#if PAGE_SIZE != 4096 +# error Unsupported PAGE_SIZE +#endif + +#ifndef LOG_PAGE_SIZE +# define LOG_PAGE_SIZE 12 +#endif + +#if PAGE_SIZE != (1 << LOG_PAGE_SIZE) +# error Invalid LOG_PAGE_SIZE +#endif + +/* Byte per page for large_memcpy inner loop. */ +#if VEC_SIZE == 64 +# define LARGE_LOAD_SIZE (VEC_SIZE * 2) +#else +# define LARGE_LOAD_SIZE (VEC_SIZE * 4) +#endif + +/* Amount to shift rdx by to compare for memcpy_large_4x. */ +#ifndef LOG_4X_MEMCPY_THRESH +# define LOG_4X_MEMCPY_THRESH 4 +#endif + /* Avoid short distance rep movsb only with non-SSE vector. */ #ifndef AVOID_SHORT_DISTANCE_REP_MOVSB # define AVOID_SHORT_DISTANCE_REP_MOVSB (VEC_SIZE > 16) @@ -103,6 +140,28 @@ # error Unsupported PREFETCH_SIZE! #endif +#if LARGE_LOAD_SIZE == (VEC_SIZE * 2) +# define LOAD_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; +# define STORE_ONE_SET(base, offset, vec0, vec1, ...) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; +#elif LARGE_LOAD_SIZE == (VEC_SIZE * 4) +# define LOAD_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVU (offset)base, vec0; \ + VMOVU ((offset) + VEC_SIZE)base, vec1; \ + VMOVU ((offset) + VEC_SIZE * 2)base, vec2; \ + VMOVU ((offset) + VEC_SIZE * 3)base, vec3; +# define STORE_ONE_SET(base, offset, vec0, vec1, vec2, vec3) \ + VMOVNT vec0, (offset)base; \ + VMOVNT vec1, ((offset) + VEC_SIZE)base; \ + VMOVNT vec2, ((offset) + VEC_SIZE * 2)base; \ + VMOVNT vec3, ((offset) + VEC_SIZE * 3)base; +#else +# error Invalid LARGE_LOAD_SIZE +#endif + #ifndef SECTION # error SECTION is not defined! #endif @@ -390,6 +449,15 @@ L(last_4x_vec): VZEROUPPER_RETURN L(more_8x_vec): + /* Check if non-temporal move candidate. */ +#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) + /* Check non-temporal store threshold. */ + cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP + ja L(large_memcpy_2x) +#endif + /* Entry if rdx is greater than non-temporal threshold but there + is overlap. */ +L(more_8x_vec_check): cmpq %rsi, %rdi ja L(more_8x_vec_backward) /* Source == destination is less common. */ @@ -416,24 +484,21 @@ L(more_8x_vec): subq %r8, %rdi /* Adjust length. */ addq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_forward) -#endif + + .p2align 4 L(loop_4x_vec_forward): /* Copy 4 * VEC a time forward. */ VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $(VEC_SIZE * 4), %rsi - subq $(VEC_SIZE * 4), %rdx + subq $-(VEC_SIZE * 4), %rsi + addq $-(VEC_SIZE * 4), %rdx VMOVA %VEC(0), (%rdi) VMOVA %VEC(1), VEC_SIZE(%rdi) VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $(VEC_SIZE * 4), %rdi + subq $-(VEC_SIZE * 4), %rdi cmpq $(VEC_SIZE * 4), %rdx ja L(loop_4x_vec_forward) /* Store the last 4 * VEC. */ @@ -467,24 +532,21 @@ L(more_8x_vec_backward): subq %r8, %r9 /* Adjust length. */ subq %r8, %rdx -#if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) - /* Check non-temporal store threshold. */ - cmp __x86_shared_non_temporal_threshold(%rip), %RDX_LP - ja L(large_backward) -#endif + + .p2align 4 L(loop_4x_vec_backward): /* Copy 4 * VEC a time backward. */ VMOVU (%rcx), %VEC(0) VMOVU -VEC_SIZE(%rcx), %VEC(1) VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $(VEC_SIZE * 4), %rcx - subq $(VEC_SIZE * 4), %rdx + addq $-(VEC_SIZE * 4), %rcx + addq $-(VEC_SIZE * 4), %rdx VMOVA %VEC(0), (%r9) VMOVA %VEC(1), -VEC_SIZE(%r9) VMOVA %VEC(2), -(VEC_SIZE * 2)(%r9) VMOVA %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $(VEC_SIZE * 4), %r9 + addq $-(VEC_SIZE * 4), %r9 cmpq $(VEC_SIZE * 4), %rdx ja L(loop_4x_vec_backward) /* Store the first 4 * VEC. */ @@ -497,72 +559,202 @@ L(loop_4x_vec_backward): VZEROUPPER_RETURN #if (defined USE_MULTIARCH || VEC_SIZE == 16) && IS_IN (libc) -L(large_forward): + .p2align 4 +L(large_memcpy_2x): + /* Compute absolute value of difference between source and + destination. */ + movq %rdi, %r9 + subq %rsi, %r9 + movq %r9, %r8 + leaq -1(%r9), %rcx + sarq $63, %r8 + xorq %r8, %r9 + subq %r8, %r9 /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rdi, %rdx), %r10 - cmpq %r10, %rsi - jb L(loop_4x_vec_forward) -L(loop_large_forward): + destination and source since destination may be in cache when + source is loaded. */ + cmpq %r9, %rdx + ja L(more_8x_vec_check) + + /* Cache align destination. First store the first 64 bytes then + adjust alignments. */ + VMOVU (%rsi), %VEC(8) +#if VEC_SIZE < 64 + VMOVU VEC_SIZE(%rsi), %VEC(9) +#if VEC_SIZE < 32 + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(10) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(11) +#endif +#endif + VMOVU %VEC(8), (%rdi) +#if VEC_SIZE < 64 + VMOVU %VEC(9), VEC_SIZE(%rdi) +#if VEC_SIZE < 32 + VMOVU %VEC(10), (VEC_SIZE * 2)(%rdi) + VMOVU %VEC(11), (VEC_SIZE * 3)(%rdi) +#endif +#endif + /* Adjust source, destination, and size. */ + movq %rdi, %r8 + andq $63, %r8 + /* Get the negative of offset for alignment. */ + subq $64, %r8 + /* Adjust source. */ + subq %r8, %rsi + /* Adjust destination which should be aligned now. */ + subq %r8, %rdi + /* Adjust length. */ + addq %r8, %rdx + + /* Test if source and destination addresses will alias. If they do + the larger pipeline in large_memcpy_4x alleviated the + performance drop. */ + testl $(PAGE_SIZE - VEC_SIZE * 8), %ecx + jz L(large_memcpy_4x) + + movq %rdx, %r10 + shrq $LOG_4X_MEMCPY_THRESH, %r10 + cmp __x86_shared_non_temporal_threshold(%rip), %r10 + jae L(large_memcpy_4x) + + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 2 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $((LOG_PAGE_SIZE + 1) - LOG_4X_MEMCPY_THRESH), %r10 + /* Copy 4x VEC at a time from 2 pages. */ + .p2align 4 +L(loop_large_memcpy_2x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_2x_inner): + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE * 2) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE * 2) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + subq $-LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + subq $-LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_2x_inner) + addq $PAGE_SIZE, %rdi + addq $PAGE_SIZE, %rsi + decq %r10 + jne L(loop_large_memcpy_2x_outer) + sfence + + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_2x_end) + + /* Handle the last 2 * PAGE_SIZE bytes. */ +L(loop_large_memcpy_2x_tail): /* Copy 4 * VEC a time forward with non-temporal stores. */ - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE * 3) + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) VMOVU (%rsi), %VEC(0) VMOVU VEC_SIZE(%rsi), %VEC(1) VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) - addq $PREFETCHED_LOAD_SIZE, %rsi - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%rdi) - VMOVNT %VEC(1), VEC_SIZE(%rdi) - VMOVNT %VEC(2), (VEC_SIZE * 2)(%rdi) - VMOVNT %VEC(3), (VEC_SIZE * 3)(%rdi) - addq $PREFETCHED_LOAD_SIZE, %rdi - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_forward) - sfence + subq $-(VEC_SIZE * 4), %rsi + addl $-(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + subq $-(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_2x_tail) + +L(large_memcpy_2x_end): /* Store the last 4 * VEC. */ - VMOVU %VEC(5), (%rcx) - VMOVU %VEC(6), -VEC_SIZE(%rcx) - VMOVU %VEC(7), -(VEC_SIZE * 2)(%rcx) - VMOVU %VEC(8), -(VEC_SIZE * 3)(%rcx) - /* Store the first VEC. */ - VMOVU %VEC(4), (%r11) + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN -L(large_backward): - /* Don't use non-temporal store if there is overlap between - destination and source since destination may be in cache - when source is loaded. */ - leaq (%rcx, %rdx), %r10 - cmpq %r10, %r9 - jb L(loop_4x_vec_backward) -L(loop_large_backward): - /* Copy 4 * VEC a time backward with non-temporal stores. */ - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 2) - PREFETCH_ONE_SET (-1, (%rcx), -PREFETCHED_LOAD_SIZE * 3) - VMOVU (%rcx), %VEC(0) - VMOVU -VEC_SIZE(%rcx), %VEC(1) - VMOVU -(VEC_SIZE * 2)(%rcx), %VEC(2) - VMOVU -(VEC_SIZE * 3)(%rcx), %VEC(3) - subq $PREFETCHED_LOAD_SIZE, %rcx - subq $PREFETCHED_LOAD_SIZE, %rdx - VMOVNT %VEC(0), (%r9) - VMOVNT %VEC(1), -VEC_SIZE(%r9) - VMOVNT %VEC(2), -(VEC_SIZE * 2)(%r9) - VMOVNT %VEC(3), -(VEC_SIZE * 3)(%r9) - subq $PREFETCHED_LOAD_SIZE, %r9 - cmpq $PREFETCHED_LOAD_SIZE, %rdx - ja L(loop_large_backward) + .p2align 4 +L(large_memcpy_4x): + movq %rdx, %r10 + /* edx will store remainder size for copying tail. */ + andl $(PAGE_SIZE * 4 - 1), %edx + /* r10 stores outer loop counter. */ + shrq $(LOG_PAGE_SIZE + 2), %r10 + /* Copy 4x VEC at a time from 4 pages. */ + .p2align 4 +L(loop_large_memcpy_4x_outer): + /* ecx stores inner loop counter. */ + movl $(PAGE_SIZE / LARGE_LOAD_SIZE), %ecx +L(loop_large_memcpy_4x_inner): + /* Only one prefetch set per page as doing 4 pages give more time + for prefetcher to keep up. */ + PREFETCH_ONE_SET(1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 2 + PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET(1, (%rsi), PAGE_SIZE * 3 + PREFETCHED_LOAD_SIZE) + /* Load vectors from rsi. */ + LOAD_ONE_SET((%rsi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + LOAD_ONE_SET((%rsi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + LOAD_ONE_SET((%rsi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + subq $-LARGE_LOAD_SIZE, %rsi + /* Non-temporal store vectors to rdi. */ + STORE_ONE_SET((%rdi), 0, %VEC(0), %VEC(1), %VEC(2), %VEC(3)) + STORE_ONE_SET((%rdi), PAGE_SIZE, %VEC(4), %VEC(5), %VEC(6), %VEC(7)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 2, %VEC(8), %VEC(9), %VEC(10), %VEC(11)) + STORE_ONE_SET((%rdi), PAGE_SIZE * 3, %VEC(12), %VEC(13), %VEC(14), %VEC(15)) + subq $-LARGE_LOAD_SIZE, %rdi + decl %ecx + jnz L(loop_large_memcpy_4x_inner) + addq $(PAGE_SIZE * 3), %rdi + addq $(PAGE_SIZE * 3), %rsi + decq %r10 + jne L(loop_large_memcpy_4x_outer) sfence - /* Store the first 4 * VEC. */ - VMOVU %VEC(4), (%rdi) - VMOVU %VEC(5), VEC_SIZE(%rdi) - VMOVU %VEC(6), (VEC_SIZE * 2)(%rdi) - VMOVU %VEC(7), (VEC_SIZE * 3)(%rdi) - /* Store the last VEC. */ - VMOVU %VEC(8), (%r11) + /* Check if only last 4 loads are needed. */ + cmpl $(VEC_SIZE * 4), %edx + jbe L(large_memcpy_4x_end) + + /* Handle the last 4 * PAGE_SIZE bytes. */ +L(loop_large_memcpy_4x_tail): + /* Copy 4 * VEC a time forward with non-temporal stores. */ + PREFETCH_ONE_SET (1, (%rsi), PREFETCHED_LOAD_SIZE) + PREFETCH_ONE_SET (1, (%rdi), PREFETCHED_LOAD_SIZE) + VMOVU (%rsi), %VEC(0) + VMOVU VEC_SIZE(%rsi), %VEC(1) + VMOVU (VEC_SIZE * 2)(%rsi), %VEC(2) + VMOVU (VEC_SIZE * 3)(%rsi), %VEC(3) + subq $-(VEC_SIZE * 4), %rsi + addl $-(VEC_SIZE * 4), %edx + VMOVA %VEC(0), (%rdi) + VMOVA %VEC(1), VEC_SIZE(%rdi) + VMOVA %VEC(2), (VEC_SIZE * 2)(%rdi) + VMOVA %VEC(3), (VEC_SIZE * 3)(%rdi) + subq $-(VEC_SIZE * 4), %rdi + cmpl $(VEC_SIZE * 4), %edx + ja L(loop_large_memcpy_4x_tail) + +L(large_memcpy_4x_end): + /* Store the last 4 * VEC. */ + VMOVU -(VEC_SIZE * 4)(%rsi, %rdx), %VEC(0) + VMOVU -(VEC_SIZE * 3)(%rsi, %rdx), %VEC(1) + VMOVU -(VEC_SIZE * 2)(%rsi, %rdx), %VEC(2) + VMOVU -VEC_SIZE(%rsi, %rdx), %VEC(3) + + VMOVU %VEC(0), -(VEC_SIZE * 4)(%rdi, %rdx) + VMOVU %VEC(1), -(VEC_SIZE * 3)(%rdi, %rdx) + VMOVU %VEC(2), -(VEC_SIZE * 2)(%rdi, %rdx) + VMOVU %VEC(3), -VEC_SIZE(%rdi, %rdx) VZEROUPPER_RETURN #endif END (MEMMOVE_SYMBOL (__memmove, unaligned_erms)) -- GitLab