206 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			206 lines
		
	
	
		
			4.2 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0 */
 | |
| /*
 | |
|  * arch/alpha/lib/ev6-copy_page.S
 | |
|  *
 | |
|  * Copy an entire page.
 | |
|  */
 | |
| 
 | |
| /* The following comparison of this routine vs the normal copy_page.S
 | |
|    was written by an unnamed ev6 hardware designer and forwarded to me
 | |
|    via Steven Hobbs <hobbs@steven.zko.dec.com>.
 | |
|  
 | |
|    First Problem: STQ overflows.
 | |
|    -----------------------------
 | |
| 
 | |
| 	It would be nice if EV6 handled every resource overflow efficiently,
 | |
| 	but for some it doesn't.  Including store queue overflows.  It causes
 | |
| 	a trap and a restart of the pipe.
 | |
| 
 | |
| 	To get around this we sometimes use (to borrow a term from a VSSAD
 | |
| 	researcher) "aeration".  The idea is to slow the rate at which the
 | |
| 	processor receives valid instructions by inserting nops in the fetch
 | |
| 	path.  In doing so, you can prevent the overflow and actually make
 | |
| 	the code run faster.  You can, of course, take advantage of the fact
 | |
| 	that the processor can fetch at most 4 aligned instructions per cycle.
 | |
| 
 | |
| 	I inserted enough nops to force it to take 10 cycles to fetch the
 | |
| 	loop code.  In theory, EV6 should be able to execute this loop in
 | |
| 	9 cycles but I was not able to get it to run that fast -- the initial
 | |
| 	conditions were such that I could not reach this optimum rate on
 | |
| 	(chaotic) EV6.  I wrote the code such that everything would issue
 | |
| 	in order. 
 | |
| 
 | |
|    Second Problem: Dcache index matches.
 | |
|    -------------------------------------
 | |
| 
 | |
| 	If you are going to use this routine on random aligned pages, there
 | |
| 	is a 25% chance that the pages will be at the same dcache indices.
 | |
| 	This results in many nasty memory traps without care.
 | |
| 
 | |
| 	The solution is to schedule the prefetches to avoid the memory
 | |
| 	conflicts.  I schedule the wh64 prefetches farther ahead of the
 | |
| 	read prefetches to avoid this problem.
 | |
| 
 | |
|    Third Problem: Needs more prefetching.
 | |
|    --------------------------------------
 | |
| 
 | |
| 	In order to improve the code I added deeper prefetching to take the
 | |
| 	most advantage of EV6's bandwidth.
 | |
| 
 | |
| 	I also prefetched the read stream. Note that adding the read prefetch
 | |
| 	forced me to add another cycle to the inner-most kernel - up to 11
 | |
| 	from the original 8 cycles per iteration.  We could improve performance
 | |
| 	further by unrolling the loop and doing multiple prefetches per cycle.
 | |
| 
 | |
|    I think that the code below will be very robust and fast code for the
 | |
|    purposes of copying aligned pages.  It is slower when both source and
 | |
|    destination pages are in the dcache, but it is my guess that this is
 | |
|    less important than the dcache miss case.  */
 | |
| 
 | |
| #include <asm/export.h>
 | |
| 	.text
 | |
| 	.align 4
 | |
| 	.global copy_page
 | |
| 	.ent copy_page
 | |
| copy_page:
 | |
| 	.prologue 0
 | |
| 
 | |
| 	/* Prefetch 5 read cachelines; write-hint 10 cache lines.  */
 | |
| 	wh64	($16)
 | |
| 	ldl	$31,0($17)
 | |
| 	ldl	$31,64($17)
 | |
| 	lda	$1,1*64($16)
 | |
| 
 | |
| 	wh64	($1)
 | |
| 	ldl	$31,128($17)
 | |
| 	ldl	$31,192($17)
 | |
| 	lda	$1,2*64($16)
 | |
| 
 | |
| 	wh64	($1)
 | |
| 	ldl	$31,256($17)
 | |
| 	lda	$18,118
 | |
| 	lda	$1,3*64($16)
 | |
| 
 | |
| 	wh64	($1)
 | |
| 	nop
 | |
| 	lda	$1,4*64($16)
 | |
| 	lda	$2,5*64($16)
 | |
| 
 | |
| 	wh64	($1)
 | |
| 	wh64	($2)
 | |
| 	lda	$1,6*64($16)
 | |
| 	lda	$2,7*64($16)
 | |
| 
 | |
| 	wh64	($1)
 | |
| 	wh64	($2)
 | |
| 	lda	$1,8*64($16)
 | |
| 	lda	$2,9*64($16)
 | |
| 
 | |
| 	wh64	($1)
 | |
| 	wh64	($2)
 | |
| 	lda	$19,10*64($16)
 | |
| 	nop
 | |
| 
 | |
| 	/* Main prefetching/write-hinting loop.  */
 | |
| 1:	ldq	$0,0($17)
 | |
| 	ldq	$1,8($17)
 | |
| 	unop
 | |
| 	unop
 | |
| 
 | |
| 	unop
 | |
| 	unop
 | |
| 	ldq	$2,16($17)
 | |
| 	ldq	$3,24($17)
 | |
| 
 | |
| 	ldq	$4,32($17)
 | |
| 	ldq	$5,40($17)
 | |
| 	unop
 | |
| 	unop
 | |
| 
 | |
| 	unop
 | |
| 	unop
 | |
| 	ldq	$6,48($17)
 | |
| 	ldq	$7,56($17)
 | |
| 
 | |
| 	ldl	$31,320($17)
 | |
| 	unop
 | |
| 	unop
 | |
| 	unop
 | |
| 
 | |
| 	/* This gives the extra cycle of aeration above the minimum.  */
 | |
| 	unop			
 | |
| 	unop
 | |
| 	unop
 | |
| 	unop
 | |
| 
 | |
| 	wh64	($19)
 | |
| 	unop
 | |
| 	unop
 | |
| 	unop
 | |
| 
 | |
| 	stq	$0,0($16)
 | |
| 	subq	$18,1,$18
 | |
| 	stq	$1,8($16)
 | |
| 	unop
 | |
| 
 | |
| 	unop
 | |
| 	stq	$2,16($16)
 | |
| 	addq	$17,64,$17
 | |
| 	stq	$3,24($16)
 | |
| 
 | |
| 	stq	$4,32($16)
 | |
| 	stq	$5,40($16)
 | |
| 	addq	$19,64,$19
 | |
| 	unop
 | |
| 
 | |
| 	stq	$6,48($16)
 | |
| 	stq	$7,56($16)
 | |
| 	addq	$16,64,$16
 | |
| 	bne	$18, 1b
 | |
| 
 | |
| 	/* Prefetch the final 5 cache lines of the read stream.  */
 | |
| 	lda	$18,10
 | |
| 	ldl	$31,320($17)
 | |
| 	ldl	$31,384($17)
 | |
| 	ldl	$31,448($17)
 | |
| 
 | |
| 	ldl	$31,512($17)
 | |
| 	ldl	$31,576($17)
 | |
| 	nop
 | |
| 	nop
 | |
| 
 | |
| 	/* Non-prefetching, non-write-hinting cleanup loop for the
 | |
| 	   final 10 cache lines.  */
 | |
| 2:	ldq	$0,0($17)
 | |
| 	ldq	$1,8($17)
 | |
| 	ldq	$2,16($17)
 | |
| 	ldq	$3,24($17)
 | |
| 
 | |
| 	ldq	$4,32($17)
 | |
| 	ldq	$5,40($17)
 | |
| 	ldq	$6,48($17)
 | |
| 	ldq	$7,56($17)
 | |
| 
 | |
| 	stq	$0,0($16)
 | |
| 	subq	$18,1,$18
 | |
| 	stq	$1,8($16)
 | |
| 	addq	$17,64,$17
 | |
| 
 | |
| 	stq	$2,16($16)
 | |
| 	stq	$3,24($16)
 | |
| 	stq	$4,32($16)
 | |
| 	stq	$5,40($16)
 | |
| 
 | |
| 	stq	$6,48($16)
 | |
| 	stq	$7,56($16)
 | |
| 	addq	$16,64,$16
 | |
| 	bne	$18, 2b
 | |
| 
 | |
| 	ret
 | |
| 	nop
 | |
| 	unop
 | |
| 	nop
 | |
| 
 | |
| 	.end copy_page
 | |
| 	EXPORT_SYMBOL(copy_page)
 |