145 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			145 lines
		
	
	
		
			2.7 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /* SPDX-License-Identifier: GPL-2.0-only */
 | |
| /*
 | |
|  * Copyright (C) 2014-15 Synopsys, Inc. (www.synopsys.com)
 | |
|  */
 | |
| 
 | |
| #include <linux/linkage.h>
 | |
| #include <asm/cache.h>
 | |
| 
 | |
| /*
 | |
|  * The memset implementation below is optimized to use prefetchw and prealloc
 | |
|  * instruction in case of CPU with 64B L1 data cache line (L1_CACHE_SHIFT == 6)
 | |
|  * If you want to implement optimized memset for other possible L1 data cache
 | |
|  * line lengths (32B and 128B) you should rewrite code carefully checking
 | |
|  * we don't call any prefetchw/prealloc instruction for L1 cache lines which
 | |
|  * don't belongs to memset area.
 | |
|  */
 | |
| 
 | |
| #if L1_CACHE_SHIFT == 6
 | |
| 
 | |
| .macro PREALLOC_INSTR	reg, off
 | |
| 	prealloc	[\reg, \off]
 | |
| .endm
 | |
| 
 | |
| .macro PREFETCHW_INSTR	reg, off
 | |
| 	prefetchw	[\reg, \off]
 | |
| .endm
 | |
| 
 | |
| #else
 | |
| 
 | |
| .macro PREALLOC_INSTR	reg, off
 | |
| .endm
 | |
| 
 | |
| .macro PREFETCHW_INSTR	reg, off
 | |
| .endm
 | |
| 
 | |
| #endif
 | |
| 
 | |
| ENTRY_CFI(memset)
 | |
| 	mov.f	0, r2
 | |
| ;;; if size is zero
 | |
| 	jz.d	[blink]
 | |
| 	mov	r3, r0		; don't clobber ret val
 | |
| 
 | |
| 	PREFETCHW_INSTR	r0, 0	; Prefetch the first write location
 | |
| 
 | |
| ;;; if length < 8
 | |
| 	brls.d.nt	r2, 8, .Lsmallchunk
 | |
| 	mov.f	lp_count,r2
 | |
| 
 | |
| 	and.f	r4, r0, 0x03
 | |
| 	rsub	lp_count, r4, 4
 | |
| 	lpnz	@.Laligndestination
 | |
| 	;; LOOP BEGIN
 | |
| 	stb.ab	r1, [r3,1]
 | |
| 	sub	r2, r2, 1
 | |
| .Laligndestination:
 | |
| 
 | |
| ;;; Destination is aligned
 | |
| 	and	r1, r1, 0xFF
 | |
| 	asl	r4, r1, 8
 | |
| 	or	r4, r4, r1
 | |
| 	asl	r5, r4, 16
 | |
| 	or	r5, r5, r4
 | |
| 	mov	r4, r5
 | |
| 
 | |
| 	sub3	lp_count, r2, 8
 | |
| 	cmp     r2, 64
 | |
| 	bmsk.hi	r2, r2, 5
 | |
| 	mov.ls	lp_count, 0
 | |
| 	add3.hi	r2, r2, 8
 | |
| 
 | |
| ;;; Convert len to Dwords, unfold x8
 | |
| 	lsr.f	lp_count, lp_count, 6
 | |
| 
 | |
| 	lpnz	@.Lset64bytes
 | |
| 	;; LOOP START
 | |
| 	PREALLOC_INSTR	r3, 64	; alloc next line w/o fetching
 | |
| 
 | |
| #ifdef CONFIG_ARC_HAS_LL64
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| #else
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| #endif
 | |
| .Lset64bytes:
 | |
| 
 | |
| 	lsr.f	lp_count, r2, 5 ;Last remaining  max 124 bytes
 | |
| 	lpnz	.Lset32bytes
 | |
| 	;; LOOP START
 | |
| #ifdef CONFIG_ARC_HAS_LL64
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| 	std.ab	r4, [r3, 8]
 | |
| #else
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| 	st.ab	r4, [r3, 4]
 | |
| #endif
 | |
| .Lset32bytes:
 | |
| 
 | |
| 	and.f	lp_count, r2, 0x1F ;Last remaining 31 bytes
 | |
| .Lsmallchunk:
 | |
| 	lpnz	.Lcopy3bytes
 | |
| 	;; LOOP START
 | |
| 	stb.ab	r1, [r3, 1]
 | |
| .Lcopy3bytes:
 | |
| 
 | |
| 	j	[blink]
 | |
| 
 | |
| END_CFI(memset)
 | |
| 
 | |
| ENTRY_CFI(memzero)
 | |
|     ; adjust bzero args to memset args
 | |
|     mov r2, r1
 | |
|     b.d  memset    ;tail call so need to tinker with blink
 | |
|     mov r1, 0
 | |
| END_CFI(memzero)
 |