99 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
			
		
		
	
	
			99 lines
		
	
	
		
			2.3 KiB
		
	
	
	
		
			ArmAsm
		
	
	
	
	
	
| /*
 | |
|  * OpenRISC memset.S
 | |
|  *
 | |
|  * Hand-optimized assembler version of memset for OpenRISC.
 | |
|  * Algorithm inspired by several other arch-specific memset routines
 | |
|  * in the kernel tree
 | |
|  *
 | |
|  * Copyright (C) 2015 Olof Kindgren <olof.kindgren@gmail.com>
 | |
|  *
 | |
|  *      This program is free software; you can redistribute it and/or
 | |
|  *      modify it under the terms of the GNU General Public License
 | |
|  *      as published by the Free Software Foundation; either version
 | |
|  *      2 of the License, or (at your option) any later version.
 | |
|  */
 | |
| 
 | |
| 	.global memset
 | |
| 	.type	memset, @function
 | |
| memset:
 | |
| 	/* arguments:
 | |
| 	 * r3 = *s
 | |
| 	 * r4 = c
 | |
| 	 * r5 = n
 | |
| 	 * r13, r15, r17, r19 used as temp regs
 | |
| 	*/
 | |
| 
 | |
| 	/* Exit if n == 0 */
 | |
| 	l.sfeqi		r5, 0
 | |
| 	l.bf		4f
 | |
| 
 | |
| 	/* Truncate c to char */
 | |
| 	l.andi  	r13, r4, 0xff
 | |
| 
 | |
| 	/* Skip word extension if c is 0 */
 | |
| 	l.sfeqi		r13, 0
 | |
| 	l.bf		1f
 | |
| 	/* Check for at least two whole words (8 bytes) */
 | |
| 	 l.sfleui	r5, 7
 | |
| 
 | |
| 	/* Extend char c to 32-bit word cccc in r13 */
 | |
| 	l.slli		r15, r13, 16  // r13 = 000c, r15 = 0c00
 | |
| 	l.or		r13, r13, r15 // r13 = 0c0c, r15 = 0c00
 | |
| 	l.slli		r15, r13, 8   // r13 = 0c0c, r15 = c0c0
 | |
| 	l.or		r13, r13, r15 // r13 = cccc, r15 = c0c0
 | |
| 
 | |
| 1:	l.addi		r19, r3, 0 // Set r19 = src
 | |
| 	/* Jump to byte copy loop if less than two words */
 | |
| 	l.bf		3f
 | |
| 	 l.or		r17, r5, r0 // Set r17 = n
 | |
| 
 | |
| 	/* Mask out two LSBs to check alignment */
 | |
| 	l.andi		r15, r3, 0x3
 | |
| 
 | |
| 	/* lsb == 00, jump to word copy loop */
 | |
| 	l.sfeqi		r15, 0
 | |
| 	l.bf		2f
 | |
| 	 l.addi		r19, r3, 0 // Set r19 = src
 | |
| 
 | |
| 	/* lsb == 01,10 or 11 */
 | |
| 	l.sb		0(r3), r13   // *src = c
 | |
| 	l.addi		r17, r17, -1 // Decrease n
 | |
| 
 | |
| 	l.sfeqi		r15, 3
 | |
| 	l.bf		2f
 | |
| 	 l.addi		r19, r3, 1  // src += 1
 | |
| 
 | |
| 	/* lsb == 01 or 10 */
 | |
| 	l.sb		1(r3), r13   // *(src+1) = c
 | |
| 	l.addi		r17, r17, -1 // Decrease n
 | |
| 
 | |
| 	l.sfeqi		r15, 2
 | |
| 	l.bf		2f
 | |
| 	 l.addi		r19, r3, 2  // src += 2
 | |
| 
 | |
| 	/* lsb == 01 */
 | |
| 	l.sb		2(r3), r13   // *(src+2) = c
 | |
| 	l.addi		r17, r17, -1 // Decrease n
 | |
| 	l.addi		r19, r3, 3   // src += 3
 | |
| 
 | |
| 	/* Word copy loop */
 | |
| 2:	l.sw		0(r19), r13  // *src = cccc
 | |
| 	l.addi		r17, r17, -4 // Decrease n
 | |
| 	l.sfgeui	r17, 4
 | |
| 	l.bf		2b
 | |
| 	 l.addi		r19, r19, 4  // Increase src
 | |
| 
 | |
| 	/* When n > 0, copy the remaining bytes, otherwise jump to exit */
 | |
| 	l.sfeqi		r17, 0
 | |
| 	l.bf		4f
 | |
| 
 | |
| 	/* Byte copy loop */
 | |
| 3:	l.addi		r17, r17, -1 // Decrease n
 | |
| 	l.sb		0(r19), r13  // *src = cccc
 | |
| 	l.sfnei		r17, 0
 | |
| 	l.bf		3b
 | |
| 	 l.addi		r19, r19, 1  // Increase src
 | |
| 
 | |
| 4:	l.jr		r9
 | |
| 	 l.ori		r11, r3, 0
 |