openssl/openssl-1.1.1-arm-update.patch
DistroBaker e33651f416 Merged update from upstream sources
This is an automated DistroBaker update from upstream sources.
If you do not know what this is about or would like to opt out,
contact the OSCI team.

Source: https://src.fedoraproject.org/rpms/openssl.git#a07706cf0e50b02a61d3cb10ecad554d4ac4240c
2020-12-10 01:32:28 +01:00

3707 lines
89 KiB
Diff

diff -up openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl
--- openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl 2020-12-09 10:39:50.645705385 +0100
@@ -27,44 +27,72 @@
# CBC encrypt case. On Cortex-A57 parallelizable mode performance
# seems to be limited by sheer amount of NEON instructions...
#
+# April 2019
+#
+# Key to performance of parallelize-able modes is round instruction
+# interleaving. But which factor to use? There is optimal one for
+# each combination of instruction latency and issue rate, beyond
+# which increasing interleave factor doesn't pay off. While on cons
+# side we have code size increase and resource waste on platforms for
+# which interleave factor is too high. In other words you want it to
+# be just right. So far interleave factor of 3x was serving well all
+# platforms. But for ThunderX2 optimal interleave factor was measured
+# to be 5x...
+#
# Performance in cycles per byte processed with 128-bit key:
#
# CBC enc CBC dec CTR
# Apple A7 2.39 1.20 1.20
-# Cortex-A53 1.32 1.29 1.46
-# Cortex-A57(*) 1.95 0.85 0.93
-# Denver 1.96 0.86 0.80
-# Mongoose 1.33 1.20 1.20
-# Kryo 1.26 0.94 1.00
+# Cortex-A53 1.32 1.17/1.29(**) 1.36/1.46
+# Cortex-A57(*) 1.95 0.82/0.85 0.89/0.93
+# Cortex-A72 1.33 0.85/0.88 0.92/0.96
+# Denver 1.96 0.65/0.86 0.76/0.80
+# Mongoose 1.33 1.23/1.20 1.30/1.20
+# Kryo 1.26 0.87/0.94 1.00/1.00
+# ThunderX2 5.95 1.25 1.30
#
# (*) original 3.64/1.34/1.32 results were for r0p0 revision
# and are still same even for updated module;
+# (**) numbers after slash are for 32-bit code, which is 3x-
+# interleaved;
-$flavour = shift;
-$output = shift;
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
*STDOUT=*OUT;
$prefix="aes_v8";
+$_byte = ($flavour =~ /win/ ? "DCB" : ".byte");
+
$code=<<___;
#include "arm_arch.h"
#if __ARM_MAX_ARCH__>=7
-.text
___
-$code.=".arch armv8-a+crypto\n" if ($flavour =~ /64/);
+$code.=".arch armv8-a+crypto\n.text\n" if ($flavour =~ /64/);
$code.=<<___ if ($flavour !~ /64/);
.arch armv7-a // don't confuse not-so-latest binutils with argv8 :-)
.fpu neon
+#ifdef __thumb2__
+.syntax unified
+.thumb
+# define INST(a,b,c,d) $_byte c,d|0xc,a,b
+#else
.code 32
-#undef __thumb2__
+# define INST(a,b,c,d) $_byte a,b,c,d
+#endif
+
+.text
___
# Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax,
@@ -361,6 +389,836 @@ ___
&gen_block("en");
&gen_block("de");
}}}
+
+# Performance in cycles per byte.
+# Processed with AES-ECB different key size.
+# It shows the value before and after optimization as below:
+# (before/after):
+#
+# AES-128-ECB AES-192-ECB AES-256-ECB
+# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10
+# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14
+
+# Optimization is implemented by loop unrolling and interleaving.
+# Commonly, we choose the unrolling factor as 5, if the input
+# data size smaller than 5 blocks, but not smaller than 3 blocks,
+# choose 3 as the unrolling factor.
+# If the input data size dsize >= 5*16 bytes, then take 5 blocks
+# as one iteration, every loop the left size lsize -= 5*16.
+# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration,
+# every loop lsize -=3*16.
+# If lsize < 3*16 bytes, treat them as the tail, interleave the
+# two blocks AES instructions.
+# There is one special case, if the original input data size dsize
+# = 16 bytes, we will treat it seperately to improve the
+# performance: one independent code block without LR, FP load and
+# store, just looks like what the original ECB implementation does.
+
+{{{
+my ($inp,$out,$len,$key)=map("x$_",(0..3));
+my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+### q7 last round key
+### q10-q15 q7 Last 7 round keys
+### q8-q9 preloaded round keys except last 7 keys for big size
+### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte
+
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3); # used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
+$code.=<<___;
+.globl ${prefix}_ecb_encrypt
+.type ${prefix}_ecb_encrypt,%function
+.align 5
+${prefix}_ecb_encrypt:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ subs $len,$len,#16
+ // Original input data size bigger than 16, jump to big size processing.
+ b.ne .Lecb_big_size
+ vld1.8 {$dat0},[$inp]
+ cmp $enc,#0 // en- or decrypting?
+ ldr $rounds,[$key,#240]
+ vld1.32 {q5-q6},[$key],#32 // load key schedule...
+
+ b.eq .Lecb_small_dec
+ aese $dat0,q5
+ aesmc $dat0,$dat0
+ vld1.32 {q8-q9},[$key],#32 // load key schedule...
+ aese $dat0,q6
+ aesmc $dat0,$dat0
+ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing
+ b.eq .Lecb_128_enc
+.Lecb_round_loop:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ vld1.32 {q8},[$key],#16 // load key schedule...
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ vld1.32 {q9},[$key],#16 // load key schedule...
+ subs $rounds,$rounds,#2 // bias
+ b.gt .Lecb_round_loop
+.Lecb_128_enc:
+ vld1.32 {q10-q11},[$key],#32 // load key schedule...
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ vld1.32 {q12-q13},[$key],#32 // load key schedule...
+ aese $dat0,q10
+ aesmc $dat0,$dat0
+ aese $dat0,q11
+ aesmc $dat0,$dat0
+ vld1.32 {q14-q15},[$key],#32 // load key schedule...
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ vld1.32 {$rndlast},[$key]
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat0,q15
+ veor $dat0,$dat0,$rndlast
+ vst1.8 {$dat0},[$out]
+ b .Lecb_Final_abort
+.Lecb_small_dec:
+ aesd $dat0,q5
+ aesimc $dat0,$dat0
+ vld1.32 {q8-q9},[$key],#32 // load key schedule...
+ aesd $dat0,q6
+ aesimc $dat0,$dat0
+ subs $rounds,$rounds,#10 // bias
+ b.eq .Lecb_128_dec
+.Lecb_dec_round_loop:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ vld1.32 {q8},[$key],#16 // load key schedule...
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ vld1.32 {q9},[$key],#16 // load key schedule...
+ subs $rounds,$rounds,#2 // bias
+ b.gt .Lecb_dec_round_loop
+.Lecb_128_dec:
+ vld1.32 {q10-q11},[$key],#32 // load key schedule...
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ vld1.32 {q12-q13},[$key],#32 // load key schedule...
+ aesd $dat0,q10
+ aesimc $dat0,$dat0
+ aesd $dat0,q11
+ aesimc $dat0,$dat0
+ vld1.32 {q14-q15},[$key],#32 // load key schedule...
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ vld1.32 {$rndlast},[$key]
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat0,q15
+ veor $dat0,$dat0,$rndlast
+ vst1.8 {$dat0},[$out]
+ b .Lecb_Final_abort
+.Lecb_big_size:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp x29,x30,[sp,#-16]!
+ add x29,sp,#0
+___
+$code.=<<___ if ($flavour !~ /64/);
+ mov ip,sp
+ stmdb sp!,{r4-r8,lr}
+ vstmdb sp!,{d8-d15} @ ABI specification says so
+ ldmia ip,{r4-r5} @ load remaining args
+ subs $len,$len,#16
+___
+$code.=<<___;
+ mov $step,#16
+ b.lo .Lecb_done
+ cclr $step,eq
+
+ cmp $enc,#0 // en- or decrypting?
+ ldr $rounds,[$key,#240]
+ and $len,$len,#-16
+ vld1.8 {$dat},[$inp],$step
+
+ vld1.32 {q8-q9},[$key] // load key schedule...
+ sub $rounds,$rounds,#6
+ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys
+ sub $rounds,$rounds,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key,#32
+ mov $cnt,$rounds
+ b.eq .Lecb_dec
+
+ vld1.8 {$dat1},[$inp],#16
+ subs $len,$len,#32 // bias
+ add $cnt,$rounds,#2
+ vorr $in1,$dat1,$dat1
+ vorr $dat2,$dat1,$dat1
+ vorr $dat1,$dat,$dat
+ b.lo .Lecb_enc_tail
+
+ vorr $dat1,$in1,$in1
+ vld1.8 {$dat2},[$inp],#16
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#32
+ b.lo .Loop3x_ecb_enc
+
+ vld1.8 {$dat3},[$inp],#16
+ vld1.8 {$dat4},[$inp],#16
+ sub $len,$len,#32 // bias
+ mov $cnt,$rounds
+
+.Loop5x_ecb_enc:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop5x_ecb_enc
+
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ cmp $len,#0x40 // because .Lecb_enc_tail4x
+ sub $len,$len,#0x50
+
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
+ mov $key_,$key
+
+ aese $dat0,q10
+ aesmc $dat0,$dat0
+ aese $dat1,q10
+ aesmc $dat1,$dat1
+ aese $dat2,q10
+ aesmc $dat2,$dat2
+ aese $dat3,q10
+ aesmc $dat3,$dat3
+ aese $dat4,q10
+ aesmc $dat4,$dat4
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat4
+ // are loaded with last "words"
+ add x6,$len,#0x60 // because .Lecb_enc_tail4x
+
+ aese $dat0,q11
+ aesmc $dat0,$dat0
+ aese $dat1,q11
+ aesmc $dat1,$dat1
+ aese $dat2,q11
+ aesmc $dat2,$dat2
+ aese $dat3,q11
+ aesmc $dat3,$dat3
+ aese $dat4,q11
+ aesmc $dat4,$dat4
+
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ aese $dat3,q12
+ aesmc $dat3,$dat3
+ aese $dat4,q12
+ aesmc $dat4,$dat4
+
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ aese $dat3,q13
+ aesmc $dat3,$dat3
+ aese $dat4,q13
+ aesmc $dat4,$dat4
+
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ aese $dat3,q14
+ aesmc $dat3,$dat3
+ aese $dat4,q14
+ aesmc $dat4,$dat4
+
+ aese $dat0,q15
+ vld1.8 {$in0},[$inp],#16
+ aese $dat1,q15
+ vld1.8 {$in1},[$inp],#16
+ aese $dat2,q15
+ vld1.8 {$in2},[$inp],#16
+ aese $dat3,q15
+ vld1.8 {$in3},[$inp],#16
+ aese $dat4,q15
+ vld1.8 {$in4},[$inp],#16
+ cbz x6,.Lecb_enc_tail4x
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ veor $tmp0,$rndlast,$dat0
+ vorr $dat0,$in0,$in0
+ veor $tmp1,$rndlast,$dat1
+ vorr $dat1,$in1,$in1
+ veor $tmp2,$rndlast,$dat2
+ vorr $dat2,$in2,$in2
+ veor $tmp3,$rndlast,$dat3
+ vorr $dat3,$in3,$in3
+ veor $tmp4,$rndlast,$dat4
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat4,$in4,$in4
+ vst1.8 {$tmp1},[$out],#16
+ mov $cnt,$rounds
+ vst1.8 {$tmp2},[$out],#16
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+ b.hs .Loop5x_ecb_enc
+
+ add $len,$len,#0x50
+ cbz $len,.Lecb_done
+
+ add $cnt,$rounds,#2
+ subs $len,$len,#0x30
+ vorr $dat0,$in2,$in2
+ vorr $dat1,$in3,$in3
+ vorr $dat2,$in4,$in4
+ b.lo .Lecb_enc_tail
+
+ b .Loop3x_ecb_enc
+
+.align 4
+.Lecb_enc_tail4x:
+ veor $tmp1,$rndlast,$dat1
+ veor $tmp2,$rndlast,$dat2
+ veor $tmp3,$rndlast,$dat3
+ veor $tmp4,$rndlast,$dat4
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+
+ b .Lecb_done
+.align 4
+___
+$code.=<<___;
+.Loop3x_ecb_enc:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop3x_ecb_enc
+
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ subs $len,$len,#0x30
+ mov.lo x6,$len // x6, $cnt, is zero at this point
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat2
+ // are loaded with last "words"
+ mov $key_,$key
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ vld1.8 {$in0},[$inp],#16
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ vld1.8 {$in1},[$inp],#16
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ vld1.8 {$in2},[$inp],#16
+ aese $dat0,q15
+ aese $dat1,q15
+ aese $dat2,q15
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ add $cnt,$rounds,#2
+ veor $tmp0,$rndlast,$dat0
+ veor $tmp1,$rndlast,$dat1
+ veor $dat2,$dat2,$rndlast
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat0,$in0,$in0
+ vst1.8 {$tmp1},[$out],#16
+ vorr $dat1,$in1,$in1
+ vst1.8 {$dat2},[$out],#16
+ vorr $dat2,$in2,$in2
+ b.hs .Loop3x_ecb_enc
+
+ cmn $len,#0x30
+ b.eq .Lecb_done
+ nop
+
+.Lecb_enc_tail:
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lecb_enc_tail
+
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ cmn $len,#0x20
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ aese $dat1,q15
+ aese $dat2,q15
+ b.eq .Lecb_enc_one
+ veor $tmp1,$rndlast,$dat1
+ veor $tmp2,$rndlast,$dat2
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ b .Lecb_done
+
+.Lecb_enc_one:
+ veor $tmp1,$rndlast,$dat2
+ vst1.8 {$tmp1},[$out],#16
+ b .Lecb_done
+___
+
+$code.=<<___;
+.align 5
+.Lecb_dec:
+ vld1.8 {$dat1},[$inp],#16
+ subs $len,$len,#32 // bias
+ add $cnt,$rounds,#2
+ vorr $in1,$dat1,$dat1
+ vorr $dat2,$dat1,$dat1
+ vorr $dat1,$dat,$dat
+ b.lo .Lecb_dec_tail
+
+ vorr $dat1,$in1,$in1
+ vld1.8 {$dat2},[$inp],#16
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#32
+ b.lo .Loop3x_ecb_dec
+
+ vld1.8 {$dat3},[$inp],#16
+ vld1.8 {$dat4},[$inp],#16
+ sub $len,$len,#32 // bias
+ mov $cnt,$rounds
+
+.Loop5x_ecb_dec:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop5x_ecb_dec
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ cmp $len,#0x40 // because .Lecb_tail4x
+ sub $len,$len,#0x50
+
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
+ mov $key_,$key
+
+ aesd $dat0,q10
+ aesimc $dat0,$dat0
+ aesd $dat1,q10
+ aesimc $dat1,$dat1
+ aesd $dat2,q10
+ aesimc $dat2,$dat2
+ aesd $dat3,q10
+ aesimc $dat3,$dat3
+ aesd $dat4,q10
+ aesimc $dat4,$dat4
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat4
+ // are loaded with last "words"
+ add x6,$len,#0x60 // because .Lecb_tail4x
+
+ aesd $dat0,q11
+ aesimc $dat0,$dat0
+ aesd $dat1,q11
+ aesimc $dat1,$dat1
+ aesd $dat2,q11
+ aesimc $dat2,$dat2
+ aesd $dat3,q11
+ aesimc $dat3,$dat3
+ aesd $dat4,q11
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ aesd $dat3,q12
+ aesimc $dat3,$dat3
+ aesd $dat4,q12
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ aesd $dat3,q13
+ aesimc $dat3,$dat3
+ aesd $dat4,q13
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ aesd $dat3,q14
+ aesimc $dat3,$dat3
+ aesd $dat4,q14
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q15
+ vld1.8 {$in0},[$inp],#16
+ aesd $dat1,q15
+ vld1.8 {$in1},[$inp],#16
+ aesd $dat2,q15
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat3,q15
+ vld1.8 {$in3},[$inp],#16
+ aesd $dat4,q15
+ vld1.8 {$in4},[$inp],#16
+ cbz x6,.Lecb_tail4x
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ veor $tmp0,$rndlast,$dat0
+ vorr $dat0,$in0,$in0
+ veor $tmp1,$rndlast,$dat1
+ vorr $dat1,$in1,$in1
+ veor $tmp2,$rndlast,$dat2
+ vorr $dat2,$in2,$in2
+ veor $tmp3,$rndlast,$dat3
+ vorr $dat3,$in3,$in3
+ veor $tmp4,$rndlast,$dat4
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat4,$in4,$in4
+ vst1.8 {$tmp1},[$out],#16
+ mov $cnt,$rounds
+ vst1.8 {$tmp2},[$out],#16
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+ b.hs .Loop5x_ecb_dec
+
+ add $len,$len,#0x50
+ cbz $len,.Lecb_done
+
+ add $cnt,$rounds,#2
+ subs $len,$len,#0x30
+ vorr $dat0,$in2,$in2
+ vorr $dat1,$in3,$in3
+ vorr $dat2,$in4,$in4
+ b.lo .Lecb_dec_tail
+
+ b .Loop3x_ecb_dec
+
+.align 4
+.Lecb_tail4x:
+ veor $tmp1,$rndlast,$dat1
+ veor $tmp2,$rndlast,$dat2
+ veor $tmp3,$rndlast,$dat3
+ veor $tmp4,$rndlast,$dat4
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+
+ b .Lecb_done
+.align 4
+___
+$code.=<<___;
+.Loop3x_ecb_dec:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop3x_ecb_dec
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ subs $len,$len,#0x30
+ mov.lo x6,$len // x6, $cnt, is zero at this point
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat2
+ // are loaded with last "words"
+ mov $key_,$key
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ vld1.8 {$in0},[$inp],#16
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ vld1.8 {$in1},[$inp],#16
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat0,q15
+ aesd $dat1,q15
+ aesd $dat2,q15
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ add $cnt,$rounds,#2
+ veor $tmp0,$rndlast,$dat0
+ veor $tmp1,$rndlast,$dat1
+ veor $dat2,$dat2,$rndlast
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat0,$in0,$in0
+ vst1.8 {$tmp1},[$out],#16
+ vorr $dat1,$in1,$in1
+ vst1.8 {$dat2},[$out],#16
+ vorr $dat2,$in2,$in2
+ b.hs .Loop3x_ecb_dec
+
+ cmn $len,#0x30
+ b.eq .Lecb_done
+ nop
+
+.Lecb_dec_tail:
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lecb_dec_tail
+
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ cmn $len,#0x20
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ aesd $dat1,q15
+ aesd $dat2,q15
+ b.eq .Lecb_dec_one
+ veor $tmp1,$rndlast,$dat1
+ veor $tmp2,$rndlast,$dat2
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ b .Lecb_done
+
+.Lecb_dec_one:
+ veor $tmp1,$rndlast,$dat2
+ vst1.8 {$tmp1},[$out],#16
+
+.Lecb_done:
+___
+}
+$code.=<<___ if ($flavour !~ /64/);
+ vldmia sp!,{d8-d15}
+ ldmia sp!,{r4-r8,pc}
+___
+$code.=<<___ if ($flavour =~ /64/);
+ ldr x29,[sp],#16
+___
+$code.=<<___ if ($flavour =~ /64/);
+.Lecb_Final_abort:
+ ret
+___
+$code.=<<___;
+.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt
+___
+}}}
{{{
my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5";
my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12");
@@ -519,6 +1377,13 @@ $code.=<<___;
___
{
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3); # used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
$code.=<<___;
.align 5
.Lcbc_dec:
@@ -535,7 +1400,196 @@ $code.=<<___;
vorr $in0,$dat,$dat
vorr $in1,$dat1,$dat1
vorr $in2,$dat2,$dat2
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#32
+ b.lo .Loop3x_cbc_dec
+
+ vld1.8 {$dat3},[$inp],#16
+ vld1.8 {$dat4},[$inp],#16
+ sub $len,$len,#32 // bias
+ mov $cnt,$rounds
+ vorr $in3,$dat3,$dat3
+ vorr $in4,$dat4,$dat4
+
+.Loop5x_cbc_dec:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop5x_cbc_dec
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ cmp $len,#0x40 // because .Lcbc_tail4x
+ sub $len,$len,#0x50
+
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo
+ mov $key_,$key
+
+ aesd $dat0,q10
+ aesimc $dat0,$dat0
+ aesd $dat1,q10
+ aesimc $dat1,$dat1
+ aesd $dat2,q10
+ aesimc $dat2,$dat2
+ aesd $dat3,q10
+ aesimc $dat3,$dat3
+ aesd $dat4,q10
+ aesimc $dat4,$dat4
+ add $inp,$inp,x6 // $inp is adjusted in such way that
+ // at exit from the loop $dat1-$dat4
+ // are loaded with last "words"
+ add x6,$len,#0x60 // because .Lcbc_tail4x
+
+ aesd $dat0,q11
+ aesimc $dat0,$dat0
+ aesd $dat1,q11
+ aesimc $dat1,$dat1
+ aesd $dat2,q11
+ aesimc $dat2,$dat2
+ aesd $dat3,q11
+ aesimc $dat3,$dat3
+ aesd $dat4,q11
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ aesd $dat3,q12
+ aesimc $dat3,$dat3
+ aesd $dat4,q12
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ aesd $dat3,q13
+ aesimc $dat3,$dat3
+ aesd $dat4,q13
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ aesd $dat3,q14
+ aesimc $dat3,$dat3
+ aesd $dat4,q14
+ aesimc $dat4,$dat4
+ veor $tmp0,$ivec,$rndlast
+ aesd $dat0,q15
+ veor $tmp1,$in0,$rndlast
+ vld1.8 {$in0},[$inp],#16
+ aesd $dat1,q15
+ veor $tmp2,$in1,$rndlast
+ vld1.8 {$in1},[$inp],#16
+ aesd $dat2,q15
+ veor $tmp3,$in2,$rndlast
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat3,q15
+ veor $tmp4,$in3,$rndlast
+ vld1.8 {$in3},[$inp],#16
+ aesd $dat4,q15
+ vorr $ivec,$in4,$in4
+ vld1.8 {$in4},[$inp],#16
+ cbz x6,.Lcbc_tail4x
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ veor $tmp0,$tmp0,$dat0
+ vorr $dat0,$in0,$in0
+ veor $tmp1,$tmp1,$dat1
+ vorr $dat1,$in1,$in1
+ veor $tmp2,$tmp2,$dat2
+ vorr $dat2,$in2,$in2
+ veor $tmp3,$tmp3,$dat3
+ vorr $dat3,$in3,$in3
+ veor $tmp4,$tmp4,$dat4
+ vst1.8 {$tmp0},[$out],#16
+ vorr $dat4,$in4,$in4
+ vst1.8 {$tmp1},[$out],#16
+ mov $cnt,$rounds
+ vst1.8 {$tmp2},[$out],#16
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+ b.hs .Loop5x_cbc_dec
+
+ add $len,$len,#0x50
+ cbz $len,.Lcbc_done
+
+ add $cnt,$rounds,#2
+ subs $len,$len,#0x30
+ vorr $dat0,$in2,$in2
+ vorr $in0,$in2,$in2
+ vorr $dat1,$in3,$in3
+ vorr $in1,$in3,$in3
+ vorr $dat2,$in4,$in4
+ vorr $in2,$in4,$in4
+ b.lo .Lcbc_dec_tail
+
+ b .Loop3x_cbc_dec
+
+.align 4
+.Lcbc_tail4x:
+ veor $tmp1,$tmp0,$dat1
+ veor $tmp2,$tmp2,$dat2
+ veor $tmp3,$tmp3,$dat3
+ veor $tmp4,$tmp4,$dat4
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+
+ b .Lcbc_done
+.align 4
+___
+$code.=<<___;
.Loop3x_cbc_dec:
aesd $dat0,q8
aesimc $dat0,$dat0
@@ -696,6 +1750,9 @@ my $step="x12"; # aliases with $tctr2
my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7));
my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+# used only in 64-bit mode...
+my ($dat3,$dat4,$in3,$in4)=map("q$_",(16..23));
+
my ($dat,$tmp)=($dat0,$tmp0);
### q8-q15 preloaded key schedule
@@ -751,6 +1808,175 @@ $code.=<<___;
vmov.32 ${ivec}[3],$tctr2
sub $len,$len,#3 // bias
vorr $dat2,$ivec,$ivec
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#2
+ b.lo .Loop3x_ctr32
+
+ add w13,$ctr,#1
+ add w14,$ctr,#2
+ vorr $dat3,$dat0,$dat0
+ rev w13,w13
+ vorr $dat4,$dat0,$dat0
+ rev w14,w14
+ vmov.32 ${dat3}[3],w13
+ sub $len,$len,#2 // bias
+ vmov.32 ${dat4}[3],w14
+ add $ctr,$ctr,#2
+ b .Loop5x_ctr32
+
+.align 4
+.Loop5x_ctr32:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16
+ subs $cnt,$cnt,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop5x_ctr32
+
+ mov $key_,$key
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ add $tctr0,$ctr,#1
+ add $tctr1,$ctr,#2
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ add $tctr2,$ctr,#3
+ add w13,$ctr,#4
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ add w14,$ctr,#5
+ rev $tctr0,$tctr0
+ aese $dat3,q12
+ aesmc $dat3,$dat3
+ rev $tctr1,$tctr1
+ rev $tctr2,$tctr2
+ aese $dat4,q12
+ aesmc $dat4,$dat4
+ rev w13,w13
+ rev w14,w14
+
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ aese $dat3,q13
+ aesmc $dat3,$dat3
+ aese $dat4,q13
+ aesmc $dat4,$dat4
+
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ vld1.8 {$in0},[$inp],#16
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ vld1.8 {$in1},[$inp],#16
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ vld1.8 {$in2},[$inp],#16
+ aese $dat3,q14
+ aesmc $dat3,$dat3
+ vld1.8 {$in3},[$inp],#16
+ aese $dat4,q14
+ aesmc $dat4,$dat4
+ vld1.8 {$in4},[$inp],#16
+
+ aese $dat0,q15
+ veor $in0,$in0,$rndlast
+ aese $dat1,q15
+ veor $in1,$in1,$rndlast
+ aese $dat2,q15
+ veor $in2,$in2,$rndlast
+ aese $dat3,q15
+ veor $in3,$in3,$rndlast
+ aese $dat4,q15
+ veor $in4,$in4,$rndlast
+
+ veor $in0,$in0,$dat0
+ vorr $dat0,$ivec,$ivec
+ veor $in1,$in1,$dat1
+ vorr $dat1,$ivec,$ivec
+ veor $in2,$in2,$dat2
+ vorr $dat2,$ivec,$ivec
+ veor $in3,$in3,$dat3
+ vorr $dat3,$ivec,$ivec
+ veor $in4,$in4,$dat4
+ vorr $dat4,$ivec,$ivec
+
+ vst1.8 {$in0},[$out],#16
+ vmov.32 ${dat0}[3],$tctr0
+ vst1.8 {$in1},[$out],#16
+ vmov.32 ${dat1}[3],$tctr1
+ vst1.8 {$in2},[$out],#16
+ vmov.32 ${dat2}[3],$tctr2
+ vst1.8 {$in3},[$out],#16
+ vmov.32 ${dat3}[3],w13
+ vst1.8 {$in4},[$out],#16
+ vmov.32 ${dat4}[3],w14
+
+ mov $cnt,$rounds
+ cbz $len,.Lctr32_done
+
+ add $ctr,$ctr,#5
+ subs $len,$len,#5
+ b.hs .Loop5x_ctr32
+
+ add $len,$len,#5
+ sub $ctr,$ctr,#5
+
+ cmp $len,#2
+ mov $step,#16
+ cclr $step,lo
+ b.ls .Lctr32_tail
+
+ sub $len,$len,#3 // bias
+ add $ctr,$ctr,#3
+___
+$code.=<<___;
b .Loop3x_ctr32
.align 4
@@ -905,6 +2131,1432 @@ $code.=<<___;
.size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks
___
}}}
+# Performance in cycles per byte.
+# Processed with AES-XTS different key size.
+# It shows the value before and after optimization as below:
+# (before/after):
+#
+# AES-128-XTS AES-256-XTS
+# Cortex-A57 3.36/1.09 4.02/1.37
+# Cortex-A72 3.03/1.02 3.28/1.33
+
+# Optimization is implemented by loop unrolling and interleaving.
+# Commonly, we choose the unrolling factor as 5, if the input
+# data size smaller than 5 blocks, but not smaller than 3 blocks,
+# choose 3 as the unrolling factor.
+# If the input data size dsize >= 5*16 bytes, then take 5 blocks
+# as one iteration, every loop the left size lsize -= 5*16.
+# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes
+# will be processed specially, which be integrated into the 5*16 bytes
+# loop to improve the efficiency.
+# There is one special case, if the original input data size dsize
+# = 16 bytes, we will treat it seperately to improve the
+# performance: one independent code block without LR, FP load and
+# store.
+# Encryption will process the (length -tailcnt) bytes as mentioned
+# previously, then encrypt the composite block as last second
+# cipher block.
+# Decryption will process the (length -tailcnt -1) bytes as mentioned
+# previously, then decrypt the last second cipher block to get the
+# last plain block(tail), decrypt the composite block as last second
+# plain text block.
+
+{{{
+my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
+my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
+my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
+my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
+my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
+my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b");
+my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
+my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
+
+my ($tmpin)=("v26.16b");
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+# q7 last round key
+# q10-q15, q7 Last 7 round keys
+# q8-q9 preloaded round keys except last 7 keys for big size
+# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
+
+
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3); # used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
+$code.=<<___ if ($flavour =~ /64/);
+.globl ${prefix}_xts_encrypt
+.type ${prefix}_xts_encrypt,%function
+.align 5
+${prefix}_xts_encrypt:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#16
+ // Original input data size bigger than 16, jump to big size processing.
+ b.ne .Lxts_enc_big_size
+ // Encrypt the iv with key2, as the first XEX iv.
+ ldr $rounds,[$key2,#240]
+ vld1.8 {$dat},[$key2],#16
+ vld1.8 {$iv0},[$ivp]
+ sub $rounds,$rounds,#2
+ vld1.8 {$dat1},[$key2],#16
+
+.Loop_enc_iv_enc:
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2],#16
+ subs $rounds,$rounds,#2
+ aese $iv0,$dat1
+ aesmc $iv0,$iv0
+ vld1.32 {$dat1},[$key2],#16
+ b.gt .Loop_enc_iv_enc
+
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2]
+ aese $iv0,$dat1
+ veor $iv0,$iv0,$dat
+
+ vld1.8 {$dat0},[$inp]
+ veor $dat0,$iv0,$dat0
+
+ ldr $rounds,[$key1,#240]
+ vld1.32 {q20-q21},[$key1],#32 // load key schedule...
+
+ aese $dat0,q20
+ aesmc $dat0,$dat0
+ vld1.32 {q8-q9},[$key1],#32 // load key schedule...
+ aese $dat0,q21
+ aesmc $dat0,$dat0
+ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing
+ b.eq .Lxts_128_enc
+.Lxts_enc_round_loop:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ vld1.32 {q8},[$key1],#16 // load key schedule...
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ vld1.32 {q9},[$key1],#16 // load key schedule...
+ subs $rounds,$rounds,#2 // bias
+ b.gt .Lxts_enc_round_loop
+.Lxts_128_enc:
+ vld1.32 {q10-q11},[$key1],#32 // load key schedule...
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ vld1.32 {q12-q13},[$key1],#32 // load key schedule...
+ aese $dat0,q10
+ aesmc $dat0,$dat0
+ aese $dat0,q11
+ aesmc $dat0,$dat0
+ vld1.32 {q14-q15},[$key1],#32 // load key schedule...
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ vld1.32 {$rndlast},[$key1]
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat0,q15
+ veor $dat0,$dat0,$rndlast
+ veor $dat0,$dat0,$iv0
+ vst1.8 {$dat0},[$out]
+ b .Lxts_enc_final_abort
+
+.align 4
+.Lxts_enc_big_size:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp $constnumx,$tmpinp,[sp,#-64]!
+ stp $tailcnt,$midnumx,[sp,#48]
+ stp $ivd10,$ivd20,[sp,#32]
+ stp $ivd30,$ivd40,[sp,#16]
+
+ // tailcnt store the tail value of length%16.
+ and $tailcnt,$len,#0xf
+ and $len,$len,#-16
+ subs $len,$len,#16
+ mov $step,#16
+ b.lo .Lxts_abort
+ csel $step,xzr,$step,eq
+
+ // Firstly, encrypt the iv with key2, as the first iv of XEX.
+ ldr $rounds,[$key2,#240]
+ vld1.32 {$dat},[$key2],#16
+ vld1.8 {$iv0},[$ivp]
+ sub $rounds,$rounds,#2
+ vld1.32 {$dat1},[$key2],#16
+
+.Loop_iv_enc:
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2],#16
+ subs $rounds,$rounds,#2
+ aese $iv0,$dat1
+ aesmc $iv0,$iv0
+ vld1.32 {$dat1},[$key2],#16
+ b.gt .Loop_iv_enc
+
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2]
+ aese $iv0,$dat1
+ veor $iv0,$iv0,$dat
+
+ // The iv for second block
+ // $ivl- iv(low), $ivh - iv(high)
+ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
+ fmov $ivl,$ivd00
+ fmov $ivh,$ivd01
+ mov $constnum,#0x87
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd10,$ivl
+ fmov $ivd11,$ivh
+
+ ldr $rounds0,[$key1,#240] // next starting point
+ vld1.8 {$dat},[$inp],$step
+
+ vld1.32 {q8-q9},[$key1] // load key schedule...
+ sub $rounds0,$rounds0,#6
+ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
+ sub $rounds0,$rounds0,#2
+ vld1.32 {q10-q11},[$key_],#32
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ add $key_,$key1,#32
+ mov $rounds,$rounds0
+
+ // Encryption
+.Lxts_enc:
+ vld1.8 {$dat2},[$inp],#16
+ subs $len,$len,#32 // bias
+ add $rounds,$rounds0,#2
+ vorr $in1,$dat,$dat
+ vorr $dat1,$dat,$dat
+ vorr $in3,$dat,$dat
+ vorr $in2,$dat2,$dat2
+ vorr $in4,$dat2,$dat2
+ b.lo .Lxts_inner_enc_tail
+ veor $dat,$dat,$iv0 // before encryption, xor with iv
+ veor $dat2,$dat2,$iv1
+
+ // The iv for third block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd20,$ivl
+ fmov $ivd21,$ivh
+
+
+ vorr $dat1,$dat2,$dat2
+ vld1.8 {$dat2},[$inp],#16
+ vorr $in0,$dat,$dat
+ vorr $in1,$dat1,$dat1
+ veor $in2,$dat2,$iv2 // the third block
+ veor $dat2,$dat2,$iv2
+ cmp $len,#32
+ b.lo .Lxts_outer_enc_tail
+
+ // The iv for fourth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd30,$ivl
+ fmov $ivd31,$ivh
+
+ vld1.8 {$dat3},[$inp],#16
+ // The iv for fifth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd40,$ivl
+ fmov $ivd41,$ivh
+
+ vld1.8 {$dat4},[$inp],#16
+ veor $dat3,$dat3,$iv3 // the fourth block
+ veor $dat4,$dat4,$iv4
+ sub $len,$len,#32 // bias
+ mov $rounds,$rounds0
+ b .Loop5x_xts_enc
+
+.align 4
+.Loop5x_xts_enc:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16
+ subs $rounds,$rounds,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16
+ b.gt .Loop5x_xts_enc
+
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat3,q8
+ aesmc $dat3,$dat3
+ aese $dat4,q8
+ aesmc $dat4,$dat4
+ subs $len,$len,#0x50 // because .Lxts_enc_tail4x
+
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat3,q9
+ aesmc $dat3,$dat3
+ aese $dat4,q9
+ aesmc $dat4,$dat4
+ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
+ mov $key_,$key1
+
+ aese $dat0,q10
+ aesmc $dat0,$dat0
+ aese $dat1,q10
+ aesmc $dat1,$dat1
+ aese $dat2,q10
+ aesmc $dat2,$dat2
+ aese $dat3,q10
+ aesmc $dat3,$dat3
+ aese $dat4,q10
+ aesmc $dat4,$dat4
+ add $inp,$inp,$xoffset // x0 is adjusted in such way that
+ // at exit from the loop v1.16b-v26.16b
+ // are loaded with last "words"
+ add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x
+
+ aese $dat0,q11
+ aesmc $dat0,$dat0
+ aese $dat1,q11
+ aesmc $dat1,$dat1
+ aese $dat2,q11
+ aesmc $dat2,$dat2
+ aese $dat3,q11
+ aesmc $dat3,$dat3
+ aese $dat4,q11
+ aesmc $dat4,$dat4
+
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ aese $dat3,q12
+ aesmc $dat3,$dat3
+ aese $dat4,q12
+ aesmc $dat4,$dat4
+
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ aese $dat3,q13
+ aesmc $dat3,$dat3
+ aese $dat4,q13
+ aesmc $dat4,$dat4
+
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ aese $dat3,q14
+ aesmc $dat3,$dat3
+ aese $dat4,q14
+ aesmc $dat4,$dat4
+
+ veor $tmp0,$rndlast,$iv0
+ aese $dat0,q15
+ // The iv for first block of one iteration
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd00,$ivl
+ fmov $ivd01,$ivh
+ veor $tmp1,$rndlast,$iv1
+ vld1.8 {$in0},[$inp],#16
+ aese $dat1,q15
+ // The iv for second block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd10,$ivl
+ fmov $ivd11,$ivh
+ veor $tmp2,$rndlast,$iv2
+ vld1.8 {$in1},[$inp],#16
+ aese $dat2,q15
+ // The iv for third block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd20,$ivl
+ fmov $ivd21,$ivh
+ veor $tmp3,$rndlast,$iv3
+ vld1.8 {$in2},[$inp],#16
+ aese $dat3,q15
+ // The iv for fourth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd30,$ivl
+ fmov $ivd31,$ivh
+ veor $tmp4,$rndlast,$iv4
+ vld1.8 {$in3},[$inp],#16
+ aese $dat4,q15
+
+ // The iv for fifth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd40,$ivl
+ fmov $ivd41,$ivh
+
+ vld1.8 {$in4},[$inp],#16
+ cbz $xoffset,.Lxts_enc_tail4x
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ veor $tmp0,$tmp0,$dat0
+ veor $dat0,$in0,$iv0
+ veor $tmp1,$tmp1,$dat1
+ veor $dat1,$in1,$iv1
+ veor $tmp2,$tmp2,$dat2
+ veor $dat2,$in2,$iv2
+ veor $tmp3,$tmp3,$dat3
+ veor $dat3,$in3,$iv3
+ veor $tmp4,$tmp4,$dat4
+ vst1.8 {$tmp0},[$out],#16
+ veor $dat4,$in4,$iv4
+ vst1.8 {$tmp1},[$out],#16
+ mov $rounds,$rounds0
+ vst1.8 {$tmp2},[$out],#16
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+ b.hs .Loop5x_xts_enc
+
+
+ // If left 4 blocks, borrow the five block's processing.
+ cmn $len,#0x10
+ b.ne .Loop5x_enc_after
+ vorr $iv4,$iv3,$iv3
+ vorr $iv3,$iv2,$iv2
+ vorr $iv2,$iv1,$iv1
+ vorr $iv1,$iv0,$iv0
+ fmov $ivl,$ivd40
+ fmov $ivh,$ivd41
+ veor $dat0,$iv0,$in0
+ veor $dat1,$iv1,$in1
+ veor $dat2,$in2,$iv2
+ veor $dat3,$in3,$iv3
+ veor $dat4,$in4,$iv4
+ b.eq .Loop5x_xts_enc
+
+.Loop5x_enc_after:
+ add $len,$len,#0x50
+ cbz $len,.Lxts_enc_done
+
+ add $rounds,$rounds0,#2
+ subs $len,$len,#0x30
+ b.lo .Lxts_inner_enc_tail
+
+ veor $dat0,$iv0,$in2
+ veor $dat1,$iv1,$in3
+ veor $dat2,$in4,$iv2
+ b .Lxts_outer_enc_tail
+
+.align 4
+.Lxts_enc_tail4x:
+ add $inp,$inp,#16
+ veor $tmp1,$dat1,$tmp1
+ vst1.8 {$tmp1},[$out],#16
+ veor $tmp2,$dat2,$tmp2
+ vst1.8 {$tmp2},[$out],#16
+ veor $tmp3,$dat3,$tmp3
+ veor $tmp4,$dat4,$tmp4
+ vst1.8 {$tmp3-$tmp4},[$out],#32
+
+ b .Lxts_enc_done
+.align 4
+.Lxts_outer_enc_tail:
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $rounds,$rounds,#2
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lxts_outer_enc_tail
+
+ aese $dat0,q8
+ aesmc $dat0,$dat0
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ veor $tmp0,$iv0,$rndlast
+ subs $len,$len,#0x30
+ // The iv for first block
+ fmov $ivl,$ivd20
+ fmov $ivh,$ivd21
+ //mov $constnum,#0x87
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr#31
+ eor $ivl,$tmpmx,$ivl,lsl#1
+ fmov $ivd00,$ivl
+ fmov $ivd01,$ivh
+ veor $tmp1,$iv1,$rndlast
+ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
+ aese $dat0,q9
+ aesmc $dat0,$dat0
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ veor $tmp2,$iv2,$rndlast
+
+ add $xoffset,$xoffset,#0x20
+ add $inp,$inp,$xoffset
+ mov $key_,$key1
+
+ aese $dat0,q12
+ aesmc $dat0,$dat0
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ aese $dat0,q13
+ aesmc $dat0,$dat0
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ aese $dat0,q14
+ aesmc $dat0,$dat0
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ aese $dat0,q15
+ aese $dat1,q15
+ aese $dat2,q15
+ vld1.8 {$in2},[$inp],#16
+ add $rounds,$rounds0,#2
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ veor $tmp0,$tmp0,$dat0
+ veor $tmp1,$tmp1,$dat1
+ veor $dat2,$dat2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp0},[$out],#16
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$dat2},[$out],#16
+ cmn $len,#0x30
+ b.eq .Lxts_enc_done
+.Lxts_encxor_one:
+ vorr $in3,$in1,$in1
+ vorr $in4,$in2,$in2
+ nop
+
+.Lxts_inner_enc_tail:
+ cmn $len,#0x10
+ veor $dat1,$in3,$iv0
+ veor $dat2,$in4,$iv1
+ b.eq .Lxts_enc_tail_loop
+ veor $dat2,$in4,$iv0
+.Lxts_enc_tail_loop:
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $rounds,$rounds,#2
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lxts_enc_tail_loop
+
+ aese $dat1,q8
+ aesmc $dat1,$dat1
+ aese $dat2,q8
+ aesmc $dat2,$dat2
+ aese $dat1,q9
+ aesmc $dat1,$dat1
+ aese $dat2,q9
+ aesmc $dat2,$dat2
+ aese $dat1,q12
+ aesmc $dat1,$dat1
+ aese $dat2,q12
+ aesmc $dat2,$dat2
+ cmn $len,#0x20
+ aese $dat1,q13
+ aesmc $dat1,$dat1
+ aese $dat2,q13
+ aesmc $dat2,$dat2
+ veor $tmp1,$iv0,$rndlast
+ aese $dat1,q14
+ aesmc $dat1,$dat1
+ aese $dat2,q14
+ aesmc $dat2,$dat2
+ veor $tmp2,$iv1,$rndlast
+ aese $dat1,q15
+ aese $dat2,q15
+ b.eq .Lxts_enc_one
+ veor $tmp1,$tmp1,$dat1
+ vst1.8 {$tmp1},[$out],#16
+ veor $tmp2,$tmp2,$dat2
+ vorr $iv0,$iv1,$iv1
+ vst1.8 {$tmp2},[$out],#16
+ fmov $ivl,$ivd10
+ fmov $ivh,$ivd11
+ mov $constnum,#0x87
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd00,$ivl
+ fmov $ivd01,$ivh
+ b .Lxts_enc_done
+
+.Lxts_enc_one:
+ veor $tmp1,$tmp1,$dat2
+ vorr $iv0,$iv0,$iv0
+ vst1.8 {$tmp1},[$out],#16
+ fmov $ivl,$ivd00
+ fmov $ivh,$ivd01
+ mov $constnum,#0x87
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd00,$ivl
+ fmov $ivd01,$ivh
+ b .Lxts_enc_done
+.align 5
+.Lxts_enc_done:
+ // Process the tail block with cipher stealing.
+ tst $tailcnt,#0xf
+ b.eq .Lxts_abort
+
+ mov $tmpinp,$inp
+ mov $tmpoutp,$out
+ sub $out,$out,#16
+.composite_enc_loop:
+ subs $tailcnt,$tailcnt,#1
+ ldrb $l2outp,[$out,$tailcnt]
+ ldrb $loutp,[$tmpinp,$tailcnt]
+ strb $l2outp,[$tmpoutp,$tailcnt]
+ strb $loutp,[$out,$tailcnt]
+ b.gt .composite_enc_loop
+.Lxts_enc_load_done:
+ vld1.8 {$tmpin},[$out]
+ veor $tmpin,$tmpin,$iv0
+
+ // Encrypt the composite block to get the last second encrypted text block
+ ldr $rounds,[$key1,#240] // load key schedule...
+ vld1.8 {$dat},[$key1],#16
+ sub $rounds,$rounds,#2
+ vld1.8 {$dat1},[$key1],#16 // load key schedule...
+.Loop_final_enc:
+ aese $tmpin,$dat0
+ aesmc $tmpin,$tmpin
+ vld1.32 {$dat0},[$key1],#16
+ subs $rounds,$rounds,#2
+ aese $tmpin,$dat1
+ aesmc $tmpin,$tmpin
+ vld1.32 {$dat1},[$key1],#16
+ b.gt .Loop_final_enc
+
+ aese $tmpin,$dat0
+ aesmc $tmpin,$tmpin
+ vld1.32 {$dat0},[$key1]
+ aese $tmpin,$dat1
+ veor $tmpin,$tmpin,$dat0
+ veor $tmpin,$tmpin,$iv0
+ vst1.8 {$tmpin},[$out]
+
+.Lxts_abort:
+ ldp $tailcnt,$midnumx,[sp,#48]
+ ldp $ivd10,$ivd20,[sp,#32]
+ ldp $ivd30,$ivd40,[sp,#16]
+ ldp $constnumx,$tmpinp,[sp],#64
+.Lxts_enc_final_abort:
+ ret
+.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt
+___
+
+}}}
+{{{
+my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5));
+my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10");
+my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20");
+my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19");
+my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11");
+my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7));
+my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b");
+my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]");
+my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]");
+
+my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1);
+
+# q7 last round key
+# q10-q15, q7 Last 7 round keys
+# q8-q9 preloaded round keys except last 7 keys for big size
+# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte
+
+{
+my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9));
+
+my ($dat3,$in3,$tmp3); # used only in 64-bit mode
+my ($dat4,$in4,$tmp4);
+if ($flavour =~ /64/) {
+ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23));
+}
+
+$code.=<<___ if ($flavour =~ /64/);
+.globl ${prefix}_xts_decrypt
+.type ${prefix}_xts_decrypt,%function
+.align 5
+${prefix}_xts_decrypt:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ cmp $len,#16
+ // Original input data size bigger than 16, jump to big size processing.
+ b.ne .Lxts_dec_big_size
+ // Encrypt the iv with key2, as the first XEX iv.
+ ldr $rounds,[$key2,#240]
+ vld1.8 {$dat},[$key2],#16
+ vld1.8 {$iv0},[$ivp]
+ sub $rounds,$rounds,#2
+ vld1.8 {$dat1},[$key2],#16
+
+.Loop_dec_small_iv_enc:
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2],#16
+ subs $rounds,$rounds,#2
+ aese $iv0,$dat1
+ aesmc $iv0,$iv0
+ vld1.32 {$dat1},[$key2],#16
+ b.gt .Loop_dec_small_iv_enc
+
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2]
+ aese $iv0,$dat1
+ veor $iv0,$iv0,$dat
+
+ vld1.8 {$dat0},[$inp]
+ veor $dat0,$iv0,$dat0
+
+ ldr $rounds,[$key1,#240]
+ vld1.32 {q20-q21},[$key1],#32 // load key schedule...
+
+ aesd $dat0,q20
+ aesimc $dat0,$dat0
+ vld1.32 {q8-q9},[$key1],#32 // load key schedule...
+ aesd $dat0,q21
+ aesimc $dat0,$dat0
+ subs $rounds,$rounds,#10 // bias
+ b.eq .Lxts_128_dec
+.Lxts_dec_round_loop:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ vld1.32 {q8},[$key1],#16 // load key schedule...
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ vld1.32 {q9},[$key1],#16 // load key schedule...
+ subs $rounds,$rounds,#2 // bias
+ b.gt .Lxts_dec_round_loop
+.Lxts_128_dec:
+ vld1.32 {q10-q11},[$key1],#32 // load key schedule...
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ vld1.32 {q12-q13},[$key1],#32 // load key schedule...
+ aesd $dat0,q10
+ aesimc $dat0,$dat0
+ aesd $dat0,q11
+ aesimc $dat0,$dat0
+ vld1.32 {q14-q15},[$key1],#32 // load key schedule...
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ vld1.32 {$rndlast},[$key1]
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat0,q15
+ veor $dat0,$dat0,$rndlast
+ veor $dat0,$iv0,$dat0
+ vst1.8 {$dat0},[$out]
+ b .Lxts_dec_final_abort
+.Lxts_dec_big_size:
+___
+$code.=<<___ if ($flavour =~ /64/);
+ stp $constnumx,$tmpinp,[sp,#-64]!
+ stp $tailcnt,$midnumx,[sp,#48]
+ stp $ivd10,$ivd20,[sp,#32]
+ stp $ivd30,$ivd40,[sp,#16]
+
+ and $tailcnt,$len,#0xf
+ and $len,$len,#-16
+ subs $len,$len,#16
+ mov $step,#16
+ b.lo .Lxts_dec_abort
+
+ // Encrypt the iv with key2, as the first XEX iv
+ ldr $rounds,[$key2,#240]
+ vld1.8 {$dat},[$key2],#16
+ vld1.8 {$iv0},[$ivp]
+ sub $rounds,$rounds,#2
+ vld1.8 {$dat1},[$key2],#16
+
+.Loop_dec_iv_enc:
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2],#16
+ subs $rounds,$rounds,#2
+ aese $iv0,$dat1
+ aesmc $iv0,$iv0
+ vld1.32 {$dat1},[$key2],#16
+ b.gt .Loop_dec_iv_enc
+
+ aese $iv0,$dat
+ aesmc $iv0,$iv0
+ vld1.32 {$dat},[$key2]
+ aese $iv0,$dat1
+ veor $iv0,$iv0,$dat
+
+ // The iv for second block
+ // $ivl- iv(low), $ivh - iv(high)
+ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4
+ fmov $ivl,$ivd00
+ fmov $ivh,$ivd01
+ mov $constnum,#0x87
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd10,$ivl
+ fmov $ivd11,$ivh
+
+ ldr $rounds0,[$key1,#240] // load rounds number
+
+ // The iv for third block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd20,$ivl
+ fmov $ivd21,$ivh
+
+ vld1.32 {q8-q9},[$key1] // load key schedule...
+ sub $rounds0,$rounds0,#6
+ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys
+ sub $rounds0,$rounds0,#2
+ vld1.32 {q10-q11},[$key_],#32 // load key schedule...
+ vld1.32 {q12-q13},[$key_],#32
+ vld1.32 {q14-q15},[$key_],#32
+ vld1.32 {$rndlast},[$key_]
+
+ // The iv for fourth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd30,$ivl
+ fmov $ivd31,$ivh
+
+ add $key_,$key1,#32
+ mov $rounds,$rounds0
+ b .Lxts_dec
+
+ // Decryption
+.align 5
+.Lxts_dec:
+ tst $tailcnt,#0xf
+ b.eq .Lxts_dec_begin
+ subs $len,$len,#16
+ csel $step,xzr,$step,eq
+ vld1.8 {$dat},[$inp],#16
+ b.lo .Lxts_done
+ sub $inp,$inp,#16
+.Lxts_dec_begin:
+ vld1.8 {$dat},[$inp],$step
+ subs $len,$len,#32 // bias
+ add $rounds,$rounds0,#2
+ vorr $in1,$dat,$dat
+ vorr $dat1,$dat,$dat
+ vorr $in3,$dat,$dat
+ vld1.8 {$dat2},[$inp],#16
+ vorr $in2,$dat2,$dat2
+ vorr $in4,$dat2,$dat2
+ b.lo .Lxts_inner_dec_tail
+ veor $dat,$dat,$iv0 // before decryt, xor with iv
+ veor $dat2,$dat2,$iv1
+
+ vorr $dat1,$dat2,$dat2
+ vld1.8 {$dat2},[$inp],#16
+ vorr $in0,$dat,$dat
+ vorr $in1,$dat1,$dat1
+ veor $in2,$dat2,$iv2 // third block xox with third iv
+ veor $dat2,$dat2,$iv2
+ cmp $len,#32
+ b.lo .Lxts_outer_dec_tail
+
+ vld1.8 {$dat3},[$inp],#16
+
+ // The iv for fifth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd40,$ivl
+ fmov $ivd41,$ivh
+
+ vld1.8 {$dat4},[$inp],#16
+ veor $dat3,$dat3,$iv3 // the fourth block
+ veor $dat4,$dat4,$iv4
+ sub $len,$len,#32 // bias
+ mov $rounds,$rounds0
+ b .Loop5x_xts_dec
+
+.align 4
+.Loop5x_xts_dec:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ vld1.32 {q8},[$key_],#16 // load key schedule...
+ subs $rounds,$rounds,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ vld1.32 {q9},[$key_],#16 // load key schedule...
+ b.gt .Loop5x_xts_dec
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat3,q8
+ aesimc $dat3,$dat3
+ aesd $dat4,q8
+ aesimc $dat4,$dat4
+ subs $len,$len,#0x50 // because .Lxts_dec_tail4x
+
+ aesd $dat0,q9
+ aesimc $dat0,$dat
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat3,q9
+ aesimc $dat3,$dat3
+ aesd $dat4,q9
+ aesimc $dat4,$dat4
+ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo
+ mov $key_,$key1
+
+ aesd $dat0,q10
+ aesimc $dat0,$dat0
+ aesd $dat1,q10
+ aesimc $dat1,$dat1
+ aesd $dat2,q10
+ aesimc $dat2,$dat2
+ aesd $dat3,q10
+ aesimc $dat3,$dat3
+ aesd $dat4,q10
+ aesimc $dat4,$dat4
+ add $inp,$inp,$xoffset // x0 is adjusted in such way that
+ // at exit from the loop v1.16b-v26.16b
+ // are loaded with last "words"
+ add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x
+
+ aesd $dat0,q11
+ aesimc $dat0,$dat0
+ aesd $dat1,q11
+ aesimc $dat1,$dat1
+ aesd $dat2,q11
+ aesimc $dat2,$dat2
+ aesd $dat3,q11
+ aesimc $dat3,$dat3
+ aesd $dat4,q11
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ aesd $dat3,q12
+ aesimc $dat3,$dat3
+ aesd $dat4,q12
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ aesd $dat3,q13
+ aesimc $dat3,$dat3
+ aesd $dat4,q13
+ aesimc $dat4,$dat4
+
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ aesd $dat3,q14
+ aesimc $dat3,$dat3
+ aesd $dat4,q14
+ aesimc $dat4,$dat4
+
+ veor $tmp0,$rndlast,$iv0
+ aesd $dat0,q15
+ // The iv for first block of next iteration.
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd00,$ivl
+ fmov $ivd01,$ivh
+ veor $tmp1,$rndlast,$iv1
+ vld1.8 {$in0},[$inp],#16
+ aesd $dat1,q15
+ // The iv for second block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd10,$ivl
+ fmov $ivd11,$ivh
+ veor $tmp2,$rndlast,$iv2
+ vld1.8 {$in1},[$inp],#16
+ aesd $dat2,q15
+ // The iv for third block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd20,$ivl
+ fmov $ivd21,$ivh
+ veor $tmp3,$rndlast,$iv3
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat3,q15
+ // The iv for fourth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd30,$ivl
+ fmov $ivd31,$ivh
+ veor $tmp4,$rndlast,$iv4
+ vld1.8 {$in3},[$inp],#16
+ aesd $dat4,q15
+
+ // The iv for fifth block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd40,$ivl
+ fmov $ivd41,$ivh
+
+ vld1.8 {$in4},[$inp],#16
+ cbz $xoffset,.Lxts_dec_tail4x
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ veor $tmp0,$tmp0,$dat0
+ veor $dat0,$in0,$iv0
+ veor $tmp1,$tmp1,$dat1
+ veor $dat1,$in1,$iv1
+ veor $tmp2,$tmp2,$dat2
+ veor $dat2,$in2,$iv2
+ veor $tmp3,$tmp3,$dat3
+ veor $dat3,$in3,$iv3
+ veor $tmp4,$tmp4,$dat4
+ vst1.8 {$tmp0},[$out],#16
+ veor $dat4,$in4,$iv4
+ vst1.8 {$tmp1},[$out],#16
+ mov $rounds,$rounds0
+ vst1.8 {$tmp2},[$out],#16
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp3},[$out],#16
+ vst1.8 {$tmp4},[$out],#16
+ b.hs .Loop5x_xts_dec
+
+ cmn $len,#0x10
+ b.ne .Loop5x_dec_after
+ // If x2($len) equal to -0x10, the left blocks is 4.
+ // After specially processing, utilize the five blocks processing again.
+ // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3.
+ vorr $iv4,$iv3,$iv3
+ vorr $iv3,$iv2,$iv2
+ vorr $iv2,$iv1,$iv1
+ vorr $iv1,$iv0,$iv0
+ fmov $ivl,$ivd40
+ fmov $ivh,$ivd41
+ veor $dat0,$iv0,$in0
+ veor $dat1,$iv1,$in1
+ veor $dat2,$in2,$iv2
+ veor $dat3,$in3,$iv3
+ veor $dat4,$in4,$iv4
+ b.eq .Loop5x_xts_dec
+
+.Loop5x_dec_after:
+ add $len,$len,#0x50
+ cbz $len,.Lxts_done
+
+ add $rounds,$rounds0,#2
+ subs $len,$len,#0x30
+ b.lo .Lxts_inner_dec_tail
+
+ veor $dat0,$iv0,$in2
+ veor $dat1,$iv1,$in3
+ veor $dat2,$in4,$iv2
+ b .Lxts_outer_dec_tail
+
+.align 4
+.Lxts_dec_tail4x:
+ add $inp,$inp,#16
+ vld1.32 {$dat0},[$inp],#16
+ veor $tmp1,$dat1,$tmp0
+ vst1.8 {$tmp1},[$out],#16
+ veor $tmp2,$dat2,$tmp2
+ vst1.8 {$tmp2},[$out],#16
+ veor $tmp3,$dat3,$tmp3
+ veor $tmp4,$dat4,$tmp4
+ vst1.8 {$tmp3-$tmp4},[$out],#32
+
+ b .Lxts_done
+.align 4
+.Lxts_outer_dec_tail:
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $rounds,$rounds,#2
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lxts_outer_dec_tail
+
+ aesd $dat0,q8
+ aesimc $dat0,$dat0
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ veor $tmp0,$iv0,$rndlast
+ subs $len,$len,#0x30
+ // The iv for first block
+ fmov $ivl,$ivd20
+ fmov $ivh,$ivd21
+ mov $constnum,#0x87
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd00,$ivl
+ fmov $ivd01,$ivh
+ veor $tmp1,$iv1,$rndlast
+ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point
+ aesd $dat0,q9
+ aesimc $dat0,$dat0
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ veor $tmp2,$iv2,$rndlast
+ // The iv for second block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd10,$ivl
+ fmov $ivd11,$ivh
+
+ add $xoffset,$xoffset,#0x20
+ add $inp,$inp,$xoffset // $inp is adjusted to the last data
+
+ mov $key_,$key1
+
+ // The iv for third block
+ extr $midnumx,$ivh,$ivh,#32
+ extr $ivh,$ivh,$ivl,#63
+ and $tmpmw,$constnum,$midnum,asr #31
+ eor $ivl,$tmpmx,$ivl,lsl #1
+ fmov $ivd20,$ivl
+ fmov $ivd21,$ivh
+
+ aesd $dat0,q12
+ aesimc $dat0,$dat0
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ aesd $dat0,q13
+ aesimc $dat0,$dat0
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ aesd $dat0,q14
+ aesimc $dat0,$dat0
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ vld1.8 {$in2},[$inp],#16
+ aesd $dat0,q15
+ aesd $dat1,q15
+ aesd $dat2,q15
+ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0]
+ add $rounds,$rounds0,#2
+ veor $tmp0,$tmp0,$dat0
+ veor $tmp1,$tmp1,$dat1
+ veor $dat2,$dat2,$tmp2
+ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1]
+ vst1.8 {$tmp0},[$out],#16
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$dat2},[$out],#16
+
+ cmn $len,#0x30
+ add $len,$len,#0x30
+ b.eq .Lxts_done
+ sub $len,$len,#0x30
+ vorr $in3,$in1,$in1
+ vorr $in4,$in2,$in2
+ nop
+
+.Lxts_inner_dec_tail:
+ // $len == -0x10 means two blocks left.
+ cmn $len,#0x10
+ veor $dat1,$in3,$iv0
+ veor $dat2,$in4,$iv1
+ b.eq .Lxts_dec_tail_loop
+ veor $dat2,$in4,$iv0
+.Lxts_dec_tail_loop:
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ vld1.32 {q8},[$key_],#16
+ subs $rounds,$rounds,#2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ vld1.32 {q9},[$key_],#16
+ b.gt .Lxts_dec_tail_loop
+
+ aesd $dat1,q8
+ aesimc $dat1,$dat1
+ aesd $dat2,q8
+ aesimc $dat2,$dat2
+ aesd $dat1,q9
+ aesimc $dat1,$dat1
+ aesd $dat2,q9
+ aesimc $dat2,$dat2
+ aesd $dat1,q12
+ aesimc $dat1,$dat1
+ aesd $dat2,q12
+ aesimc $dat2,$dat2
+ cmn $len,#0x20
+ aesd $dat1,q13
+ aesimc $dat1,$dat1
+ aesd $dat2,q13
+ aesimc $dat2,$dat2
+ veor $tmp1,$iv0,$rndlast
+ aesd $dat1,q14
+ aesimc $dat1,$dat1
+ aesd $dat2,q14
+ aesimc $dat2,$dat2
+ veor $tmp2,$iv1,$rndlast
+ aesd $dat1,q15
+ aesd $dat2,q15
+ b.eq .Lxts_dec_one
+ veor $tmp1,$tmp1,$dat1
+ veor $tmp2,$tmp2,$dat2
+ vorr $iv0,$iv2,$iv2
+ vorr $iv1,$iv3,$iv3
+ vst1.8 {$tmp1},[$out],#16
+ vst1.8 {$tmp2},[$out],#16
+ add $len,$len,#16
+ b .Lxts_done
+
+.Lxts_dec_one:
+ veor $tmp1,$tmp1,$dat2
+ vorr $iv0,$iv1,$iv1
+ vorr $iv1,$iv2,$iv2
+ vst1.8 {$tmp1},[$out],#16
+ add $len,$len,#32
+
+.Lxts_done:
+ tst $tailcnt,#0xf
+ b.eq .Lxts_dec_abort
+ // Processing the last two blocks with cipher stealing.
+ mov x7,x3
+ cbnz x2,.Lxts_dec_1st_done
+ vld1.32 {$dat0},[$inp],#16
+
+ // Decrypt the last secod block to get the last plain text block
+.Lxts_dec_1st_done:
+ eor $tmpin,$dat0,$iv1
+ ldr $rounds,[$key1,#240]
+ vld1.32 {$dat0},[$key1],#16
+ sub $rounds,$rounds,#2
+ vld1.32 {$dat1},[$key1],#16
+.Loop_final_2nd_dec:
+ aesd $tmpin,$dat0
+ aesimc $tmpin,$tmpin
+ vld1.32 {$dat0},[$key1],#16 // load key schedule...
+ subs $rounds,$rounds,#2
+ aesd $tmpin,$dat1
+ aesimc $tmpin,$tmpin
+ vld1.32 {$dat1},[$key1],#16 // load key schedule...
+ b.gt .Loop_final_2nd_dec
+
+ aesd $tmpin,$dat0
+ aesimc $tmpin,$tmpin
+ vld1.32 {$dat0},[$key1]
+ aesd $tmpin,$dat1
+ veor $tmpin,$tmpin,$dat0
+ veor $tmpin,$tmpin,$iv1
+ vst1.8 {$tmpin},[$out]
+
+ mov $tmpinp,$inp
+ add $tmpoutp,$out,#16
+
+ // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks
+ // to get the last encrypted block.
+.composite_dec_loop:
+ subs $tailcnt,$tailcnt,#1
+ ldrb $l2outp,[$out,$tailcnt]
+ ldrb $loutp,[$tmpinp,$tailcnt]
+ strb $l2outp,[$tmpoutp,$tailcnt]
+ strb $loutp,[$out,$tailcnt]
+ b.gt .composite_dec_loop
+.Lxts_dec_load_done:
+ vld1.8 {$tmpin},[$out]
+ veor $tmpin,$tmpin,$iv0
+
+ // Decrypt the composite block to get the last second plain text block
+ ldr $rounds,[$key_,#240]
+ vld1.8 {$dat},[$key_],#16
+ sub $rounds,$rounds,#2
+ vld1.8 {$dat1},[$key_],#16
+.Loop_final_dec:
+ aesd $tmpin,$dat0
+ aesimc $tmpin,$tmpin
+ vld1.32 {$dat0},[$key_],#16 // load key schedule...
+ subs $rounds,$rounds,#2
+ aesd $tmpin,$dat1
+ aesimc $tmpin,$tmpin
+ vld1.32 {$dat1},[$key_],#16 // load key schedule...
+ b.gt .Loop_final_dec
+
+ aesd $tmpin,$dat0
+ aesimc $tmpin,$tmpin
+ vld1.32 {$dat0},[$key_]
+ aesd $tmpin,$dat1
+ veor $tmpin,$tmpin,$dat0
+ veor $tmpin,$tmpin,$iv0
+ vst1.8 {$tmpin},[$out]
+
+.Lxts_dec_abort:
+ ldp $tailcnt,$midnumx,[sp,#48]
+ ldp $ivd10,$ivd20,[sp,#32]
+ ldp $ivd30,$ivd40,[sp,#16]
+ ldp $constnumx,$tmpinp,[sp],#64
+
+.Lxts_dec_final_abort:
+ ret
+.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt
+___
+}
+}}}
$code.=<<___;
#endif
___
@@ -963,7 +3615,7 @@ if ($flavour =~ /64/) { ######## 64-bi
# since ARMv7 instructions are always encoded little-endian.
# correct solution is to use .inst directive, but older
# assemblers don't implement it:-(
- sprintf ".byte\t0x%02x,0x%02x,0x%02x,0x%02x\t@ %s %s",
+ sprintf "INST(0x%02x,0x%02x,0x%02x,0x%02x)\t@ %s %s",
$word&0xff,($word>>8)&0xff,
($word>>16)&0xff,($word>>24)&0xff,
$mnemonic,$arg;
@@ -1004,14 +3656,17 @@ if ($flavour =~ /64/) { ######## 64-bi
s/\],#[0-9]+/]!/o;
s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or
- s/cclr\s+([^,]+),\s*([a-z]+)/mov$2 $1,#0/o or
+ s/cclr\s+([^,]+),\s*([a-z]+)/mov.$2 $1,#0/o or
s/vtbl\.8\s+(.*)/unvtbl($1)/geo or
s/vdup\.32\s+(.*)/unvdup32($1)/geo or
s/vmov\.32\s+(.*)/unvmov32($1)/geo or
s/^(\s+)b\./$1b/o or
- s/^(\s+)mov\./$1mov/o or
s/^(\s+)ret/$1bx\tlr/o;
+ if (s/^(\s+)mov\.([a-z]+)/$1mov$2/) {
+ print " it $2\n";
+ }
+
print $_,"\n";
}
}
diff -up openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl
--- openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl 2020-12-09 10:37:38.405558929 +0100
@@ -30,6 +30,7 @@
# Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ]
# Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ]
# Mongoose(***) 26.3(**) 21.0/25.0(**) [13.3/16.8 ]
+# ThunderX2(***) 39.4(**) 33.8/48.6(**)
#
# (*) ECB denotes approximate result for parallelizable modes
# such as CBC decrypt, CTR, etc.;
diff -up openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl
--- openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl 2020-12-09 10:40:57.922288627 +0100
@@ -18,32 +18,44 @@
#
# ChaCha20 for ARMv8.
#
+# April 2019
+#
+# Replace 3xNEON+1xIALU code path with 4+1. 4+1 is actually fastest
+# option on most(*), but not all, processors, yet 6+2 is retained.
+# This is because penalties are considered tolerable in comparison to
+# improvement on processors where 6+2 helps. Most notably +37% on
+# ThunderX2. It's server-oriented processor which will have to serve
+# as many requests as possible. While others are mostly clients, when
+# performance doesn't have to be absolute top-notch, just fast enough,
+# as majority of time is spent "entertaining" relatively slow human.
+#
# Performance in cycles per byte out of large buffer.
#
-# IALU/gcc-4.9 3xNEON+1xIALU 6xNEON+2xIALU
+# IALU/gcc-4.9 4xNEON+1xIALU 6xNEON+2xIALU
#
-# Apple A7 5.50/+49% 3.33 1.70
-# Cortex-A53 8.40/+80% 4.72 4.72(*)
-# Cortex-A57 8.06/+43% 4.90 4.43(**)
-# Denver 4.50/+82% 2.63 2.67(*)
-# X-Gene 9.50/+46% 8.82 8.89(*)
-# Mongoose 8.00/+44% 3.64 3.25
-# Kryo 8.17/+50% 4.83 4.65
+# Apple A7 5.50/+49% 2.72 1.60
+# Cortex-A53 8.40/+80% 4.06 4.45(*)
+# Cortex-A57 8.06/+43% 4.15 4.40(*)
+# Denver 4.50/+82% 2.30 2.70(*)
+# X-Gene 9.50/+46% 8.20 8.90(*)
+# Mongoose 8.00/+44% 2.74 3.12(*)
+# Kryo 8.17/+50% 4.47 4.65(*)
+# ThunderX2 7.22/+48% 5.64 4.10
#
-# (*) it's expected that doubling interleave factor doesn't help
-# all processors, only those with higher NEON latency and
-# higher instruction issue rate;
-# (**) expected improvement was actually higher;
+# (*) slower than 4+1:-(
-$flavour=shift;
-$output=shift;
+# $output is the last argument if it looks like a file (it has an extension)
+# $flavour is the first argument if it doesn't look like a file
+$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
+$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
-open OUT,"| \"$^X\" $xlate $flavour $output";
+open OUT,"| \"$^X\" $xlate $flavour \"$output\""
+ or die "can't call $xlate: $!";
*STDOUT=*OUT;
sub AUTOLOAD() # thunk [simplified] x86-style perlasm
@@ -120,42 +132,37 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)
}
$code.=<<___;
-#include "arm_arch.h"
-
-.text
-
+#ifndef __KERNEL__
+# include "arm_arch.h"
.extern OPENSSL_armcap_P
.hidden OPENSSL_armcap_P
+#endif
+
+.text
.align 5
.Lsigma:
.quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral
.Lone:
-.long 1,0,0,0
-.LOPENSSL_armcap_P:
-#ifdef __ILP32__
-.long OPENSSL_armcap_P-.
-#else
-.quad OPENSSL_armcap_P-.
-#endif
-.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by <appro\@openssl.org>"
+.long 1,2,3,4
+.Lrot24:
+.long 0x02010003,0x06050407,0x0a09080b,0x0e0d0c0f
+.asciz "ChaCha20 for ARMv8, CRYPTOGAMS by \@dot-asm"
.globl ChaCha20_ctr32
.type ChaCha20_ctr32,%function
.align 5
ChaCha20_ctr32:
cbz $len,.Labort
- adr @x[0],.LOPENSSL_armcap_P
cmp $len,#192
b.lo .Lshort
-#ifdef __ILP32__
- ldrsw @x[1],[@x[0]]
-#else
- ldr @x[1],[@x[0]]
-#endif
- ldr w17,[@x[1],@x[0]]
+
+#ifndef __KERNEL__
+ adrp x17,OPENSSL_armcap_P
+ ldr w17,[x17,#:lo12:OPENSSL_armcap_P]
tst w17,#ARMV7_NEON
- b.ne ChaCha20_neon
+ b.ne .LChaCha20_neon
+#endif
.Lshort:
.inst 0xd503233f // paciasp
@@ -174,7 +181,7 @@ ChaCha20_ctr32:
ldp @d[2],@d[3],[$key] // load key
ldp @d[4],@d[5],[$key,#16]
ldp @d[6],@d[7],[$ctr] // load counter
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
ror @d[2],@d[2],#32
ror @d[3],@d[3],#32
ror @d[4],@d[4],#32
@@ -243,7 +250,7 @@ $code.=<<___;
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
add $inp,$inp,#64
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
@@ -300,7 +307,7 @@ $code.=<<___;
add @x[10],@x[10],@x[11],lsl#32
add @x[12],@x[12],@x[13],lsl#32
add @x[14],@x[14],@x[15],lsl#32
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
@@ -341,46 +348,91 @@ $code.=<<___;
___
{{{
-my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,$T0,$T1,$T2,$T3) =
- map("v$_.4s",(0..7,16..23));
-my (@K)=map("v$_.4s",(24..30));
-my $ONE="v31.4s";
+my @K = map("v$_.4s",(0..3));
+my ($xt0,$xt1,$xt2,$xt3, $CTR,$ROT24) = map("v$_.4s",(4..9));
+my @X = map("v$_.4s",(16,20,24,28, 17,21,25,29, 18,22,26,30, 19,23,27,31));
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = @X;
-sub NEONROUND {
-my $odd = pop;
-my ($a,$b,$c,$d,$t)=@_;
+sub NEON_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("'$_'",@X);
(
- "&add ('$a','$a','$b')",
- "&eor ('$d','$d','$a')",
- "&rev32_16 ('$d','$d')", # vrot ($d,16)
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',20)",
- "&sli ('$b','$t',12)",
-
- "&add ('$a','$a','$b')",
- "&eor ('$t','$d','$a')",
- "&ushr ('$d','$t',24)",
- "&sli ('$d','$t',8)",
-
- "&add ('$c','$c','$d')",
- "&eor ('$t','$b','$c')",
- "&ushr ('$b','$t',25)",
- "&sli ('$b','$t',7)",
-
- "&ext ('$c','$c','$c',8)",
- "&ext ('$d','$d','$d',$odd?4:12)",
- "&ext ('$b','$b','$b',$odd?12:4)"
+ "&add (@x[$a0],@x[$a0],@x[$b0])", # Q1
+ "&add (@x[$a1],@x[$a1],@x[$b1])", # Q2
+ "&add (@x[$a2],@x[$a2],@x[$b2])", # Q3
+ "&add (@x[$a3],@x[$a3],@x[$b3])", # Q4
+ "&eor (@x[$d0],@x[$d0],@x[$a0])",
+ "&eor (@x[$d1],@x[$d1],@x[$a1])",
+ "&eor (@x[$d2],@x[$d2],@x[$a2])",
+ "&eor (@x[$d3],@x[$d3],@x[$a3])",
+ "&rev32_16 (@x[$d0],@x[$d0])",
+ "&rev32_16 (@x[$d1],@x[$d1])",
+ "&rev32_16 (@x[$d2],@x[$d2])",
+ "&rev32_16 (@x[$d3],@x[$d3])",
+
+ "&add (@x[$c0],@x[$c0],@x[$d0])",
+ "&add (@x[$c1],@x[$c1],@x[$d1])",
+ "&add (@x[$c2],@x[$c2],@x[$d2])",
+ "&add (@x[$c3],@x[$c3],@x[$d3])",
+ "&eor ('$xt0',@x[$b0],@x[$c0])",
+ "&eor ('$xt1',@x[$b1],@x[$c1])",
+ "&eor ('$xt2',@x[$b2],@x[$c2])",
+ "&eor ('$xt3',@x[$b3],@x[$c3])",
+ "&ushr (@x[$b0],'$xt0',20)",
+ "&ushr (@x[$b1],'$xt1',20)",
+ "&ushr (@x[$b2],'$xt2',20)",
+ "&ushr (@x[$b3],'$xt3',20)",
+ "&sli (@x[$b0],'$xt0',12)",
+ "&sli (@x[$b1],'$xt1',12)",
+ "&sli (@x[$b2],'$xt2',12)",
+ "&sli (@x[$b3],'$xt3',12)",
+
+ "&add (@x[$a0],@x[$a0],@x[$b0])",
+ "&add (@x[$a1],@x[$a1],@x[$b1])",
+ "&add (@x[$a2],@x[$a2],@x[$b2])",
+ "&add (@x[$a3],@x[$a3],@x[$b3])",
+ "&eor ('$xt0',@x[$d0],@x[$a0])",
+ "&eor ('$xt1',@x[$d1],@x[$a1])",
+ "&eor ('$xt2',@x[$d2],@x[$a2])",
+ "&eor ('$xt3',@x[$d3],@x[$a3])",
+ "&tbl (@x[$d0],'{$xt0}','$ROT24')",
+ "&tbl (@x[$d1],'{$xt1}','$ROT24')",
+ "&tbl (@x[$d2],'{$xt2}','$ROT24')",
+ "&tbl (@x[$d3],'{$xt3}','$ROT24')",
+
+ "&add (@x[$c0],@x[$c0],@x[$d0])",
+ "&add (@x[$c1],@x[$c1],@x[$d1])",
+ "&add (@x[$c2],@x[$c2],@x[$d2])",
+ "&add (@x[$c3],@x[$c3],@x[$d3])",
+ "&eor ('$xt0',@x[$b0],@x[$c0])",
+ "&eor ('$xt1',@x[$b1],@x[$c1])",
+ "&eor ('$xt2',@x[$b2],@x[$c2])",
+ "&eor ('$xt3',@x[$b3],@x[$c3])",
+ "&ushr (@x[$b0],'$xt0',25)",
+ "&ushr (@x[$b1],'$xt1',25)",
+ "&ushr (@x[$b2],'$xt2',25)",
+ "&ushr (@x[$b3],'$xt3',25)",
+ "&sli (@x[$b0],'$xt0',7)",
+ "&sli (@x[$b1],'$xt1',7)",
+ "&sli (@x[$b2],'$xt2',7)",
+ "&sli (@x[$b3],'$xt3',7)"
);
}
$code.=<<___;
+#ifdef __KERNEL__
+.globl ChaCha20_neon
+#endif
.type ChaCha20_neon,%function
.align 5
ChaCha20_neon:
+.LChaCha20_neon:
.inst 0xd503233f // paciasp
stp x29,x30,[sp,#-96]!
add x29,sp,#0
@@ -403,8 +455,9 @@ ChaCha20_neon:
ld1 {@K[1],@K[2]},[$key]
ldp @d[6],@d[7],[$ctr] // load counter
ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-#ifdef __ARMEB__
+ stp d8,d9,[sp] // meet ABI requirements
+ ld1 {$CTR,$ROT24},[@x[0]]
+#ifdef __AARCH64EB__
rev64 @K[0],@K[0]
ror @d[2],@d[2],#32
ror @d[3],@d[3],#32
@@ -413,115 +466,129 @@ ChaCha20_neon:
ror @d[6],@d[6],#32
ror @d[7],@d[7],#32
#endif
- add @K[3],@K[3],$ONE // += 1
- add @K[4],@K[3],$ONE
- add @K[5],@K[4],$ONE
- shl $ONE,$ONE,#2 // 1 -> 4
.Loop_outer_neon:
- mov.32 @x[0],@d[0] // unpack key block
- lsr @x[1],@d[0],#32
- mov $A0,@K[0]
- mov.32 @x[2],@d[1]
- lsr @x[3],@d[1],#32
- mov $A1,@K[0]
- mov.32 @x[4],@d[2]
- lsr @x[5],@d[2],#32
- mov $A2,@K[0]
- mov.32 @x[6],@d[3]
- mov $B0,@K[1]
- lsr @x[7],@d[3],#32
- mov $B1,@K[1]
- mov.32 @x[8],@d[4]
- mov $B2,@K[1]
- lsr @x[9],@d[4],#32
- mov $D0,@K[3]
- mov.32 @x[10],@d[5]
- mov $D1,@K[4]
- lsr @x[11],@d[5],#32
- mov $D2,@K[5]
- mov.32 @x[12],@d[6]
- mov $C0,@K[2]
- lsr @x[13],@d[6],#32
- mov $C1,@K[2]
- mov.32 @x[14],@d[7]
- mov $C2,@K[2]
- lsr @x[15],@d[7],#32
+ dup $xa0,@{K[0]}[0] // unpack key block
+ mov.32 @x[0],@d[0]
+ dup $xa1,@{K[0]}[1]
+ lsr @x[1],@d[0],#32
+ dup $xa2,@{K[0]}[2]
+ mov.32 @x[2],@d[1]
+ dup $xa3,@{K[0]}[3]
+ lsr @x[3],@d[1],#32
+ dup $xb0,@{K[1]}[0]
+ mov.32 @x[4],@d[2]
+ dup $xb1,@{K[1]}[1]
+ lsr @x[5],@d[2],#32
+ dup $xb2,@{K[1]}[2]
+ mov.32 @x[6],@d[3]
+ dup $xb3,@{K[1]}[3]
+ lsr @x[7],@d[3],#32
+ dup $xd0,@{K[3]}[0]
+ mov.32 @x[8],@d[4]
+ dup $xd1,@{K[3]}[1]
+ lsr @x[9],@d[4],#32
+ dup $xd2,@{K[3]}[2]
+ mov.32 @x[10],@d[5]
+ dup $xd3,@{K[3]}[3]
+ lsr @x[11],@d[5],#32
+ add $xd0,$xd0,$CTR
+ mov.32 @x[12],@d[6]
+ dup $xc0,@{K[2]}[0]
+ lsr @x[13],@d[6],#32
+ dup $xc1,@{K[2]}[1]
+ mov.32 @x[14],@d[7]
+ dup $xc2,@{K[2]}[2]
+ lsr @x[15],@d[7],#32
+ dup $xc3,@{K[2]}[3]
mov $ctr,#10
- subs $len,$len,#256
+ subs $len,$len,#320
.Loop_neon:
sub $ctr,$ctr,#1
___
- my @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,0);
- my @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,0);
- my @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,0);
- my @thread3=&ROUND(0,4,8,12);
-
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
-
- @thread0=&NEONROUND($A0,$B0,$C0,$D0,$T0,1);
- @thread1=&NEONROUND($A1,$B1,$C1,$D1,$T1,1);
- @thread2=&NEONROUND($A2,$B2,$C2,$D2,$T2,1);
- @thread3=&ROUND(0,5,10,15);
+ my @plus_one=&ROUND(0,4,8,12);
+ foreach (&NEON_lane_ROUND(0,4,8,12)) { eval; eval(shift(@plus_one)); }
- foreach (@thread0) {
- eval; eval(shift(@thread3));
- eval(shift(@thread1)); eval(shift(@thread3));
- eval(shift(@thread2)); eval(shift(@thread3));
- }
+ @plus_one=&ROUND(0,5,10,15);
+ foreach (&NEON_lane_ROUND(0,5,10,15)) { eval; eval(shift(@plus_one)); }
$code.=<<___;
cbnz $ctr,.Loop_neon
- add.32 @x[0],@x[0],@d[0] // accumulate key block
- add $A0,$A0,@K[0]
- add @x[1],@x[1],@d[0],lsr#32
- add $A1,$A1,@K[0]
- add.32 @x[2],@x[2],@d[1]
- add $A2,$A2,@K[0]
- add @x[3],@x[3],@d[1],lsr#32
- add $C0,$C0,@K[2]
- add.32 @x[4],@x[4],@d[2]
- add $C1,$C1,@K[2]
- add @x[5],@x[5],@d[2],lsr#32
- add $C2,$C2,@K[2]
- add.32 @x[6],@x[6],@d[3]
- add $D0,$D0,@K[3]
- add @x[7],@x[7],@d[3],lsr#32
- add.32 @x[8],@x[8],@d[4]
- add $D1,$D1,@K[4]
- add @x[9],@x[9],@d[4],lsr#32
- add.32 @x[10],@x[10],@d[5]
- add $D2,$D2,@K[5]
- add @x[11],@x[11],@d[5],lsr#32
- add.32 @x[12],@x[12],@d[6]
- add $B0,$B0,@K[1]
- add @x[13],@x[13],@d[6],lsr#32
- add.32 @x[14],@x[14],@d[7]
- add $B1,$B1,@K[1]
- add @x[15],@x[15],@d[7],lsr#32
- add $B2,$B2,@K[1]
+ add $xd0,$xd0,$CTR
+
+ zip1 $xt0,$xa0,$xa1 // transpose data
+ zip1 $xt1,$xa2,$xa3
+ zip2 $xt2,$xa0,$xa1
+ zip2 $xt3,$xa2,$xa3
+ zip1.64 $xa0,$xt0,$xt1
+ zip2.64 $xa1,$xt0,$xt1
+ zip1.64 $xa2,$xt2,$xt3
+ zip2.64 $xa3,$xt2,$xt3
+
+ zip1 $xt0,$xb0,$xb1
+ zip1 $xt1,$xb2,$xb3
+ zip2 $xt2,$xb0,$xb1
+ zip2 $xt3,$xb2,$xb3
+ zip1.64 $xb0,$xt0,$xt1
+ zip2.64 $xb1,$xt0,$xt1
+ zip1.64 $xb2,$xt2,$xt3
+ zip2.64 $xb3,$xt2,$xt3
+
+ zip1 $xt0,$xc0,$xc1
+ add.32 @x[0],@x[0],@d[0] // accumulate key block
+ zip1 $xt1,$xc2,$xc3
+ add @x[1],@x[1],@d[0],lsr#32
+ zip2 $xt2,$xc0,$xc1
+ add.32 @x[2],@x[2],@d[1]
+ zip2 $xt3,$xc2,$xc3
+ add @x[3],@x[3],@d[1],lsr#32
+ zip1.64 $xc0,$xt0,$xt1
+ add.32 @x[4],@x[4],@d[2]
+ zip2.64 $xc1,$xt0,$xt1
+ add @x[5],@x[5],@d[2],lsr#32
+ zip1.64 $xc2,$xt2,$xt3
+ add.32 @x[6],@x[6],@d[3]
+ zip2.64 $xc3,$xt2,$xt3
+ add @x[7],@x[7],@d[3],lsr#32
+
+ zip1 $xt0,$xd0,$xd1
+ add.32 @x[8],@x[8],@d[4]
+ zip1 $xt1,$xd2,$xd3
+ add @x[9],@x[9],@d[4],lsr#32
+ zip2 $xt2,$xd0,$xd1
+ add.32 @x[10],@x[10],@d[5]
+ zip2 $xt3,$xd2,$xd3
+ add @x[11],@x[11],@d[5],lsr#32
+ zip1.64 $xd0,$xt0,$xt1
+ add.32 @x[12],@x[12],@d[6]
+ zip2.64 $xd1,$xt0,$xt1
+ add @x[13],@x[13],@d[6],lsr#32
+ zip1.64 $xd2,$xt2,$xt3
+ add.32 @x[14],@x[14],@d[7]
+ zip2.64 $xd3,$xt2,$xt3
+ add @x[15],@x[15],@d[7],lsr#32
b.lo .Ltail_neon
add @x[0],@x[0],@x[1],lsl#32 // pack
add @x[2],@x[2],@x[3],lsl#32
ldp @x[1],@x[3],[$inp,#0] // load input
+ add $xa0,$xa0,@K[0] // accumulate key block
add @x[4],@x[4],@x[5],lsl#32
add @x[6],@x[6],@x[7],lsl#32
ldp @x[5],@x[7],[$inp,#16]
+ add $xb0,$xb0,@K[1]
add @x[8],@x[8],@x[9],lsl#32
add @x[10],@x[10],@x[11],lsl#32
ldp @x[9],@x[11],[$inp,#32]
+ add $xc0,$xc0,@K[2]
add @x[12],@x[12],@x[13],lsl#32
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
+ add $xd0,$xd0,@K[3]
add $inp,$inp,#64
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
@@ -531,48 +598,68 @@ $code.=<<___;
rev @x[12],@x[12]
rev @x[14],@x[14]
#endif
- ld1.8 {$T0-$T3},[$inp],#64
+ ld1.8 {$xt0-$xt3},[$inp],#64
eor @x[0],@x[0],@x[1]
+ add $xa1,$xa1,@K[0]
eor @x[2],@x[2],@x[3]
+ add $xb1,$xb1,@K[1]
eor @x[4],@x[4],@x[5]
+ add $xc1,$xc1,@K[2]
eor @x[6],@x[6],@x[7]
+ add $xd1,$xd1,@K[3]
eor @x[8],@x[8],@x[9]
- eor $A0,$A0,$T0
+ eor $xa0,$xa0,$xt0
+ movi $xt0,#5
eor @x[10],@x[10],@x[11]
- eor $B0,$B0,$T1
+ eor $xb0,$xb0,$xt1
eor @x[12],@x[12],@x[13]
- eor $C0,$C0,$T2
+ eor $xc0,$xc0,$xt2
eor @x[14],@x[14],@x[15]
- eor $D0,$D0,$T3
- ld1.8 {$T0-$T3},[$inp],#64
+ eor $xd0,$xd0,$xt3
+ add $CTR,$CTR,$xt0 // += 5
+ ld1.8 {$xt0-$xt3},[$inp],#64
stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
+ add @d[6],@d[6],#5 // increment counter
stp @x[4],@x[6],[$out,#16]
- add @K[3],@K[3],$ONE // += 4
stp @x[8],@x[10],[$out,#32]
- add @K[4],@K[4],$ONE
stp @x[12],@x[14],[$out,#48]
- add @K[5],@K[5],$ONE
add $out,$out,#64
- st1.8 {$A0-$D0},[$out],#64
- ld1.8 {$A0-$D0},[$inp],#64
-
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
-
- eor $A2,$A2,$A0
- eor $B2,$B2,$B0
- eor $C2,$C2,$C0
- eor $D2,$D2,$D0
- st1.8 {$A2-$D2},[$out],#64
+ st1.8 {$xa0-$xd0},[$out],#64
+ add $xa2,$xa2,@K[0]
+ add $xb2,$xb2,@K[1]
+ add $xc2,$xc2,@K[2]
+ add $xd2,$xd2,@K[3]
+ ld1.8 {$xa0-$xd0},[$inp],#64
+
+ eor $xa1,$xa1,$xt0
+ eor $xb1,$xb1,$xt1
+ eor $xc1,$xc1,$xt2
+ eor $xd1,$xd1,$xt3
+ st1.8 {$xa1-$xd1},[$out],#64
+ add $xa3,$xa3,@K[0]
+ add $xb3,$xb3,@K[1]
+ add $xc3,$xc3,@K[2]
+ add $xd3,$xd3,@K[3]
+ ld1.8 {$xa1-$xd1},[$inp],#64
+
+ eor $xa2,$xa2,$xa0
+ eor $xb2,$xb2,$xb0
+ eor $xc2,$xc2,$xc0
+ eor $xd2,$xd2,$xd0
+ st1.8 {$xa2-$xd2},[$out],#64
+
+ eor $xa3,$xa3,$xa1
+ eor $xb3,$xb3,$xb1
+ eor $xc3,$xc3,$xc1
+ eor $xd3,$xd3,$xd1
+ st1.8 {$xa3-$xd3},[$out],#64
b.hi .Loop_outer_neon
+ ldp d8,d9,[sp] // meet ABI requirements
+
ldp x19,x20,[x29,#16]
add sp,sp,#64
ldp x21,x22,[x29,#32]
@@ -583,8 +670,10 @@ $code.=<<___;
.inst 0xd50323bf // autiasp
ret
+.align 4
.Ltail_neon:
- add $len,$len,#256
+ add $len,$len,#320
+ ldp d8,d9,[sp] // meet ABI requirements
cmp $len,#64
b.lo .Less_than_64
@@ -601,7 +690,7 @@ $code.=<<___;
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
add $inp,$inp,#64
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
@@ -621,48 +710,68 @@ $code.=<<___;
eor @x[14],@x[14],@x[15]
stp @x[0],@x[2],[$out,#0] // store output
- add @d[6],@d[6],#4 // increment counter
+ add $xa0,$xa0,@K[0] // accumulate key block
stp @x[4],@x[6],[$out,#16]
+ add $xb0,$xb0,@K[1]
stp @x[8],@x[10],[$out,#32]
+ add $xc0,$xc0,@K[2]
stp @x[12],@x[14],[$out,#48]
+ add $xd0,$xd0,@K[3]
add $out,$out,#64
b.eq .Ldone_neon
sub $len,$len,#64
cmp $len,#64
- b.lo .Less_than_128
+ b.lo .Last_neon
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A0,$A0,$T0
- eor $B0,$B0,$T1
- eor $C0,$C0,$T2
- eor $D0,$D0,$T3
- st1.8 {$A0-$D0},[$out],#64
+ ld1.8 {$xt0-$xt3},[$inp],#64
+ eor $xa0,$xa0,$xt0
+ eor $xb0,$xb0,$xt1
+ eor $xc0,$xc0,$xt2
+ eor $xd0,$xd0,$xt3
+ st1.8 {$xa0-$xd0},[$out],#64
b.eq .Ldone_neon
+
+ add $xa0,$xa1,@K[0]
+ add $xb0,$xb1,@K[1]
sub $len,$len,#64
+ add $xc0,$xc1,@K[2]
cmp $len,#64
- b.lo .Less_than_192
+ add $xd0,$xd1,@K[3]
+ b.lo .Last_neon
- ld1.8 {$T0-$T3},[$inp],#64
- eor $A1,$A1,$T0
- eor $B1,$B1,$T1
- eor $C1,$C1,$T2
- eor $D1,$D1,$T3
- st1.8 {$A1-$D1},[$out],#64
+ ld1.8 {$xt0-$xt3},[$inp],#64
+ eor $xa1,$xa0,$xt0
+ eor $xb1,$xb0,$xt1
+ eor $xc1,$xc0,$xt2
+ eor $xd1,$xd0,$xt3
+ st1.8 {$xa1-$xd1},[$out],#64
b.eq .Ldone_neon
+
+ add $xa0,$xa2,@K[0]
+ add $xb0,$xb2,@K[1]
sub $len,$len,#64
+ add $xc0,$xc2,@K[2]
+ cmp $len,#64
+ add $xd0,$xd2,@K[3]
+ b.lo .Last_neon
- st1.8 {$A2-$D2},[sp]
- b .Last_neon
+ ld1.8 {$xt0-$xt3},[$inp],#64
+ eor $xa2,$xa0,$xt0
+ eor $xb2,$xb0,$xt1
+ eor $xc2,$xc0,$xt2
+ eor $xd2,$xd0,$xt3
+ st1.8 {$xa2-$xd2},[$out],#64
+ b.eq .Ldone_neon
-.Less_than_128:
- st1.8 {$A0-$D0},[sp]
- b .Last_neon
-.Less_than_192:
- st1.8 {$A1-$D1},[sp]
- b .Last_neon
+ add $xa0,$xa3,@K[0]
+ add $xb0,$xb3,@K[1]
+ add $xc0,$xc3,@K[2]
+ add $xd0,$xd3,@K[3]
+ sub $len,$len,#64
-.align 4
.Last_neon:
+ st1.8 {$xa0-$xd0},[sp]
+
sub $out,$out,#1
add $inp,$inp,$len
add $out,$out,$len
@@ -695,9 +804,41 @@ $code.=<<___;
.size ChaCha20_neon,.-ChaCha20_neon
___
{
+my @K = map("v$_.4s",(0..6));
my ($T0,$T1,$T2,$T3,$T4,$T5)=@K;
my ($A0,$B0,$C0,$D0,$A1,$B1,$C1,$D1,$A2,$B2,$C2,$D2,
- $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(0..23));
+ $A3,$B3,$C3,$D3,$A4,$B4,$C4,$D4,$A5,$B5,$C5,$D5) = map("v$_.4s",(8..31));
+my $rot24 = @K[6];
+my $ONE = "v7.4s";
+
+sub NEONROUND {
+my $odd = pop;
+my ($a,$b,$c,$d,$t)=@_;
+
+ (
+ "&add ('$a','$a','$b')",
+ "&eor ('$d','$d','$a')",
+ "&rev32_16 ('$d','$d')", # vrot ($d,16)
+
+ "&add ('$c','$c','$d')",
+ "&eor ('$t','$b','$c')",
+ "&ushr ('$b','$t',20)",
+ "&sli ('$b','$t',12)",
+
+ "&add ('$a','$a','$b')",
+ "&eor ('$d','$d','$a')",
+ "&tbl ('$d','{$d}','$rot24')",
+
+ "&add ('$c','$c','$d')",
+ "&eor ('$t','$b','$c')",
+ "&ushr ('$b','$t',25)",
+ "&sli ('$b','$t',7)",
+
+ "&ext ('$c','$c','$c',8)",
+ "&ext ('$d','$d','$d',$odd?4:12)",
+ "&ext ('$b','$b','$b',$odd?12:4)"
+ );
+}
$code.=<<___;
.type ChaCha20_512_neon,%function
@@ -717,6 +858,7 @@ ChaCha20_512_neon:
.L512_or_more_neon:
sub sp,sp,#128+64
+ eor $ONE,$ONE,$ONE
ldp @d[0],@d[1],[@x[0]] // load sigma
ld1 {@K[0]},[@x[0]],#16
ldp @d[2],@d[3],[$key] // load key
@@ -724,8 +866,9 @@ ChaCha20_512_neon:
ld1 {@K[1],@K[2]},[$key]
ldp @d[6],@d[7],[$ctr] // load counter
ld1 {@K[3]},[$ctr]
- ld1 {$ONE},[@x[0]]
-#ifdef __ARMEB__
+ ld1 {$ONE}[0],[@x[0]]
+ add $key,@x[0],#16 // .Lrot24
+#ifdef __AARCH64EB__
rev64 @K[0],@K[0]
ror @d[2],@d[2],#32
ror @d[3],@d[3],#32
@@ -792,9 +935,10 @@ ChaCha20_512_neon:
mov $C4,@K[2]
stp @K[3],@K[4],[sp,#48] // off-load key block, variable part
mov $C5,@K[2]
- str @K[5],[sp,#80]
+ stp @K[5],@K[6],[sp,#80]
mov $ctr,#5
+ ld1 {$rot24},[$key]
subs $len,$len,#512
.Loop_upper_neon:
sub $ctr,$ctr,#1
@@ -867,7 +1011,7 @@ $code.=<<___;
add @x[14],@x[14],@x[15],lsl#32
ldp @x[13],@x[15],[$inp,#48]
add $inp,$inp,#64
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
@@ -956,6 +1100,7 @@ $code.=<<___;
add.32 @x[2],@x[2],@d[1]
ldp @K[4],@K[5],[sp,#64]
add @x[3],@x[3],@d[1],lsr#32
+ ldr @K[6],[sp,#96]
add $A0,$A0,@K[0]
add.32 @x[4],@x[4],@d[2]
add $A1,$A1,@K[0]
@@ -1008,7 +1153,7 @@ $code.=<<___;
add $inp,$inp,#64
add $B5,$B5,@K[1]
-#ifdef __ARMEB__
+#ifdef __AARCH64EB__
rev @x[0],@x[0]
rev @x[2],@x[2]
rev @x[4],@x[4]
@@ -1086,26 +1231,26 @@ $code.=<<___;
b.hs .Loop_outer_512_neon
adds $len,$len,#512
- ushr $A0,$ONE,#2 // 4 -> 1
+ ushr $ONE,$ONE,#1 // 4 -> 2
- ldp d8,d9,[sp,#128+0] // meet ABI requirements
ldp d10,d11,[sp,#128+16]
ldp d12,d13,[sp,#128+32]
ldp d14,d15,[sp,#128+48]
- stp @K[0],$ONE,[sp,#0] // wipe off-load area
- stp @K[0],$ONE,[sp,#32]
- stp @K[0],$ONE,[sp,#64]
+ stp @K[0],@K[0],[sp,#0] // wipe off-load area
+ stp @K[0],@K[0],[sp,#32]
+ stp @K[0],@K[0],[sp,#64]
b.eq .Ldone_512_neon
+ sub $key,$key,#16 // .Lone
cmp $len,#192
- sub @K[3],@K[3],$A0 // -= 1
- sub @K[4],@K[4],$A0
- sub @K[5],@K[5],$A0
add sp,sp,#128
+ sub @K[3],@K[3],$ONE // -= 2
+ ld1 {$CTR,$ROT24},[$key]
b.hs .Loop_outer_neon
+ ldp d8,d9,[sp,#0] // meet ABI requirements
eor @K[1],@K[1],@K[1]
eor @K[2],@K[2],@K[2]
eor @K[3],@K[3],@K[3]
@@ -1115,6 +1260,7 @@ $code.=<<___;
b .Loop_outer
.Ldone_512_neon:
+ ldp d8,d9,[sp,#128+0] // meet ABI requirements
ldp x19,x20,[x29,#16]
add sp,sp,#128+64
ldp x21,x22,[x29,#32]
@@ -1133,9 +1279,11 @@ foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval $1/geo;
(s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or
- (m/\b(eor|ext|mov)\b/ and (s/\.4s/\.16b/g or 1)) or
+ (m/\b(eor|ext|mov|tbl)\b/ and (s/\.4s/\.16b/g or 1)) or
(s/\b((?:ld|st)1)\.8\b/$1/ and (s/\.4s/\.16b/g or 1)) or
(m/\b(ld|st)[rp]\b/ and (s/v([0-9]+)\.4s/q$1/g or 1)) or
+ (m/\b(dup|ld1)\b/ and (s/\.4(s}?\[[0-3]\])/.$1/g or 1)) or
+ (s/\b(zip[12])\.64\b/$1/ and (s/\.4s/\.2d/g or 1)) or
(s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1));
#s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo;
diff -up openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl
--- openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl 2020-12-09 10:37:38.408558954 +0100
@@ -42,6 +42,7 @@
# Denver 0.51 0.65 6.02
# Mongoose 0.65 1.10 8.06
# Kryo 0.76 1.16 8.00
+# ThunderX2 1.05
#
# (*) presented for reference/comparison purposes;
diff -up openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl
--- openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl 2020-12-09 10:37:38.408558954 +0100
@@ -29,6 +29,7 @@
# X-Gene 2.13/+68% 2.27
# Mongoose 1.77/+75% 1.12
# Kryo 2.70/+55% 1.13
+# ThunderX2 1.17/+95% 1.36
#
# (*) estimate based on resources availability is less than 1.0,
# i.e. measured result is worse than expected, presumably binary
diff -up openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl
--- openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl 2020-12-09 10:37:38.408558954 +0100
@@ -51,6 +51,7 @@
# Kryo 12
# Denver 7.8
# Apple A7 7.2
+# ThunderX2 9.7
#
# (*) Corresponds to SHA3-256. No improvement coefficients are listed
# because they vary too much from compiler to compiler. Newer
diff -up openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl
--- openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl 2020-12-09 10:37:38.408558954 +0100
@@ -27,6 +27,7 @@
# X-Gene 8.80 (+200%)
# Mongoose 2.05 6.50 (+160%)
# Kryo 1.88 8.00 (+90%)
+# ThunderX2 2.64 6.36 (+150%)
#
# (*) Software results are presented mostly for reference purposes.
# (**) Keep in mind that Denver relies on binary translation, which
diff -up openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl
--- openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100
+++ openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl 2020-12-09 10:37:38.408558954 +0100
@@ -28,6 +28,7 @@
# X-Gene 20.0 (+100%) 12.8 (+300%(***))
# Mongoose 2.36 13.0 (+50%) 8.36 (+33%)
# Kryo 1.92 17.4 (+30%) 11.2 (+8%)
+# ThunderX2 2.54 13.2 (+40%) 8.40 (+18%)
#
# (*) Software SHA256 results are of lesser relevance, presented
# mostly for informational purposes.