diff --git a/.gitignore b/.gitignore index 3305a0f..d1abce3 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,4 @@ openssl-1.0.0a-usa.tar.bz2 /openssl-1.1.1f-hobbled.tar.xz /openssl-1.1.1g-hobbled.tar.xz /openssl-1.1.1h-hobbled.tar.xz +/openssl-1.1.1i-hobbled.tar.xz diff --git a/openssl-1.1.1-arm-update.patch b/openssl-1.1.1-arm-update.patch index 998905f..2b8c549 100644 --- a/openssl-1.1.1-arm-update.patch +++ b/openssl-1.1.1-arm-update.patch @@ -1,6 +1,6 @@ -diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl ---- openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl 2019-11-20 11:36:22.389506155 +0100 +diff -up openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl +--- openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl 2020-12-09 10:39:50.645705385 +0100 @@ -27,44 +27,72 @@ # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... @@ -85,7 +85,844 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c ___ # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, -@@ -514,6 +542,13 @@ $code.=<<___; +@@ -361,6 +389,836 @@ ___ + &gen_block("en"); + &gen_block("de"); + }}} ++ ++# Performance in cycles per byte. ++# Processed with AES-ECB different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-ECB AES-192-ECB AES-256-ECB ++# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 ++# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, ++# every loop lsize -=3*16. ++# If lsize < 3*16 bytes, treat them as the tail, interleave the ++# two blocks AES instructions. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store, just looks like what the original ECB implementation does. ++ ++{{{ ++my ($inp,$out,$len,$key)=map("x$_",(0..3)); ++my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++### q7 last round key ++### q10-q15 q7 Last 7 round keys ++### q8-q9 preloaded round keys except last 7 keys for big size ++### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ subs $len,$len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lecb_big_size ++ vld1.8 {$dat0},[$inp] ++ cmp $enc,#0 // en- or decrypting? ++ ldr $rounds,[$key,#240] ++ vld1.32 {q5-q6},[$key],#32 // load key schedule... ++ ++ b.eq .Lecb_small_dec ++ aese $dat0,q5 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key],#32 // load key schedule... ++ aese $dat0,q6 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing ++ b.eq .Lecb_128_enc ++.Lecb_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lecb_round_loop ++.Lecb_128_enc: ++ vld1.32 {q10-q11},[$key],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ vst1.8 {$dat0},[$out] ++ b .Lecb_Final_abort ++.Lecb_small_dec: ++ aesd $dat0,q5 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key],#32 // load key schedule... ++ aesd $dat0,q6 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lecb_128_dec ++.Lecb_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lecb_dec_round_loop ++.Lecb_128_dec: ++ vld1.32 {q10-q11},[$key],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ vst1.8 {$dat0},[$out] ++ b .Lecb_Final_abort ++.Lecb_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp x29,x30,[sp,#-16]! ++ add x29,sp,#0 ++___ ++$code.=<<___ if ($flavour !~ /64/); ++ mov ip,sp ++ stmdb sp!,{r4-r8,lr} ++ vstmdb sp!,{d8-d15} @ ABI specification says so ++ ldmia ip,{r4-r5} @ load remaining args ++ subs $len,$len,#16 ++___ ++$code.=<<___; ++ mov $step,#16 ++ b.lo .Lecb_done ++ cclr $step,eq ++ ++ cmp $enc,#0 // en- or decrypting? ++ ldr $rounds,[$key,#240] ++ and $len,$len,#-16 ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key] // load key schedule... ++ sub $rounds,$rounds,#6 ++ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys ++ sub $rounds,$rounds,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key,#32 ++ mov $cnt,$rounds ++ b.eq .Lecb_dec ++ ++ vld1.8 {$dat1},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $cnt,$rounds,#2 ++ vorr $in1,$dat1,$dat1 ++ vorr $dat2,$dat1,$dat1 ++ vorr $dat1,$dat,$dat ++ b.lo .Lecb_enc_tail ++ ++ vorr $dat1,$in1,$in1 ++ vld1.8 {$dat2},[$inp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_ecb_enc ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ ++.Loop5x_ecb_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ecb_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lecb_enc_tail4x ++ sub $len,$len,#0x50 ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lecb_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q15 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lecb_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$rndlast,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$rndlast,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$rndlast,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$rndlast,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_ecb_enc ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lecb_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ b.lo .Lecb_enc_tail ++ ++ b .Loop3x_ecb_enc ++ ++.align 4 ++.Lecb_enc_tail4x: ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ veor $tmp3,$rndlast,$dat3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lecb_done ++.align 4 ++___ ++$code.=<<___; ++.Loop3x_ecb_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop3x_ecb_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ subs $len,$len,#0x30 ++ mov.lo x6,$len // x6, $cnt, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat2 ++ // are loaded with last "words" ++ mov $key_,$key ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $cnt,$rounds,#2 ++ veor $tmp0,$rndlast,$dat0 ++ veor $tmp1,$rndlast,$dat1 ++ veor $dat2,$dat2,$rndlast ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat0,$in0,$in0 ++ vst1.8 {$tmp1},[$out],#16 ++ vorr $dat1,$in1,$in1 ++ vst1.8 {$dat2},[$out],#16 ++ vorr $dat2,$in2,$in2 ++ b.hs .Loop3x_ecb_enc ++ ++ cmn $len,#0x30 ++ b.eq .Lecb_done ++ nop ++ ++.Lecb_enc_tail: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lecb_enc_tail ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lecb_enc_one ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ b .Lecb_done ++ ++.Lecb_enc_one: ++ veor $tmp1,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ b .Lecb_done ++___ ++ ++$code.=<<___; ++.align 5 ++.Lecb_dec: ++ vld1.8 {$dat1},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $cnt,$rounds,#2 ++ vorr $in1,$dat1,$dat1 ++ vorr $dat2,$dat1,$dat1 ++ vorr $dat1,$dat,$dat ++ b.lo .Lecb_dec_tail ++ ++ vorr $dat1,$in1,$in1 ++ vld1.8 {$dat2},[$inp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_ecb_dec ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ ++.Loop5x_ecb_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ecb_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lecb_tail4x ++ sub $len,$len,#0x50 ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lecb_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q15 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lecb_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$rndlast,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$rndlast,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$rndlast,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$rndlast,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_ecb_dec ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lecb_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ b.lo .Lecb_dec_tail ++ ++ b .Loop3x_ecb_dec ++ ++.align 4 ++.Lecb_tail4x: ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ veor $tmp3,$rndlast,$dat3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lecb_done ++.align 4 ++___ ++$code.=<<___; ++.Loop3x_ecb_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop3x_ecb_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ subs $len,$len,#0x30 ++ mov.lo x6,$len // x6, $cnt, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat2 ++ // are loaded with last "words" ++ mov $key_,$key ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $cnt,$rounds,#2 ++ veor $tmp0,$rndlast,$dat0 ++ veor $tmp1,$rndlast,$dat1 ++ veor $dat2,$dat2,$rndlast ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat0,$in0,$in0 ++ vst1.8 {$tmp1},[$out],#16 ++ vorr $dat1,$in1,$in1 ++ vst1.8 {$dat2},[$out],#16 ++ vorr $dat2,$in2,$in2 ++ b.hs .Loop3x_ecb_dec ++ ++ cmn $len,#0x30 ++ b.eq .Lecb_done ++ nop ++ ++.Lecb_dec_tail: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lecb_dec_tail ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lecb_dec_one ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ b .Lecb_done ++ ++.Lecb_dec_one: ++ veor $tmp1,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ ++.Lecb_done: ++___ ++} ++$code.=<<___ if ($flavour !~ /64/); ++ vldmia sp!,{d8-d15} ++ ldmia sp!,{r4-r8,pc} ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ ldr x29,[sp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++.Lecb_Final_abort: ++ ret ++___ ++$code.=<<___; ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} + {{{ + my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; + my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +@@ -519,6 +1377,13 @@ $code.=<<___; ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); @@ -99,7 +936,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c $code.=<<___; .align 5 .Lcbc_dec: -@@ -530,7 +565,196 @@ $code.=<<___; +@@ -535,7 +1400,196 @@ $code.=<<___; vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 vorr $in2,$dat2,$dat2 @@ -225,7 +1062,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c + aesimc $dat3,$dat3 + aesd $dat4,q14 + aesimc $dat4,$dat4 -+ + + veor $tmp0,$ivec,$rndlast + aesd $dat0,q15 + veor $tmp1,$in0,$rndlast @@ -277,7 +1114,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c + b.lo .Lcbc_dec_tail + + b .Loop3x_cbc_dec - ++ +.align 4 +.Lcbc_tail4x: + veor $tmp1,$tmp0,$dat1 @@ -296,7 +1133,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c .Loop3x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 -@@ -691,6 +915,9 @@ my $step="x12"; # aliases with $tctr2 +@@ -696,6 +1750,9 @@ my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); @@ -306,10 +1143,10 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c my ($dat,$tmp)=($dat0,$tmp0); ### q8-q15 preloaded key schedule -@@ -743,6 +970,175 @@ $code.=<<___; - rev $tctr2, $ctr +@@ -751,6 +1808,175 @@ $code.=<<___; + vmov.32 ${ivec}[3],$tctr2 sub $len,$len,#3 // bias - vmov.32 ${dat2}[3],$tctr2 + vorr $dat2,$ivec,$ivec +___ +$code.=<<___ if ($flavour =~ /64/); + cmp $len,#2 @@ -482,7 +1319,1440 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c b .Loop3x_ctr32 .align 4 -@@ -955,7 +1351,7 @@ if ($flavour =~ /64/) { ######## 64-bi +@@ -905,6 +2131,1432 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++# Performance in cycles per byte. ++# Processed with AES-XTS different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-XTS AES-256-XTS ++# Cortex-A57 3.36/1.09 4.02/1.37 ++# Cortex-A72 3.03/1.02 3.28/1.33 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes ++# will be processed specially, which be integrated into the 5*16 bytes ++# loop to improve the efficiency. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store. ++# Encryption will process the (length -tailcnt) bytes as mentioned ++# previously, then encrypt the composite block as last second ++# cipher block. ++# Decryption will process the (length -tailcnt -1) bytes as mentioned ++# previously, then decrypt the last second cipher block to get the ++# last plain block(tail), decrypt the composite block as last second ++# plain text block. ++ ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($tmpin)=("v26.16b"); ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_encrypt ++.type ${prefix}_xts_encrypt,%function ++.align 5 ++${prefix}_xts_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_enc_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_enc_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_enc_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aese $dat0,q20 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aese $dat0,q21 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing ++ b.eq .Lxts_128_enc ++.Lxts_enc_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_enc_round_loop ++.Lxts_128_enc: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$dat0,$iv0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_enc_final_abort ++ ++.align 4 ++.Lxts_enc_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ // tailcnt store the tail value of length%16. ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_abort ++ csel $step,xzr,$step,eq ++ ++ // Firstly, encrypt the iv with key2, as the first iv of XEX. ++ ldr $rounds,[$key2,#240] ++ vld1.32 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key2],#16 ++ ++.Loop_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // next starting point ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ ++ // Encryption ++.Lxts_enc: ++ vld1.8 {$dat2},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_enc_tail ++ veor $dat,$dat,$iv0 // before encryption, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // the third block ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_enc_tail ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ vld1.8 {$dat3},[$inp],#16 ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_enc ++ ++.align 4 ++.Loop5x_xts_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_xts_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aese $dat0,q15 ++ // The iv for first block of one iteration ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_enc ++ ++ ++ // If left 4 blocks, borrow the five block's processing. ++ cmn $len,#0x10 ++ b.ne .Loop5x_enc_after ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_enc ++ ++.Loop5x_enc_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_enc_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_enc_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_enc_tail ++ ++.align 4 ++.Lxts_enc_tail4x: ++ add $inp,$inp,#16 ++ veor $tmp1,$dat1,$tmp1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_enc_done ++.align 4 ++.Lxts_outer_enc_tail: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_enc_tail ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ //mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset ++ mov $key_,$key1 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ add $rounds,$rounds0,#2 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ cmn $len,#0x30 ++ b.eq .Lxts_enc_done ++.Lxts_encxor_one: ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_enc_tail: ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_enc_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_enc_tail_loop: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_enc_tail_loop ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lxts_enc_one ++ veor $tmp1,$tmp1,$dat1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vst1.8 {$tmp2},[$out],#16 ++ fmov $ivl,$ivd10 ++ fmov $ivh,$ivd11 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++ ++.Lxts_enc_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv0,$iv0 ++ vst1.8 {$tmp1},[$out],#16 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++.align 5 ++.Lxts_enc_done: ++ // Process the tail block with cipher stealing. ++ tst $tailcnt,#0xf ++ b.eq .Lxts_abort ++ ++ mov $tmpinp,$inp ++ mov $tmpoutp,$out ++ sub $out,$out,#16 ++.composite_enc_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_enc_loop ++.Lxts_enc_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Encrypt the composite block to get the last second encrypted text block ++ ldr $rounds,[$key1,#240] // load key schedule... ++ vld1.8 {$dat},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key1],#16 // load key schedule... ++.Loop_final_enc: ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 ++ subs $rounds,$rounds,#2 ++ aese $tmpin,$dat1 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 ++ b.gt .Loop_final_enc ++ ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aese $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++.Lxts_enc_final_abort: ++ ret ++.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt ++___ ++ ++}}} ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_decrypt ++.type ${prefix}_xts_decrypt,%function ++.align 5 ++${prefix}_xts_decrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_dec_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_small_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_small_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aesd $dat0,q20 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aesd $dat0,q21 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lxts_128_dec ++.Lxts_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_dec_round_loop ++.Lxts_128_dec: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$iv0,$dat0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_dec_final_abort ++.Lxts_dec_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_dec_abort ++ ++ // Encrypt the iv with key2, as the first XEX iv ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // load rounds number ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 // load key schedule... ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ b .Lxts_dec ++ ++ // Decryption ++.align 5 ++.Lxts_dec: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_begin ++ subs $len,$len,#16 ++ csel $step,xzr,$step,eq ++ vld1.8 {$dat},[$inp],#16 ++ b.lo .Lxts_done ++ sub $inp,$inp,#16 ++.Lxts_dec_begin: ++ vld1.8 {$dat},[$inp],$step ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_dec_tail ++ veor $dat,$dat,$iv0 // before decryt, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // third block xox with third iv ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_dec_tail ++ ++ vld1.8 {$dat3},[$inp],#16 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_dec ++ ++.align 4 ++.Loop5x_xts_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 // load key schedule... ++ b.gt .Loop5x_xts_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aesd $dat0,q15 ++ // The iv for first block of next iteration. ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_dec_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_dec ++ ++ cmn $len,#0x10 ++ b.ne .Loop5x_dec_after ++ // If x2($len) equal to -0x10, the left blocks is 4. ++ // After specially processing, utilize the five blocks processing again. ++ // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_dec ++ ++.Loop5x_dec_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_dec_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_dec_tail ++ ++.align 4 ++.Lxts_dec_tail4x: ++ add $inp,$inp,#16 ++ vld1.32 {$dat0},[$inp],#16 ++ veor $tmp1,$dat1,$tmp0 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_done ++.align 4 ++.Lxts_outer_dec_tail: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_dec_tail ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset // $inp is adjusted to the last data ++ ++ mov $key_,$key1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $rounds,$rounds0,#2 ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ ++ cmn $len,#0x30 ++ add $len,$len,#0x30 ++ b.eq .Lxts_done ++ sub $len,$len,#0x30 ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_dec_tail: ++ // $len == -0x10 means two blocks left. ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_dec_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_dec_tail_loop: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_dec_tail_loop ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lxts_dec_one ++ veor $tmp1,$tmp1,$dat1 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv2,$iv2 ++ vorr $iv1,$iv3,$iv3 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ add $len,$len,#16 ++ b .Lxts_done ++ ++.Lxts_dec_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vorr $iv1,$iv2,$iv2 ++ vst1.8 {$tmp1},[$out],#16 ++ add $len,$len,#32 ++ ++.Lxts_done: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_abort ++ // Processing the last two blocks with cipher stealing. ++ mov x7,x3 ++ cbnz x2,.Lxts_dec_1st_done ++ vld1.32 {$dat0},[$inp],#16 ++ ++ // Decrypt the last secod block to get the last plain text block ++.Lxts_dec_1st_done: ++ eor $tmpin,$dat0,$iv1 ++ ldr $rounds,[$key1,#240] ++ vld1.32 {$dat0},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key1],#16 ++.Loop_final_2nd_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 // load key schedule... ++ b.gt .Loop_final_2nd_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv1 ++ vst1.8 {$tmpin},[$out] ++ ++ mov $tmpinp,$inp ++ add $tmpoutp,$out,#16 ++ ++ // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks ++ // to get the last encrypted block. ++.composite_dec_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_dec_loop ++.Lxts_dec_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Decrypt the composite block to get the last second plain text block ++ ldr $rounds,[$key_,#240] ++ vld1.8 {$dat},[$key_],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key_],#16 ++.Loop_final_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key_],#16 // load key schedule... ++ b.gt .Loop_final_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_dec_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++ ++.Lxts_dec_final_abort: ++ ret ++.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt ++___ ++} ++}}} + $code.=<<___; + #endif + ___ +@@ -963,7 +3615,7 @@ if ($flavour =~ /64/) { ######## 64-bi # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( @@ -491,7 +2761,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; -@@ -996,14 +1392,17 @@ if ($flavour =~ /64/) { ######## 64-bi +@@ -1004,14 +3656,17 @@ if ($flavour =~ /64/) { ######## 64-bi s/\],#[0-9]+/]!/o; s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or @@ -511,9 +2781,9 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c print $_,"\n"; } } -diff -up openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl ---- openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl 2019-11-20 11:36:22.389506155 +0100 +diff -up openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl +--- openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl 2020-12-09 10:37:38.405558929 +0100 @@ -30,6 +30,7 @@ # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] @@ -522,9 +2792,9 @@ diff -up openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1c/ # # (*) ECB denotes approximate result for parallelizable modes # such as CBC decrypt, CTR, etc.; -diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl ---- openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl 2019-11-21 16:44:50.814651553 +0100 +diff -up openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl +--- openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl 2020-12-09 10:40:57.922288627 +0100 @@ -18,32 +18,44 @@ # # ChaCha20 for ARMv8. @@ -585,20 +2855,22 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 *STDOUT=*OUT; sub AUTOLOAD() # thunk [simplified] x86-style perlasm -@@ -120,41 +132,36 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1) +@@ -120,42 +132,37 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1) } $code.=<<___; -#include "arm_arch.h" +- +-.text +- +#ifndef __KERNEL__ +# include "arm_arch.h" -+.extern OPENSSL_armcap_P + .extern OPENSSL_armcap_P + .hidden OPENSSL_armcap_P +#endif ++ ++.text - .text - --.extern OPENSSL_armcap_P -- .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral @@ -641,7 +2913,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 .Lshort: .inst 0xd503233f // paciasp -@@ -173,7 +180,7 @@ ChaCha20_ctr32: +@@ -174,7 +181,7 @@ ChaCha20_ctr32: ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ldp @d[6],@d[7],[$ctr] // load counter @@ -650,7 +2922,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 -@@ -242,7 +249,7 @@ $code.=<<___; +@@ -243,7 +250,7 @@ $code.=<<___; add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 @@ -659,7 +2931,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -299,7 +306,7 @@ $code.=<<___; +@@ -300,7 +307,7 @@ $code.=<<___; add @x[10],@x[10],@x[11],lsl#32 add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 @@ -668,7 +2940,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -340,46 +347,91 @@ $code.=<<___; +@@ -341,46 +348,91 @@ $code.=<<___; ___ {{{ @@ -789,7 +3061,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 -@@ -402,8 +454,9 @@ ChaCha20_neon: +@@ -403,8 +455,9 @@ ChaCha20_neon: ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] @@ -801,7 +3073,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 -@@ -412,115 +465,129 @@ ChaCha20_neon: +@@ -413,115 +466,129 @@ ChaCha20_neon: ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif @@ -1013,7 +3285,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -530,48 +597,68 @@ $code.=<<___; +@@ -531,48 +598,68 @@ $code.=<<___; rev @x[12],@x[12] rev @x[14],@x[14] #endif @@ -1106,7 +3378,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] -@@ -582,8 +669,10 @@ $code.=<<___; +@@ -583,8 +670,10 @@ $code.=<<___; .inst 0xd50323bf // autiasp ret @@ -1118,7 +3390,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 cmp $len,#64 b.lo .Less_than_64 -@@ -600,7 +689,7 @@ $code.=<<___; +@@ -601,7 +690,7 @@ $code.=<<___; add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 @@ -1127,7 +3399,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -620,48 +709,68 @@ $code.=<<___; +@@ -621,48 +710,68 @@ $code.=<<___; eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output @@ -1220,7 +3492,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len -@@ -694,9 +803,41 @@ $code.=<<___; +@@ -695,9 +804,41 @@ $code.=<<___; .size ChaCha20_neon,.-ChaCha20_neon ___ { @@ -1263,7 +3535,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 $code.=<<___; .type ChaCha20_512_neon,%function -@@ -716,6 +857,7 @@ ChaCha20_512_neon: +@@ -717,6 +858,7 @@ ChaCha20_512_neon: .L512_or_more_neon: sub sp,sp,#128+64 @@ -1271,7 +3543,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ldp @d[0],@d[1],[@x[0]] // load sigma ld1 {@K[0]},[@x[0]],#16 ldp @d[2],@d[3],[$key] // load key -@@ -723,8 +865,9 @@ ChaCha20_512_neon: +@@ -724,8 +866,9 @@ ChaCha20_512_neon: ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] @@ -1283,7 +3555,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 -@@ -791,9 +934,10 @@ ChaCha20_512_neon: +@@ -792,9 +935,10 @@ ChaCha20_512_neon: mov $C4,@K[2] stp @K[3],@K[4],[sp,#48] // off-load key block, variable part mov $C5,@K[2] @@ -1295,7 +3567,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 subs $len,$len,#512 .Loop_upper_neon: sub $ctr,$ctr,#1 -@@ -866,7 +1010,7 @@ $code.=<<___; +@@ -867,7 +1011,7 @@ $code.=<<___; add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 @@ -1304,7 +3576,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -955,6 +1099,7 @@ $code.=<<___; +@@ -956,6 +1100,7 @@ $code.=<<___; add.32 @x[2],@x[2],@d[1] ldp @K[4],@K[5],[sp,#64] add @x[3],@x[3],@d[1],lsr#32 @@ -1312,7 +3584,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 add $A0,$A0,@K[0] add.32 @x[4],@x[4],@d[2] add $A1,$A1,@K[0] -@@ -1007,7 +1152,7 @@ $code.=<<___; +@@ -1008,7 +1153,7 @@ $code.=<<___; add $inp,$inp,#64 add $B5,$B5,@K[1] @@ -1321,7 +3593,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -1085,26 +1230,26 @@ $code.=<<___; +@@ -1086,26 +1231,26 @@ $code.=<<___; b.hs .Loop_outer_512_neon adds $len,$len,#512 @@ -1356,7 +3628,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 eor @K[1],@K[1],@K[1] eor @K[2],@K[2],@K[2] eor @K[3],@K[3],@K[3] -@@ -1114,6 +1259,7 @@ $code.=<<___; +@@ -1115,6 +1260,7 @@ $code.=<<___; b .Loop_outer .Ldone_512_neon: @@ -1364,7 +3636,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] -@@ -1132,9 +1278,11 @@ foreach (split("\n",$code)) { +@@ -1133,9 +1279,11 @@ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or @@ -1377,9 +3649,9 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; -diff -up openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl ---- openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl 2019-11-20 11:36:22.389506155 +0100 +diff -up openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl +--- openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl 2020-12-09 10:37:38.408558954 +0100 @@ -42,6 +42,7 @@ # Denver 0.51 0.65 6.02 # Mongoose 0.65 1.10 8.06 @@ -1388,9 +3660,9 @@ diff -up openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1. # # (*) presented for reference/comparison purposes; -diff -up openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl ---- openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl +--- openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -29,6 +29,7 @@ # X-Gene 2.13/+68% 2.27 # Mongoose 1.77/+75% 1.12 @@ -1399,9 +3671,9 @@ diff -up openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl # # (*) estimate based on resources availability is less than 1.0, # i.e. measured result is worse than expected, presumably binary -diff -up openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl ---- openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -51,6 +51,7 @@ # Kryo 12 # Denver 7.8 @@ -1410,9 +3682,9 @@ diff -up openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1. # # (*) Corresponds to SHA3-256. No improvement coefficients are listed # because they vary too much from compiler to compiler. Newer -diff -up openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl ---- openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -27,6 +27,7 @@ # X-Gene 8.80 (+200%) # Mongoose 2.05 6.50 (+160%) @@ -1421,9 +3693,9 @@ diff -up openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1c/c # # (*) Software results are presented mostly for reference purposes. # (**) Keep in mind that Denver relies on binary translation, which -diff -up openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl.arm-update openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl ---- openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -28,6 +28,7 @@ # X-Gene 20.0 (+100%) 12.8 (+300%(***)) # Mongoose 2.36 13.0 (+50%) 8.36 (+33%) diff --git a/openssl-1.1.1-fips-post-rand.patch b/openssl-1.1.1-fips-post-rand.patch index 18a01fe..027dc55 100644 --- a/openssl-1.1.1-fips-post-rand.patch +++ b/openssl-1.1.1-fips-post-rand.patch @@ -1,6 +1,6 @@ -diff -up openssl-1.1.1e/crypto/fips/fips.c.fips-post-rand openssl-1.1.1e/crypto/fips/fips.c ---- openssl-1.1.1e/crypto/fips/fips.c.fips-post-rand 2020-03-17 18:06:16.822418854 +0100 -+++ openssl-1.1.1e/crypto/fips/fips.c 2020-03-17 18:06:16.861418172 +0100 +diff -up openssl-1.1.1i/crypto/fips/fips.c.fips-post-rand openssl-1.1.1i/crypto/fips/fips.c +--- openssl-1.1.1i/crypto/fips/fips.c.fips-post-rand 2020-12-09 10:26:41.634106328 +0100 ++++ openssl-1.1.1i/crypto/fips/fips.c 2020-12-09 10:26:41.652106475 +0100 @@ -68,6 +68,7 @@ # include @@ -51,10 +51,10 @@ diff -up openssl-1.1.1e/crypto/fips/fips.c.fips-post-rand openssl-1.1.1e/crypto/ ret = 1; goto end; } -diff -up openssl-1.1.1e/crypto/rand/drbg_lib.c.fips-post-rand openssl-1.1.1e/crypto/rand/drbg_lib.c ---- openssl-1.1.1e/crypto/rand/drbg_lib.c.fips-post-rand 2020-03-17 15:31:17.000000000 +0100 -+++ openssl-1.1.1e/crypto/rand/drbg_lib.c 2020-03-17 18:07:35.305045521 +0100 -@@ -1009,6 +1009,20 @@ size_t rand_drbg_seedlen(RAND_DRBG *drbg +diff -up openssl-1.1.1i/crypto/rand/drbg_lib.c.fips-post-rand openssl-1.1.1i/crypto/rand/drbg_lib.c +--- openssl-1.1.1i/crypto/rand/drbg_lib.c.fips-post-rand 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/rand/drbg_lib.c 2020-12-09 10:26:41.652106475 +0100 +@@ -1005,6 +1005,20 @@ size_t rand_drbg_seedlen(RAND_DRBG *drbg return min_entropy > min_entropylen ? min_entropy : min_entropylen; } @@ -75,9 +75,9 @@ diff -up openssl-1.1.1e/crypto/rand/drbg_lib.c.fips-post-rand openssl-1.1.1e/cry /* Implements the default OpenSSL RAND_add() method */ static int drbg_add(const void *buf, int num, double randomness) { -diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/crypto/rand/rand_unix.c ---- openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand 2020-03-17 15:31:17.000000000 +0100 -+++ openssl-1.1.1e/crypto/rand/rand_unix.c 2020-03-17 18:09:01.503537189 +0100 +diff -up openssl-1.1.1i/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1i/crypto/rand/rand_unix.c +--- openssl-1.1.1i/crypto/rand/rand_unix.c.fips-post-rand 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/rand/rand_unix.c 2020-12-09 10:36:59.531221903 +0100 @@ -17,10 +17,12 @@ #include #include "rand_local.h" @@ -91,7 +91,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr # ifdef DEVRANDOM_WAIT # include # include -@@ -342,7 +344,7 @@ static ssize_t sysctl_random(char *buf, +@@ -344,7 +346,7 @@ static ssize_t sysctl_random(char *buf, * syscall_random(): Try to get random data using a system call * returns the number of bytes returned in buf, or < 0 on error. */ @@ -100,15 +100,15 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr { /* * Note: 'buflen' equals the size of the buffer which is used by the -@@ -364,6 +366,7 @@ static ssize_t syscall_random(void *buf, - * - Linux since 3.17 with glibc 2.25 - * - FreeBSD since 12.0 (1200061) +@@ -369,6 +371,7 @@ static ssize_t syscall_random(void *buf, + * Note: Sometimes getentropy() can be provided but not implemented + * internally. So we need to check errno for ENOSYS */ +# if 0 # if defined(__GNUC__) && __GNUC__>=2 && defined(__ELF__) && !defined(__hpux) extern int getentropy(void *buffer, size_t length) __attribute__((weak)); -@@ -385,10 +388,10 @@ static ssize_t syscall_random(void *buf, +@@ -394,10 +397,10 @@ static ssize_t syscall_random(void *buf, if (p_getentropy.p != NULL) return p_getentropy.f(buf, buflen) == 0 ? (ssize_t)buflen : -1; # endif @@ -122,7 +122,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr # elif (defined(__FreeBSD__) || defined(__NetBSD__)) && defined(KERN_ARND) return sysctl_random(buf, buflen); # else -@@ -623,6 +626,9 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -633,6 +636,9 @@ size_t rand_pool_acquire_entropy(RAND_PO size_t entropy_available; # if defined(OPENSSL_RAND_SEED_GETRANDOM) @@ -132,7 +132,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr { size_t bytes_needed; unsigned char *buffer; -@@ -633,7 +639,7 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -643,7 +649,7 @@ size_t rand_pool_acquire_entropy(RAND_PO bytes_needed = rand_pool_bytes_needed(pool, 1 /*entropy_factor*/); while (bytes_needed != 0 && attempts-- > 0) { buffer = rand_pool_add_begin(pool, bytes_needed); @@ -141,7 +141,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr if (bytes > 0) { rand_pool_add_end(pool, bytes, 8 * bytes); bytes_needed -= bytes; -@@ -668,8 +674,10 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -678,8 +684,10 @@ size_t rand_pool_acquire_entropy(RAND_PO int attempts = 3; const int fd = get_random_device(i); @@ -153,7 +153,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr while (bytes_needed != 0 && attempts-- > 0) { buffer = rand_pool_add_begin(pool, bytes_needed); -@@ -732,7 +740,9 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -742,7 +750,9 @@ size_t rand_pool_acquire_entropy(RAND_PO return entropy_available; } # endif @@ -164,9 +164,9 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr return rand_pool_entropy_available(pool); # endif } -diff -up openssl-1.1.1e/include/crypto/fips.h.fips-post-rand openssl-1.1.1e/include/crypto/fips.h ---- openssl-1.1.1e/include/crypto/fips.h.fips-post-rand 2020-03-17 18:06:16.831418696 +0100 -+++ openssl-1.1.1e/include/crypto/fips.h 2020-03-17 18:06:16.861418172 +0100 +diff -up openssl-1.1.1i/include/crypto/fips.h.fips-post-rand openssl-1.1.1i/include/crypto/fips.h +--- openssl-1.1.1i/include/crypto/fips.h.fips-post-rand 2020-12-09 10:26:41.639106369 +0100 ++++ openssl-1.1.1i/include/crypto/fips.h 2020-12-09 10:26:41.657106516 +0100 @@ -77,6 +77,8 @@ int FIPS_selftest_hmac(void); int FIPS_selftest_drbg(void); int FIPS_selftest_cmac(void); @@ -176,9 +176,9 @@ diff -up openssl-1.1.1e/include/crypto/fips.h.fips-post-rand openssl-1.1.1e/incl int fips_pkey_signature_test(EVP_PKEY *pkey, const unsigned char *tbs, int tbslen, const unsigned char *kat, -diff -up openssl-1.1.1e/include/crypto/rand.h.fips-post-rand openssl-1.1.1e/include/crypto/rand.h ---- openssl-1.1.1e/include/crypto/rand.h.fips-post-rand 2020-03-17 15:31:17.000000000 +0100 -+++ openssl-1.1.1e/include/crypto/rand.h 2020-03-17 18:07:35.303045555 +0100 +diff -up openssl-1.1.1i/include/crypto/rand.h.fips-post-rand openssl-1.1.1i/include/crypto/rand.h +--- openssl-1.1.1i/include/crypto/rand.h.fips-post-rand 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/include/crypto/rand.h 2020-12-09 10:26:41.657106516 +0100 @@ -24,6 +24,7 @@ typedef struct rand_pool_st RAND_POOL; diff --git a/openssl-1.1.1-version-override.patch b/openssl-1.1.1-version-override.patch index ff69bdb..727cc26 100644 --- a/openssl-1.1.1-version-override.patch +++ b/openssl-1.1.1-version-override.patch @@ -1,12 +1,12 @@ -diff -up openssl-1.1.1g/include/openssl/opensslv.h.version-override openssl-1.1.1g/include/openssl/opensslv.h ---- openssl-1.1.1g/include/openssl/opensslv.h.version-override 2020-04-23 13:29:37.802673513 +0200 -+++ openssl-1.1.1g/include/openssl/opensslv.h 2020-04-23 13:30:13.064008458 +0200 +diff -up openssl-1.1.1i/include/openssl/opensslv.h.version-override openssl-1.1.1i/include/openssl/opensslv.h +--- openssl-1.1.1i/include/openssl/opensslv.h.version-override 2020-12-09 10:25:12.042374409 +0100 ++++ openssl-1.1.1i/include/openssl/opensslv.h 2020-12-09 10:26:00.362769170 +0100 @@ -40,7 +40,7 @@ extern "C" { * major minor fix final patch/beta) */ - # define OPENSSL_VERSION_NUMBER 0x1010108fL --# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1h 22 Sep 2020" -+# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1h FIPS 22 Sep 2020" + # define OPENSSL_VERSION_NUMBER 0x1010109fL +-# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1i 8 Dec 2020" ++# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1i FIPS 8 Dec 2020" /*- * The macros below are to be used for shared library (.so, .dll, ...) diff --git a/openssl.spec b/openssl.spec index 3f6403c..2e26e49 100644 --- a/openssl.spec +++ b/openssl.spec @@ -21,7 +21,7 @@ Summary: Utilities from the general purpose cryptography library with TLS implementation Name: openssl -Version: 1.1.1h +Version: 1.1.1i Release: 1%{?dist} Epoch: 1 # We have to remove certain patented algorithms from the openssl source @@ -473,6 +473,9 @@ export LD_LIBRARY_PATH %ldconfig_scriptlets libs %changelog +* Wed Dec 9 2020 Tomáš Mráz 1.1.1i-1 +- Update to the 1.1.1i release fixing CVE-2020-1971 + * Mon Nov 9 2020 Sahana Prasad - 1.1.1h-1 - Upgrade to version 1.1.1.h diff --git a/sources b/sources index 2bae151..4c1e648 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openssl-1.1.1h-hobbled.tar.xz) = 75e1d3f34f93462b97db92aa6538fd4f2f091ad717438e51d147508738be720d7d0bf4a9b1fda3a1943a4c13aae2a39da3add05f7da833b3c6de40a97bc97908 +SHA512 (openssl-1.1.1i-hobbled.tar.xz) = e131a05e88690a7be7c3d74cbb26620130498ced2ce3d7fd55979aab5ea736ec8b268ba92268bd5bc347989325a3950a066883007cb20c2dd9739fd1eafc513f