From e33651f4168897db99813d271984ac699be98cd8 Mon Sep 17 00:00:00 2001 From: DistroBaker Date: Thu, 10 Dec 2020 01:32:28 +0100 Subject: [PATCH] Merged update from upstream sources This is an automated DistroBaker update from upstream sources. If you do not know what this is about or would like to opt out, contact the OSCI team. Source: https://src.fedoraproject.org/rpms/openssl.git#a07706cf0e50b02a61d3cb10ecad554d4ac4240c --- .gitignore | 1 + openssl-1.1.1-arm-update.patch | 2392 +++++++++++++++++++++++++- openssl-1.1.1-fips-post-rand.patch | 50 +- openssl-1.1.1-version-override.patch | 12 +- openssl.spec | 5 +- sources | 2 +- 6 files changed, 2369 insertions(+), 93 deletions(-) diff --git a/.gitignore b/.gitignore index 3305a0f..d1abce3 100644 --- a/.gitignore +++ b/.gitignore @@ -49,3 +49,4 @@ openssl-1.0.0a-usa.tar.bz2 /openssl-1.1.1f-hobbled.tar.xz /openssl-1.1.1g-hobbled.tar.xz /openssl-1.1.1h-hobbled.tar.xz +/openssl-1.1.1i-hobbled.tar.xz diff --git a/openssl-1.1.1-arm-update.patch b/openssl-1.1.1-arm-update.patch index 998905f..2b8c549 100644 --- a/openssl-1.1.1-arm-update.patch +++ b/openssl-1.1.1-arm-update.patch @@ -1,6 +1,6 @@ -diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl ---- openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl 2019-11-20 11:36:22.389506155 +0100 +diff -up openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl +--- openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/aes/asm/aesv8-armx.pl 2020-12-09 10:39:50.645705385 +0100 @@ -27,44 +27,72 @@ # CBC encrypt case. On Cortex-A57 parallelizable mode performance # seems to be limited by sheer amount of NEON instructions... @@ -85,7 +85,844 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c ___ # Assembler mnemonics are an eclectic mix of 32- and 64-bit syntax, -@@ -514,6 +542,13 @@ $code.=<<___; +@@ -361,6 +389,836 @@ ___ + &gen_block("en"); + &gen_block("de"); + }}} ++ ++# Performance in cycles per byte. ++# Processed with AES-ECB different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-ECB AES-192-ECB AES-256-ECB ++# Cortex-A57 1.85/0.82 2.16/0.96 2.47/1.10 ++# Cortex-A72 1.64/0.85 1.82/0.99 2.13/1.14 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If 5*16 > lsize >= 3*16 bytes, take 3 blocks as one iteration, ++# every loop lsize -=3*16. ++# If lsize < 3*16 bytes, treat them as the tail, interleave the ++# two blocks AES instructions. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store, just looks like what the original ECB implementation does. ++ ++{{{ ++my ($inp,$out,$len,$key)=map("x$_",(0..3)); ++my ($enc,$rounds,$cnt,$key_,$step)=("w4","w5","w6","x7","x8"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++### q7 last round key ++### q10-q15 q7 Last 7 round keys ++### q8-q9 preloaded round keys except last 7 keys for big size ++### q5, q6, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___; ++.globl ${prefix}_ecb_encrypt ++.type ${prefix}_ecb_encrypt,%function ++.align 5 ++${prefix}_ecb_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ subs $len,$len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lecb_big_size ++ vld1.8 {$dat0},[$inp] ++ cmp $enc,#0 // en- or decrypting? ++ ldr $rounds,[$key,#240] ++ vld1.32 {q5-q6},[$key],#32 // load key schedule... ++ ++ b.eq .Lecb_small_dec ++ aese $dat0,q5 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key],#32 // load key schedule... ++ aese $dat0,q6 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-ecb processing ++ b.eq .Lecb_128_enc ++.Lecb_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lecb_round_loop ++.Lecb_128_enc: ++ vld1.32 {q10-q11},[$key],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ vst1.8 {$dat0},[$out] ++ b .Lecb_Final_abort ++.Lecb_small_dec: ++ aesd $dat0,q5 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key],#32 // load key schedule... ++ aesd $dat0,q6 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lecb_128_dec ++.Lecb_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lecb_dec_round_loop ++.Lecb_128_dec: ++ vld1.32 {q10-q11},[$key],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ vst1.8 {$dat0},[$out] ++ b .Lecb_Final_abort ++.Lecb_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp x29,x30,[sp,#-16]! ++ add x29,sp,#0 ++___ ++$code.=<<___ if ($flavour !~ /64/); ++ mov ip,sp ++ stmdb sp!,{r4-r8,lr} ++ vstmdb sp!,{d8-d15} @ ABI specification says so ++ ldmia ip,{r4-r5} @ load remaining args ++ subs $len,$len,#16 ++___ ++$code.=<<___; ++ mov $step,#16 ++ b.lo .Lecb_done ++ cclr $step,eq ++ ++ cmp $enc,#0 // en- or decrypting? ++ ldr $rounds,[$key,#240] ++ and $len,$len,#-16 ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key] // load key schedule... ++ sub $rounds,$rounds,#6 ++ add $key_,$key,x5,lsl#4 // pointer to last 7 round keys ++ sub $rounds,$rounds,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key,#32 ++ mov $cnt,$rounds ++ b.eq .Lecb_dec ++ ++ vld1.8 {$dat1},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $cnt,$rounds,#2 ++ vorr $in1,$dat1,$dat1 ++ vorr $dat2,$dat1,$dat1 ++ vorr $dat1,$dat,$dat ++ b.lo .Lecb_enc_tail ++ ++ vorr $dat1,$in1,$in1 ++ vld1.8 {$dat2},[$inp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_ecb_enc ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ ++.Loop5x_ecb_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ecb_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lecb_enc_tail4x ++ sub $len,$len,#0x50 ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lecb_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q15 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lecb_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$rndlast,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$rndlast,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$rndlast,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$rndlast,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_ecb_enc ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lecb_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ b.lo .Lecb_enc_tail ++ ++ b .Loop3x_ecb_enc ++ ++.align 4 ++.Lecb_enc_tail4x: ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ veor $tmp3,$rndlast,$dat3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lecb_done ++.align 4 ++___ ++$code.=<<___; ++.Loop3x_ecb_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop3x_ecb_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ subs $len,$len,#0x30 ++ mov.lo x6,$len // x6, $cnt, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat2 ++ // are loaded with last "words" ++ mov $key_,$key ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $cnt,$rounds,#2 ++ veor $tmp0,$rndlast,$dat0 ++ veor $tmp1,$rndlast,$dat1 ++ veor $dat2,$dat2,$rndlast ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat0,$in0,$in0 ++ vst1.8 {$tmp1},[$out],#16 ++ vorr $dat1,$in1,$in1 ++ vst1.8 {$dat2},[$out],#16 ++ vorr $dat2,$in2,$in2 ++ b.hs .Loop3x_ecb_enc ++ ++ cmn $len,#0x30 ++ b.eq .Lecb_done ++ nop ++ ++.Lecb_enc_tail: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lecb_enc_tail ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lecb_enc_one ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ b .Lecb_done ++ ++.Lecb_enc_one: ++ veor $tmp1,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ b .Lecb_done ++___ ++ ++$code.=<<___; ++.align 5 ++.Lecb_dec: ++ vld1.8 {$dat1},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $cnt,$rounds,#2 ++ vorr $in1,$dat1,$dat1 ++ vorr $dat2,$dat1,$dat1 ++ vorr $dat1,$dat,$dat ++ b.lo .Lecb_dec_tail ++ ++ vorr $dat1,$in1,$in1 ++ vld1.8 {$dat2},[$inp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#32 ++ b.lo .Loop3x_ecb_dec ++ ++ vld1.8 {$dat3},[$inp],#16 ++ vld1.8 {$dat4},[$inp],#16 ++ sub $len,$len,#32 // bias ++ mov $cnt,$rounds ++ ++.Loop5x_ecb_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_ecb_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ cmp $len,#0x40 // because .Lecb_tail4x ++ sub $len,$len,#0x50 ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel x6,xzr,$len,gt // borrow x6, $cnt, "gt" is not typo ++ mov $key_,$key ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat4 ++ // are loaded with last "words" ++ add x6,$len,#0x60 // because .Lecb_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q15 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ vld1.8 {$in4},[$inp],#16 ++ cbz x6,.Lecb_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$rndlast,$dat0 ++ vorr $dat0,$in0,$in0 ++ veor $tmp1,$rndlast,$dat1 ++ vorr $dat1,$in1,$in1 ++ veor $tmp2,$rndlast,$dat2 ++ vorr $dat2,$in2,$in2 ++ veor $tmp3,$rndlast,$dat3 ++ vorr $dat3,$in3,$in3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat4,$in4,$in4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $cnt,$rounds ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_ecb_dec ++ ++ add $len,$len,#0x50 ++ cbz $len,.Lecb_done ++ ++ add $cnt,$rounds,#2 ++ subs $len,$len,#0x30 ++ vorr $dat0,$in2,$in2 ++ vorr $dat1,$in3,$in3 ++ vorr $dat2,$in4,$in4 ++ b.lo .Lecb_dec_tail ++ ++ b .Loop3x_ecb_dec ++ ++.align 4 ++.Lecb_tail4x: ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ veor $tmp3,$rndlast,$dat3 ++ veor $tmp4,$rndlast,$dat4 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ ++ b .Lecb_done ++.align 4 ++___ ++$code.=<<___; ++.Loop3x_ecb_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop3x_ecb_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ subs $len,$len,#0x30 ++ mov.lo x6,$len // x6, $cnt, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ add $inp,$inp,x6 // $inp is adjusted in such way that ++ // at exit from the loop $dat1-$dat2 ++ // are loaded with last "words" ++ mov $key_,$key ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $cnt,$rounds,#2 ++ veor $tmp0,$rndlast,$dat0 ++ veor $tmp1,$rndlast,$dat1 ++ veor $dat2,$dat2,$rndlast ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vorr $dat0,$in0,$in0 ++ vst1.8 {$tmp1},[$out],#16 ++ vorr $dat1,$in1,$in1 ++ vst1.8 {$dat2},[$out],#16 ++ vorr $dat2,$in2,$in2 ++ b.hs .Loop3x_ecb_dec ++ ++ cmn $len,#0x30 ++ b.eq .Lecb_done ++ nop ++ ++.Lecb_dec_tail: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $cnt,$cnt,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lecb_dec_tail ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lecb_dec_one ++ veor $tmp1,$rndlast,$dat1 ++ veor $tmp2,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ b .Lecb_done ++ ++.Lecb_dec_one: ++ veor $tmp1,$rndlast,$dat2 ++ vst1.8 {$tmp1},[$out],#16 ++ ++.Lecb_done: ++___ ++} ++$code.=<<___ if ($flavour !~ /64/); ++ vldmia sp!,{d8-d15} ++ ldmia sp!,{r4-r8,pc} ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ ldr x29,[sp],#16 ++___ ++$code.=<<___ if ($flavour =~ /64/); ++.Lecb_Final_abort: ++ ret ++___ ++$code.=<<___; ++.size ${prefix}_ecb_encrypt,.-${prefix}_ecb_encrypt ++___ ++}}} + {{{ + my ($inp,$out,$len,$key,$ivp)=map("x$_",(0..4)); my $enc="w5"; + my ($rounds,$cnt,$key_,$step,$step1)=($enc,"w6","x7","x8","x12"); +@@ -519,6 +1377,13 @@ $code.=<<___; ___ { my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); @@ -99,7 +936,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c $code.=<<___; .align 5 .Lcbc_dec: -@@ -530,7 +565,196 @@ $code.=<<___; +@@ -535,7 +1400,196 @@ $code.=<<___; vorr $in0,$dat,$dat vorr $in1,$dat1,$dat1 vorr $in2,$dat2,$dat2 @@ -225,7 +1062,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c + aesimc $dat3,$dat3 + aesd $dat4,q14 + aesimc $dat4,$dat4 -+ + + veor $tmp0,$ivec,$rndlast + aesd $dat0,q15 + veor $tmp1,$in0,$rndlast @@ -277,7 +1114,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c + b.lo .Lcbc_dec_tail + + b .Loop3x_cbc_dec - ++ +.align 4 +.Lcbc_tail4x: + veor $tmp1,$tmp0,$dat1 @@ -296,7 +1133,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c .Loop3x_cbc_dec: aesd $dat0,q8 aesimc $dat0,$dat0 -@@ -691,6 +915,9 @@ my $step="x12"; # aliases with $tctr2 +@@ -696,6 +1750,9 @@ my $step="x12"; # aliases with $tctr2 my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$ivec,$rndlast)=map("q$_",(0..7)); my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); @@ -306,10 +1143,10 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c my ($dat,$tmp)=($dat0,$tmp0); ### q8-q15 preloaded key schedule -@@ -743,6 +970,175 @@ $code.=<<___; - rev $tctr2, $ctr +@@ -751,6 +1808,175 @@ $code.=<<___; + vmov.32 ${ivec}[3],$tctr2 sub $len,$len,#3 // bias - vmov.32 ${dat2}[3],$tctr2 + vorr $dat2,$ivec,$ivec +___ +$code.=<<___ if ($flavour =~ /64/); + cmp $len,#2 @@ -482,7 +1319,1440 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c b .Loop3x_ctr32 .align 4 -@@ -955,7 +1351,7 @@ if ($flavour =~ /64/) { ######## 64-bi +@@ -905,6 +2131,1432 @@ $code.=<<___; + .size ${prefix}_ctr32_encrypt_blocks,.-${prefix}_ctr32_encrypt_blocks + ___ + }}} ++# Performance in cycles per byte. ++# Processed with AES-XTS different key size. ++# It shows the value before and after optimization as below: ++# (before/after): ++# ++# AES-128-XTS AES-256-XTS ++# Cortex-A57 3.36/1.09 4.02/1.37 ++# Cortex-A72 3.03/1.02 3.28/1.33 ++ ++# Optimization is implemented by loop unrolling and interleaving. ++# Commonly, we choose the unrolling factor as 5, if the input ++# data size smaller than 5 blocks, but not smaller than 3 blocks, ++# choose 3 as the unrolling factor. ++# If the input data size dsize >= 5*16 bytes, then take 5 blocks ++# as one iteration, every loop the left size lsize -= 5*16. ++# If lsize < 5*16 bytes, treat them as the tail. Note: left 4*16 bytes ++# will be processed specially, which be integrated into the 5*16 bytes ++# loop to improve the efficiency. ++# There is one special case, if the original input data size dsize ++# = 16 bytes, we will treat it seperately to improve the ++# performance: one independent code block without LR, FP load and ++# store. ++# Encryption will process the (length -tailcnt) bytes as mentioned ++# previously, then encrypt the composite block as last second ++# cipher block. ++# Decryption will process the (length -tailcnt -1) bytes as mentioned ++# previously, then decrypt the last second cipher block to get the ++# last plain block(tail), decrypt the composite block as last second ++# plain text block. ++ ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($tmpin)=("v26.16b"); ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_encrypt ++.type ${prefix}_xts_encrypt,%function ++.align 5 ++${prefix}_xts_encrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_enc_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_enc_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_enc_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aese $dat0,q20 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aese $dat0,q21 ++ aesmc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // if rounds==10, jump to aes-128-xts processing ++ b.eq .Lxts_128_enc ++.Lxts_enc_round_loop: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_enc_round_loop ++.Lxts_128_enc: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$dat0,$iv0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_enc_final_abort ++ ++.align 4 ++.Lxts_enc_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ // tailcnt store the tail value of length%16. ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_abort ++ csel $step,xzr,$step,eq ++ ++ // Firstly, encrypt the iv with key2, as the first iv of XEX. ++ ldr $rounds,[$key2,#240] ++ vld1.32 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key2],#16 ++ ++.Loop_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // next starting point ++ vld1.8 {$dat},[$inp],$step ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ ++ // Encryption ++.Lxts_enc: ++ vld1.8 {$dat2},[$inp],#16 ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_enc_tail ++ veor $dat,$dat,$iv0 // before encryption, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // the third block ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_enc_tail ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ vld1.8 {$dat3},[$inp],#16 ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_enc ++ ++.align 4 ++.Loop5x_xts_enc: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Loop5x_xts_enc ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat3,q8 ++ aesmc $dat3,$dat3 ++ aese $dat4,q8 ++ aesmc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat3,q9 ++ aesmc $dat3,$dat3 ++ aese $dat4,q9 ++ aesmc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aese $dat0,q10 ++ aesmc $dat0,$dat0 ++ aese $dat1,q10 ++ aesmc $dat1,$dat1 ++ aese $dat2,q10 ++ aesmc $dat2,$dat2 ++ aese $dat3,q10 ++ aesmc $dat3,$dat3 ++ aese $dat4,q10 ++ aesmc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_enc_tail4x ++ ++ aese $dat0,q11 ++ aesmc $dat0,$dat0 ++ aese $dat1,q11 ++ aesmc $dat1,$dat1 ++ aese $dat2,q11 ++ aesmc $dat2,$dat2 ++ aese $dat3,q11 ++ aesmc $dat3,$dat3 ++ aese $dat4,q11 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat3,q12 ++ aesmc $dat3,$dat3 ++ aese $dat4,q12 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat3,q13 ++ aesmc $dat3,$dat3 ++ aese $dat4,q13 ++ aesmc $dat4,$dat4 ++ ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat3,q14 ++ aesmc $dat3,$dat3 ++ aese $dat4,q14 ++ aesmc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aese $dat0,q15 ++ // The iv for first block of one iteration ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aese $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aese $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aese $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aese $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_enc_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_enc ++ ++ ++ // If left 4 blocks, borrow the five block's processing. ++ cmn $len,#0x10 ++ b.ne .Loop5x_enc_after ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_enc ++ ++.Loop5x_enc_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_enc_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_enc_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_enc_tail ++ ++.align 4 ++.Lxts_enc_tail4x: ++ add $inp,$inp,#16 ++ veor $tmp1,$dat1,$tmp1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_enc_done ++.align 4 ++.Lxts_outer_enc_tail: ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_enc_tail ++ ++ aese $dat0,q8 ++ aesmc $dat0,$dat0 ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ //mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr#31 ++ eor $ivl,$tmpmx,$ivl,lsl#1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aese $dat0,q9 ++ aesmc $dat0,$dat0 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset ++ mov $key_,$key1 ++ ++ aese $dat0,q12 ++ aesmc $dat0,$dat0 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ aese $dat0,q13 ++ aesmc $dat0,$dat0 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ aese $dat0,q14 ++ aesmc $dat0,$dat0 ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ aese $dat0,q15 ++ aese $dat1,q15 ++ aese $dat2,q15 ++ vld1.8 {$in2},[$inp],#16 ++ add $rounds,$rounds0,#2 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ cmn $len,#0x30 ++ b.eq .Lxts_enc_done ++.Lxts_encxor_one: ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_enc_tail: ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_enc_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_enc_tail_loop: ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_enc_tail_loop ++ ++ aese $dat1,q8 ++ aesmc $dat1,$dat1 ++ aese $dat2,q8 ++ aesmc $dat2,$dat2 ++ aese $dat1,q9 ++ aesmc $dat1,$dat1 ++ aese $dat2,q9 ++ aesmc $dat2,$dat2 ++ aese $dat1,q12 ++ aesmc $dat1,$dat1 ++ aese $dat2,q12 ++ aesmc $dat2,$dat2 ++ cmn $len,#0x20 ++ aese $dat1,q13 ++ aesmc $dat1,$dat1 ++ aese $dat2,q13 ++ aesmc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aese $dat1,q14 ++ aesmc $dat1,$dat1 ++ aese $dat2,q14 ++ aesmc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aese $dat1,q15 ++ aese $dat2,q15 ++ b.eq .Lxts_enc_one ++ veor $tmp1,$tmp1,$dat1 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vst1.8 {$tmp2},[$out],#16 ++ fmov $ivl,$ivd10 ++ fmov $ivh,$ivd11 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++ ++.Lxts_enc_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv0,$iv0 ++ vst1.8 {$tmp1},[$out],#16 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ b .Lxts_enc_done ++.align 5 ++.Lxts_enc_done: ++ // Process the tail block with cipher stealing. ++ tst $tailcnt,#0xf ++ b.eq .Lxts_abort ++ ++ mov $tmpinp,$inp ++ mov $tmpoutp,$out ++ sub $out,$out,#16 ++.composite_enc_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_enc_loop ++.Lxts_enc_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Encrypt the composite block to get the last second encrypted text block ++ ldr $rounds,[$key1,#240] // load key schedule... ++ vld1.8 {$dat},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key1],#16 // load key schedule... ++.Loop_final_enc: ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 ++ subs $rounds,$rounds,#2 ++ aese $tmpin,$dat1 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 ++ b.gt .Loop_final_enc ++ ++ aese $tmpin,$dat0 ++ aesmc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aese $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++.Lxts_enc_final_abort: ++ ret ++.size ${prefix}_xts_encrypt,.-${prefix}_xts_encrypt ++___ ++ ++}}} ++{{{ ++my ($inp,$out,$len,$key1,$key2,$ivp)=map("x$_",(0..5)); ++my ($rounds0,$rounds,$key_,$step,$ivl,$ivh)=("w5","w6","x7","x8","x9","x10"); ++my ($tmpoutp,$loutp,$l2outp,$tmpinp)=("x13","w14","w15","x20"); ++my ($tailcnt,$midnum,$midnumx,$constnum,$constnumx)=("x21","w22","x22","w19","x19"); ++my ($xoffset,$tmpmx,$tmpmw)=("x6","x11","w11"); ++my ($dat0,$dat1,$in0,$in1,$tmp0,$tmp1,$tmp2,$rndlast)=map("q$_",(0..7)); ++my ($iv0,$iv1,$iv2,$iv3,$iv4,$tmpin)=("v6.16b","v8.16b","v9.16b","v10.16b","v11.16b","v26.16b"); ++my ($ivd00,$ivd01,$ivd20,$ivd21)=("d6","v6.d[1]","d9","v9.d[1]"); ++my ($ivd10,$ivd11,$ivd30,$ivd31,$ivd40,$ivd41)=("d8","v8.d[1]","d10","v10.d[1]","d11","v11.d[1]"); ++ ++my ($dat,$tmp,$rndzero_n_last)=($dat0,$tmp0,$tmp1); ++ ++# q7 last round key ++# q10-q15, q7 Last 7 round keys ++# q8-q9 preloaded round keys except last 7 keys for big size ++# q20, q21, q8-q9 preloaded round keys except last 7 keys for only 16 byte ++ ++{ ++my ($dat2,$in2,$tmp2)=map("q$_",(10,11,9)); ++ ++my ($dat3,$in3,$tmp3); # used only in 64-bit mode ++my ($dat4,$in4,$tmp4); ++if ($flavour =~ /64/) { ++ ($dat2,$dat3,$dat4,$in2,$in3,$in4,$tmp3,$tmp4)=map("q$_",(16..23)); ++} ++ ++$code.=<<___ if ($flavour =~ /64/); ++.globl ${prefix}_xts_decrypt ++.type ${prefix}_xts_decrypt,%function ++.align 5 ++${prefix}_xts_decrypt: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ cmp $len,#16 ++ // Original input data size bigger than 16, jump to big size processing. ++ b.ne .Lxts_dec_big_size ++ // Encrypt the iv with key2, as the first XEX iv. ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_small_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_small_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ vld1.8 {$dat0},[$inp] ++ veor $dat0,$iv0,$dat0 ++ ++ ldr $rounds,[$key1,#240] ++ vld1.32 {q20-q21},[$key1],#32 // load key schedule... ++ ++ aesd $dat0,q20 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8-q9},[$key1],#32 // load key schedule... ++ aesd $dat0,q21 ++ aesimc $dat0,$dat0 ++ subs $rounds,$rounds,#10 // bias ++ b.eq .Lxts_128_dec ++.Lxts_dec_round_loop: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ vld1.32 {q8},[$key1],#16 // load key schedule... ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q9},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 // bias ++ b.gt .Lxts_dec_round_loop ++.Lxts_128_dec: ++ vld1.32 {q10-q11},[$key1],#32 // load key schedule... ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ vld1.32 {q12-q13},[$key1],#32 // load key schedule... ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ vld1.32 {q14-q15},[$key1],#32 // load key schedule... ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ vld1.32 {$rndlast},[$key1] ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat0,q15 ++ veor $dat0,$dat0,$rndlast ++ veor $dat0,$iv0,$dat0 ++ vst1.8 {$dat0},[$out] ++ b .Lxts_dec_final_abort ++.Lxts_dec_big_size: ++___ ++$code.=<<___ if ($flavour =~ /64/); ++ stp $constnumx,$tmpinp,[sp,#-64]! ++ stp $tailcnt,$midnumx,[sp,#48] ++ stp $ivd10,$ivd20,[sp,#32] ++ stp $ivd30,$ivd40,[sp,#16] ++ ++ and $tailcnt,$len,#0xf ++ and $len,$len,#-16 ++ subs $len,$len,#16 ++ mov $step,#16 ++ b.lo .Lxts_dec_abort ++ ++ // Encrypt the iv with key2, as the first XEX iv ++ ldr $rounds,[$key2,#240] ++ vld1.8 {$dat},[$key2],#16 ++ vld1.8 {$iv0},[$ivp] ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key2],#16 ++ ++.Loop_dec_iv_enc: ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2],#16 ++ subs $rounds,$rounds,#2 ++ aese $iv0,$dat1 ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat1},[$key2],#16 ++ b.gt .Loop_dec_iv_enc ++ ++ aese $iv0,$dat ++ aesmc $iv0,$iv0 ++ vld1.32 {$dat},[$key2] ++ aese $iv0,$dat1 ++ veor $iv0,$iv0,$dat ++ ++ // The iv for second block ++ // $ivl- iv(low), $ivh - iv(high) ++ // the five ivs stored into, $iv0,$iv1,$iv2,$iv3,$iv4 ++ fmov $ivl,$ivd00 ++ fmov $ivh,$ivd01 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ ldr $rounds0,[$key1,#240] // load rounds number ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ vld1.32 {q8-q9},[$key1] // load key schedule... ++ sub $rounds0,$rounds0,#6 ++ add $key_,$key1,$ivp,lsl#4 // pointer to last 7 round keys ++ sub $rounds0,$rounds0,#2 ++ vld1.32 {q10-q11},[$key_],#32 // load key schedule... ++ vld1.32 {q12-q13},[$key_],#32 ++ vld1.32 {q14-q15},[$key_],#32 ++ vld1.32 {$rndlast},[$key_] ++ ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ ++ add $key_,$key1,#32 ++ mov $rounds,$rounds0 ++ b .Lxts_dec ++ ++ // Decryption ++.align 5 ++.Lxts_dec: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_begin ++ subs $len,$len,#16 ++ csel $step,xzr,$step,eq ++ vld1.8 {$dat},[$inp],#16 ++ b.lo .Lxts_done ++ sub $inp,$inp,#16 ++.Lxts_dec_begin: ++ vld1.8 {$dat},[$inp],$step ++ subs $len,$len,#32 // bias ++ add $rounds,$rounds0,#2 ++ vorr $in1,$dat,$dat ++ vorr $dat1,$dat,$dat ++ vorr $in3,$dat,$dat ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in2,$dat2,$dat2 ++ vorr $in4,$dat2,$dat2 ++ b.lo .Lxts_inner_dec_tail ++ veor $dat,$dat,$iv0 // before decryt, xor with iv ++ veor $dat2,$dat2,$iv1 ++ ++ vorr $dat1,$dat2,$dat2 ++ vld1.8 {$dat2},[$inp],#16 ++ vorr $in0,$dat,$dat ++ vorr $in1,$dat1,$dat1 ++ veor $in2,$dat2,$iv2 // third block xox with third iv ++ veor $dat2,$dat2,$iv2 ++ cmp $len,#32 ++ b.lo .Lxts_outer_dec_tail ++ ++ vld1.8 {$dat3},[$inp],#16 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$dat4},[$inp],#16 ++ veor $dat3,$dat3,$iv3 // the fourth block ++ veor $dat4,$dat4,$iv4 ++ sub $len,$len,#32 // bias ++ mov $rounds,$rounds0 ++ b .Loop5x_xts_dec ++ ++.align 4 ++.Loop5x_xts_dec: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ vld1.32 {q8},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ vld1.32 {q9},[$key_],#16 // load key schedule... ++ b.gt .Loop5x_xts_dec ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q8 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q8 ++ aesimc $dat4,$dat4 ++ subs $len,$len,#0x50 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q9 ++ aesimc $dat0,$dat ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q9 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q9 ++ aesimc $dat4,$dat4 ++ csel $xoffset,xzr,$len,gt // borrow x6, w6, "gt" is not typo ++ mov $key_,$key1 ++ ++ aesd $dat0,q10 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q10 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q10 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q10 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q10 ++ aesimc $dat4,$dat4 ++ add $inp,$inp,$xoffset // x0 is adjusted in such way that ++ // at exit from the loop v1.16b-v26.16b ++ // are loaded with last "words" ++ add $xoffset,$len,#0x60 // because .Lxts_dec_tail4x ++ ++ aesd $dat0,q11 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q11 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q11 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q11 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q11 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q12 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q12 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q13 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q13 ++ aesimc $dat4,$dat4 ++ ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ aesd $dat3,q14 ++ aesimc $dat3,$dat3 ++ aesd $dat4,q14 ++ aesimc $dat4,$dat4 ++ ++ veor $tmp0,$rndlast,$iv0 ++ aesd $dat0,q15 ++ // The iv for first block of next iteration. ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$rndlast,$iv1 ++ vld1.8 {$in0},[$inp],#16 ++ aesd $dat1,q15 ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ veor $tmp2,$rndlast,$iv2 ++ vld1.8 {$in1},[$inp],#16 ++ aesd $dat2,q15 ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ veor $tmp3,$rndlast,$iv3 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat3,q15 ++ // The iv for fourth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd30,$ivl ++ fmov $ivd31,$ivh ++ veor $tmp4,$rndlast,$iv4 ++ vld1.8 {$in3},[$inp],#16 ++ aesd $dat4,q15 ++ ++ // The iv for fifth block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd40,$ivl ++ fmov $ivd41,$ivh ++ ++ vld1.8 {$in4},[$inp],#16 ++ cbz $xoffset,.Lxts_dec_tail4x ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ veor $tmp0,$tmp0,$dat0 ++ veor $dat0,$in0,$iv0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat1,$in1,$iv1 ++ veor $tmp2,$tmp2,$dat2 ++ veor $dat2,$in2,$iv2 ++ veor $tmp3,$tmp3,$dat3 ++ veor $dat3,$in3,$iv3 ++ veor $tmp4,$tmp4,$dat4 ++ vst1.8 {$tmp0},[$out],#16 ++ veor $dat4,$in4,$iv4 ++ vst1.8 {$tmp1},[$out],#16 ++ mov $rounds,$rounds0 ++ vst1.8 {$tmp2},[$out],#16 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp3},[$out],#16 ++ vst1.8 {$tmp4},[$out],#16 ++ b.hs .Loop5x_xts_dec ++ ++ cmn $len,#0x10 ++ b.ne .Loop5x_dec_after ++ // If x2($len) equal to -0x10, the left blocks is 4. ++ // After specially processing, utilize the five blocks processing again. ++ // It will use the following IVs: $iv0,$iv0,$iv1,$iv2,$iv3. ++ vorr $iv4,$iv3,$iv3 ++ vorr $iv3,$iv2,$iv2 ++ vorr $iv2,$iv1,$iv1 ++ vorr $iv1,$iv0,$iv0 ++ fmov $ivl,$ivd40 ++ fmov $ivh,$ivd41 ++ veor $dat0,$iv0,$in0 ++ veor $dat1,$iv1,$in1 ++ veor $dat2,$in2,$iv2 ++ veor $dat3,$in3,$iv3 ++ veor $dat4,$in4,$iv4 ++ b.eq .Loop5x_xts_dec ++ ++.Loop5x_dec_after: ++ add $len,$len,#0x50 ++ cbz $len,.Lxts_done ++ ++ add $rounds,$rounds0,#2 ++ subs $len,$len,#0x30 ++ b.lo .Lxts_inner_dec_tail ++ ++ veor $dat0,$iv0,$in2 ++ veor $dat1,$iv1,$in3 ++ veor $dat2,$in4,$iv2 ++ b .Lxts_outer_dec_tail ++ ++.align 4 ++.Lxts_dec_tail4x: ++ add $inp,$inp,#16 ++ vld1.32 {$dat0},[$inp],#16 ++ veor $tmp1,$dat1,$tmp0 ++ vst1.8 {$tmp1},[$out],#16 ++ veor $tmp2,$dat2,$tmp2 ++ vst1.8 {$tmp2},[$out],#16 ++ veor $tmp3,$dat3,$tmp3 ++ veor $tmp4,$dat4,$tmp4 ++ vst1.8 {$tmp3-$tmp4},[$out],#32 ++ ++ b .Lxts_done ++.align 4 ++.Lxts_outer_dec_tail: ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_outer_dec_tail ++ ++ aesd $dat0,q8 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ veor $tmp0,$iv0,$rndlast ++ subs $len,$len,#0x30 ++ // The iv for first block ++ fmov $ivl,$ivd20 ++ fmov $ivh,$ivd21 ++ mov $constnum,#0x87 ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd00,$ivl ++ fmov $ivd01,$ivh ++ veor $tmp1,$iv1,$rndlast ++ csel $xoffset,$len,$xoffset,lo // x6, w6, is zero at this point ++ aesd $dat0,q9 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv2,$rndlast ++ // The iv for second block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd10,$ivl ++ fmov $ivd11,$ivh ++ ++ add $xoffset,$xoffset,#0x20 ++ add $inp,$inp,$xoffset // $inp is adjusted to the last data ++ ++ mov $key_,$key1 ++ ++ // The iv for third block ++ extr $midnumx,$ivh,$ivh,#32 ++ extr $ivh,$ivh,$ivl,#63 ++ and $tmpmw,$constnum,$midnum,asr #31 ++ eor $ivl,$tmpmx,$ivl,lsl #1 ++ fmov $ivd20,$ivl ++ fmov $ivd21,$ivh ++ ++ aesd $dat0,q12 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q13 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ aesd $dat0,q14 ++ aesimc $dat0,$dat0 ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ vld1.8 {$in2},[$inp],#16 ++ aesd $dat0,q15 ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ vld1.32 {q8},[$key_],#16 // re-pre-load rndkey[0] ++ add $rounds,$rounds0,#2 ++ veor $tmp0,$tmp0,$dat0 ++ veor $tmp1,$tmp1,$dat1 ++ veor $dat2,$dat2,$tmp2 ++ vld1.32 {q9},[$key_],#16 // re-pre-load rndkey[1] ++ vst1.8 {$tmp0},[$out],#16 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$dat2},[$out],#16 ++ ++ cmn $len,#0x30 ++ add $len,$len,#0x30 ++ b.eq .Lxts_done ++ sub $len,$len,#0x30 ++ vorr $in3,$in1,$in1 ++ vorr $in4,$in2,$in2 ++ nop ++ ++.Lxts_inner_dec_tail: ++ // $len == -0x10 means two blocks left. ++ cmn $len,#0x10 ++ veor $dat1,$in3,$iv0 ++ veor $dat2,$in4,$iv1 ++ b.eq .Lxts_dec_tail_loop ++ veor $dat2,$in4,$iv0 ++.Lxts_dec_tail_loop: ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ vld1.32 {q8},[$key_],#16 ++ subs $rounds,$rounds,#2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ vld1.32 {q9},[$key_],#16 ++ b.gt .Lxts_dec_tail_loop ++ ++ aesd $dat1,q8 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q8 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q9 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q9 ++ aesimc $dat2,$dat2 ++ aesd $dat1,q12 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q12 ++ aesimc $dat2,$dat2 ++ cmn $len,#0x20 ++ aesd $dat1,q13 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q13 ++ aesimc $dat2,$dat2 ++ veor $tmp1,$iv0,$rndlast ++ aesd $dat1,q14 ++ aesimc $dat1,$dat1 ++ aesd $dat2,q14 ++ aesimc $dat2,$dat2 ++ veor $tmp2,$iv1,$rndlast ++ aesd $dat1,q15 ++ aesd $dat2,q15 ++ b.eq .Lxts_dec_one ++ veor $tmp1,$tmp1,$dat1 ++ veor $tmp2,$tmp2,$dat2 ++ vorr $iv0,$iv2,$iv2 ++ vorr $iv1,$iv3,$iv3 ++ vst1.8 {$tmp1},[$out],#16 ++ vst1.8 {$tmp2},[$out],#16 ++ add $len,$len,#16 ++ b .Lxts_done ++ ++.Lxts_dec_one: ++ veor $tmp1,$tmp1,$dat2 ++ vorr $iv0,$iv1,$iv1 ++ vorr $iv1,$iv2,$iv2 ++ vst1.8 {$tmp1},[$out],#16 ++ add $len,$len,#32 ++ ++.Lxts_done: ++ tst $tailcnt,#0xf ++ b.eq .Lxts_dec_abort ++ // Processing the last two blocks with cipher stealing. ++ mov x7,x3 ++ cbnz x2,.Lxts_dec_1st_done ++ vld1.32 {$dat0},[$inp],#16 ++ ++ // Decrypt the last secod block to get the last plain text block ++.Lxts_dec_1st_done: ++ eor $tmpin,$dat0,$iv1 ++ ldr $rounds,[$key1,#240] ++ vld1.32 {$dat0},[$key1],#16 ++ sub $rounds,$rounds,#2 ++ vld1.32 {$dat1},[$key1],#16 ++.Loop_final_2nd_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key1],#16 // load key schedule... ++ b.gt .Loop_final_2nd_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key1] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv1 ++ vst1.8 {$tmpin},[$out] ++ ++ mov $tmpinp,$inp ++ add $tmpoutp,$out,#16 ++ ++ // Composite the tailcnt "16 byte not aligned block" into the last second plain blocks ++ // to get the last encrypted block. ++.composite_dec_loop: ++ subs $tailcnt,$tailcnt,#1 ++ ldrb $l2outp,[$out,$tailcnt] ++ ldrb $loutp,[$tmpinp,$tailcnt] ++ strb $l2outp,[$tmpoutp,$tailcnt] ++ strb $loutp,[$out,$tailcnt] ++ b.gt .composite_dec_loop ++.Lxts_dec_load_done: ++ vld1.8 {$tmpin},[$out] ++ veor $tmpin,$tmpin,$iv0 ++ ++ // Decrypt the composite block to get the last second plain text block ++ ldr $rounds,[$key_,#240] ++ vld1.8 {$dat},[$key_],#16 ++ sub $rounds,$rounds,#2 ++ vld1.8 {$dat1},[$key_],#16 ++.Loop_final_dec: ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_],#16 // load key schedule... ++ subs $rounds,$rounds,#2 ++ aesd $tmpin,$dat1 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat1},[$key_],#16 // load key schedule... ++ b.gt .Loop_final_dec ++ ++ aesd $tmpin,$dat0 ++ aesimc $tmpin,$tmpin ++ vld1.32 {$dat0},[$key_] ++ aesd $tmpin,$dat1 ++ veor $tmpin,$tmpin,$dat0 ++ veor $tmpin,$tmpin,$iv0 ++ vst1.8 {$tmpin},[$out] ++ ++.Lxts_dec_abort: ++ ldp $tailcnt,$midnumx,[sp,#48] ++ ldp $ivd10,$ivd20,[sp,#32] ++ ldp $ivd30,$ivd40,[sp,#16] ++ ldp $constnumx,$tmpinp,[sp],#64 ++ ++.Lxts_dec_final_abort: ++ ret ++.size ${prefix}_xts_decrypt,.-${prefix}_xts_decrypt ++___ ++} ++}}} + $code.=<<___; + #endif + ___ +@@ -963,7 +3615,7 @@ if ($flavour =~ /64/) { ######## 64-bi # since ARMv7 instructions are always encoded little-endian. # correct solution is to use .inst directive, but older # assemblers don't implement it:-( @@ -491,7 +2761,7 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c $word&0xff,($word>>8)&0xff, ($word>>16)&0xff,($word>>24)&0xff, $mnemonic,$arg; -@@ -996,14 +1392,17 @@ if ($flavour =~ /64/) { ######## 64-bi +@@ -1004,14 +3656,17 @@ if ($flavour =~ /64/) { ######## 64-bi s/\],#[0-9]+/]!/o; s/[v]?(aes\w+)\s+([qv].*)/unaes($1,$2)/geo or @@ -511,9 +2781,9 @@ diff -up openssl-1.1.1c/crypto/aes/asm/aesv8-armx.pl.arm-update openssl-1.1.1c/c print $_,"\n"; } } -diff -up openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl ---- openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl 2019-11-20 11:36:22.389506155 +0100 +diff -up openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl +--- openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/aes/asm/vpaes-armv8.pl 2020-12-09 10:37:38.405558929 +0100 @@ -30,6 +30,7 @@ # Denver(***) 16.6(**) 15.1/17.8(**) [8.80/9.93 ] # Apple A7(***) 22.7(**) 10.9/14.3 [8.45/10.0 ] @@ -522,9 +2792,9 @@ diff -up openssl-1.1.1c/crypto/aes/asm/vpaes-armv8.pl.arm-update openssl-1.1.1c/ # # (*) ECB denotes approximate result for parallelizable modes # such as CBC decrypt, CTR, etc.; -diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl ---- openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl 2019-11-21 16:44:50.814651553 +0100 +diff -up openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl +--- openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/chacha/asm/chacha-armv8.pl 2020-12-09 10:40:57.922288627 +0100 @@ -18,32 +18,44 @@ # # ChaCha20 for ARMv8. @@ -585,20 +2855,22 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 *STDOUT=*OUT; sub AUTOLOAD() # thunk [simplified] x86-style perlasm -@@ -120,41 +132,36 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1) +@@ -120,42 +132,37 @@ my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1) } $code.=<<___; -#include "arm_arch.h" +- +-.text +- +#ifndef __KERNEL__ +# include "arm_arch.h" -+.extern OPENSSL_armcap_P + .extern OPENSSL_armcap_P + .hidden OPENSSL_armcap_P +#endif ++ ++.text - .text - --.extern OPENSSL_armcap_P -- .align 5 .Lsigma: .quad 0x3320646e61707865,0x6b20657479622d32 // endian-neutral @@ -641,7 +2913,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 .Lshort: .inst 0xd503233f // paciasp -@@ -173,7 +180,7 @@ ChaCha20_ctr32: +@@ -174,7 +181,7 @@ ChaCha20_ctr32: ldp @d[2],@d[3],[$key] // load key ldp @d[4],@d[5],[$key,#16] ldp @d[6],@d[7],[$ctr] // load counter @@ -650,7 +2922,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 ror @d[4],@d[4],#32 -@@ -242,7 +249,7 @@ $code.=<<___; +@@ -243,7 +250,7 @@ $code.=<<___; add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 @@ -659,7 +2931,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -299,7 +306,7 @@ $code.=<<___; +@@ -300,7 +307,7 @@ $code.=<<___; add @x[10],@x[10],@x[11],lsl#32 add @x[12],@x[12],@x[13],lsl#32 add @x[14],@x[14],@x[15],lsl#32 @@ -668,7 +2940,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -340,46 +347,91 @@ $code.=<<___; +@@ -341,46 +348,91 @@ $code.=<<___; ___ {{{ @@ -789,7 +3061,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 .inst 0xd503233f // paciasp stp x29,x30,[sp,#-96]! add x29,sp,#0 -@@ -402,8 +454,9 @@ ChaCha20_neon: +@@ -403,8 +455,9 @@ ChaCha20_neon: ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] @@ -801,7 +3073,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 -@@ -412,115 +465,129 @@ ChaCha20_neon: +@@ -413,115 +466,129 @@ ChaCha20_neon: ror @d[6],@d[6],#32 ror @d[7],@d[7],#32 #endif @@ -1013,7 +3285,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -530,48 +597,68 @@ $code.=<<___; +@@ -531,48 +598,68 @@ $code.=<<___; rev @x[12],@x[12] rev @x[14],@x[14] #endif @@ -1106,7 +3378,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ldp x19,x20,[x29,#16] add sp,sp,#64 ldp x21,x22,[x29,#32] -@@ -582,8 +669,10 @@ $code.=<<___; +@@ -583,8 +670,10 @@ $code.=<<___; .inst 0xd50323bf // autiasp ret @@ -1118,7 +3390,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 cmp $len,#64 b.lo .Less_than_64 -@@ -600,7 +689,7 @@ $code.=<<___; +@@ -601,7 +690,7 @@ $code.=<<___; add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 @@ -1127,7 +3399,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -620,48 +709,68 @@ $code.=<<___; +@@ -621,48 +710,68 @@ $code.=<<___; eor @x[14],@x[14],@x[15] stp @x[0],@x[2],[$out,#0] // store output @@ -1220,7 +3492,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 sub $out,$out,#1 add $inp,$inp,$len add $out,$out,$len -@@ -694,9 +803,41 @@ $code.=<<___; +@@ -695,9 +804,41 @@ $code.=<<___; .size ChaCha20_neon,.-ChaCha20_neon ___ { @@ -1263,7 +3535,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 $code.=<<___; .type ChaCha20_512_neon,%function -@@ -716,6 +857,7 @@ ChaCha20_512_neon: +@@ -717,6 +858,7 @@ ChaCha20_512_neon: .L512_or_more_neon: sub sp,sp,#128+64 @@ -1271,7 +3543,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ldp @d[0],@d[1],[@x[0]] // load sigma ld1 {@K[0]},[@x[0]],#16 ldp @d[2],@d[3],[$key] // load key -@@ -723,8 +865,9 @@ ChaCha20_512_neon: +@@ -724,8 +866,9 @@ ChaCha20_512_neon: ld1 {@K[1],@K[2]},[$key] ldp @d[6],@d[7],[$ctr] // load counter ld1 {@K[3]},[$ctr] @@ -1283,7 +3555,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev64 @K[0],@K[0] ror @d[2],@d[2],#32 ror @d[3],@d[3],#32 -@@ -791,9 +934,10 @@ ChaCha20_512_neon: +@@ -792,9 +935,10 @@ ChaCha20_512_neon: mov $C4,@K[2] stp @K[3],@K[4],[sp,#48] // off-load key block, variable part mov $C5,@K[2] @@ -1295,7 +3567,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 subs $len,$len,#512 .Loop_upper_neon: sub $ctr,$ctr,#1 -@@ -866,7 +1010,7 @@ $code.=<<___; +@@ -867,7 +1011,7 @@ $code.=<<___; add @x[14],@x[14],@x[15],lsl#32 ldp @x[13],@x[15],[$inp,#48] add $inp,$inp,#64 @@ -1304,7 +3576,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -955,6 +1099,7 @@ $code.=<<___; +@@ -956,6 +1100,7 @@ $code.=<<___; add.32 @x[2],@x[2],@d[1] ldp @K[4],@K[5],[sp,#64] add @x[3],@x[3],@d[1],lsr#32 @@ -1312,7 +3584,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 add $A0,$A0,@K[0] add.32 @x[4],@x[4],@d[2] add $A1,$A1,@K[0] -@@ -1007,7 +1152,7 @@ $code.=<<___; +@@ -1008,7 +1153,7 @@ $code.=<<___; add $inp,$inp,#64 add $B5,$B5,@K[1] @@ -1321,7 +3593,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 rev @x[0],@x[0] rev @x[2],@x[2] rev @x[4],@x[4] -@@ -1085,26 +1230,26 @@ $code.=<<___; +@@ -1086,26 +1231,26 @@ $code.=<<___; b.hs .Loop_outer_512_neon adds $len,$len,#512 @@ -1356,7 +3628,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 eor @K[1],@K[1],@K[1] eor @K[2],@K[2],@K[2] eor @K[3],@K[3],@K[3] -@@ -1114,6 +1259,7 @@ $code.=<<___; +@@ -1115,6 +1260,7 @@ $code.=<<___; b .Loop_outer .Ldone_512_neon: @@ -1364,7 +3636,7 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 ldp x19,x20,[x29,#16] add sp,sp,#128+64 ldp x21,x22,[x29,#32] -@@ -1132,9 +1278,11 @@ foreach (split("\n",$code)) { +@@ -1133,9 +1279,11 @@ foreach (split("\n",$code)) { s/\`([^\`]*)\`/eval $1/geo; (s/\b([a-z]+)\.32\b/$1/ and (s/x([0-9]+)/w$1/g or 1)) or @@ -1377,9 +3649,9 @@ diff -up openssl-1.1.1c/crypto/chacha/asm/chacha-armv8.pl.arm-update openssl-1.1 (s/\brev32\.16\b/rev32/ and (s/\.4s/\.8h/g or 1)); #s/\bq([0-9]+)#(lo|hi)/sprintf "d%d",2*$1+($2 eq "hi")/geo; -diff -up openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl ---- openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl 2019-11-20 11:36:22.389506155 +0100 +diff -up openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl +--- openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/modes/asm/ghashv8-armx.pl 2020-12-09 10:37:38.408558954 +0100 @@ -42,6 +42,7 @@ # Denver 0.51 0.65 6.02 # Mongoose 0.65 1.10 8.06 @@ -1388,9 +3660,9 @@ diff -up openssl-1.1.1c/crypto/modes/asm/ghashv8-armx.pl.arm-update openssl-1.1. # # (*) presented for reference/comparison purposes; -diff -up openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl ---- openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl +--- openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/poly1305/asm/poly1305-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -29,6 +29,7 @@ # X-Gene 2.13/+68% 2.27 # Mongoose 1.77/+75% 1.12 @@ -1399,9 +3671,9 @@ diff -up openssl-1.1.1c/crypto/poly1305/asm/poly1305-armv8.pl.arm-update openssl # # (*) estimate based on resources availability is less than 1.0, # i.e. measured result is worse than expected, presumably binary -diff -up openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl ---- openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/keccak1600-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -51,6 +51,7 @@ # Kryo 12 # Denver 7.8 @@ -1410,9 +3682,9 @@ diff -up openssl-1.1.1c/crypto/sha/asm/keccak1600-armv8.pl.arm-update openssl-1. # # (*) Corresponds to SHA3-256. No improvement coefficients are listed # because they vary too much from compiler to compiler. Newer -diff -up openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl ---- openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/sha1-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -27,6 +27,7 @@ # X-Gene 8.80 (+200%) # Mongoose 2.05 6.50 (+160%) @@ -1421,9 +3693,9 @@ diff -up openssl-1.1.1c/crypto/sha/asm/sha1-armv8.pl.arm-update openssl-1.1.1c/c # # (*) Software results are presented mostly for reference purposes. # (**) Keep in mind that Denver relies on binary translation, which -diff -up openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl.arm-update openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl ---- openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl.arm-update 2019-05-28 15:12:21.000000000 +0200 -+++ openssl-1.1.1c/crypto/sha/asm/sha512-armv8.pl 2019-11-20 11:36:22.390506137 +0100 +diff -up openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl +--- openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl.arm-update 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/sha/asm/sha512-armv8.pl 2020-12-09 10:37:38.408558954 +0100 @@ -28,6 +28,7 @@ # X-Gene 20.0 (+100%) 12.8 (+300%(***)) # Mongoose 2.36 13.0 (+50%) 8.36 (+33%) diff --git a/openssl-1.1.1-fips-post-rand.patch b/openssl-1.1.1-fips-post-rand.patch index 18a01fe..027dc55 100644 --- a/openssl-1.1.1-fips-post-rand.patch +++ b/openssl-1.1.1-fips-post-rand.patch @@ -1,6 +1,6 @@ -diff -up openssl-1.1.1e/crypto/fips/fips.c.fips-post-rand openssl-1.1.1e/crypto/fips/fips.c ---- openssl-1.1.1e/crypto/fips/fips.c.fips-post-rand 2020-03-17 18:06:16.822418854 +0100 -+++ openssl-1.1.1e/crypto/fips/fips.c 2020-03-17 18:06:16.861418172 +0100 +diff -up openssl-1.1.1i/crypto/fips/fips.c.fips-post-rand openssl-1.1.1i/crypto/fips/fips.c +--- openssl-1.1.1i/crypto/fips/fips.c.fips-post-rand 2020-12-09 10:26:41.634106328 +0100 ++++ openssl-1.1.1i/crypto/fips/fips.c 2020-12-09 10:26:41.652106475 +0100 @@ -68,6 +68,7 @@ # include @@ -51,10 +51,10 @@ diff -up openssl-1.1.1e/crypto/fips/fips.c.fips-post-rand openssl-1.1.1e/crypto/ ret = 1; goto end; } -diff -up openssl-1.1.1e/crypto/rand/drbg_lib.c.fips-post-rand openssl-1.1.1e/crypto/rand/drbg_lib.c ---- openssl-1.1.1e/crypto/rand/drbg_lib.c.fips-post-rand 2020-03-17 15:31:17.000000000 +0100 -+++ openssl-1.1.1e/crypto/rand/drbg_lib.c 2020-03-17 18:07:35.305045521 +0100 -@@ -1009,6 +1009,20 @@ size_t rand_drbg_seedlen(RAND_DRBG *drbg +diff -up openssl-1.1.1i/crypto/rand/drbg_lib.c.fips-post-rand openssl-1.1.1i/crypto/rand/drbg_lib.c +--- openssl-1.1.1i/crypto/rand/drbg_lib.c.fips-post-rand 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/rand/drbg_lib.c 2020-12-09 10:26:41.652106475 +0100 +@@ -1005,6 +1005,20 @@ size_t rand_drbg_seedlen(RAND_DRBG *drbg return min_entropy > min_entropylen ? min_entropy : min_entropylen; } @@ -75,9 +75,9 @@ diff -up openssl-1.1.1e/crypto/rand/drbg_lib.c.fips-post-rand openssl-1.1.1e/cry /* Implements the default OpenSSL RAND_add() method */ static int drbg_add(const void *buf, int num, double randomness) { -diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/crypto/rand/rand_unix.c ---- openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand 2020-03-17 15:31:17.000000000 +0100 -+++ openssl-1.1.1e/crypto/rand/rand_unix.c 2020-03-17 18:09:01.503537189 +0100 +diff -up openssl-1.1.1i/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1i/crypto/rand/rand_unix.c +--- openssl-1.1.1i/crypto/rand/rand_unix.c.fips-post-rand 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/crypto/rand/rand_unix.c 2020-12-09 10:36:59.531221903 +0100 @@ -17,10 +17,12 @@ #include #include "rand_local.h" @@ -91,7 +91,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr # ifdef DEVRANDOM_WAIT # include # include -@@ -342,7 +344,7 @@ static ssize_t sysctl_random(char *buf, +@@ -344,7 +346,7 @@ static ssize_t sysctl_random(char *buf, * syscall_random(): Try to get random data using a system call * returns the number of bytes returned in buf, or < 0 on error. */ @@ -100,15 +100,15 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr { /* * Note: 'buflen' equals the size of the buffer which is used by the -@@ -364,6 +366,7 @@ static ssize_t syscall_random(void *buf, - * - Linux since 3.17 with glibc 2.25 - * - FreeBSD since 12.0 (1200061) +@@ -369,6 +371,7 @@ static ssize_t syscall_random(void *buf, + * Note: Sometimes getentropy() can be provided but not implemented + * internally. So we need to check errno for ENOSYS */ +# if 0 # if defined(__GNUC__) && __GNUC__>=2 && defined(__ELF__) && !defined(__hpux) extern int getentropy(void *buffer, size_t length) __attribute__((weak)); -@@ -385,10 +388,10 @@ static ssize_t syscall_random(void *buf, +@@ -394,10 +397,10 @@ static ssize_t syscall_random(void *buf, if (p_getentropy.p != NULL) return p_getentropy.f(buf, buflen) == 0 ? (ssize_t)buflen : -1; # endif @@ -122,7 +122,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr # elif (defined(__FreeBSD__) || defined(__NetBSD__)) && defined(KERN_ARND) return sysctl_random(buf, buflen); # else -@@ -623,6 +626,9 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -633,6 +636,9 @@ size_t rand_pool_acquire_entropy(RAND_PO size_t entropy_available; # if defined(OPENSSL_RAND_SEED_GETRANDOM) @@ -132,7 +132,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr { size_t bytes_needed; unsigned char *buffer; -@@ -633,7 +639,7 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -643,7 +649,7 @@ size_t rand_pool_acquire_entropy(RAND_PO bytes_needed = rand_pool_bytes_needed(pool, 1 /*entropy_factor*/); while (bytes_needed != 0 && attempts-- > 0) { buffer = rand_pool_add_begin(pool, bytes_needed); @@ -141,7 +141,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr if (bytes > 0) { rand_pool_add_end(pool, bytes, 8 * bytes); bytes_needed -= bytes; -@@ -668,8 +674,10 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -678,8 +684,10 @@ size_t rand_pool_acquire_entropy(RAND_PO int attempts = 3; const int fd = get_random_device(i); @@ -153,7 +153,7 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr while (bytes_needed != 0 && attempts-- > 0) { buffer = rand_pool_add_begin(pool, bytes_needed); -@@ -732,7 +740,9 @@ size_t rand_pool_acquire_entropy(RAND_PO +@@ -742,7 +750,9 @@ size_t rand_pool_acquire_entropy(RAND_PO return entropy_available; } # endif @@ -164,9 +164,9 @@ diff -up openssl-1.1.1e/crypto/rand/rand_unix.c.fips-post-rand openssl-1.1.1e/cr return rand_pool_entropy_available(pool); # endif } -diff -up openssl-1.1.1e/include/crypto/fips.h.fips-post-rand openssl-1.1.1e/include/crypto/fips.h ---- openssl-1.1.1e/include/crypto/fips.h.fips-post-rand 2020-03-17 18:06:16.831418696 +0100 -+++ openssl-1.1.1e/include/crypto/fips.h 2020-03-17 18:06:16.861418172 +0100 +diff -up openssl-1.1.1i/include/crypto/fips.h.fips-post-rand openssl-1.1.1i/include/crypto/fips.h +--- openssl-1.1.1i/include/crypto/fips.h.fips-post-rand 2020-12-09 10:26:41.639106369 +0100 ++++ openssl-1.1.1i/include/crypto/fips.h 2020-12-09 10:26:41.657106516 +0100 @@ -77,6 +77,8 @@ int FIPS_selftest_hmac(void); int FIPS_selftest_drbg(void); int FIPS_selftest_cmac(void); @@ -176,9 +176,9 @@ diff -up openssl-1.1.1e/include/crypto/fips.h.fips-post-rand openssl-1.1.1e/incl int fips_pkey_signature_test(EVP_PKEY *pkey, const unsigned char *tbs, int tbslen, const unsigned char *kat, -diff -up openssl-1.1.1e/include/crypto/rand.h.fips-post-rand openssl-1.1.1e/include/crypto/rand.h ---- openssl-1.1.1e/include/crypto/rand.h.fips-post-rand 2020-03-17 15:31:17.000000000 +0100 -+++ openssl-1.1.1e/include/crypto/rand.h 2020-03-17 18:07:35.303045555 +0100 +diff -up openssl-1.1.1i/include/crypto/rand.h.fips-post-rand openssl-1.1.1i/include/crypto/rand.h +--- openssl-1.1.1i/include/crypto/rand.h.fips-post-rand 2020-12-08 14:20:59.000000000 +0100 ++++ openssl-1.1.1i/include/crypto/rand.h 2020-12-09 10:26:41.657106516 +0100 @@ -24,6 +24,7 @@ typedef struct rand_pool_st RAND_POOL; diff --git a/openssl-1.1.1-version-override.patch b/openssl-1.1.1-version-override.patch index ff69bdb..727cc26 100644 --- a/openssl-1.1.1-version-override.patch +++ b/openssl-1.1.1-version-override.patch @@ -1,12 +1,12 @@ -diff -up openssl-1.1.1g/include/openssl/opensslv.h.version-override openssl-1.1.1g/include/openssl/opensslv.h ---- openssl-1.1.1g/include/openssl/opensslv.h.version-override 2020-04-23 13:29:37.802673513 +0200 -+++ openssl-1.1.1g/include/openssl/opensslv.h 2020-04-23 13:30:13.064008458 +0200 +diff -up openssl-1.1.1i/include/openssl/opensslv.h.version-override openssl-1.1.1i/include/openssl/opensslv.h +--- openssl-1.1.1i/include/openssl/opensslv.h.version-override 2020-12-09 10:25:12.042374409 +0100 ++++ openssl-1.1.1i/include/openssl/opensslv.h 2020-12-09 10:26:00.362769170 +0100 @@ -40,7 +40,7 @@ extern "C" { * major minor fix final patch/beta) */ - # define OPENSSL_VERSION_NUMBER 0x1010108fL --# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1h 22 Sep 2020" -+# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1h FIPS 22 Sep 2020" + # define OPENSSL_VERSION_NUMBER 0x1010109fL +-# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1i 8 Dec 2020" ++# define OPENSSL_VERSION_TEXT "OpenSSL 1.1.1i FIPS 8 Dec 2020" /*- * The macros below are to be used for shared library (.so, .dll, ...) diff --git a/openssl.spec b/openssl.spec index 3f6403c..2e26e49 100644 --- a/openssl.spec +++ b/openssl.spec @@ -21,7 +21,7 @@ Summary: Utilities from the general purpose cryptography library with TLS implementation Name: openssl -Version: 1.1.1h +Version: 1.1.1i Release: 1%{?dist} Epoch: 1 # We have to remove certain patented algorithms from the openssl source @@ -473,6 +473,9 @@ export LD_LIBRARY_PATH %ldconfig_scriptlets libs %changelog +* Wed Dec 9 2020 Tomáš Mráz 1.1.1i-1 +- Update to the 1.1.1i release fixing CVE-2020-1971 + * Mon Nov 9 2020 Sahana Prasad - 1.1.1h-1 - Upgrade to version 1.1.1.h diff --git a/sources b/sources index 2bae151..4c1e648 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openssl-1.1.1h-hobbled.tar.xz) = 75e1d3f34f93462b97db92aa6538fd4f2f091ad717438e51d147508738be720d7d0bf4a9b1fda3a1943a4c13aae2a39da3add05f7da833b3c6de40a97bc97908 +SHA512 (openssl-1.1.1i-hobbled.tar.xz) = e131a05e88690a7be7c3d74cbb26620130498ced2ce3d7fd55979aab5ea736ec8b268ba92268bd5bc347989325a3950a066883007cb20c2dd9739fd1eafc513f