openssl/openssl-1.1.1-s390x-update.patch

5503 lines
111 KiB
Diff
Raw Normal View History

diff -up openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl.s390x-update openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl
--- openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl.s390x-update 2020-03-17 15:31:17.000000000 +0100
+++ openssl-1.1.1e/crypto/chacha/asm/chacha-s390x.pl 2020-03-19 16:45:05.483440129 +0100
@@ -20,41 +20,53 @@
#
# 3 times faster than compiler-generated code.
-$flavour = shift;
+#
+# August 2018
+#
+# Add vx code path: 4x"vertical".
+#
+# Copyright IBM Corp. 2018
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+
+#
+# February 2019
+#
+# Add 6x"horizontal" VX implementation. It's ~25% faster than IBM's
+# 4x"vertical" submission [on z13] and >3 faster than scalar code.
+# But to harness overheads revert to transliteration of VSX code path
+# from chacha-ppc module, which is also 4x"vertical", to handle inputs
+# not longer than 256 bytes.
+
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin/../..";
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
+my $flavour = shift;
+
+my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
+ $z=0; # S/390 ABI
$SIZE_T=4;
- $g="";
} else {
+ $z=1; # zSeries ABI
$SIZE_T=8;
- $g="g";
}
+my $output;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-
-sub AUTOLOAD() # thunk [simplified] x86-style perlasm
-{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
- $code .= "\t$opcode\t".join(',',@_)."\n";
-}
my $sp="%r15";
-
my $stdframe=16*$SIZE_T+4*8;
-my $frame=$stdframe+4*20;
-
-my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
+sub ROUND {
my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
my @t=map("%r$_",(8,9));
-
-sub ROUND {
my ($a0,$b0,$c0,$d0)=@_;
my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
-my ($xc,$xc_)=map("\"$_\"",@t);
-my @x=map("\"$_\"",@x);
+my ($xc,$xc_)=map("$_",@t);
# Consider order in which variables are addressed by their
# index:
@@ -78,249 +90,967 @@ my @x=map("\"$_\"",@x);
# 'c' stores and loads in the middle, but none in the beginning
# or end.
- (
- "&alr (@x[$a0],@x[$b0])", # Q1
- "&alr (@x[$a1],@x[$b1])", # Q2
- "&xr (@x[$d0],@x[$a0])",
- "&xr (@x[$d1],@x[$a1])",
- "&rll (@x[$d0],@x[$d0],16)",
- "&rll (@x[$d1],@x[$d1],16)",
-
- "&alr ($xc,@x[$d0])",
- "&alr ($xc_,@x[$d1])",
- "&xr (@x[$b0],$xc)",
- "&xr (@x[$b1],$xc_)",
- "&rll (@x[$b0],@x[$b0],12)",
- "&rll (@x[$b1],@x[$b1],12)",
-
- "&alr (@x[$a0],@x[$b0])",
- "&alr (@x[$a1],@x[$b1])",
- "&xr (@x[$d0],@x[$a0])",
- "&xr (@x[$d1],@x[$a1])",
- "&rll (@x[$d0],@x[$d0],8)",
- "&rll (@x[$d1],@x[$d1],8)",
-
- "&alr ($xc,@x[$d0])",
- "&alr ($xc_,@x[$d1])",
- "&xr (@x[$b0],$xc)",
- "&xr (@x[$b1],$xc_)",
- "&rll (@x[$b0],@x[$b0],7)",
- "&rll (@x[$b1],@x[$b1],7)",
-
- "&stm ($xc,$xc_,'$stdframe+4*8+4*$c0($sp)')", # reload pair of 'c's
- "&lm ($xc,$xc_,'$stdframe+4*8+4*$c2($sp)')",
-
- "&alr (@x[$a2],@x[$b2])", # Q3
- "&alr (@x[$a3],@x[$b3])", # Q4
- "&xr (@x[$d2],@x[$a2])",
- "&xr (@x[$d3],@x[$a3])",
- "&rll (@x[$d2],@x[$d2],16)",
- "&rll (@x[$d3],@x[$d3],16)",
-
- "&alr ($xc,@x[$d2])",
- "&alr ($xc_,@x[$d3])",
- "&xr (@x[$b2],$xc)",
- "&xr (@x[$b3],$xc_)",
- "&rll (@x[$b2],@x[$b2],12)",
- "&rll (@x[$b3],@x[$b3],12)",
-
- "&alr (@x[$a2],@x[$b2])",
- "&alr (@x[$a3],@x[$b3])",
- "&xr (@x[$d2],@x[$a2])",
- "&xr (@x[$d3],@x[$a3])",
- "&rll (@x[$d2],@x[$d2],8)",
- "&rll (@x[$d3],@x[$d3],8)",
-
- "&alr ($xc,@x[$d2])",
- "&alr ($xc_,@x[$d3])",
- "&xr (@x[$b2],$xc)",
- "&xr (@x[$b3],$xc_)",
- "&rll (@x[$b2],@x[$b2],7)",
- "&rll (@x[$b3],@x[$b3],7)"
- );
-}
-
-$code.=<<___;
-.text
-
-.globl ChaCha20_ctr32
-.type ChaCha20_ctr32,\@function
-.align 32
-ChaCha20_ctr32:
- lt${g}r $len,$len # $len==0?
- bzr %r14
- a${g}hi $len,-64
- l${g}hi %r1,-$frame
- stm${g} %r6,%r15,`6*$SIZE_T`($sp)
- sl${g}r $out,$inp # difference
- la $len,0($inp,$len) # end of input minus 64
- larl %r7,.Lsigma
- lgr %r0,$sp
- la $sp,0(%r1,$sp)
- st${g} %r0,0($sp)
-
- lmg %r8,%r11,0($key) # load key
- lmg %r12,%r13,0($counter) # load counter
- lmg %r6,%r7,0(%r7) # load sigma constant
-
- la %r14,0($inp)
- st${g} $out,$frame+3*$SIZE_T($sp)
- st${g} $len,$frame+4*$SIZE_T($sp)
- stmg %r6,%r13,$stdframe($sp) # copy key schedule to stack
- srlg @x[12],%r12,32 # 32-bit counter value
- j .Loop_outer
-
-.align 16
-.Loop_outer:
- lm @x[0],@x[7],$stdframe+4*0($sp) # load x[0]-x[7]
- lm @t[0],@t[1],$stdframe+4*10($sp) # load x[10]-x[11]
- lm @x[13],@x[15],$stdframe+4*13($sp) # load x[13]-x[15]
- stm @t[0],@t[1],$stdframe+4*8+4*10($sp) # offload x[10]-x[11]
- lm @t[0],@t[1],$stdframe+4*8($sp) # load x[8]-x[9]
- st @x[12],$stdframe+4*12($sp) # save counter
- st${g} %r14,$frame+2*$SIZE_T($sp) # save input pointer
- lhi %r14,10
- j .Loop
-
-.align 4
-.Loop:
-___
- foreach (&ROUND(0, 4, 8,12)) { eval; }
- foreach (&ROUND(0, 5,10,15)) { eval; }
-$code.=<<___;
- brct %r14,.Loop
-
- l${g} %r14,$frame+2*$SIZE_T($sp) # pull input pointer
- stm @t[0],@t[1],$stdframe+4*8+4*8($sp) # offload x[8]-x[9]
- lm${g} @t[0],@t[1],$frame+3*$SIZE_T($sp)
-
- al @x[0],$stdframe+4*0($sp) # accumulate key schedule
- al @x[1],$stdframe+4*1($sp)
- al @x[2],$stdframe+4*2($sp)
- al @x[3],$stdframe+4*3($sp)
- al @x[4],$stdframe+4*4($sp)
- al @x[5],$stdframe+4*5($sp)
- al @x[6],$stdframe+4*6($sp)
- al @x[7],$stdframe+4*7($sp)
- lrvr @x[0],@x[0]
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- lrvr @x[4],@x[4]
- lrvr @x[5],@x[5]
- lrvr @x[6],@x[6]
- lrvr @x[7],@x[7]
- al @x[12],$stdframe+4*12($sp)
- al @x[13],$stdframe+4*13($sp)
- al @x[14],$stdframe+4*14($sp)
- al @x[15],$stdframe+4*15($sp)
- lrvr @x[12],@x[12]
- lrvr @x[13],@x[13]
- lrvr @x[14],@x[14]
- lrvr @x[15],@x[15]
-
- la @t[0],0(@t[0],%r14) # reconstruct output pointer
- cl${g}r %r14,@t[1]
- jh .Ltail
-
- x @x[0],4*0(%r14) # xor with input
- x @x[1],4*1(%r14)
- st @x[0],4*0(@t[0]) # store output
- x @x[2],4*2(%r14)
- st @x[1],4*1(@t[0])
- x @x[3],4*3(%r14)
- st @x[2],4*2(@t[0])
- x @x[4],4*4(%r14)
- st @x[3],4*3(@t[0])
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp) # load x[8]-x[11]
- x @x[5],4*5(%r14)
- st @x[4],4*4(@t[0])
- x @x[6],4*6(%r14)
- al @x[0],$stdframe+4*8($sp)
- st @x[5],4*5(@t[0])
- x @x[7],4*7(%r14)
- al @x[1],$stdframe+4*9($sp)
- st @x[6],4*6(@t[0])
- x @x[12],4*12(%r14)
- al @x[2],$stdframe+4*10($sp)
- st @x[7],4*7(@t[0])
- x @x[13],4*13(%r14)
- al @x[3],$stdframe+4*11($sp)
- st @x[12],4*12(@t[0])
- x @x[14],4*14(%r14)
- st @x[13],4*13(@t[0])
- x @x[15],4*15(%r14)
- st @x[14],4*14(@t[0])
- lrvr @x[0],@x[0]
- st @x[15],4*15(@t[0])
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- lhi @x[12],1
- x @x[0],4*8(%r14)
- al @x[12],$stdframe+4*12($sp) # increment counter
- x @x[1],4*9(%r14)
- st @x[0],4*8(@t[0])
- x @x[2],4*10(%r14)
- st @x[1],4*9(@t[0])
- x @x[3],4*11(%r14)
- st @x[2],4*10(@t[0])
- st @x[3],4*11(@t[0])
-
- cl${g}r %r14,@t[1] # done yet?
- la %r14,64(%r14)
- jl .Loop_outer
-
-.Ldone:
- xgr %r0,%r0
- xgr %r1,%r1
- xgr %r2,%r2
- xgr %r3,%r3
- stmg %r0,%r3,$stdframe+4*4($sp) # wipe key copy
- stmg %r0,%r3,$stdframe+4*12($sp)
-
- lm${g} %r6,%r15,`$frame+6*$SIZE_T`($sp)
- br %r14
-
-.align 16
-.Ltail:
- la @t[1],64($t[1])
- stm @x[0],@x[7],$stdframe+4*0($sp)
- sl${g}r @t[1],%r14
- lm @x[0],@x[3],$stdframe+4*8+4*8($sp)
- l${g}hi @x[6],0
- stm @x[12],@x[15],$stdframe+4*12($sp)
- al @x[0],$stdframe+4*8($sp)
- al @x[1],$stdframe+4*9($sp)
- al @x[2],$stdframe+4*10($sp)
- al @x[3],$stdframe+4*11($sp)
- lrvr @x[0],@x[0]
- lrvr @x[1],@x[1]
- lrvr @x[2],@x[2]
- lrvr @x[3],@x[3]
- stm @x[0],@x[3],$stdframe+4*8($sp)
-
-.Loop_tail:
- llgc @x[4],0(@x[6],%r14)
- llgc @x[5],$stdframe(@x[6],$sp)
- xr @x[5],@x[4]
- stc @x[5],0(@x[6],@t[0])
- la @x[6],1(@x[6])
- brct @t[1],.Loop_tail
-
- j .Ldone
-.size ChaCha20_ctr32,.-ChaCha20_ctr32
-
-.align 32
-.Lsigma:
-.long 0x61707865,0x3320646e,0x79622d32,0x6b206574 # endian-neutral
-.asciz "ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-.align 4
-___
+ alr (@x[$a0],@x[$b0]); # Q1
+ alr (@x[$a1],@x[$b1]); # Q2
+ xr (@x[$d0],@x[$a0]);
+ xr (@x[$d1],@x[$a1]);
+ rll (@x[$d0],@x[$d0],16);
+ rll (@x[$d1],@x[$d1],16);
+
+ alr ($xc,@x[$d0]);
+ alr ($xc_,@x[$d1]);
+ xr (@x[$b0],$xc);
+ xr (@x[$b1],$xc_);
+ rll (@x[$b0],@x[$b0],12);
+ rll (@x[$b1],@x[$b1],12);
+
+ alr (@x[$a0],@x[$b0]);
+ alr (@x[$a1],@x[$b1]);
+ xr (@x[$d0],@x[$a0]);
+ xr (@x[$d1],@x[$a1]);
+ rll (@x[$d0],@x[$d0],8);
+ rll (@x[$d1],@x[$d1],8);
+
+ alr ($xc,@x[$d0]);
+ alr ($xc_,@x[$d1]);
+ xr (@x[$b0],$xc);
+ xr (@x[$b1],$xc_);
+ rll (@x[$b0],@x[$b0],7);
+ rll (@x[$b1],@x[$b1],7);
+
+ stm ($xc,$xc_,"$stdframe+4*8+4*$c0($sp)"); # reload pair of 'c's
+ lm ($xc,$xc_,"$stdframe+4*8+4*$c2($sp)");
+
+ alr (@x[$a2],@x[$b2]); # Q3
+ alr (@x[$a3],@x[$b3]); # Q4
+ xr (@x[$d2],@x[$a2]);
+ xr (@x[$d3],@x[$a3]);
+ rll (@x[$d2],@x[$d2],16);
+ rll (@x[$d3],@x[$d3],16);
+
+ alr ($xc,@x[$d2]);
+ alr ($xc_,@x[$d3]);
+ xr (@x[$b2],$xc);
+ xr (@x[$b3],$xc_);
+ rll (@x[$b2],@x[$b2],12);
+ rll (@x[$b3],@x[$b3],12);
+
+ alr (@x[$a2],@x[$b2]);
+ alr (@x[$a3],@x[$b3]);
+ xr (@x[$d2],@x[$a2]);
+ xr (@x[$d3],@x[$a3]);
+ rll (@x[$d2],@x[$d2],8);
+ rll (@x[$d3],@x[$d3],8);
+
+ alr ($xc,@x[$d2]);
+ alr ($xc_,@x[$d3]);
+ xr (@x[$b2],$xc);
+ xr (@x[$b3],$xc_);
+ rll (@x[$b2],@x[$b2],7);
+ rll (@x[$b3],@x[$b3],7);
+}
+
+sub VX_lane_ROUND {
+my ($a0,$b0,$c0,$d0)=@_;
+my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
+my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
+my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
+my @x=map("%v$_",(0..15));
-foreach (split("\n",$code)) {
- s/\`([^\`]*)\`/eval $1/ge;
+ vaf (@x[$a0],@x[$a0],@x[$b0]); # Q1
+ vx (@x[$d0],@x[$d0],@x[$a0]);
+ verllf (@x[$d0],@x[$d0],16);
+ vaf (@x[$a1],@x[$a1],@x[$b1]); # Q2
+ vx (@x[$d1],@x[$d1],@x[$a1]);
+ verllf (@x[$d1],@x[$d1],16);
+ vaf (@x[$a2],@x[$a2],@x[$b2]); # Q3
+ vx (@x[$d2],@x[$d2],@x[$a2]);
+ verllf (@x[$d2],@x[$d2],16);
+ vaf (@x[$a3],@x[$a3],@x[$b3]); # Q4
+ vx (@x[$d3],@x[$d3],@x[$a3]);
+ verllf (@x[$d3],@x[$d3],16);
+
+ vaf (@x[$c0],@x[$c0],@x[$d0]);
+ vx (@x[$b0],@x[$b0],@x[$c0]);
+ verllf (@x[$b0],@x[$b0],12);
+ vaf (@x[$c1],@x[$c1],@x[$d1]);
+ vx (@x[$b1],@x[$b1],@x[$c1]);
+ verllf (@x[$b1],@x[$b1],12);
+ vaf (@x[$c2],@x[$c2],@x[$d2]);
+ vx (@x[$b2],@x[$b2],@x[$c2]);
+ verllf (@x[$b2],@x[$b2],12);
+ vaf (@x[$c3],@x[$c3],@x[$d3]);
+ vx (@x[$b3],@x[$b3],@x[$c3]);
+ verllf (@x[$b3],@x[$b3],12);
+
+ vaf (@x[$a0],@x[$a0],@x[$b0]);
+ vx (@x[$d0],@x[$d0],@x[$a0]);
+ verllf (@x[$d0],@x[$d0],8);
+ vaf (@x[$a1],@x[$a1],@x[$b1]);
+ vx (@x[$d1],@x[$d1],@x[$a1]);
+ verllf (@x[$d1],@x[$d1],8);
+ vaf (@x[$a2],@x[$a2],@x[$b2]);
+ vx (@x[$d2],@x[$d2],@x[$a2]);
+ verllf (@x[$d2],@x[$d2],8);
+ vaf (@x[$a3],@x[$a3],@x[$b3]);
+ vx (@x[$d3],@x[$d3],@x[$a3]);
+ verllf (@x[$d3],@x[$d3],8);
+
+ vaf (@x[$c0],@x[$c0],@x[$d0]);
+ vx (@x[$b0],@x[$b0],@x[$c0]);
+ verllf (@x[$b0],@x[$b0],7);
+ vaf (@x[$c1],@x[$c1],@x[$d1]);
+ vx (@x[$b1],@x[$b1],@x[$c1]);
+ verllf (@x[$b1],@x[$b1],7);
+ vaf (@x[$c2],@x[$c2],@x[$d2]);
+ vx (@x[$b2],@x[$b2],@x[$c2]);
+ verllf (@x[$b2],@x[$b2],7);
+ vaf (@x[$c3],@x[$c3],@x[$d3]);
+ vx (@x[$b3],@x[$b3],@x[$c3]);
+ verllf (@x[$b3],@x[$b3],7);
+}
- print $_,"\n";
+sub VX_ROUND {
+my @a=@_[0..5];
+my @b=@_[6..11];
+my @c=@_[12..17];
+my @d=@_[18..23];
+my $odd=@_[24];
+
+ vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
+ vx (@d[$_],@d[$_],@a[$_]) for (0..5);
+ verllf (@d[$_],@d[$_],16) for (0..5);
+
+ vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
+ vx (@b[$_],@b[$_],@c[$_]) for (0..5);
+ verllf (@b[$_],@b[$_],12) for (0..5);
+
+ vaf (@a[$_],@a[$_],@b[$_]) for (0..5);
+ vx (@d[$_],@d[$_],@a[$_]) for (0..5);
+ verllf (@d[$_],@d[$_],8) for (0..5);
+
+ vaf (@c[$_],@c[$_],@d[$_]) for (0..5);
+ vx (@b[$_],@b[$_],@c[$_]) for (0..5);
+ verllf (@b[$_],@b[$_],7) for (0..5);
+
+ vsldb (@c[$_],@c[$_],@c[$_],8) for (0..5);
+ vsldb (@b[$_],@b[$_],@b[$_],$odd?12:4) for (0..5);
+ vsldb (@d[$_],@d[$_],@d[$_],$odd?4:12) for (0..5);
}
-close STDOUT or die "error closing STDOUT: $!";
+
+PERLASM_BEGIN($output);
+
+INCLUDE ("s390x_arch.h");
+TEXT ();
+
+################
+# void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, size_t len,
+# const unsigned int key[8], const unsigned int counter[4])
+my ($out,$inp,$len,$key,$counter)=map("%r$_",(2..6));
+{
+my $frame=$stdframe+4*20;
+my @x=map("%r$_",(0..7,"x","x","x","x",(10..13)));
+my @t=map("%r$_",(8,9));
+
+GLOBL ("ChaCha20_ctr32");
+TYPE ("ChaCha20_ctr32","\@function");
+ALIGN (32);
+LABEL ("ChaCha20_ctr32");
+ larl ("%r1","OPENSSL_s390xcap_P");
+
+ lghi ("%r0",64);
+&{$z? \&ltgr:\&ltr} ($len,$len); # len==0?
+ bzr ("%r14");
+ lg ("%r1","S390X_STFLE+16(%r1)");
+&{$z? \&clgr:\&clr} ($len,"%r0");
+ jle (".Lshort");
+
+ tmhh ("%r1",0x4000); # check for vx bit
+ jnz (".LChaCha20_ctr32_vx");
+
+LABEL (".Lshort");
+&{$z? \&aghi:\&ahi} ($len,-64);
+&{$z? \&lghi:\&lhi} ("%r1",-$frame);
+&{$z? \&stmg:\&stm} ("%r6","%r15","6*$SIZE_T($sp)");
+&{$z? \&slgr:\&slr} ($out,$inp); # difference
+ la ($len,"0($inp,$len)"); # end of input minus 64
+ larl ("%r7",".Lsigma");
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)");
+&{$z? \&stg:\&st} ("%r0","0($sp)");
+
+ lmg ("%r8","%r11","0($key)"); # load key
+ lmg ("%r12","%r13","0($counter)"); # load counter
+ lmg ("%r6","%r7","0(%r7)"); # load sigma constant
+
+ la ("%r14","0($inp)");
+&{$z? \&stg:\&st} ($out,"$frame+3*$SIZE_T($sp)");
+&{$z? \&stg:\&st} ($len,"$frame+4*$SIZE_T($sp)");
+ stmg ("%r6","%r13","$stdframe($sp)");# copy key schedule to stack
+ srlg (@x[12],"%r12",32); # 32-bit counter value
+ j (".Loop_outer");
+
+ALIGN (16);
+LABEL (".Loop_outer");
+ lm (@x[0],@x[7],"$stdframe+4*0($sp)"); # load x[0]-x[7]
+ lm (@t[0],@t[1],"$stdframe+4*10($sp)"); # load x[10]-x[11]
+ lm (@x[13],@x[15],"$stdframe+4*13($sp)"); # load x[13]-x[15]
+ stm (@t[0],@t[1],"$stdframe+4*8+4*10($sp)");# offload x[10]-x[11]
+ lm (@t[0],@t[1],"$stdframe+4*8($sp)"); # load x[8]-x[9]
+ st (@x[12],"$stdframe+4*12($sp)"); # save counter
+&{$z? \&stg:\&st} ("%r14","$frame+2*$SIZE_T($sp)");# save input pointer
+ lhi ("%r14",10);
+ j (".Loop");
+
+ALIGN (4);
+LABEL (".Loop");
+ ROUND (0, 4, 8,12);
+ ROUND (0, 5,10,15);
+ brct ("%r14",".Loop");
+
+&{$z? \&lg:\&l} ("%r14","$frame+2*$SIZE_T($sp)");# pull input pointer
+ stm (@t[0],@t[1],"$stdframe+4*8+4*8($sp)"); # offload x[8]-x[9]
+&{$z? \&lmg:\&lm} (@t[0],@t[1],"$frame+3*$SIZE_T($sp)");
+
+ al (@x[0],"$stdframe+4*0($sp)"); # accumulate key schedule
+ al (@x[1],"$stdframe+4*1($sp)");
+ al (@x[2],"$stdframe+4*2($sp)");
+ al (@x[3],"$stdframe+4*3($sp)");
+ al (@x[4],"$stdframe+4*4($sp)");
+ al (@x[5],"$stdframe+4*5($sp)");
+ al (@x[6],"$stdframe+4*6($sp)");
+ al (@x[7],"$stdframe+4*7($sp)");
+ lrvr (@x[0],@x[0]);
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ lrvr (@x[4],@x[4]);
+ lrvr (@x[5],@x[5]);
+ lrvr (@x[6],@x[6]);
+ lrvr (@x[7],@x[7]);
+ al (@x[12],"$stdframe+4*12($sp)");
+ al (@x[13],"$stdframe+4*13($sp)");
+ al (@x[14],"$stdframe+4*14($sp)");
+ al (@x[15],"$stdframe+4*15($sp)");
+ lrvr (@x[12],@x[12]);
+ lrvr (@x[13],@x[13]);
+ lrvr (@x[14],@x[14]);
+ lrvr (@x[15],@x[15]);
+
+ la (@t[0],"0(@t[0],%r14)"); # reconstruct output pointer
+&{$z? \&clgr:\&clr} ("%r14",@t[1]);
+ jh (".Ltail");
+
+ x (@x[0],"4*0(%r14)"); # xor with input
+ x (@x[1],"4*1(%r14)");
+ st (@x[0],"4*0(@t[0])"); # store output
+ x (@x[2],"4*2(%r14)");
+ st (@x[1],"4*1(@t[0])");
+ x (@x[3],"4*3(%r14)");
+ st (@x[2],"4*2(@t[0])");
+ x (@x[4],"4*4(%r14)");
+ st (@x[3],"4*3(@t[0])");
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)"); # load x[8]-x[11]
+ x (@x[5],"4*5(%r14)");
+ st (@x[4],"4*4(@t[0])");
+ x (@x[6],"4*6(%r14)");
+ al (@x[0],"$stdframe+4*8($sp)");
+ st (@x[5],"4*5(@t[0])");
+ x (@x[7],"4*7(%r14)");
+ al (@x[1],"$stdframe+4*9($sp)");
+ st (@x[6],"4*6(@t[0])");
+ x (@x[12],"4*12(%r14)");
+ al (@x[2],"$stdframe+4*10($sp)");
+ st (@x[7],"4*7(@t[0])");
+ x (@x[13],"4*13(%r14)");
+ al (@x[3],"$stdframe+4*11($sp)");
+ st (@x[12],"4*12(@t[0])");
+ x (@x[14],"4*14(%r14)");
+ st (@x[13],"4*13(@t[0])");
+ x (@x[15],"4*15(%r14)");
+ st (@x[14],"4*14(@t[0])");
+ lrvr (@x[0],@x[0]);
+ st (@x[15],"4*15(@t[0])");
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ lhi (@x[12],1);
+ x (@x[0],"4*8(%r14)");
+ al (@x[12],"$stdframe+4*12($sp)"); # increment counter
+ x (@x[1],"4*9(%r14)");
+ st (@x[0],"4*8(@t[0])");
+ x (@x[2],"4*10(%r14)");
+ st (@x[1],"4*9(@t[0])");
+ x (@x[3],"4*11(%r14)");
+ st (@x[2],"4*10(@t[0])");
+ st (@x[3],"4*11(@t[0])");
+
+&{$z? \&clgr:\&clr} ("%r14",@t[1]); # done yet?
+ la ("%r14","64(%r14)");
+ jl (".Loop_outer");
+
+LABEL (".Ldone");
+ xgr ("%r0","%r0");
+ xgr ("%r1","%r1");
+ xgr ("%r2","%r2");
+ xgr ("%r3","%r3");
+ stmg ("%r0","%r3","$stdframe+4*4($sp)"); # wipe key copy
+ stmg ("%r0","%r3","$stdframe+4*12($sp)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r15","$frame+6*$SIZE_T($sp)");
+ br ("%r14");
+
+ALIGN (16);
+LABEL (".Ltail");
+ la (@t[1],"64($t[1])");
+ stm (@x[0],@x[7],"$stdframe+4*0($sp)");
+&{$z? \&slgr:\&slr} (@t[1],"%r14");
+ lm (@x[0],@x[3],"$stdframe+4*8+4*8($sp)");
+&{$z? \&lghi:\&lhi} (@x[6],0);
+ stm (@x[12],@x[15],"$stdframe+4*12($sp)");
+ al (@x[0],"$stdframe+4*8($sp)");
+ al (@x[1],"$stdframe+4*9($sp)");
+ al (@x[2],"$stdframe+4*10($sp)");
+ al (@x[3],"$stdframe+4*11($sp)");
+ lrvr (@x[0],@x[0]);
+ lrvr (@x[1],@x[1]);
+ lrvr (@x[2],@x[2]);
+ lrvr (@x[3],@x[3]);
+ stm (@x[0],@x[3],"$stdframe+4*8($sp)");
+
+LABEL (".Loop_tail");
+ llgc (@x[4],"0(@x[6],%r14)");
+ llgc (@x[5],"$stdframe(@x[6],$sp)");
+ xr (@x[5],@x[4]);
+ stc (@x[5],"0(@x[6],@t[0])");
+ la (@x[6],"1(@x[6])");
+ brct (@t[1],".Loop_tail");
+
+ j (".Ldone");
+SIZE ("ChaCha20_ctr32",".-ChaCha20_ctr32");
+}
+
+########################################################################
+# 4x"vertical" layout minimizes amount of instructions, but pipeline
+# runs underutilized [because of vector instructions' high latency].
+# On the other hand minimum amount of data it takes to fully utilize
+# the pipeline is higher, so that effectively, short inputs would be
+# processed slower. Hence this code path targeting <=256 bytes lengths.
+#
+{
+my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
+ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%v$_",(0..15));
+my @K=map("%v$_",(16..19));
+my $CTR="%v26";
+my ($xt0,$xt1,$xt2,$xt3)=map("%v$_",(27..30));
+my $beperm="%v31";
+my ($x00,$x10,$x20,$x30)=(0,map("r$_",(8..10)));
+my $FRAME=$stdframe+4*16;
+
+ALIGN (32);
+LABEL ("ChaCha20_ctr32_4x");
+LABEL (".LChaCha20_ctr32_4x");
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)");
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
+if ($z) {
+ std ("%f8","$stdframe+8*0($sp)");
+ std ("%f9","$stdframe+8*1($sp)");
+ std ("%f10","$stdframe+8*2($sp)");
+ std ("%f11","$stdframe+8*3($sp)");
+ std ("%f12","$stdframe+8*4($sp)");
+ std ("%f13","$stdframe+8*5($sp)");
+ std ("%f14","$stdframe+8*6($sp)");
+ std ("%f15","$stdframe+8*7($sp)");
+}
+ larl ("%r7",".Lsigma");
+ lhi ("%r0",10);
+ lhi ("%r1",0);
+
+ vl (@K[0],"0(%r7)"); # load sigma
+ vl (@K[1],"0($key)"); # load key
+ vl (@K[2],"16($key)");
+ vl (@K[3],"0($counter)"); # load counter
+
+ vl ($beperm,"0x40(%r7)");
+ vl ($xt1,"0x50(%r7)");
+ vrepf ($CTR,@K[3],0);
+ vlvgf (@K[3],"%r1",0); # clear @K[3].word[0]
+ vaf ($CTR,$CTR,$xt1);
+
+#LABEL (".Loop_outer_4x");
+ vlm ($xa0,$xa3,"0x60(%r7)"); # load [smashed] sigma
+
+ vrepf ($xb0,@K[1],0); # smash the key
+ vrepf ($xb1,@K[1],1);
+ vrepf ($xb2,@K[1],2);
+ vrepf ($xb3,@K[1],3);
+
+ vrepf ($xc0,@K[2],0);
+ vrepf ($xc1,@K[2],1);
+ vrepf ($xc2,@K[2],2);
+ vrepf ($xc3,@K[2],3);
+
+ vlr ($xd0,$CTR);
+ vrepf ($xd1,@K[3],1);
+ vrepf ($xd2,@K[3],2);
+ vrepf ($xd3,@K[3],3);
+
+LABEL (".Loop_4x");
+ VX_lane_ROUND(0, 4, 8,12);
+ VX_lane_ROUND(0, 5,10,15);
+ brct ("%r0",".Loop_4x");
+
+ vaf ($xd0,$xd0,$CTR);
+
+ vmrhf ($xt0,$xa0,$xa1); # transpose data
+ vmrhf ($xt1,$xa2,$xa3);
+ vmrlf ($xt2,$xa0,$xa1);
+ vmrlf ($xt3,$xa2,$xa3);
+ vpdi ($xa0,$xt0,$xt1,0b0000);
+ vpdi ($xa1,$xt0,$xt1,0b0101);
+ vpdi ($xa2,$xt2,$xt3,0b0000);
+ vpdi ($xa3,$xt2,$xt3,0b0101);
+
+ vmrhf ($xt0,$xb0,$xb1);
+ vmrhf ($xt1,$xb2,$xb3);
+ vmrlf ($xt2,$xb0,$xb1);
+ vmrlf ($xt3,$xb2,$xb3);
+ vpdi ($xb0,$xt0,$xt1,0b0000);
+ vpdi ($xb1,$xt0,$xt1,0b0101);
+ vpdi ($xb2,$xt2,$xt3,0b0000);
+ vpdi ($xb3,$xt2,$xt3,0b0101);
+
+ vmrhf ($xt0,$xc0,$xc1);
+ vmrhf ($xt1,$xc2,$xc3);
+ vmrlf ($xt2,$xc0,$xc1);
+ vmrlf ($xt3,$xc2,$xc3);
+ vpdi ($xc0,$xt0,$xt1,0b0000);
+ vpdi ($xc1,$xt0,$xt1,0b0101);
+ vpdi ($xc2,$xt2,$xt3,0b0000);
+ vpdi ($xc3,$xt2,$xt3,0b0101);
+
+ vmrhf ($xt0,$xd0,$xd1);
+ vmrhf ($xt1,$xd2,$xd3);
+ vmrlf ($xt2,$xd0,$xd1);
+ vmrlf ($xt3,$xd2,$xd3);
+ vpdi ($xd0,$xt0,$xt1,0b0000);
+ vpdi ($xd1,$xt0,$xt1,0b0101);
+ vpdi ($xd2,$xt2,$xt3,0b0000);
+ vpdi ($xd3,$xt2,$xt3,0b0101);
+
+ #vrepif ($xt0,4);
+ #vaf ($CTR,$CTR,$xt0); # next counter value
+
+ vaf ($xa0,$xa0,@K[0]);
+ vaf ($xb0,$xb0,@K[1]);
+ vaf ($xc0,$xc0,@K[2]);
+ vaf ($xd0,$xd0,@K[3]);
+
+ vperm ($xa0,$xa0,$xa0,$beperm);
+ vperm ($xb0,$xb0,$xb0,$beperm);
+ vperm ($xc0,$xc0,$xc0,$beperm);
+ vperm ($xd0,$xd0,$xd0,$beperm);
+
+ #&{$z? \&clgfi:\&clfi} ($len,0x40);
+ #jl (".Ltail_4x");
+
+ vlm ($xt0,$xt3,"0($inp)");
+
+ vx ($xt0,$xt0,$xa0);
+ vx ($xt1,$xt1,$xb0);
+ vx ($xt2,$xt2,$xc0);
+ vx ($xt3,$xt3,$xd0);
+
+ vstm ($xt0,$xt3,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ #je (".Ldone_4x");
+
+ vaf ($xa0,$xa1,@K[0]);
+ vaf ($xb0,$xb1,@K[1]);
+ vaf ($xc0,$xc1,@K[2]);
+ vaf ($xd0,$xd1,@K[3]);
+
+ vperm ($xa0,$xa0,$xa0,$beperm);
+ vperm ($xb0,$xb0,$xb0,$beperm);
+ vperm ($xc0,$xc0,$xc0,$beperm);
+ vperm ($xd0,$xd0,$xd0,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_4x");
+
+ vlm ($xt0,$xt3,"0($inp)");
+
+ vx ($xt0,$xt0,$xa0);
+ vx ($xt1,$xt1,$xb0);
+ vx ($xt2,$xt2,$xc0);
+ vx ($xt3,$xt3,$xd0);
+
+ vstm ($xt0,$xt3,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_4x");
+
+ vaf ($xa0,$xa2,@K[0]);
+ vaf ($xb0,$xb2,@K[1]);
+ vaf ($xc0,$xc2,@K[2]);
+ vaf ($xd0,$xd2,@K[3]);
+
+ vperm ($xa0,$xa0,$xa0,$beperm);
+ vperm ($xb0,$xb0,$xb0,$beperm);
+ vperm ($xc0,$xc0,$xc0,$beperm);
+ vperm ($xd0,$xd0,$xd0,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_4x");
+
+ vlm ($xt0,$xt3,"0($inp)");
+
+ vx ($xt0,$xt0,$xa0);
+ vx ($xt1,$xt1,$xb0);
+ vx ($xt2,$xt2,$xc0);
+ vx ($xt3,$xt3,$xd0);
+
+ vstm ($xt0,$xt3,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_4x");
+
+ vaf ($xa0,$xa3,@K[0]);
+ vaf ($xb0,$xb3,@K[1]);
+ vaf ($xc0,$xc3,@K[2]);
+ vaf ($xd0,$xd3,@K[3]);
+
+ vperm ($xa0,$xa0,$xa0,$beperm);
+ vperm ($xb0,$xb0,$xb0,$beperm);
+ vperm ($xc0,$xc0,$xc0,$beperm);
+ vperm ($xd0,$xd0,$xd0,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_4x");
+
+ vlm ($xt0,$xt3,"0($inp)");
+
+ vx ($xt0,$xt0,$xa0);
+ vx ($xt1,$xt1,$xb0);
+ vx ($xt2,$xt2,$xc0);
+ vx ($xt3,$xt3,$xd0);
+
+ vstm ($xt0,$xt3,"0($out)");
+
+ #la $inp,0x40($inp));
+ #la $out,0x40($out));
+ #lhi %r0,10);
+ #&{$z? \&aghi:\&ahi} $len,-0x40);
+ #jne .Loop_outer_4x);
+
+LABEL (".Ldone_4x");
+if (!$z) {
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+ ld ("%f8","$stdframe+8*0($sp)");
+ ld ("%f9","$stdframe+8*1($sp)");
+ ld ("%f10","$stdframe+8*2($sp)");
+ ld ("%f11","$stdframe+8*3($sp)");
+ ld ("%f12","$stdframe+8*4($sp)");
+ ld ("%f13","$stdframe+8*5($sp)");
+ ld ("%f14","$stdframe+8*6($sp)");
+ ld ("%f15","$stdframe+8*7($sp)");
+}
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+ la ($sp,"$FRAME($sp)");
+ br ("%r14");
+
+ALIGN (16);
+LABEL (".Ltail_4x");
+if (!$z) {
+ vlr ($xt0,$xb0);
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+
+ vst ($xa0,"$stdframe+0x00($sp)");
+ vst ($xt0,"$stdframe+0x10($sp)");
+ vst ($xc0,"$stdframe+0x20($sp)");
+ vst ($xd0,"$stdframe+0x30($sp)");
+} else {
+ vlr ($xt0,$xc0);
+ ld ("%f8","$stdframe+8*0($sp)");
+ ld ("%f9","$stdframe+8*1($sp)");
+ ld ("%f10","$stdframe+8*2($sp)");
+ ld ("%f11","$stdframe+8*3($sp)");
+ vlr ($xt1,$xd0);
+ ld ("%f12","$stdframe+8*4($sp)");
+ ld ("%f13","$stdframe+8*5($sp)");
+ ld ("%f14","$stdframe+8*6($sp)");
+ ld ("%f15","$stdframe+8*7($sp)");
+
+ vst ($xa0,"$stdframe+0x00($sp)");
+ vst ($xb0,"$stdframe+0x10($sp)");
+ vst ($xt0,"$stdframe+0x20($sp)");
+ vst ($xt1,"$stdframe+0x30($sp)");
+}
+ lghi ("%r1",0);
+
+LABEL (".Loop_tail_4x");
+ llgc ("%r5","0(%r1,$inp)");
+ llgc ("%r6","$stdframe(%r1,$sp)");
+ xr ("%r6","%r5");
+ stc ("%r6","0(%r1,$out)");
+ la ("%r1","1(%r1)");
+ brct ($len,".Loop_tail_4x");
+
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+ la ($sp,"$FRAME($sp)");
+ br ("%r14");
+SIZE ("ChaCha20_ctr32_4x",".-ChaCha20_ctr32_4x");
+}
+
+########################################################################
+# 6x"horizontal" layout is optimal fit for the platform in its current
+# shape, more specifically for given vector instructions' latency. Well,
+# computational part of 8x"vertical" would be faster, but it consumes
+# all registers and dealing with that will diminish the return...
+#
+{
+my ($a0,$b0,$c0,$d0, $a1,$b1,$c1,$d1,
+ $a2,$b2,$c2,$d2, $a3,$b3,$c3,$d3,
+ $a4,$b4,$c4,$d4, $a5,$b5,$c5,$d5)=map("%v$_",(0..23));
+my @K=map("%v$_",(27,24..26));
+my ($t0,$t1,$t2,$t3)=map("%v$_",27..30);
+my $beperm="%v31";
+my $FRAME=$stdframe + 4*16;
+
+GLOBL ("ChaCha20_ctr32_vx");
+ALIGN (32);
+LABEL ("ChaCha20_ctr32_vx");
+LABEL (".LChaCha20_ctr32_vx");
+&{$z? \&clgfi:\&clfi} ($len,256);
+ jle (".LChaCha20_ctr32_4x");
+&{$z? \&stmg:\&stm} ("%r6","%r7","6*$SIZE_T($sp)");
+if (!$z) {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+}
+&{$z? \&lghi:\&lhi} ("%r1",-$FRAME);
+ lgr ("%r0",$sp);
+ la ($sp,"0(%r1,$sp)");
+&{$z? \&stg:\&st} ("%r0","0($sp)"); # back-chain
+if ($z) {
+ std ("%f8","$FRAME-8*8($sp)");
+ std ("%f9","$FRAME-8*7($sp)");
+ std ("%f10","$FRAME-8*6($sp)");
+ std ("%f11","$FRAME-8*5($sp)");
+ std ("%f12","$FRAME-8*4($sp)");
+ std ("%f13","$FRAME-8*3($sp)");
+ std ("%f14","$FRAME-8*2($sp)");
+ std ("%f15","$FRAME-8*1($sp)");
+}
+ larl ("%r7",".Lsigma");
+ lhi ("%r0",10);
+
+ vlm (@K[1],@K[2],"0($key)"); # load key
+ vl (@K[3],"0($counter)"); # load counter
+
+ vlm (@K[0],"$beperm","0(%r7)"); # load sigma, increments, ...
+
+LABEL (".Loop_outer_vx");
+ vlr ($a0,@K[0]);
+ vlr ($b0,@K[1]);
+ vlr ($a1,@K[0]);
+ vlr ($b1,@K[1]);
+ vlr ($a2,@K[0]);
+ vlr ($b2,@K[1]);
+ vlr ($a3,@K[0]);
+ vlr ($b3,@K[1]);
+ vlr ($a4,@K[0]);
+ vlr ($b4,@K[1]);
+ vlr ($a5,@K[0]);
+ vlr ($b5,@K[1]);
+
+ vlr ($d0,@K[3]);
+ vaf ($d1,@K[3],$t1); # K[3]+1
+ vaf ($d2,@K[3],$t2); # K[3]+2
+ vaf ($d3,@K[3],$t3); # K[3]+3
+ vaf ($d4,$d2,$t2); # K[3]+4
+ vaf ($d5,$d2,$t3); # K[3]+5
+
+ vlr ($c0,@K[2]);
+ vlr ($c1,@K[2]);
+ vlr ($c2,@K[2]);
+ vlr ($c3,@K[2]);
+ vlr ($c4,@K[2]);
+ vlr ($c5,@K[2]);
+
+ vlr ($t1,$d1);
+ vlr ($t2,$d2);
+ vlr ($t3,$d3);
+
+ALIGN (4);
+LABEL (".Loop_vx");
+
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+ $b0,$b1,$b2,$b3,$b4,$b5,
+ $c0,$c1,$c2,$c3,$c4,$c5,
+ $d0,$d1,$d2,$d3,$d4,$d5,
+ 0);
+
+ VX_ROUND($a0,$a1,$a2,$a3,$a4,$a5,
+ $b0,$b1,$b2,$b3,$b4,$b5,
+ $c0,$c1,$c2,$c3,$c4,$c5,
+ $d0,$d1,$d2,$d3,$d4,$d5,
+ 1);
+
+ brct ("%r0",".Loop_vx");
+
+ vaf ($a0,$a0,@K[0]);
+ vaf ($b0,$b0,@K[1]);
+ vaf ($c0,$c0,@K[2]);
+ vaf ($d0,$d0,@K[3]);
+ vaf ($a1,$a1,@K[0]);
+ vaf ($d1,$d1,$t1); # +K[3]+1
+
+ vperm ($a0,$a0,$a0,$beperm);
+ vperm ($b0,$b0,$b0,$beperm);
+ vperm ($c0,$c0,$c0,$beperm);
+ vperm ($d0,$d0,$d0,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vaf ($d2,$d2,$t2); # +K[3]+2
+ vaf ($d3,$d3,$t3); # +K[3]+3
+ vlm ($t0,$t3,"0($inp)");
+
+ vx ($a0,$a0,$t0);
+ vx ($b0,$b0,$t1);
+ vx ($c0,$c0,$t2);
+ vx ($d0,$d0,$t3);
+
+ vlm (@K[0],$t3,"0(%r7)"); # re-load sigma and increments
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($b1,$b1,@K[1]);
+ vaf ($c1,$c1,@K[2]);
+
+ vperm ($a0,$a1,$a1,$beperm);
+ vperm ($b0,$b1,$b1,$beperm);
+ vperm ($c0,$c1,$c1,$beperm);
+ vperm ($d0,$d1,$d1,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a2,$a2,@K[0]);
+ vaf ($b2,$b2,@K[1]);
+ vaf ($c2,$c2,@K[2]);
+
+ vperm ($a0,$a2,$a2,$beperm);
+ vperm ($b0,$b2,$b2,$beperm);
+ vperm ($c0,$c2,$c2,$beperm);
+ vperm ($d0,$d2,$d2,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a3,$a3,@K[0]);
+ vaf ($b3,$b3,@K[1]);
+ vaf ($c3,$c3,@K[2]);
+ vaf ($d2,@K[3],$t3); # K[3]+3
+
+ vperm ($a0,$a3,$a3,$beperm);
+ vperm ($b0,$b3,$b3,$beperm);
+ vperm ($c0,$c3,$c3,$beperm);
+ vperm ($d0,$d3,$d3,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vaf ($d3,$d2,$t1); # K[3]+4
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a4,$a4,@K[0]);
+ vaf ($b4,$b4,@K[1]);
+ vaf ($c4,$c4,@K[2]);
+ vaf ($d4,$d4,$d3); # +K[3]+4
+ vaf ($d3,$d3,$t1); # K[3]+5
+ vaf (@K[3],$d2,$t3); # K[3]+=6
+
+ vperm ($a0,$a4,$a4,$beperm);
+ vperm ($b0,$b4,$b4,$beperm);
+ vperm ($c0,$c4,$c4,$beperm);
+ vperm ($d0,$d4,$d4,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ je (".Ldone_vx");
+
+ vaf ($a5,$a5,@K[0]);
+ vaf ($b5,$b5,@K[1]);
+ vaf ($c5,$c5,@K[2]);
+ vaf ($d5,$d5,$d3); # +K[3]+5
+
+ vperm ($a0,$a5,$a5,$beperm);
+ vperm ($b0,$b5,$b5,$beperm);
+ vperm ($c0,$c5,$c5,$beperm);
+ vperm ($d0,$d5,$d5,$beperm);
+
+&{$z? \&clgfi:\&clfi} ($len,0x40);
+ jl (".Ltail_vx");
+
+ vlm ($a1,$d1,"0($inp)");
+
+ vx ($a0,$a0,$a1);
+ vx ($b0,$b0,$b1);
+ vx ($c0,$c0,$c1);
+ vx ($d0,$d0,$d1);
+
+ vstm ($a0,$d0,"0($out)");
+
+ la ($inp,"0x40($inp)");
+ la ($out,"0x40($out)");
+ lhi ("%r0",10);
+&{$z? \&aghi:\&ahi} ($len,-0x40);
+ jne (".Loop_outer_vx");
+
+LABEL (".Ldone_vx");
+if (!$z) {
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+ ld ("%f8","$FRAME-8*8($sp)");
+ ld ("%f9","$FRAME-8*7($sp)");
+ ld ("%f10","$FRAME-8*6($sp)");
+ ld ("%f11","$FRAME-8*5($sp)");
+ ld ("%f12","$FRAME-8*4($sp)");
+ ld ("%f13","$FRAME-8*3($sp)");
+ ld ("%f14","$FRAME-8*2($sp)");
+ ld ("%f15","$FRAME-8*1($sp)");
+}
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+ la ($sp,"$FRAME($sp)");
+ br ("%r14");
+
+ALIGN (16);
+LABEL (".Ltail_vx");
+if (!$z) {
+ ld ("%f4","$FRAME+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$FRAME+16*$SIZE_T+3*8($sp)");
+} else {
+ ld ("%f8","$FRAME-8*8($sp)");
+ ld ("%f9","$FRAME-8*7($sp)");
+ ld ("%f10","$FRAME-8*6($sp)");
+ ld ("%f11","$FRAME-8*5($sp)");
+ ld ("%f12","$FRAME-8*4($sp)");
+ ld ("%f13","$FRAME-8*3($sp)");
+ ld ("%f14","$FRAME-8*2($sp)");
+ ld ("%f15","$FRAME-8*1($sp)");
+}
+ vstm ($a0,$d0,"$stdframe($sp)");
+ lghi ("%r1",0);
+
+LABEL (".Loop_tail_vx");
+ llgc ("%r5","0(%r1,$inp)");
+ llgc ("%r6","$stdframe(%r1,$sp)");
+ xr ("%r6","%r5");
+ stc ("%r6","0(%r1,$out)");
+ la ("%r1","1(%r1)");
+ brct ($len,".Loop_tail_vx");
+
+&{$z? \&lmg:\&lm} ("%r6","%r7","$FRAME+6*$SIZE_T($sp)");
+ la ($sp,"$FRAME($sp)");
+ br ("%r14");
+SIZE ("ChaCha20_ctr32_vx",".-ChaCha20_ctr32_vx");
+}
+################
+
+ALIGN (32);
+LABEL (".Lsigma");
+LONG (0x61707865,0x3320646e,0x79622d32,0x6b206574); # endian-neutral sigma
+LONG (1,0,0,0);
+LONG (2,0,0,0);
+LONG (3,0,0,0);
+LONG (0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c); # byte swap
+
+LONG (0,1,2,3);
+LONG (0x61707865,0x61707865,0x61707865,0x61707865); # smashed sigma
+LONG (0x3320646e,0x3320646e,0x3320646e,0x3320646e);
+LONG (0x79622d32,0x79622d32,0x79622d32,0x79622d32);
+LONG (0x6b206574,0x6b206574,0x6b206574,0x6b206574);
+
+ASCIZ ("\"ChaCha20 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
+ALIGN (4);
+
+PERLASM_END();
diff -up openssl-1.1.1e/crypto/perlasm/s390x.pm.s390x-update openssl-1.1.1e/crypto/perlasm/s390x.pm
--- openssl-1.1.1e/crypto/perlasm/s390x.pm.s390x-update 2020-03-19 16:20:22.039227394 +0100
+++ openssl-1.1.1e/crypto/perlasm/s390x.pm 2020-03-19 16:20:22.039227394 +0100
@@ -0,0 +1,3060 @@
+#!/usr/bin/env perl
+# Copyright 2018 The OpenSSL Project Authors. All Rights Reserved.
+#
+# Licensed under the OpenSSL license (the "License"). You may not use
+# this file except in compliance with the License. You can obtain a copy
+# in the file LICENSE in the source distribution or at
+# https://www.openssl.org/source/license.html
+
+# Copyright IBM Corp. 2018
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+
+package perlasm::s390x;
+
+use strict;
+use warnings;
+use Carp qw(confess);
+use Exporter qw(import);
+
+our @EXPORT=qw(PERLASM_BEGIN PERLASM_END);
+our @EXPORT_OK=qw(AUTOLOAD LABEL INCLUDE stfle);
+our %EXPORT_TAGS=(
+ MSA => [qw(kmac km kmc kimd klmd)],
+ MSA4 => [qw(kmf kmo pcc kmctr)],
+ MSA5 => [qw(ppno prno)],
+ MSA8 => [qw(kma)],
+ VX => [qw(vgef vgeg vgbm vzero vone vgm vgmb vgmh vgmf vgmg
+ vl vlr vlrep vlrepb vlreph vlrepf vlrepg vleb vleh vlef vleg vleib
+ vleih vleif vleig vlgv vlgvb vlgvh vlgvf vlgvg vllez vllezb vllezh
+ vllezf vllezg vlm vlbb vlvg vlvgb vlvgh vlvgf vlvgg vlvgp
+ vll vmrh vmrhb vmrhh vmrhf vmrhg vmrl vmrlb vmrlh vmrlf vmrlg vpk
+ vpkh vpkf vpkg vpks vpksh vpksf vpksg vpkshs vpksfs vpksgs vpkls
+ vpklsh vpklsf vpklsg vpklshs vpklsfs vpklsgs vperm vpdi vrep vrepb
+ vreph vrepf vrepg vrepi vrepib vrepih vrepif vrepig vscef vsceg
+ vsel vseg vsegb vsegh vsegf vst vsteb vsteh vstef vsteg vstm vstl
+ vuph vuphb vuphh vuphf vuplh vuplhb vuplhh vuplhf vupl vuplb vuplhw
+ vuplf vupll vupllb vupllh vupllf va vab vah vaf vag vaq vacc vaccb
+ vacch vaccf vaccg vaccq vac vacq vaccc vacccq vn vnc vavg vavgb
+ vavgh vavgf vavgg vavgl vavglb vavglh vavglf vavglg vcksm vec_ vecb
+ vech vecf vecg vecl veclb veclh veclf veclg vceq vceqb vceqh vceqf
+ vceqg vceqbs vceqhs vceqfs vceqgs vch vchb vchh vchf vchg vchbs
+ vchhs vchfs vchgs vchl vchlb vchlh vchlf vchlg vchlbs vchlhs vchlfs
+ vchlgs vclz vclzb vclzh vclzf vclzg vctz vctzb vctzh vctzf vctzg
+ vx vgfm vgfmb vgfmh vgfmf vgfmg vgfma vgfmab vgfmah vgfmaf vgfmag
+ vlc vlcb vlch vlcf vlcg vlp vlpb vlph vlpf vlpg vmx vmxb vmxh vmxf
+ vmxg vmxl vmxlb vmxlh vmxlf vmxlg vmn vmnb vmnh vmnf vmng vmnl
+ vmnlb vmnlh vmnlf vmnlg vmal vmalb vmalhw vmalf vmah vmahb vmahh
+ vmahf vmalh vmalhb vmalhh vmalhf vmae vmaeb vmaeh vmaef vmale
+ vmaleb vmaleh vmalef vmao vmaob vmaoh vmaof vmalo vmalob vmaloh
+ vmalof vmh vmhb vmhh vmhf vmlh vmlhb vmlhh vmlhf vml vmlb vmlhw
+ vmlf vme vmeb vmeh vmef vmle vmleb vmleh vmlef vmo vmob vmoh vmof
+ vmlo vmlob vmloh vmlof vno vnot vo vpopct verllv verllvb verllvh
+ verllvf verllvg verll verllb verllh verllf verllg verim verimb
+ verimh verimf verimg veslv veslvb veslvh veslvf veslvg vesl veslb
+ veslh veslf veslg vesrav vesravb vesravh vesravf vesravg vesra
+ vesrab vesrah vesraf vesrag vesrlv vesrlvb vesrlvh vesrlvf vesrlvg
+ vesrl vesrlb vesrlh vesrlf vesrlg vsl vslb vsldb vsra vsrab vsrl
+ vsrlb vs vsb vsh vsf vsg vsq vscbi vscbib vscbih vscbif vscbig
+ vscbiq vsbi vsbiq vsbcbi vsbcbiq vsumg vsumgh vsumgf vsumq vsumqf
+ vsumqg vsum vsumb vsumh vtm vfae vfaeb vfaeh vfaef vfaebs vfaehs
+ vfaefs vfaezb vfaezh vfaezf vfaezbs vfaezhs vfaezfs vfee vfeeb
+ vfeeh vfeef vfeebs vfeehs vfeefs vfeezb vfeezh vfeezf vfeezbs
+ vfeezhs vfeezfs vfene vfeneb vfeneh vfenef vfenebs vfenehs vfenefs
+ vfenezb vfenezh vfenezf vfenezbs vfenezhs vfenezfs vistr vistrb
+ vistrh vistrf vistrbs vistrhs vistrfs vstrc vstrcb vstrch vstrcf
+ vstrcbs vstrchs vstrcfs vstrczb vstrczh vstrczf vstrczbs vstrczhs
+ vstrczfs vfa vfadb wfadb wfc wfcdb wfk wfkdb vfce vfcedb wfcedb
+ vfcedbs wfcedbs vfch vfchdb wfchdb vfchdbs wfchdbs vfche vfchedb
+ wfchedb vfchedbs wfchedbs vcdg vcdgb wcdgb vcdlg vcdlgb wcdlgb vcgd
+ vcgdb wcgdb vclgd vclgdb wclgdb vfd vfddb wfddb vfi vfidb wfidb
+ vlde vldeb wldeb vled vledb wledb vfm vfmdb wfmdb vfma vfmadb
+ wfmadb vfms vfmsdb wfmsdb vfpso vfpsodb wfpsodb vflcdb wflcdb
+ vflndb wflndb vflpdb wflpdb vfsq vfsqdb wfsqdb vfs vfsdb wfsdb
+ vftci vftcidb wftcidb)],
+ VXE => [qw(vbperm vllezlf vmsl vmslg vnx vnn voc vpopctb vpopcth
+ vpopctf vpopctg vfasb wfasb wfaxb wfcsb wfcxb wfksb wfkxb vfcesb
+ vfcesbs wfcesb wfcesbs wfcexb wfcexbs vfchsb vfchsbs wfchsb wfchsbs
+ wfchxb wfchxbs vfchesb vfchesbs wfchesb wfchesbs wfchexb wfchexbs
+ vfdsb wfdsb wfdxb vfisb wfisb wfixb vfll vflls wflls wflld vflr
+ vflrd wflrd wflrx vfmax vfmaxsb vfmaxdb wfmaxsb wfmaxdb wfmaxxb
+ vfmin vfminsb vfmindb wfminsb wfmindb wfminxb vfmsb wfmsb wfmxb
+ vfnma vfnms vfmasb wfmasb wfmaxb vfmssb wfmssb wfmsxb vfnmasb
+ vfnmadb wfnmasb wfnmadb wfnmaxb vfnmssb vfnmsdb wfnmssb wfnmsdb
+ wfnmsxb vfpsosb wfpsosb vflcsb wflcsb vflnsb wflnsb vflpsb wflpsb
+ vfpsoxb wfpsoxb vflcxb wflcxb vflnxb wflnxb vflpxb wflpxb vfsqsb
+ wfsqsb wfsqxb vfssb wfssb wfsxb vftcisb wftcisb wftcixb)],
+ VXD => [qw(vlrlr vlrl vstrlr vstrl vap vcp vcvb vcvbg vcvd vcvdg vdp
+ vlip vmp vmsp vpkz vpsop vrp vsdp vsrp vsp vtp vupkz)],
+);
+Exporter::export_ok_tags(qw(MSA MSA4 MSA5 MSA8 VX VXE VXD));
+
+our $AUTOLOAD;
+
+my $GR='(?:%r)?([0-9]|1[0-5])';
+my $VR='(?:%v)?([0-9]|1[0-9]|2[0-9]|3[0-1])';
+
+my ($file,$out);
+
+sub PERLASM_BEGIN
+{
+ ($file,$out)=(shift,"");
+}
+sub PERLASM_END
+{
+ if (defined($file)) {
+ open(my $fd,'>',$file)||die("can't open $file: $!");
+ print({$fd}$out);
+ close($fd);
+ } else {
+ print($out);
+ }
+}
+
+sub AUTOLOAD {
+ confess(err("PARSE")) if (grep(!defined($_),@_));
+ my $token;
+ for ($AUTOLOAD) {
+ $token=".$1" if (/^.*::([A-Z_]+)$/); # uppercase: directive
+ $token="\t$1" if (/^.*::([a-z]+)$/); # lowercase: mnemonic
+ confess(err("PARSE")) if (!defined($token));
+ }
+ $token.="\t" if ($#_>=0);
+ $out.=$token.join(',',@_)."\n";
+}
+
+sub LABEL { # label directive
+ confess(err("ARGNUM")) if ($#_!=0);
+ my ($label)=@_;
+ $out.="$label:\n";
+}
+
+sub INCLUDE {
+ confess(err("ARGNUM")) if ($#_!=0);
+ my ($file)=@_;
+ $out.="#include \"$file\"\n";
+}
+
+#
+# Mnemonics
+#
+
+sub stfle {
+ confess(err("ARGNUM")) if ($#_!=0);
+ S(0xb2b0,@_);
+}
+
+# MSA
+
+sub kmac {
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb91e,@_);
+}
+
+sub km {
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb92e,@_);
+}
+
+sub kmc {
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb92f,@_);
+}
+
+sub kimd {
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb93e,@_);
+}
+
+sub klmd {
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb93f,@_);
+}
+
+# MSA4
+
+sub kmf {
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb92a,@_);
+}
+
+sub kmo {
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb92b,@_);
+}
+
+sub pcc {
+ confess(err("ARGNUM")) if ($#_!=-1);
+ RRE(0xb92c,@_);
+}
+
+sub kmctr {
+ confess(err("ARGNUM")) if ($#_!=2);
+ RRFb(0xb92d,@_);
+}
+
+# MSA5
+
+sub prno {
+ ppno(@_);
+}
+
+sub ppno { # deprecated, use prno
+ confess(err("ARGNUM")) if ($#_!=1);
+ RRE(0xb93c,@_);
+}
+
+# MSA8
+
+sub kma {
+ confess(err("ARGNUM")) if ($#_!=2);
+ RRFb(0xb929,@_);
+}
+
+# VX - Support Instructions
+
+sub vgef {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRV(0xe713,@_);
+}
+sub vgeg {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRV(0xe712,@_);
+}
+
+sub vgbm {
+ confess(err("ARGNUM")) if ($#_!=1);
+ VRIa(0xe744,@_);
+}
+sub vzero {
+ vgbm(@_,0);
+}
+sub vone {
+ vgbm(@_,0xffff);
+}
+
+sub vgm {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRIb(0xe746,@_);
+}
+sub vgmb {
+ vgm(@_,0);
+}
+sub vgmh {
+ vgm(@_,1);
+}
+sub vgmf {
+ vgm(@_,2);
+}
+sub vgmg {
+ vgm(@_,3);
+}
+
+sub vl {
+ confess(err("ARGNUM")) if ($#_<1||$#_>2);
+ VRX(0xe706,@_);
+}
+
+sub vlr {
+ confess(err("ARGNUM")) if ($#_!=1);
+ VRRa(0xe756,@_);
+}
+
+sub vlrep {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe705,@_);
+}
+sub vlrepb {
+ vlrep(@_,0);
+}
+sub vlreph {
+ vlrep(@_,1);
+}
+sub vlrepf {
+ vlrep(@_,2);
+}
+sub vlrepg {
+ vlrep(@_,3);
+}
+
+sub vleb {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe700,@_);
+}
+sub vleh {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe701,@_);
+}
+sub vlef {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe703,@_);
+}
+sub vleg {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe702,@_);
+}
+
+sub vleib {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRIa(0xe740,@_);
+}
+sub vleih {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRIa(0xe741,@_);
+}
+sub vleif {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRIa(0xe743,@_);
+}
+sub vleig {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRIa(0xe742,@_);
+}
+
+sub vlgv {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRSc(0xe721,@_);
+}
+sub vlgvb {
+ vlgv(@_,0);
+}
+sub vlgvh {
+ vlgv(@_,1);
+}
+sub vlgvf {
+ vlgv(@_,2);
+}
+sub vlgvg {
+ vlgv(@_,3);
+}
+
+sub vllez {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe704,@_);
+}
+sub vllezb {
+ vllez(@_,0);
+}
+sub vllezh {
+ vllez(@_,1);
+}
+sub vllezf {
+ vllez(@_,2);
+}
+sub vllezg {
+ vllez(@_,3);
+}
+
+sub vlm {
+ confess(err("ARGNUM")) if ($#_<2||$#_>3);
+ VRSa(0xe736,@_);
+}
+
+sub vlbb {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe707,@_);
+}
+
+sub vlvg {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRSb(0xe722,@_);
+}
+sub vlvgb {
+ vlvg(@_,0);
+}
+sub vlvgh {
+ vlvg(@_,1);
+}
+sub vlvgf {
+ vlvg(@_,2);
+}
+sub vlvgg {
+ vlvg(@_,3);
+}
+
+sub vlvgp {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRf(0xe762,@_);
+}
+
+sub vll {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRSb(0xe737,@_);
+}
+
+sub vmrh {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe761,@_);
+}
+sub vmrhb {
+ vmrh(@_,0);
+}
+sub vmrhh {
+ vmrh(@_,1);
+}
+sub vmrhf {
+ vmrh(@_,2);
+}
+sub vmrhg {
+ vmrh(@_,3);
+}
+
+sub vmrl {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe760,@_);
+}
+sub vmrlb {
+ vmrl(@_,0);
+}
+sub vmrlh {
+ vmrl(@_,1);
+}
+sub vmrlf {
+ vmrl(@_,2);
+}
+sub vmrlg {
+ vmrl(@_,3);
+}
+
+sub vpk {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe794,@_);
+}
+sub vpkh {
+ vpk(@_,1);
+}
+sub vpkf {
+ vpk(@_,2);
+}
+sub vpkg {
+ vpk(@_,3);
+}
+
+sub vpks {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRb(0xe797,@_);
+}
+sub vpksh {
+ vpks(@_,1,0);
+}
+sub vpksf {
+ vpks(@_,2,0);
+}
+sub vpksg {
+ vpks(@_,3,0);
+}
+sub vpkshs {
+ vpks(@_,1,1);
+}
+sub vpksfs {
+ vpks(@_,2,1);
+}
+sub vpksgs {
+ vpks(@_,3,1);
+}
+
+sub vpkls {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRb(0xe795,@_);
+}
+sub vpklsh {
+ vpkls(@_,1,0);
+}
+sub vpklsf {
+ vpkls(@_,2,0);
+}
+sub vpklsg {
+ vpkls(@_,3,0);
+}
+sub vpklshs {
+ vpkls(@_,1,1);
+}
+sub vpklsfs {
+ vpkls(@_,2,1);
+}
+sub vpklsgs {
+ vpkls(@_,3,1);
+}
+
+sub vperm {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRe(0xe78c,@_);
+}
+
+sub vpdi {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe784,@_);
+}
+
+sub vrep {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRIc(0xe74d,@_);
+}
+sub vrepb {
+ vrep(@_,0);
+}
+sub vreph {
+ vrep(@_,1);
+}
+sub vrepf {
+ vrep(@_,2);
+}
+sub vrepg {
+ vrep(@_,3);
+}
+
+sub vrepi {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRIa(0xe745,@_);
+}
+sub vrepib {
+ vrepi(@_,0);
+}
+sub vrepih {
+ vrepi(@_,1);
+}
+sub vrepif {
+ vrepi(@_,2);
+}
+sub vrepig {
+ vrepi(@_,3);
+}
+
+sub vscef {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRV(0xe71b,@_);
+}
+sub vsceg {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRV(0xe71a,@_);
+}
+
+sub vsel {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRe(0xe78d,@_);
+}
+
+sub vseg {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe75f,@_);
+}
+sub vsegb {
+ vseg(@_,0);
+}
+sub vsegh {
+ vseg(@_,1);
+}
+sub vsegf {
+ vseg(@_,2);
+}
+
+sub vst {
+ confess(err("ARGNUM")) if ($#_<1||$#_>2);
+ VRX(0xe70e,@_);
+}
+
+sub vsteb {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe708,@_);
+}
+sub vsteh {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe709,@_);
+}
+sub vstef {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe70b,@_);
+}
+sub vsteg {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRX(0xe70a,@_);
+}
+
+sub vstm {
+ confess(err("ARGNUM")) if ($#_<2||$#_>3);
+ VRSa(0xe73e,@_);
+}
+
+sub vstl {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRSb(0xe73f,@_);
+}
+
+sub vuph {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7d7,@_);
+}
+sub vuphb {
+ vuph(@_,0);
+}
+sub vuphh {
+ vuph(@_,1);
+}
+sub vuphf {
+ vuph(@_,2);
+}
+
+sub vuplh {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7d5,@_);
+}
+sub vuplhb {
+ vuplh(@_,0);
+}
+sub vuplhh {
+ vuplh(@_,1);
+}
+sub vuplhf {
+ vuplh(@_,2);
+}
+
+sub vupl {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7d6,@_);
+}
+sub vuplb {
+ vupl(@_,0);
+}
+sub vuplhw {
+ vupl(@_,1);
+}
+sub vuplf {
+ vupl(@_,2);
+}
+
+sub vupll {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7d4,@_);
+}
+sub vupllb {
+ vupll(@_,0);
+}
+sub vupllh {
+ vupll(@_,1);
+}
+sub vupllf {
+ vupll(@_,2);
+}
+
+# VX - Integer Instructions
+
+sub va {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7f3,@_);
+}
+sub vab {
+ va(@_,0);
+}
+sub vah {
+ va(@_,1);
+}
+sub vaf {
+ va(@_,2);
+}
+sub vag {
+ va(@_,3);
+}
+sub vaq {
+ va(@_,4);
+}
+
+sub vacc {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7f1,@_);
+}
+sub vaccb {
+ vacc(@_,0);
+}
+sub vacch {
+ vacc(@_,1);
+}
+sub vaccf {
+ vacc(@_,2);
+}
+sub vaccg {
+ vacc(@_,3);
+}
+sub vaccq {
+ vacc(@_,4);
+}
+
+sub vac {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7bb,@_);
+}
+sub vacq {
+ vac(@_,4);
+}
+
+sub vaccc {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7b9,@_);
+}
+sub vacccq {
+ vaccc(@_,4);
+}
+
+sub vn {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe768,@_);
+}
+
+sub vnc {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe769,@_);
+}
+
+sub vavg {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7f2,@_);
+}
+sub vavgb {
+ vavg(@_,0);
+}
+sub vavgh {
+ vavg(@_,1);
+}
+sub vavgf {
+ vavg(@_,2);
+}
+sub vavgg {
+ vavg(@_,3);
+}
+
+sub vavgl {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7f0,@_);
+}
+sub vavglb {
+ vavgl(@_,0);
+}
+sub vavglh {
+ vavgl(@_,1);
+}
+sub vavglf {
+ vavgl(@_,2);
+}
+sub vavglg {
+ vavgl(@_,3);
+}
+
+sub vcksm {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe766,@_);
+}
+
+sub vec_ {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7db,@_);
+}
+sub vecb {
+ vec_(@_,0);
+}
+sub vech {
+ vec_(@_,1);
+}
+sub vecf {
+ vec_(@_,2);
+}
+sub vecg {
+ vec_(@_,3);
+}
+
+sub vecl {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7d9,@_);
+}
+sub veclb {
+ vecl(@_,0);
+}
+sub veclh {
+ vecl(@_,1);
+}
+sub veclf {
+ vecl(@_,2);
+}
+sub veclg {
+ vecl(@_,3);
+}
+
+sub vceq {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRb(0xe7f8,@_);
+}
+sub vceqb {
+ vceq(@_,0,0);
+}
+sub vceqh {
+ vceq(@_,1,0);
+}
+sub vceqf {
+ vceq(@_,2,0);
+}
+sub vceqg {
+ vceq(@_,3,0);
+}
+sub vceqbs {
+ vceq(@_,0,1);
+}
+sub vceqhs {
+ vceq(@_,1,1);
+}
+sub vceqfs {
+ vceq(@_,2,1);
+}
+sub vceqgs {
+ vceq(@_,3,1);
+}
+
+sub vch {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRb(0xe7fb,@_);
+}
+sub vchb {
+ vch(@_,0,0);
+}
+sub vchh {
+ vch(@_,1,0);
+}
+sub vchf {
+ vch(@_,2,0);
+}
+sub vchg {
+ vch(@_,3,0);
+}
+sub vchbs {
+ vch(@_,0,1);
+}
+sub vchhs {
+ vch(@_,1,1);
+}
+sub vchfs {
+ vch(@_,2,1);
+}
+sub vchgs {
+ vch(@_,3,1);
+}
+
+sub vchl {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRb(0xe7f9,@_);
+}
+sub vchlb {
+ vchl(@_,0,0);
+}
+sub vchlh {
+ vchl(@_,1,0);
+}
+sub vchlf {
+ vchl(@_,2,0);
+}
+sub vchlg {
+ vchl(@_,3,0);
+}
+sub vchlbs {
+ vchl(@_,0,1);
+}
+sub vchlhs {
+ vchl(@_,1,1);
+}
+sub vchlfs {
+ vchl(@_,2,1);
+}
+sub vchlgs {
+ vchl(@_,3,1);
+}
+
+sub vclz {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe753,@_);
+}
+sub vclzb {
+ vclz(@_,0);
+}
+sub vclzh {
+ vclz(@_,1);
+}
+sub vclzf {
+ vclz(@_,2);
+}
+sub vclzg {
+ vclz(@_,3);
+}
+
+sub vctz {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe752,@_);
+}
+sub vctzb {
+ vctz(@_,0);
+}
+sub vctzh {
+ vctz(@_,1);
+}
+sub vctzf {
+ vctz(@_,2);
+}
+sub vctzg {
+ vctz(@_,3);
+}
+
+sub vx {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe76d,@_);
+}
+
+sub vgfm {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7b4,@_);
+}
+sub vgfmb {
+ vgfm(@_,0);
+}
+sub vgfmh {
+ vgfm(@_,1);
+}
+sub vgfmf {
+ vgfm(@_,2);
+}
+sub vgfmg {
+ vgfm(@_,3);
+}
+
+sub vgfma {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7bc,@_);
+}
+sub vgfmab {
+ vgfma(@_,0);
+}
+sub vgfmah {
+ vgfma(@_,1);
+}
+sub vgfmaf {
+ vgfma(@_,2);
+}
+sub vgfmag {
+ vgfma(@_,3);
+}
+
+sub vlc {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7de,@_);
+}
+sub vlcb {
+ vlc(@_,0);
+}
+sub vlch {
+ vlc(@_,1);
+}
+sub vlcf {
+ vlc(@_,2);
+}
+sub vlcg {
+ vlc(@_,3);
+}
+
+sub vlp {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe7df,@_);
+}
+sub vlpb {
+ vlp(@_,0);
+}
+sub vlph {
+ vlp(@_,1);
+}
+sub vlpf {
+ vlp(@_,2);
+}
+sub vlpg {
+ vlp(@_,3);
+}
+
+sub vmx {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7ff,@_);
+}
+sub vmxb {
+ vmx(@_,0);
+}
+sub vmxh {
+ vmx(@_,1);
+}
+sub vmxf {
+ vmx(@_,2);
+}
+sub vmxg {
+ vmx(@_,3);
+}
+
+sub vmxl {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7fd,@_);
+}
+sub vmxlb {
+ vmxl(@_,0);
+}
+sub vmxlh {
+ vmxl(@_,1);
+}
+sub vmxlf {
+ vmxl(@_,2);
+}
+sub vmxlg {
+ vmxl(@_,3);
+}
+
+sub vmn {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7fe,@_);
+}
+sub vmnb {
+ vmn(@_,0);
+}
+sub vmnh {
+ vmn(@_,1);
+}
+sub vmnf {
+ vmn(@_,2);
+}
+sub vmng {
+ vmn(@_,3);
+}
+
+sub vmnl {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7fc,@_);
+}
+sub vmnlb {
+ vmnl(@_,0);
+}
+sub vmnlh {
+ vmnl(@_,1);
+}
+sub vmnlf {
+ vmnl(@_,2);
+}
+sub vmnlg {
+ vmnl(@_,3);
+}
+
+sub vmal {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7aa,@_);
+}
+sub vmalb {
+ vmal(@_,0);
+}
+sub vmalhw {
+ vmal(@_,1);
+}
+sub vmalf {
+ vmal(@_,2);
+}
+
+sub vmah {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7ab,@_);
+}
+sub vmahb {
+ vmah(@_,0);
+}
+sub vmahh {
+ vmah(@_,1);
+}
+sub vmahf {
+ vmah(@_,2);
+}
+
+sub vmalh {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7a9,@_);
+}
+sub vmalhb {
+ vmalh(@_,0);
+}
+sub vmalhh {
+ vmalh(@_,1);
+}
+sub vmalhf {
+ vmalh(@_,2);
+}
+
+sub vmae {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7ae,@_);
+}
+sub vmaeb {
+ vmae(@_,0);
+}
+sub vmaeh {
+ vmae(@_,1);
+}
+sub vmaef {
+ vmae(@_,2);
+}
+
+sub vmale {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7ac,@_);
+}
+sub vmaleb {
+ vmale(@_,0);
+}
+sub vmaleh {
+ vmale(@_,1);
+}
+sub vmalef {
+ vmale(@_,2);
+}
+
+sub vmao {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7af,@_);
+}
+sub vmaob {
+ vmao(@_,0);
+}
+sub vmaoh {
+ vmao(@_,1);
+}
+sub vmaof {
+ vmao(@_,2);
+}
+
+sub vmalo {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7ad,@_);
+}
+sub vmalob {
+ vmalo(@_,0);
+}
+sub vmaloh {
+ vmalo(@_,1);
+}
+sub vmalof {
+ vmalo(@_,2);
+}
+
+sub vmh {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7a3,@_);
+}
+sub vmhb {
+ vmh(@_,0);
+}
+sub vmhh {
+ vmh(@_,1);
+}
+sub vmhf {
+ vmh(@_,2);
+}
+
+sub vmlh {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7a1,@_);
+}
+sub vmlhb {
+ vmlh(@_,0);
+}
+sub vmlhh {
+ vmlh(@_,1);
+}
+sub vmlhf {
+ vmlh(@_,2);
+}
+
+sub vml {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7a2,@_);
+}
+sub vmlb {
+ vml(@_,0);
+}
+sub vmlhw {
+ vml(@_,1);
+}
+sub vmlf {
+ vml(@_,2);
+}
+
+sub vme {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7a6,@_);
+}
+sub vmeb {
+ vme(@_,0);
+}
+sub vmeh {
+ vme(@_,1);
+}
+sub vmef {
+ vme(@_,2);
+}
+
+sub vmle {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7a4,@_);
+}
+sub vmleb {
+ vmle(@_,0);
+}
+sub vmleh {
+ vmle(@_,1);
+}
+sub vmlef {
+ vmle(@_,2);
+}
+
+sub vmo {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7a7,@_);
+}
+sub vmob {
+ vmo(@_,0);
+}
+sub vmoh {
+ vmo(@_,1);
+}
+sub vmof {
+ vmo(@_,2);
+}
+
+sub vmlo {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7a5,@_);
+}
+sub vmlob {
+ vmlo(@_,0);
+}
+sub vmloh {
+ vmlo(@_,1);
+}
+sub vmlof {
+ vmlo(@_,2);
+}
+
+sub vno {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe76b,@_);
+}
+sub vnot {
+ vno(@_,$_[1]);
+}
+
+sub vo {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe76a,@_);
+}
+
+sub vpopct {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRa(0xe750,@_);
+}
+
+sub verllv {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe773,@_);
+}
+sub verllvb {
+ verllv(@_,0);
+}
+sub verllvh {
+ verllv(@_,1);
+}
+sub verllvf {
+ verllv(@_,2);
+}
+sub verllvg {
+ verllv(@_,3);
+}
+
+sub verll {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRSa(0xe733,@_);
+}
+sub verllb {
+ verll(@_,0);
+}
+sub verllh {
+ verll(@_,1);
+}
+sub verllf {
+ verll(@_,2);
+}
+sub verllg {
+ verll(@_,3);
+}
+
+sub verim {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRId(0xe772,@_);
+}
+sub verimb {
+ verim(@_,0);
+}
+sub verimh {
+ verim(@_,1);
+}
+sub verimf {
+ verim(@_,2);
+}
+sub verimg {
+ verim(@_,3);
+}
+
+sub veslv {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe770,@_);
+}
+sub veslvb {
+ veslv(@_,0);
+}
+sub veslvh {
+ veslv(@_,1);
+}
+sub veslvf {
+ veslv(@_,2);
+}
+sub veslvg {
+ veslv(@_,3);
+}
+
+sub vesl {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRSa(0xe730,@_);
+}
+sub veslb {
+ vesl(@_,0);
+}
+sub veslh {
+ vesl(@_,1);
+}
+sub veslf {
+ vesl(@_,2);
+}
+sub veslg {
+ vesl(@_,3);
+}
+
+sub vesrav {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe77a,@_);
+}
+sub vesravb {
+ vesrav(@_,0);
+}
+sub vesravh {
+ vesrav(@_,1);
+}
+sub vesravf {
+ vesrav(@_,2);
+}
+sub vesravg {
+ vesrav(@_,3);
+}
+
+sub vesra {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRSa(0xe73a,@_);
+}
+sub vesrab {
+ vesra(@_,0);
+}
+sub vesrah {
+ vesra(@_,1);
+}
+sub vesraf {
+ vesra(@_,2);
+}
+sub vesrag {
+ vesra(@_,3);
+}
+
+sub vesrlv {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe778,@_);
+}
+sub vesrlvb {
+ vesrlv(@_,0);
+}
+sub vesrlvh {
+ vesrlv(@_,1);
+}
+sub vesrlvf {
+ vesrlv(@_,2);
+}
+sub vesrlvg {
+ vesrlv(@_,3);
+}
+
+sub vesrl {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRSa(0xe738,@_);
+}
+sub vesrlb {
+ vesrl(@_,0);
+}
+sub vesrlh {
+ vesrl(@_,1);
+}
+sub vesrlf {
+ vesrl(@_,2);
+}
+sub vesrlg {
+ vesrl(@_,3);
+}
+
+sub vsl {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe774,@_);
+}
+
+sub vslb {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe775,@_);
+}
+
+sub vsldb {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRId(0xe777,@_);
+}
+
+sub vsra {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe77e,@_);
+}
+
+sub vsrab {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe77f,@_);
+}
+
+sub vsrl {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe77c,@_);
+}
+
+sub vsrlb {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe77d,@_);
+}
+
+sub vs {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7f7,@_);
+}
+sub vsb {
+ vs(@_,0);
+}
+sub vsh {
+ vs(@_,1);
+}
+sub vsf {
+ vs(@_,2);
+}
+sub vsg {
+ vs(@_,3);
+}
+sub vsq {
+ vs(@_,4);
+}
+
+sub vscbi {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe7f5,@_);
+}
+sub vscbib {
+ vscbi(@_,0);
+}
+sub vscbih {
+ vscbi(@_,1);
+}
+sub vscbif {
+ vscbi(@_,2);
+}
+sub vscbig {
+ vscbi(@_,3);
+}
+sub vscbiq {
+ vscbi(@_,4);
+}
+
+sub vsbi {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7bf,@_);
+}
+sub vsbiq {
+ vsbi(@_,4);
+}
+
+sub vsbcbi {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRd(0xe7bd,@_);
+}
+sub vsbcbiq {
+ vsbcbi(@_,4);
+}
+
+sub vsumg {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe765,@_);
+}
+sub vsumgh {
+ vsumg(@_,1);
+}
+sub vsumgf {
+ vsumg(@_,2);
+}
+
+sub vsumq {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe767,@_);
+}
+sub vsumqf {
+ vsumq(@_,2);
+}
+sub vsumqg {
+ vsumq(@_,3);
+}
+
+sub vsum {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRc(0xe764,@_);
+}
+sub vsumb {
+ vsum(@_,0);
+}
+sub vsumh {
+ vsum(@_,1);
+}
+
+sub vtm {
+ confess(err("ARGNUM")) if ($#_!=1);
+ VRRa(0xe7d8,@_);
+}
+
+# VX - String Instructions
+
+sub vfae {
+ confess(err("ARGNUM")) if ($#_<3||$#_>4);
+ VRRb(0xe782,@_);
+}
+sub vfaeb {
+ vfae(@_[0..2],0,$_[3]);
+}
+sub vfaeh {
+ vfae(@_[0..2],1,$_[3]);
+}
+sub vfaef {
+ vfae(@_[0..2],2,$_[3]);
+}
+sub vfaebs {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],0,0x1|$_[3]);
+}
+sub vfaehs {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],1,0x1|$_[3]);
+}
+sub vfaefs {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],2,0x1|$_[3]);
+}
+sub vfaezb {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],0,0x2|$_[3]);
+}
+sub vfaezh {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],1,0x2|$_[3]);
+}
+sub vfaezf {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],2,0x2|$_[3]);
+}
+sub vfaezbs {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],0,0x3|$_[3]);
+}
+sub vfaezhs {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],1,0x3|$_[3]);
+}
+sub vfaezfs {
+ $_[3]=0 if (!defined($_[3]));
+ vfae(@_[0..2],2,0x3|$_[3]);
+}
+
+sub vfee {
+ confess(err("ARGNUM")) if ($#_<3||$#_>4);
+ VRRb(0xe780,@_);
+}
+sub vfeeb {
+ vfee(@_[0..2],0,$_[3]);
+}
+sub vfeeh {
+ vfee(@_[0..2],1,$_[3]);
+}
+sub vfeef {
+ vfee(@_[0..2],2,$_[3]);
+}
+sub vfeebs {
+ vfee(@_,0,1);
+}
+sub vfeehs {
+ vfee(@_,1,1);
+}
+sub vfeefs {
+ vfee(@_,2,1);
+}
+sub vfeezb {
+ vfee(@_,0,2);
+}
+sub vfeezh {
+ vfee(@_,1,2);
+}
+sub vfeezf {
+ vfee(@_,2,2);
+}
+sub vfeezbs {
+ vfee(@_,0,3);
+}
+sub vfeezhs {
+ vfee(@_,1,3);
+}
+sub vfeezfs {
+ vfee(@_,2,3);
+}
+
+sub vfene {
+ confess(err("ARGNUM")) if ($#_<3||$#_>4);
+ VRRb(0xe781,@_);
+}
+sub vfeneb {
+ vfene(@_[0..2],0,$_[3]);
+}
+sub vfeneh {
+ vfene(@_[0..2],1,$_[3]);
+}
+sub vfenef {
+ vfene(@_[0..2],2,$_[3]);
+}
+sub vfenebs {
+ vfene(@_,0,1);
+}
+sub vfenehs {
+ vfene(@_,1,1);
+}
+sub vfenefs {
+ vfene(@_,2,1);
+}
+sub vfenezb {
+ vfene(@_,0,2);
+}
+sub vfenezh {
+ vfene(@_,1,2);
+}
+sub vfenezf {
+ vfene(@_,2,2);
+}
+sub vfenezbs {
+ vfene(@_,0,3);
+}
+sub vfenezhs {
+ vfene(@_,1,3);
+}
+sub vfenezfs {
+ vfene(@_,2,3);
+}
+
+sub vistr {
+ confess(err("ARGNUM")) if ($#_<2||$#_>3);
+ VRRa(0xe75c,@_[0..2],0,$_[3]);
+}
+sub vistrb {
+ vistr(@_[0..1],0,$_[2]);
+}
+sub vistrh {
+ vistr(@_[0..1],1,$_[2]);
+}
+sub vistrf {
+ vistr(@_[0..1],2,$_[2]);
+}
+sub vistrbs {
+ vistr(@_,0,1);
+}
+sub vistrhs {
+ vistr(@_,1,1);
+}
+sub vistrfs {
+ vistr(@_,2,1);
+}
+
+sub vstrc {
+ confess(err("ARGNUM")) if ($#_<4||$#_>5);
+ VRRd(0xe78a,@_);
+}
+sub vstrcb {
+ vstrc(@_[0..3],0,$_[4]);
+}
+sub vstrch {
+ vstrc(@_[0..3],1,$_[4]);
+}
+sub vstrcf {
+ vstrc(@_[0..3],2,$_[4]);
+}
+sub vstrcbs {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],0,0x1|$_[4]);
+}
+sub vstrchs {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],1,0x1|$_[4]);
+}
+sub vstrcfs {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],2,0x1|$_[4]);
+}
+sub vstrczb {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],0,0x2|$_[4]);
+}
+sub vstrczh {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],1,0x2|$_[4]);
+}
+sub vstrczf {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],2,0x2|$_[4]);
+}
+sub vstrczbs {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],0,0x3|$_[4]);
+}
+sub vstrczhs {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],1,0x3|$_[4]);
+}
+sub vstrczfs {
+ $_[4]=0 if (!defined($_[4]));
+ vstrc(@_[0..3],2,0x3|$_[4]);
+}
+
+# VX - Floating-point Instructions
+
+sub vfa {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRc(0xe7e3,@_);
+}
+sub vfadb {
+ vfa(@_,3,0);
+}
+sub wfadb {
+ vfa(@_,3,8);
+}
+
+sub wfc {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRa(0xe7cb,@_);
+}
+sub wfcdb {
+ wfc(@_,3,0);
+}
+
+sub wfk {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRa(0xe7ca,@_);
+}
+sub wfksb {
+ wfk(@_,2,0);
+}
+sub wfkdb {
+ wfk(@_,3,0);
+}
+sub wfkxb {
+ wfk(@_,4,0);
+}
+
+sub vfce {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRc(0xe7e8,@_);
+}
+sub vfcedb {
+ vfce(@_,3,0,0);
+}
+sub vfcedbs {
+ vfce(@_,3,0,1);
+}
+sub wfcedb {
+ vfce(@_,3,8,0);
+}
+sub wfcedbs {
+ vfce(@_,3,8,1);
+}
+
+sub vfch {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRc(0xe7eb,@_);
+}
+sub vfchdb {
+ vfch(@_,3,0,0);
+}
+sub vfchdbs {
+ vfch(@_,3,0,1);
+}
+sub wfchdb {
+ vfch(@_,3,8,0);
+}
+sub wfchdbs {
+ vfch(@_,3,8,1);
+}
+
+sub vfche {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRc(0xe7ea,@_);
+}
+sub vfchedb {
+ vfche(@_,3,0,0);
+}
+sub vfchedbs {
+ vfche(@_,3,0,1);
+}
+sub wfchedb {
+ vfche(@_,3,8,0);
+}
+sub wfchedbs {
+ vfche(@_,3,8,1);
+}
+
+sub vcdg {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRa(0xe7c3,@_);
+}
+sub vcdgb {
+ vcdg(@_[0..1],3,@_[2..3]);
+}
+sub wcdgb {
+ vcdg(@_[0..1],3,0x8|$_[2],$_[3]);
+}
+
+sub vcdlg {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRa(0xe7c1,@_);
+}
+sub vcdlgb {
+ vcdlg(@_[0..1],3,@_[2..3]);
+}
+sub wcdlgb {
+ vcdlg(@_[0..1],3,0x8|$_[2],$_[3]);
+}
+
+sub vcgd {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRa(0xe7c2,@_);
+}
+sub vcgdb {
+ vcgd(@_[0..1],3,@_[2..3]);
+}
+sub wcgdb {
+ vcgd(@_[0..1],3,0x8|$_[2],$_[3]);
+}
+
+sub vclgd {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRa(0xe7c0,@_);
+}
+sub vclgdb {
+ vclgd(@_[0..1],3,@_[2..3]);
+}
+sub wclgdb {
+ vclgd(@_[0..1],3,0x8|$_[2],$_[3]);
+}
+
+sub vfd {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRc(0xe7e5,@_);
+}
+sub vfddb {
+ vfd(@_,3,0);
+}
+sub wfddb {
+ vfd(@_,3,8);
+}
+
+sub vfi {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRa(0xe7c7,@_);
+}
+sub vfidb {
+ vfi(@_[0..1],3,@_[2..3]);
+}
+sub wfidb {
+ vfi(@_[0..1],3,0x8|$_[2],$_[3]);
+}
+
+sub vlde { # deprecated, use vfll
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRa(0xe7c4,@_);
+}
+sub vldeb { # deprecated, use vflls
+ vlde(@_,2,0);
+}
+sub wldeb { # deprecated, use wflls
+ vlde(@_,2,8);
+}
+
+sub vled { # deprecated, use vflr
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRa(0xe7c5,@_);
+}
+sub vledb { # deprecated, use vflrd
+ vled(@_[0..1],3,@_[2..3]);
+}
+sub wledb { # deprecated, use wflrd
+ vled(@_[0..1],3,0x8|$_[2],$_[3]);
+}
+
+sub vfm {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRc(0xe7e7,@_);
+}
+sub vfmdb {
+ vfm(@_,3,0);
+}
+sub wfmdb {
+ vfm(@_,3,8);
+}
+
+sub vfma {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRe(0xe78f,@_);
+}
+sub vfmadb {
+ vfma(@_,0,3);
+}
+sub wfmadb {
+ vfma(@_,8,3);
+}
+
+sub vfms {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRe(0xe78e,@_);
+}
+sub vfmsdb {
+ vfms(@_,0,3);
+}
+sub wfmsdb {
+ vfms(@_,8,3);
+}
+
+sub vfpso {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRa(0xe7cc,@_);
+}
+sub vfpsodb {
+ vfpso(@_[0..1],3,0,$_[2]);
+}
+sub wfpsodb {
+ vfpso(@_[0..1],3,8,$_[2]);
+}
+sub vflcdb {
+ vfpso(@_,3,0,0);
+}
+sub wflcdb {
+ vfpso(@_,3,8,0);
+}
+sub vflndb {
+ vfpso(@_,3,0,1);
+}
+sub wflndb {
+ vfpso(@_,3,8,1);
+}
+sub vflpdb {
+ vfpso(@_,3,0,2);
+}
+sub wflpdb {
+ vfpso(@_,3,8,2);
+}
+
+sub vfsq {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRRa(0xe7ce,@_);
+}
+sub vfsqdb {
+ vfsq(@_,3,0);
+}
+sub wfsqdb {
+ vfsq(@_,3,8);
+}
+
+sub vfs {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRRc(0xe7e2,@_);
+}
+sub vfsdb {
+ vfs(@_,3,0);
+}
+sub wfsdb {
+ vfs(@_,3,8);
+}
+
+sub vftci {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIe(0xe74a,@_);
+}
+sub vftcidb {
+ vftci(@_,3,0);
+}
+sub wftcidb {
+ vftci(@_,3,8);
+}
+
+# VXE - Support Instructions
+
+sub vbperm {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe785,@_);
+}
+
+sub vllezlf {
+ vllez(@_,6);
+}
+
+# VXE - Integer Instructions
+
+sub vmsl {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRd(0xe7b8,@_);
+}
+sub vmslg {
+ vmsl(@_[0..3],3,$_[4]);
+}
+
+sub vnx {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe76c,@_);
+}
+
+sub vnn {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe76e,@_);
+}
+
+sub voc {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRc(0xe76f,@_);
+}
+
+sub vpopctb {
+ vpopct(@_,0);
+}
+sub vpopcth {
+ vpopct(@_,1);
+}
+sub vpopctf {
+ vpopct(@_,2);
+}
+sub vpopctg {
+ vpopct(@_,3);
+}
+
+# VXE - Floating-Point Instructions
+
+sub vfasb {
+ vfa(@_,2,0);
+}
+sub wfasb {
+ vfa(@_,2,8);
+}
+sub wfaxb {
+ vfa(@_,4,8);
+}
+
+sub wfcsb {
+ wfc(@_,2,0);
+}
+sub wfcxb {
+ wfc(@_,4,0);
+}
+
+sub vfcesb {
+ vfce(@_,2,0,0);
+}
+sub vfcesbs {
+ vfce(@_,2,0,1);
+}
+sub wfcesb {
+ vfce(@_,2,8,0);
+}
+sub wfcesbs {
+ vfce(@_,2,8,1);
+}
+sub wfcexb {
+ vfce(@_,4,8,0);
+}
+sub wfcexbs {
+ vfce(@_,4,8,1);
+}
+
+sub vfchsb {
+ vfch(@_,2,0,0);
+}
+sub vfchsbs {
+ vfch(@_,2,0,1);
+}
+sub wfchsb {
+ vfch(@_,2,8,0);
+}
+sub wfchsbs {
+ vfch(@_,2,8,1);
+}
+sub wfchxb {
+ vfch(@_,4,8,0);
+}
+sub wfchxbs {
+ vfch(@_,4,8,1);
+}
+
+sub vfchesb {
+ vfche(@_,2,0,0);
+}
+sub vfchesbs {
+ vfche(@_,2,0,1);
+}
+sub wfchesb {
+ vfche(@_,2,8,0);
+}
+sub wfchesbs {
+ vfche(@_,2,8,1);
+}
+sub wfchexb {
+ vfche(@_,4,8,0);
+}
+sub wfchexbs {
+ vfche(@_,4,8,1);
+}
+
+sub vfdsb {
+ vfd(@_,2,0);
+}
+sub wfdsb {
+ vfd(@_,2,8);
+}
+sub wfdxb {
+ vfd(@_,4,8);
+}
+
+sub vfisb {
+ vfi(@_[0..1],2,@_[2..3]);
+}
+sub wfisb {
+ vfi(@_[0..1],2,0x8|$_[2],$_[3]);
+}
+sub wfixb {
+ vfi(@_[0..1],4,0x8|$_[2],$_[3]);
+}
+
+sub vfll {
+ vlde(@_);
+}
+sub vflls {
+ vfll(@_,2,0);
+}
+sub wflls {
+ vfll(@_,2,8);
+}
+sub wflld {
+ vfll(@_,3,8);
+}
+
+sub vflr {
+ vled(@_);
+}
+sub vflrd {
+ vflr(@_[0..1],3,@_[2..3]);
+}
+sub wflrd {
+ vflr(@_[0..1],3,0x8|$_[2],$_[3]);
+}
+sub wflrx {
+ vflr(@_[0..1],4,0x8|$_[2],$_[3]);
+}
+
+sub vfmax {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRc(0xe7ef,@_);
+}
+sub vfmaxsb {
+ vfmax(@_[0..2],2,0,$_[3]);
+}
+sub vfmaxdb {
+ vfmax(@_[0..2],3,0,$_[3]);
+}
+sub wfmaxsb {
+ vfmax(@_[0..2],2,8,$_[3]);
+}
+sub wfmaxdb {
+ vfmax(@_[0..2],3,8,$_[3]);
+}
+sub wfmaxxb {
+ vfmax(@_[0..2],4,8,$_[3]);
+}
+
+sub vfmin {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRc(0xe7ee,@_);
+}
+sub vfminsb {
+ vfmin(@_[0..2],2,0,$_[5]);
+}
+sub vfmindb {
+ vfmin(@_[0..2],3,0,$_[5]);
+}
+sub wfminsb {
+ vfmin(@_[0..2],2,8,$_[5]);
+}
+sub wfmindb {
+ vfmin(@_[0..2],3,8,$_[5]);
+}
+sub wfminxb {
+ vfmin(@_[0..2],4,8,$_[5]);
+}
+
+sub vfmsb {
+ vfm(@_,2,0);
+}
+sub wfmsb {
+ vfm(@_,2,8);
+}
+sub wfmxb {
+ vfm(@_,4,8);
+}
+
+sub vfmasb {
+ vfma(@_,0,2);
+}
+sub wfmasb {
+ vfma(@_,8,2);
+}
+sub wfmaxb {
+ vfma(@_,8,4);
+}
+
+sub vfmssb {
+ vfms(@_,0,2);
+}
+sub wfmssb {
+ vfms(@_,8,2);
+}
+sub wfmsxb {
+ vfms(@_,8,4);
+}
+
+sub vfnma {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRe(0xe79f,@_);
+}
+sub vfnmasb {
+ vfnma(@_,0,2);
+}
+sub vfnmadb {
+ vfnma(@_,0,3);
+}
+sub wfnmasb {
+ vfnma(@_,8,2);
+}
+sub wfnmadb {
+ vfnma(@_,8,3);
+}
+sub wfnmaxb {
+ vfnma(@_,8,4);
+}
+
+sub vfnms {
+ confess(err("ARGNUM")) if ($#_!=5);
+ VRRe(0xe79e,@_);
+}
+sub vfnmssb {
+ vfnms(@_,0,2);
+}
+sub vfnmsdb {
+ vfnms(@_,0,3);
+}
+sub wfnmssb {
+ vfnms(@_,8,2);
+}
+sub wfnmsdb {
+ vfnms(@_,8,3);
+}
+sub wfnmsxb {
+ vfnms(@_,8,4);
+}
+
+sub vfpsosb {
+ vfpso(@_[0..1],2,0,$_[2]);
+}
+sub wfpsosb {
+ vfpso(@_[0..1],2,8,$_[2]);
+}
+sub vflcsb {
+ vfpso(@_,2,0,0);
+}
+sub wflcsb {
+ vfpso(@_,2,8,0);
+}
+sub vflnsb {
+ vfpso(@_,2,0,1);
+}
+sub wflnsb {
+ vfpso(@_,2,8,1);
+}
+sub vflpsb {
+ vfpso(@_,2,0,2);
+}
+sub wflpsb {
+ vfpso(@_,2,8,2);
+}
+sub vfpsoxb {
+ vfpso(@_[0..1],4,0,$_[2]);
+}
+sub wfpsoxb {
+ vfpso(@_[0..1],4,8,$_[2]);
+}
+sub vflcxb {
+ vfpso(@_,4,0,0);
+}
+sub wflcxb {
+ vfpso(@_,4,8,0);
+}
+sub vflnxb {
+ vfpso(@_,4,0,1);
+}
+sub wflnxb {
+ vfpso(@_,4,8,1);
+}
+sub vflpxb {
+ vfpso(@_,4,0,2);
+}
+sub wflpxb {
+ vfpso(@_,4,8,2);
+}
+
+sub vfsqsb {
+ vfsq(@_,2,0);
+}
+sub wfsqsb {
+ vfsq(@_,2,8);
+}
+sub wfsqxb {
+ vfsq(@_,4,8);
+}
+
+sub vfssb {
+ vfs(@_,2,0);
+}
+sub wfssb {
+ vfs(@_,2,8);
+}
+sub wfsxb {
+ vfs(@_,4,8);
+}
+
+sub vftcisb {
+ vftci(@_,2,0);
+}
+sub wftcisb {
+ vftci(@_,2,8);
+}
+sub wftcixb {
+ vftci(@_,4,8);
+}
+
+# VXD - Support Instructions
+
+sub vlrlr {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRSd(0xe637,@_);
+}
+
+sub vlrl {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VSI(0xe635,@_);
+}
+
+sub vstrlr {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRSd(0xe63f,@_);
+}
+
+sub vstrl {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VSI(0xe63d,@_);
+}
+
+sub vap {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIf(0xe671,@_);
+}
+
+sub vcp {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRh(0xe677,@_);
+}
+
+sub vcvb {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRi(0xe650,@_);
+}
+
+sub vcvbg {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRRi(0xe652,@_);
+}
+
+sub vcvd {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRIi(0xe658,@_);
+}
+
+sub vcvdg {
+ confess(err("ARGNUM")) if ($#_!=3);
+ VRIi(0xe65a,@_);
+}
+
+sub vdp {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIf(0xe67a,@_);
+}
+
+sub vlip {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VRIh(0xe649,@_);
+}
+
+sub vmp {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIf(0xe678,@_);
+}
+
+sub vmsp {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIf(0xe679,@_);
+}
+
+sub vpkz {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VSI(0xe634,@_);
+}
+
+sub vpsop {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIg(0xe65b,@_);
+}
+
+sub vrp {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIf(0xe67b,@_);
+}
+
+sub vsdp {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIf(0xe67e,@_);
+}
+
+sub vsrp {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIg(0xe659,@_);
+}
+
+sub vsp {
+ confess(err("ARGNUM")) if ($#_!=4);
+ VRIf(0xe673,@_);
+}
+
+sub vtp {
+ confess(err("ARGNUM")) if ($#_!=0);
+ VRRg(0xe65f,@_);
+}
+
+sub vupkz {
+ confess(err("ARGNUM")) if ($#_!=2);
+ VSI(0xe63c,@_);
+}
+
+#
+# Instruction Formats
+#
+
+sub RRE {
+ confess(err("ARGNUM")) if ($#_<0||2<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$r1,$r2)=(shift,get_R(shift),get_R(shift));
+
+ $out.="\t.long\t".sprintf("%#010x",($opcode<<16|$r1<<4|$r2));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub RRFb {
+ confess(err("ARGNUM")) if ($#_<3||4<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$r1,$r3,$r2,$m4)=(shift,get_R(shift),get_R(shift)
+ ,get_R(shift),get_M(shift));
+
+ $out.="\t.long\t"
+ .sprintf("%#010x",($opcode<<16|$r3<<12|$m4<<8|$r1<<4|$r2));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub S {
+ confess(err("ARGNUM")) if ($#_<0||1<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$d2,$b2)=(shift,get_DB(shift));
+
+ $out.="\t.long\t".sprintf("%#010x",($opcode<<16|$b2<<12|$d2));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIa {
+ confess(err("ARGNUM")) if ($#_<2||3<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$i2,$m3)=(shift,get_V(shift),get_I(shift,16),
+ get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).",";
+ $out.=sprintf("%#06x",$i2).",";
+ $out.=sprintf("%#06x",($m3<<12|RXB($v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIb {
+ confess(err("ARGNUM")) if ($#_!=4);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$i2,$i3,$m4)=(shift,get_V(shift),get_I(shift,8),
+ ,get_I(shift,8),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).",";
+ $out.=sprintf("%#06x",($i2<<8|$i3)).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB($v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIc {
+ confess(err("ARGNUM")) if ($#_!=4);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v3,$i2,$m4)=(shift,get_V(shift),get_V(shift),
+ ,get_I(shift,16),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v3&0xf)).",";
+ $out.=sprintf("%#06x",$i2).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v3)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRId {
+ confess(err("ARGNUM")) if ($#_<4||$#_>5);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$v3,$i4,$m5)=(shift,get_V(shift),get_V(shift),
+ ,get_V(shift),get_I(shift,8),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
+ $out.=sprintf("%#06x",(($v3&0xf)<<12|$i4)).",";
+ $out.=sprintf("%#06x",($m5<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIe {
+ confess(err("ARGNUM")) if ($#_!=5);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$i3,$m4,$m5)=(shift,get_V(shift),get_V(shift),
+ ,get_I(shift,12),get_M(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
+ $out.=sprintf("%#06x",($i3<<4|$m5)).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIf {
+ confess(err("ARGNUM")) if ($#_!=5);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$v3,$i4,$m5)=(shift,get_V(shift),get_V(shift),
+ ,get_V(shift),get_I(shift,8),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
+ $out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<4)|$i4>>4).",";
+ $out.=sprintf("%#06x",(($i4&0xf)<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIg {
+ confess(err("ARGNUM")) if ($#_!=5);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$i3,$i4,$m5)=(shift,get_V(shift),get_V(shift),
+ ,get_I(shift,8),get_I(shift,8),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|($v2&0xf)).",";
+ $out.=sprintf("%#06x",($i4<<8|$m5<<4|$i3>>4)).",";
+ $out.=sprintf("%#06x",(($i3&0xf)<<12|RXB($v1,$v2)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIh {
+ confess(err("ARGNUM")) if ($#_!=3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$i2,$i3)=(shift,get_V(shift),get_I(shift,16),
+ get_I(shift,4));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)).",";
+ $out.=sprintf("%#06x",$i2).",";
+ $out.=sprintf("%#06x",($i3<<12|RXB($v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRIi {
+ confess(err("ARGNUM")) if ($#_!=4);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$r2,$i3,$m4)=(shift,get_V(shift),get_R(shift),
+ ,get_I(shift,8),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4)|$r2).",";
+ $out.=sprintf("%#06x",($m4<<4|$i3>>4)).",";
+ $out.=sprintf("%#06x",(($i3&0xf)<<12|RXB($v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRa {
+ confess(err("ARGNUM")) if ($#_<2||5<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$m3,$m4,$m5)=(shift,get_V(shift),get_V(shift),
+ get_M(shift),get_M(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
+ $out.=sprintf("%#06x",($m5<<4|$m4)).",";
+ $out.=sprintf("%#06x",($m3<<12|RXB($v1,$v2)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRb {
+ confess(err("ARGNUM")) if ($#_<3||5<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$v3,$m4,$m5)=(shift,get_V(shift),get_V(shift),
+ get_V(shift),get_M(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
+ $out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<4)).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRc {
+ confess(err("ARGNUM")) if ($#_<3||6<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$v3,$m4,$m5,$m6)=(shift,get_V(shift),get_V(shift),
+ get_V(shift),get_M(shift),get_M(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
+ $out.=sprintf("%#06x",(($v3&0xf)<<12|$m6<<4|$m5)).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v2,$v3)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRd {
+ confess(err("ARGNUM")) if ($#_<4||6<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$v3,$v4,$m5,$m6)=(shift,get_V(shift),get_V(shift),
+ get_V(shift),get_V(shift),get_M(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
+ $out.=sprintf("%#06x",(($v3&0xf)<<12|$m5<<8|$m6<<4)).",";
+ $out.=sprintf("%#06x",(($v4&0xf)<<12|RXB($v1,$v2,$v3,$v4)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRe {
+ confess(err("ARGNUM")) if ($#_<4||6<$#_);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$v3,$v4,$m5,$m6)=(shift,get_V(shift),get_V(shift),
+ get_V(shift),get_V(shift),get_M(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
+ $out.=sprintf("%#06x",(($v3&0xf)<<12|$m6<<8|$m5)).",";
+ $out.=sprintf("%#06x",(($v4&0xf)<<12|RXB($v1,$v2,$v3,$v4)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRf {
+ confess(err("ARGNUM")) if ($#_!=3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$r2,$r3)=(shift,get_V(shift),get_R(shift),
+ get_R(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|$r2)).",";
+ $out.=sprintf("%#06x",($r3<<12)).",";
+ $out.=sprintf("%#06x",(RXB($v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRg {
+ confess(err("ARGNUM")) if ($#_!=1);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1)=(shift,get_V(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf))).",";
+ $out.=sprintf("%#06x",0x0000).",";
+ $out.=sprintf("%#06x",(RXB(0,$v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRh {
+ confess(err("ARGNUM")) if ($#_<2||$#_>3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v2,$m3)=(shift,get_V(shift),get_V(shift),
+ get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf))).",";
+ $out.=sprintf("%#06x",(($v2&0xf)<<12|$m3<<4)).",";
+ $out.=sprintf("%#06x",(RXB(0,$v1,$v2)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRRi {
+ confess(err("ARGNUM")) if ($#_!=3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$r1,$v2,$m3)=(shift,get_R(shift),get_V(shift),
+ get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|$r1<<4|($v2&0xf))).",";
+ $out.=sprintf("%#06x",($m3<<4))."\,";
+ $out.=sprintf("%#06x",(RXB(0,$v2)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRSa {
+ confess(err("ARGNUM")) if ($#_<3||$#_>4);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$v3,$d2,$b2,$m4)=(shift,get_V(shift),get_V(shift),
+ get_DB(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v3&0xf))).",";
+ $out.=sprintf("%#06x",($b2<<12|$d2)).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB($v1,$v3)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRSb {
+ confess(err("ARGNUM")) if ($#_<3||$#_>4);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$r3,$d2,$b2,$m4)=(shift,get_V(shift),get_R(shift),
+ get_DB(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|$r3)).",";
+ $out.=sprintf("%#06x",($b2<<12|$d2)).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB($v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRSc {
+ confess(err("ARGNUM")) if ($#_!=4);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$r1,$v3,$d2,$b2,$m4)=(shift,get_R(shift),get_V(shift),
+ get_DB(shift),get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|$r1<<4|($v3&0xf))).",";
+ $out.=sprintf("%#06x",($b2<<12|$d2)).",";
+ $out.=sprintf("%#06x",($m4<<12|RXB(0,$v3)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRSd {
+ confess(err("ARGNUM")) if ($#_!=3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$r3,$d2,$b2)=(shift,get_V(shift),get_R(shift),
+ get_DB(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|$r3)).",";
+ $out.=sprintf("%#06x",($b2<<12|$d2)).",";
+ $out.=sprintf("%#06x",(($v1&0xf)<<12|RXB(0,0,0,$v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRV {
+ confess(err("ARGNUM")) if ($#_<2||$#_>3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$d2,$v2,$b2,$m3)=(shift,get_V(shift),get_DVB(shift),
+ get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($v2&0xf))).",";
+ $out.=sprintf("%#06x",($b2<<12|$d2)).",";
+ $out.=sprintf("%#06x",($m3<<12|RXB($v1,$v2)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VRX {
+ confess(err("ARGNUM")) if ($#_<2||$#_>3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$d2,$x2,$b2,$m3)=(shift,get_V(shift),get_DXB(shift),
+ get_M(shift));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|($v1&0xf)<<4|($x2))).",";
+ $out.=sprintf("%#06x",($b2<<12|$d2)).",";
+ $out.=sprintf("%#06x",($m3<<12|RXB($v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+sub VSI {
+ confess(err("ARGNUM")) if ($#_!=3);
+ my $ops=join(',',@_[1..$#_]);
+ my $memn=(caller(1))[3];
+ $memn=~s/^.*:://;
+ my ($opcode,$v1,$d2,$b2,$i3)=(shift,get_V(shift),get_DB(shift),
+ get_I(shift,8));
+
+ $out.="\t.word\t";
+ $out.=sprintf("%#06x",($opcode&0xff00|$i3)).",";
+ $out.=sprintf("%#06x",($b2<<12|$d2)).",";
+ $out.=sprintf("%#06x",(($v1&0xf)<<12|RXB(0,0,0,$v1)<<8|$opcode&0xff));
+ $out.="\t# $memn\t$ops\n"
+}
+
+#
+# Internal
+#
+
+sub get_R {
+ confess(err("ARGNUM")) if ($#_!=0);
+ my $r;
+
+ for (shift) {
+ if (!defined) {
+ $r=0;
+ } elsif (/^$GR$/) {
+ $r=$1;
+ } else {
+ confess(err("PARSE"));
+ }
+ }
+ confess(err("ARGRANGE")) if ($r&~0xf);
+
+ return $r;
+}
+
+sub get_V {
+ confess(err("ARGNUM")) if ($#_!=0);
+ my $v;
+
+ for (shift) {
+ if (!defined) {
+ $v=0;
+ } elsif (/^$VR$/) {
+ $v=$1;
+ } else {
+ confess(err("PARSE"));
+ }
+ }
+ confess(err("ARGRANGE")) if ($v&~0x1f);
+
+ return $v;
+}
+
+sub get_I {
+ confess(err("ARGNUM")) if ($#_!=1);
+ my ($i,$bits)=(shift,shift);
+
+ $i=defined($i)?(eval($i)):(0);
+ confess(err("PARSE")) if (!defined($i));
+ confess(err("ARGRANGE")) if (abs($i)&~(2**$bits-1));
+
+ return $i&(2**$bits-1);
+}
+
+sub get_M {
+ confess(err("ARGNUM")) if ($#_!=0);
+ my $m=shift;
+
+ $m=defined($m)?(eval($m)):(0);
+ confess(err("PARSE")) if (!defined($m));
+ confess(err("ARGRANGE")) if ($m&~0xf);
+
+ return $m;
+}
+
+sub get_DB
+{
+ confess(err("ARGNUM")) if ($#_!=0);
+ my ($d,$b);
+
+ for (shift) {
+ if (!defined) {
+ ($d,$b)=(0,0);
+ } elsif (/^(.+)\($GR\)$/) {
+ ($d,$b)=(eval($1),$2);
+ confess(err("PARSE")) if (!defined($d));
+ } elsif (/^(.+)$/) {
+ ($d,$b)=(eval($1),0);
+ confess(err("PARSE")) if (!defined($d));
+ } else {
+ confess(err("PARSE"));
+ }
+ }
+ confess(err("ARGRANGE")) if ($d&~0xfff||$b&~0xf);
+
+ return ($d,$b);
+}
+
+sub get_DVB
+{
+ confess(err("ARGNUM")) if ($#_!=0);
+ my ($d,$v,$b);
+
+ for (shift) {
+ if (!defined) {
+ ($d,$v,$b)=(0,0,0);
+ } elsif (/^(.+)\($VR,$GR\)$/) {
+ ($d,$v,$b)=(eval($1),$2,$3);
+ confess(err("PARSE")) if (!defined($d));
+ } elsif (/^(.+)\($GR\)$/) {
+ ($d,$v,$b)=(eval($1),0,$2);
+ confess(err("PARSE")) if (!defined($d));
+ } elsif (/^(.+)$/) {
+ ($d,$v,$b)=(eval($1),0,0);
+ confess(err("PARSE")) if (!defined($d));
+ } else {
+ confess(err("PARSE"));
+ }
+ }
+ confess(err("ARGRANGE")) if ($d&~0xfff||$v&~0x1f||$b&~0xf);
+
+ return ($d,$v,$b);
+}
+
+sub get_DXB
+{
+ confess(err("ARGNUM")) if ($#_!=0);
+ my ($d,$x,$b);
+
+ for (shift) {
+ if (!defined) {
+ ($d,$x,$b)=(0,0,0);
+ } elsif (/^(.+)\($GR,$GR\)$/) {
+ ($d,$x,$b)=(eval($1),$2,$3);
+ confess(err("PARSE")) if (!defined($d));
+ } elsif (/^(.+)\($GR\)$/) {
+ ($d,$x,$b)=(eval($1),0,$2);
+ confess(err("PARSE")) if (!defined($d));
+ } elsif (/^(.+)$/) {
+ ($d,$x,$b)=(eval($1),0,0);
+ confess(err("PARSE")) if (!defined($d));
+ } else {
+ confess(err("PARSE"));
+ }
+ }
+ confess(err("ARGRANGE")) if ($d&~0xfff||$x&~0xf||$b&~0xf);
+
+ return ($d,$x,$b);
+}
+
+sub RXB
+{
+ confess(err("ARGNUM")) if ($#_<0||3<$#_);
+ my $rxb=0;
+
+ $rxb|=0x08 if (defined($_[0])&&($_[0]&0x10));
+ $rxb|=0x04 if (defined($_[1])&&($_[1]&0x10));
+ $rxb|=0x02 if (defined($_[2])&&($_[2]&0x10));
+ $rxb|=0x01 if (defined($_[3])&&($_[3]&0x10));
+
+ return $rxb;
+}
+
+sub err {
+ my %ERR =
+ (
+ ARGNUM => 'Wrong number of arguments',
+ ARGRANGE=> 'Argument out of range',
+ PARSE => 'Parse error',
+ );
+ confess($ERR{ARGNUM}) if ($#_!=0);
+
+ return $ERR{$_[0]};
+}
+
+1;
diff -up openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl.s390x-update openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl
--- openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl.s390x-update 2020-03-19 16:20:22.041227359 +0100
+++ openssl-1.1.1e/crypto/poly1305/asm/poly1305-s390x.pl 2020-03-19 16:23:22.364098257 +0100
@@ -24,204 +24,961 @@
#
# On side note, z13 enables vector base 2^26 implementation...
-$flavour = shift;
+#
+# January 2019
+#
+# Add vx code path (base 2^26).
+#
+# Copyright IBM Corp. 2019
+# Author: Patrick Steuer <patrick.steuer@de.ibm.com>
+
+#
+# January 2019
+#
+# Add vector base 2^26 implementation. It's problematic to accurately
+# measure performance, because reference system is hardly idle. But
+# it's sub-cycle, i.e. less than 1 cycle per processed byte, and it's
+# >=20% faster than IBM's submission on long inputs, and much faster on
+# short ones, because calculation of key powers is postponed till we
+# know that input is long enough to justify the additional overhead.
+
+use strict;
+use FindBin qw($Bin);
+use lib "$Bin/../..";
+use perlasm::s390x qw(:DEFAULT :VX AUTOLOAD LABEL INCLUDE);
+
+my $flavour = shift;
+my ($z,$SIZE_T);
if ($flavour =~ /3[12]/) {
+ $z=0; # S/390 ABI
$SIZE_T=4;
- $g="";
} else {
+ $z=1; # zSeries ABI
$SIZE_T=8;
- $g="g";
}
+my $output;
while (($output=shift) && ($output!~/\w[\w\-]*\.\w+$/)) {}
-open STDOUT,">$output";
-$sp="%r15";
+my $stdframe=16*$SIZE_T+4*8;
+my $sp="%r15";
my ($ctx,$inp,$len,$padbit) = map("%r$_",(2..5));
-$code.=<<___;
-.text
+PERLASM_BEGIN($output);
-.globl poly1305_init
-.type poly1305_init,\@function
-.align 16
-poly1305_init:
- lghi %r0,0
- lghi %r1,-1
- stg %r0,0($ctx) # zero hash value
- stg %r0,8($ctx)
- stg %r0,16($ctx)
-
- cl${g}r $inp,%r0
- je .Lno_key
-
- lrvg %r4,0($inp) # load little-endian key
- lrvg %r5,8($inp)
-
- nihl %r1,0xffc0 # 0xffffffc0ffffffff
- srlg %r0,%r1,4 # 0x0ffffffc0fffffff
- srlg %r1,%r1,4
- nill %r1,0xfffc # 0x0ffffffc0ffffffc
-
- ngr %r4,%r0
- ngr %r5,%r1
-
- stg %r4,32($ctx)
- stg %r5,40($ctx)
-
-.Lno_key:
- lghi %r2,0
- br %r14
-.size poly1305_init,.-poly1305_init
-___
+INCLUDE ("s390x_arch.h");
+TEXT ();
+
+################
+# static void poly1305_init(void *ctx, const unsigned char key[16])
+{
+GLOBL ("poly1305_init");
+TYPE ("poly1305_init","\@function");
+ALIGN (16);
+LABEL ("poly1305_init");
+ lghi ("%r0",0);
+ lghi ("%r1",-1);
+ stg ("%r0","0($ctx)"); # zero hash value
+ stg ("%r0","8($ctx)");
+ stg ("%r0","16($ctx)");
+ st ("%r0","24($ctx)"); # clear is_base2_26
+ lgr ("%r5",$ctx); # reassign $ctx
+ lghi ("%r2",0);
+
+&{$z? \&clgr:\&clr} ($inp,"%r0");
+ je (".Lno_key");
+
+ lrvg ("%r2","0($inp)"); # load little-endian key
+ lrvg ("%r3","8($inp)");
+
+ nihl ("%r1",0xffc0); # 0xffffffc0ffffffff
+ srlg ("%r0","%r1",4); # 0x0ffffffc0fffffff
+ srlg ("%r1","%r1",4);
+ nill ("%r1",0xfffc); # 0x0ffffffc0ffffffc
+
+ ngr ("%r2","%r0");
+ ngr ("%r3","%r1");
+
+ stmg ("%r2","%r3","32(%r5)");
+
+ larl ("%r1","OPENSSL_s390xcap_P");
+ lg ("%r0","16(%r1)");
+ srlg ("%r0","%r0",62);
+ nill ("%r0",1); # extract vx bit
+ lcgr ("%r0","%r0");
+ larl ("%r1",".Lpoly1305_blocks");
+ larl ("%r2",".Lpoly1305_blocks_vx");
+ larl ("%r3",".Lpoly1305_emit");
+&{$z? \&xgr:\&xr} ("%r2","%r1"); # select between scalar and vector
+&{$z? \&ngr:\&nr} ("%r2","%r0");
+&{$z? \&xgr:\&xr} ("%r2","%r1");
+&{$z? \&stmg:\&stm} ("%r2","%r3","0(%r4)");
+ lghi ("%r2",1);
+LABEL (".Lno_key");
+ br ("%r14");
+SIZE ("poly1305_init",".-poly1305_init");
+}
+
+################
+# static void poly1305_blocks(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
{
my ($d0hi,$d0lo,$d1hi,$d1lo,$t0,$h0,$t1,$h1,$h2) = map("%r$_",(6..14));
my ($r0,$r1,$s1) = map("%r$_",(0..2));
-$code.=<<___;
-.globl poly1305_blocks
-.type poly1305_blocks,\@function
-.align 16
-poly1305_blocks:
- srl${g} $len,4 # fixed-up in 64-bit build
- lghi %r0,0
- cl${g}r $len,%r0
- je .Lno_data
-
- stm${g} %r6,%r14,`6*$SIZE_T`($sp)
-
- llgfr $padbit,$padbit # clear upper half, much needed with
- # non-64-bit ABI
- lg $r0,32($ctx) # load key
- lg $r1,40($ctx)
-
- lg $h0,0($ctx) # load hash value
- lg $h1,8($ctx)
- lg $h2,16($ctx)
-
- st$g $ctx,`2*$SIZE_T`($sp) # off-load $ctx
- srlg $s1,$r1,2
- algr $s1,$r1 # s1 = r1 + r1>>2
- j .Loop
-
-.align 16
-.Loop:
- lrvg $d0lo,0($inp) # load little-endian input
- lrvg $d1lo,8($inp)
- la $inp,16($inp)
-
- algr $d0lo,$h0 # accumulate input
- alcgr $d1lo,$h1
-
- lgr $h0,$d0lo
- mlgr $d0hi,$r0 # h0*r0 -> $d0hi:$d0lo
- lgr $h1,$d1lo
- mlgr $d1hi,$s1 # h1*5*r1 -> $d1hi:$d1lo
-
- mlgr $t0,$r1 # h0*r1 -> $t0:$h0
- mlgr $t1,$r0 # h1*r0 -> $t1:$h1
- alcgr $h2,$padbit
-
- algr $d0lo,$d1lo
- lgr $d1lo,$h2
- alcgr $d0hi,$d1hi
- lghi $d1hi,0
-
- algr $h1,$h0
- alcgr $t1,$t0
-
- msgr $d1lo,$s1 # h2*s1
- msgr $h2,$r0 # h2*r0
-
- algr $h1,$d1lo
- alcgr $t1,$d1hi # $d1hi is zero
-
- algr $h1,$d0hi
- alcgr $h2,$t1
-
- lghi $h0,-4 # final reduction step
- ngr $h0,$h2
- srlg $t0,$h2,2
- algr $h0,$t0
- lghi $t1,3
- ngr $h2,$t1
-
- algr $h0,$d0lo
- alcgr $h1,$d1hi # $d1hi is still zero
- alcgr $h2,$d1hi # $d1hi is still zero
-
- brct$g $len,.Loop
-
- l$g $ctx,`2*$SIZE_T`($sp) # restore $ctx
-
- stg $h0,0($ctx) # store hash value
- stg $h1,8($ctx)
- stg $h2,16($ctx)
-
- lm${g} %r6,%r14,`6*$SIZE_T`($sp)
-.Lno_data:
- br %r14
-.size poly1305_blocks,.-poly1305_blocks
-___
+GLOBL ("poly1305_blocks");
+TYPE ("poly1305_blocks","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks");
+LABEL (".Lpoly1305_blocks");
+&{$z? \&ltgr:\&ltr} ("%r0",$len);
+ jz (".Lno_data");
+
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
+
+ lg ($h0,"0($ctx)"); # load hash value
+ lg ($h1,"8($ctx)");
+ lg ($h2,"16($ctx)");
+
+LABEL (".Lpoly1305_blocks_entry");
+if ($z) {
+ srlg ($len,$len,4);
+} else {
+ srl ($len,4);
+}
+ llgfr ($padbit,$padbit); # clear upper half, much needed with
+ # non-64-bit ABI
+ lg ($r0,"32($ctx)"); # load key
+ lg ($r1,"40($ctx)");
+
+&{$z? \&stg:\&st} ($ctx,"2*$SIZE_T($sp)"); # off-load $ctx
+ srlg ($s1,$r1,2);
+ algr ($s1,$r1); # s1 = r1 + r1>>2
+ j (".Loop");
+
+ALIGN (16);
+LABEL (".Loop");
+ lrvg ($d0lo,"0($inp)"); # load little-endian input
+ lrvg ($d1lo,"8($inp)");
+ la ($inp,"16($inp)");
+
+ algr ($d0lo,$h0); # accumulate input
+ alcgr ($d1lo,$h1);
+ alcgr ($h2,$padbit);
+
+ lgr ($h0,$d0lo);
+ mlgr ($d0hi,$r0); # h0*r0 -> $d0hi:$d0lo
+ lgr ($h1,$d1lo);
+ mlgr ($d1hi,$s1); # h1*5*r1 -> $d1hi:$d1lo
+
+ mlgr ($t0,$r1); # h0*r1 -> $t0:$h0
+ mlgr ($t1,$r0); # h1*r0 -> $t1:$h1
+
+ algr ($d0lo,$d1lo);
+ lgr ($d1lo,$h2);
+ alcgr ($d0hi,$d1hi);
+ lghi ($d1hi,0);
+
+ algr ($h1,$h0);
+ alcgr ($t1,$t0);
+
+ msgr ($d1lo,$s1); # h2*s1
+ msgr ($h2,$r0); # h2*r0
+
+ algr ($h1,$d1lo);
+ alcgr ($t1,$d1hi); # $d1hi is zero
+
+ algr ($h1,$d0hi);
+ alcgr ($h2,$t1);
+
+ lghi ($h0,-4); # final reduction step
+ ngr ($h0,$h2);
+ srlg ($t0,$h2,2);
+ algr ($h0,$t0);
+ lghi ($t1,3);
+ ngr ($h2,$t1);
+
+ algr ($h0,$d0lo);
+ alcgr ($h1,$d1hi); # $d1hi is still zero
+ alcgr ($h2,$d1hi); # $d1hi is still zero
+
+&{$z? \&brctg:\&brct} ($len,".Loop");
+
+&{$z? \&lg:\&l} ($ctx,"2*$SIZE_T($sp)");# restore $ctx
+
+ stg ($h0,"0($ctx)"); # store hash value
+ stg ($h1,"8($ctx)");
+ stg ($h2,"16($ctx)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r14","6*$SIZE_T($sp)");
+LABEL (".Lno_data");
+ br ("%r14");
+SIZE ("poly1305_blocks",".-poly1305_blocks");
}
+
+################
+# static void poly1305_blocks_vx(void *ctx, const unsigned char *inp,
+# size_t len, u32 padbit)
+{
+my ($H0, $H1, $H2, $H3, $H4) = map("%v$_",(0..4));
+my ($I0, $I1, $I2, $I3, $I4) = map("%v$_",(5..9));
+my ($R0, $R1, $S1, $R2, $S2) = map("%v$_",(10..14));
+my ($R3, $S3, $R4, $S4) = map("%v$_",(15..18));
+my ($ACC0, $ACC1, $ACC2, $ACC3, $ACC4) = map("%v$_",(19..23));
+my ($T1, $T2, $T3, $T4) = map("%v$_",(24..27));
+my ($mask26,$bswaplo,$bswaphi,$bswapmi) = map("%v$_",(28..31));
+
+my ($d2,$d0,$h0,$d1,$h1,$h2)=map("%r$_",(9..14));
+
+TYPE ("poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("poly1305_blocks_vx");
+LABEL (".Lpoly1305_blocks_vx");
+&{$z? \&clgfi:\&clfi} ($len,128);
+ jhe ("__poly1305_blocks_vx");
+
+&{$z? \&stmg:\&stm} ("%r6","%r14","6*$SIZE_T($sp)");
+
+ lg ($d0,"0($ctx)");
+ lg ($d1,"8($ctx)");
+ lg ($d2,"16($ctx)");
+
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
+ srlg ($h0,$d0,32);
+ llgfr ("%r1",$d1);
+ srlg ($h1,$d1,32);
+ srlg ($h2,$d2,32);
+
+ sllg ("%r0","%r0",26);
+ algr ($h0,"%r0");
+ sllg ("%r0",$h1,52);
+ srlg ($h1,$h1,12);
+ sllg ("%r1","%r1",14);
+ algr ($h0,"%r0");
+ alcgr ($h1,"%r1");
+ sllg ("%r0",$h2,40);
+ srlg ($h2,$h2,24);
+ lghi ("%r1",0);
+ algr ($h1,"%r0");
+ alcgr ($h2,"%r1");
+
+ llgf ("%r0","24($ctx)"); # is_base2_26
+ lcgr ("%r0","%r0");
+
+ xgr ($h0,$d0); # choose between radixes
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+ ngr ($h0,"%r0");
+ ngr ($h1,"%r0");
+ ngr ($h2,"%r0");
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+
+ lhi ("%r0",0);
+ st ("%r0","24($ctx)"); # clear is_base2_26
+
+ j (".Lpoly1305_blocks_entry");
+SIZE ("poly1305_blocks_vx",".-poly1305_blocks_vx");
+
+TYPE ("__poly1305_mul","\@function");
+ALIGN (16);
+LABEL ("__poly1305_mul");
+ vmlof ($ACC0,$H0,$R0);
+ vmlof ($ACC1,$H0,$R1);
+ vmlof ($ACC2,$H0,$R2);
+ vmlof ($ACC3,$H0,$R3);
+ vmlof ($ACC4,$H0,$R4);
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # lazy reduction
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+ br ("%r14");
+SIZE ("__poly1305_mul",".-__poly1305_mul");
+
+TYPE ("__poly1305_blocks_vx","\@function");
+ALIGN (16);
+LABEL ("__poly1305_blocks_vx");
+&{$z? \&lgr:\&lr} ("%r0",$sp);
+&{$z? \&stmg:\&stm} ("%r10","%r15","10*$SIZE_T($sp)");
+if (!$z) {
+ std ("%f4","16*$SIZE_T+2*8($sp)");
+ std ("%f6","16*$SIZE_T+3*8($sp)");
+ ahi ($sp,-$stdframe);
+ st ("%r0","0($sp)"); # back-chain
+
+ llgfr ($len,$len); # so that srlg works on $len
+} else {
+ aghi ($sp,"-($stdframe+8*8)");
+ stg ("%r0","0($sp)"); # back-chain
+
+ std ("%f8","$stdframe+0*8($sp)");
+ std ("%f9","$stdframe+1*8($sp)");
+ std ("%f10","$stdframe+2*8($sp)");
+ std ("%f11","$stdframe+3*8($sp)");
+ std ("%f12","$stdframe+4*8($sp)");
+ std ("%f13","$stdframe+5*8($sp)");
+ std ("%f14","$stdframe+6*8($sp)");
+ std ("%f15","$stdframe+7*8($sp)");
+}
+ larl ("%r1",".Lconst");
+ vgmg ($mask26,38,63);
+ vlm ($bswaplo,$bswapmi,"16(%r1)");
+
+ &lt ("%r0","24($ctx)"); # is_base2_26?
+ jnz (".Lskip_init");
+
+ lg ($h0,"32($ctx)"); # load key base 2^64
+ lg ($h1,"40($ctx)");
+
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
+ srlg ($d1,$h0,52);
+ risbg ($h0,$h0,38,0x80+63,0);
+ vlvgg ($R0,$h0,0);
+ risbg ($d1,$h1,38,51,12);
+ vlvgg ($R1,$d0,0);
+ risbg ($d0,$h1,38,63,50);
+ vlvgg ($R2,$d1,0);
+ srlg ($d1,$h1,40);
+ vlvgg ($R3,$d0,0);
+ vlvgg ($R4,$d1,0);
+
+ veslg ($S1,$R1,2);
+ veslg ($S2,$R2,2);
+ veslg ($S3,$R3,2);
+ veslg ($S4,$R4,2);
+ vlr ($H0,$R0);
+ vlr ($H1,$R1);
+ vlr ($H2,$R2);
+ vlr ($H3,$R3);
+ vlr ($H4,$R4);
+ vag ($S1,$S1,$R1); # * 5
+ vag ($S2,$S2,$R2);
+ vag ($S3,$S3,$R3);
+ vag ($S4,$S4,$R4);
+
+ brasl ("%r14","__poly1305_mul"); # r^1:- * r^1:-
+
+ vpdi ($R0,$H0,$R0,0); # r^2:r^1
+ vpdi ($R1,$H1,$R1,0);
+ vpdi ($R2,$H2,$R2,0);
+ vpdi ($R3,$H3,$R3,0);
+ vpdi ($R4,$H4,$R4,0);
+ vpdi ($H0,$H0,$H0,0); # r^2:r^2
+ vpdi ($H1,$H1,$H1,0);
+ vpdi ($H2,$H2,$H2,0);
+ vpdi ($H3,$H3,$H3,0);
+ vpdi ($H4,$H4,$H4,0);
+ veslg ($S1,$R1,2);
+ veslg ($S2,$R2,2);
+ veslg ($S3,$R3,2);
+ veslg ($S4,$R4,2);
+ vag ($S1,$S1,$R1); # * 5
+ vag ($S2,$S2,$R2);
+ vag ($S3,$S3,$R3);
+ vag ($S4,$S4,$R4);
+
+ brasl ("%r14,__poly1305_mul"); # r^2:r^2 * r^2:r^1
+
+ vl ($I0,"0(%r1)"); # borrow $I0
+ vperm ($R0,$R0,$H0,$I0); # r^2:r^4:r^1:r^3
+ vperm ($R1,$R1,$H1,$I0);
+ vperm ($R2,$R2,$H2,$I0);
+ vperm ($R3,$R3,$H3,$I0);
+ vperm ($R4,$R4,$H4,$I0);
+ veslf ($S1,$R1,2);
+ veslf ($S2,$R2,2);
+ veslf ($S3,$R3,2);
+ veslf ($S4,$R4,2);
+ vaf ($S1,$S1,$R1); # * 5
+ vaf ($S2,$S2,$R2);
+ vaf ($S3,$S3,$R3);
+ vaf ($S4,$S4,$R4);
+
+ lg ($h0,"0($ctx)"); # load hash base 2^64
+ lg ($h1,"8($ctx)");
+ lg ($h2,"16($ctx)");
+
+ vzero ($H0);
+ vzero ($H1);
+ vzero ($H2);
+ vzero ($H3);
+ vzero ($H4);
+
+ risbg ($d0,$h0,38,0x80+63,38); # base 2^64 -> 2^26
+ srlg ($d1,$h0,52);
+ risbg ($h0,$h0,38,0x80+63,0);
+ vlvgg ($H0,$h0,0);
+ risbg ($d1,$h1,38,51,12);
+ vlvgg ($H1,$d0,0);
+ risbg ($d0,$h1,38,63,50);
+ vlvgg ($H2,$d1,0);
+ srlg ($d1,$h1,40);
+ vlvgg ($H3,$d0,0);
+ risbg ($d1,$h2,37,39,24);
+ vlvgg ($H4,$d1,0);
+
+ lhi ("%r0",1);
+ st ("%r0","24($ctx)"); # set is_base2_26
+
+ vstm ($R0,$S4,"48($ctx)"); # save key schedule base 2^26
+
+ vpdi ($R0,$R0,$R0,0); # broadcast r^2:r^4
+ vpdi ($R1,$R1,$R1,0);
+ vpdi ($S1,$S1,$S1,0);
+ vpdi ($R2,$R2,$R2,0);
+ vpdi ($S2,$S2,$S2,0);
+ vpdi ($R3,$R3,$R3,0);
+ vpdi ($S3,$S3,$S3,0);
+ vpdi ($R4,$R4,$R4,0);
+ vpdi ($S4,$S4,$S4,0);
+
+ j (".Loaded_hash");
+
+ALIGN (16);
+LABEL (".Lskip_init");
+ vllezf ($H0,"0($ctx)"); # load hash base 2^26
+ vllezf ($H1,"4($ctx)");
+ vllezf ($H2,"8($ctx)");
+ vllezf ($H3,"12($ctx)");
+ vllezf ($H4,"16($ctx)");
+
+ vlrepg ($R0,"0x30($ctx)"); # broadcast r^2:r^4
+ vlrepg ($R1,"0x40($ctx)");
+ vlrepg ($S1,"0x50($ctx)");
+ vlrepg ($R2,"0x60($ctx)");
+ vlrepg ($S2,"0x70($ctx)");
+ vlrepg ($R3,"0x80($ctx)");
+ vlrepg ($S3,"0x90($ctx)");
+ vlrepg ($R4,"0xa0($ctx)");
+ vlrepg ($S4,"0xb0($ctx)");
+
+LABEL (".Loaded_hash");
+ vzero ($I1);
+ vzero ($I3);
+
+ vlm ($T1,$T4,"0x00($inp)"); # load first input block
+ la ($inp,"0x40($inp)");
+ vgmg ($mask26,6,31);
+ vgmf ($I4,5,5); # padbit<<2
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ srlg ("%r0",$len,6);
+&{$z? \&aghi:\&ahi} ("%r0",-1);
+
+ALIGN (16);
+LABEL (".Loop_vx");
+ vmlef ($ACC0,$I0,$R0);
+ vmlef ($ACC1,$I0,$R1);
+ vmlef ($ACC2,$I0,$R2);
+ vmlef ($ACC3,$I0,$R3);
+ vmlef ($ACC4,$I0,$R4);
+
+ vmalef ($ACC0,$I1,$S4,$ACC0);
+ vmalef ($ACC1,$I1,$R0,$ACC1);
+ vmalef ($ACC2,$I1,$R1,$ACC2);
+ vmalef ($ACC3,$I1,$R2,$ACC3);
+ vmalef ($ACC4,$I1,$R3,$ACC4);
+
+ vaf ($H2,$H2,$I2);
+ vaf ($H0,$H0,$I0);
+ vaf ($H3,$H3,$I3);
+ vaf ($H1,$H1,$I1);
+ vaf ($H4,$H4,$I4);
+
+ vmalef ($ACC0,$I2,$S3,$ACC0);
+ vmalef ($ACC1,$I2,$S4,$ACC1);
+ vmalef ($ACC2,$I2,$R0,$ACC2);
+ vmalef ($ACC3,$I2,$R1,$ACC3);
+ vmalef ($ACC4,$I2,$R2,$ACC4);
+
+ vlm ($T1,$T4,"0x00($inp)"); # load next input block
+ la ($inp,"0x40($inp)");
+ vgmg ($mask26,6,31);
+
+ vmalef ($ACC0,$I3,$S2,$ACC0);
+ vmalef ($ACC1,$I3,$S3,$ACC1);
+ vmalef ($ACC2,$I3,$S4,$ACC2);
+ vmalef ($ACC3,$I3,$R0,$ACC3);
+ vmalef ($ACC4,$I3,$R1,$ACC4);
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ vmalef ($ACC0,$I4,$S1,$ACC0);
+ vmalef ($ACC1,$I4,$S2,$ACC1);
+ vmalef ($ACC2,$I4,$S3,$ACC2);
+ vmalef ($ACC3,$I4,$S4,$ACC3);
+ vmalef ($ACC4,$I4,$R0,$ACC4);
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+
+ vmalof ($ACC0,$H0,$R0,$ACC0);
+ vmalof ($ACC1,$H0,$R1,$ACC1);
+ vmalof ($ACC2,$H0,$R2,$ACC2);
+ vmalof ($ACC3,$H0,$R3,$ACC3);
+ vmalof ($ACC4,$H0,$R4,$ACC4);
+
+ vgmf ($I4,5,5); # padbit<<2
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
+ # and P. Schwabe
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+
+&{$z? \&brctg:\&brct} ("%r0",".Loop_vx");
+
+ vlm ($R0,$S4,"48($ctx)"); # load all powers
+
+ lghi ("%r0",0x30);
+&{$z? \&lcgr:\&lcr} ($len,$len);
+&{$z? \&ngr:\&nr} ($len,"%r0");
+&{$z? \&slgr:\&slr} ($inp,$len);
+
+LABEL (".Last");
+ vmlef ($ACC0,$I0,$R0);
+ vmlef ($ACC1,$I0,$R1);
+ vmlef ($ACC2,$I0,$R2);
+ vmlef ($ACC3,$I0,$R3);
+ vmlef ($ACC4,$I0,$R4);
+
+ vmalef ($ACC0,$I1,$S4,$ACC0);
+ vmalef ($ACC1,$I1,$R0,$ACC1);
+ vmalef ($ACC2,$I1,$R1,$ACC2);
+ vmalef ($ACC3,$I1,$R2,$ACC3);
+ vmalef ($ACC4,$I1,$R3,$ACC4);
+
+ vaf ($H0,$H0,$I0);
+ vaf ($H1,$H1,$I1);
+ vaf ($H2,$H2,$I2);
+ vaf ($H3,$H3,$I3);
+ vaf ($H4,$H4,$I4);
+
+ vmalef ($ACC0,$I2,$S3,$ACC0);
+ vmalef ($ACC1,$I2,$S4,$ACC1);
+ vmalef ($ACC2,$I2,$R0,$ACC2);
+ vmalef ($ACC3,$I2,$R1,$ACC3);
+ vmalef ($ACC4,$I2,$R2,$ACC4);
+
+ vmalef ($ACC0,$I3,$S2,$ACC0);
+ vmalef ($ACC1,$I3,$S3,$ACC1);
+ vmalef ($ACC2,$I3,$S4,$ACC2);
+ vmalef ($ACC3,$I3,$R0,$ACC3);
+ vmalef ($ACC4,$I3,$R1,$ACC4);
+
+ vmalef ($ACC0,$I4,$S1,$ACC0);
+ vmalef ($ACC1,$I4,$S2,$ACC1);
+ vmalef ($ACC2,$I4,$S3,$ACC2);
+ vmalef ($ACC3,$I4,$S4,$ACC3);
+ vmalef ($ACC4,$I4,$R0,$ACC4);
+
+ vmalof ($ACC0,$H0,$R0,$ACC0);
+ vmalof ($ACC1,$H0,$R1,$ACC1);
+ vmalof ($ACC2,$H0,$R2,$ACC2);
+ vmalof ($ACC3,$H0,$R3,$ACC3);
+ vmalof ($ACC4,$H0,$R4,$ACC4);
+
+ vmalof ($ACC0,$H1,$S4,$ACC0);
+ vmalof ($ACC1,$H1,$R0,$ACC1);
+ vmalof ($ACC2,$H1,$R1,$ACC2);
+ vmalof ($ACC3,$H1,$R2,$ACC3);
+ vmalof ($ACC4,$H1,$R3,$ACC4);
+
+ vmalof ($ACC0,$H2,$S3,$ACC0);
+ vmalof ($ACC1,$H2,$S4,$ACC1);
+ vmalof ($ACC2,$H2,$R0,$ACC2);
+ vmalof ($ACC3,$H2,$R1,$ACC3);
+ vmalof ($ACC4,$H2,$R2,$ACC4);
+
+ vmalof ($ACC0,$H3,$S2,$ACC0);
+ vmalof ($ACC1,$H3,$S3,$ACC1);
+ vmalof ($ACC2,$H3,$S4,$ACC2);
+ vmalof ($ACC3,$H3,$R0,$ACC3);
+ vmalof ($ACC4,$H3,$R1,$ACC4);
+
+ vmalof ($ACC0,$H4,$S1,$ACC0);
+ vmalof ($ACC1,$H4,$S2,$ACC1);
+ vmalof ($ACC2,$H4,$S3,$ACC2);
+ vmalof ($ACC3,$H4,$S4,$ACC3);
+ vmalof ($ACC4,$H4,$R0,$ACC4);
+
+ ################################################################
+ # horizontal addition
+
+ vzero ($H0);
+ vsumqg ($ACC0,$ACC0,$H0);
+ vsumqg ($ACC1,$ACC1,$H0);
+ vsumqg ($ACC2,$ACC2,$H0);
+ vsumqg ($ACC3,$ACC3,$H0);
+ vsumqg ($ACC4,$ACC4,$H0);
+
+ ################################################################
+ # lazy reduction
+
+ vesrlg ($H4,$ACC3,26);
+ vesrlg ($H1,$ACC0,26);
+ vn ($H3,$ACC3,$mask26);
+ vn ($H0,$ACC0,$mask26);
+ vag ($H4,$H4,$ACC4); # h3 -> h4
+ vag ($H1,$H1,$ACC1); # h0 -> h1
+
+ vesrlg ($ACC4,$H4,26);
+ vesrlg ($ACC1,$H1,26);
+ vn ($H4,$H4,$mask26);
+ vn ($H1,$H1,$mask26);
+ vag ($H0,$H0,$ACC4);
+ vag ($H2,$ACC2,$ACC1); # h1 -> h2
+
+ veslg ($ACC4,$ACC4,2); # <<2
+ vesrlg ($ACC2,$H2,26);
+ vn ($H2,$H2,$mask26);
+ vag ($H0,$H0,$ACC4); # h4 -> h0
+ vag ($H3,$H3,$ACC2); # h2 -> h3
+
+ vesrlg ($ACC0,$H0,26);
+ vesrlg ($ACC3,$H3,26);
+ vn ($H0,$H0,$mask26);
+ vn ($H3,$H3,$mask26);
+ vag ($H1,$H1,$ACC0); # h0 -> h1
+ vag ($H4,$H4,$ACC3); # h3 -> h4
+
+&{$z? \&clgfi:\&clfi} ($len,0);
+ je (".Ldone");
+
+ vlm ($T1,$T4,"0x00($inp)"); # load last partial block
+ vgmg ($mask26,6,31);
+ vgmf ($I4,5,5); # padbit<<2
+
+ vperm ($I0,$T3,$T4,$bswaplo);
+ vperm ($I2,$T3,$T4,$bswapmi);
+ vperm ($T3,$T3,$T4,$bswaphi);
+
+ vl ($ACC0,"0x30($len,%r1)"); # borrow $ACC0,1
+ vl ($ACC1,"0x60($len,%r1)");
+
+ verimg ($I1,$I0,$mask26,6); # >>26
+ veslg ($I0,$I0,32);
+ veslg ($I2,$I2,28); # >>4
+ verimg ($I3,$T3,$mask26,18); # >>14
+ verimg ($I4,$T3,$mask26,58); # >>38
+ vn ($I0,$I0,$mask26);
+ vn ($I2,$I2,$mask26);
+ vesrlf ($I4,$I4,2); # >>2
+
+ vgmg ($mask26,38,63);
+ vperm ($T3,$T1,$T2,$bswaplo);
+ vperm ($T4,$T1,$T2,$bswaphi);
+ vperm ($T2,$T1,$T2,$bswapmi);
+
+ verimg ($I0,$T3,$mask26,0);
+ verimg ($I1,$T3,$mask26,38); # >>26
+ verimg ($I2,$T2,$mask26,60); # >>4
+ verimg ($I3,$T4,$mask26,50); # >>14
+ vesrlg ($T4,$T4,40);
+ vo ($I4,$I4,$T4);
+
+ vperm ($H0,$H0,$H0,$ACC0); # move hash to right lane
+ vn ($I0,$I0,$ACC1); # mask redundant lane[s]
+ vperm ($H1,$H1,$H1,$ACC0);
+ vn ($I1,$I1,$ACC1);
+ vperm ($H2,$H2,$H2,$ACC0);
+ vn ($I2,$I2,$ACC1);
+ vperm ($H3,$H3,$H3,$ACC0);
+ vn ($I3,$I3,$ACC1);
+ vperm ($H4,$H4,$H4,$ACC0);
+ vn ($I4,$I4,$ACC1);
+
+ vaf ($I0,$I0,$H0); # accumulate hash
+ vzero ($H0); # wipe hash value
+ vaf ($I1,$I1,$H1);
+ vzero ($H1);
+ vaf ($I2,$I2,$H2);
+ vzero ($H2);
+ vaf ($I3,$I3,$H3);
+ vzero ($H3);
+ vaf ($I4,$I4,$H4);
+ vzero ($H4);
+
+&{$z? \&lghi:\&lhi} ($len,0);
+ j (".Last");
+ # I don't bother to tell apart cases when only one multiplication
+ # pass is sufficient, because I argue that mispredicted branch
+ # penalties are comparable to overhead of sometimes redundant
+ # multiplication pass...
+
+LABEL (".Ldone");
+ vstef ($H0,"0($ctx)",3); # store hash base 2^26
+ vstef ($H1,"4($ctx)",3);
+ vstef ($H2,"8($ctx)",3);
+ vstef ($H3,"12($ctx)",3);
+ vstef ($H4,"16($ctx)",3);
+
+if ($z) {
+ ld ("%f8","$stdframe+0*8($sp)");
+ ld ("%f9","$stdframe+1*8($sp)");
+ ld ("%f10","$stdframe+2*8($sp)");
+ ld ("%f11","$stdframe+3*8($sp)");
+ ld ("%f12","$stdframe+4*8($sp)");
+ ld ("%f13","$stdframe+5*8($sp)");
+ ld ("%f14","$stdframe+6*8($sp)");
+ ld ("%f15","$stdframe+7*8($sp)");
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+8*8+10*$SIZE_T($sp)");
+} else {
+ ld ("%f4","$stdframe+16*$SIZE_T+2*8($sp)");
+ ld ("%f6","$stdframe+16*$SIZE_T+3*8($sp)");
+&{$z? \&lmg:\&lm} ("%r10","%r15","$stdframe+10*$SIZE_T($sp)");
+}
+ br ("%r14");
+SIZE ("__poly1305_blocks_vx",".-__poly1305_blocks_vx");
+}
+
+################
+# static void poly1305_emit(void *ctx, unsigned char mac[16],
+# const u32 nonce[4])
{
my ($mac,$nonce)=($inp,$len);
-my ($h0,$h1,$h2,$d0,$d1)=map("%r$_",(5..9));
+my ($h0,$h1,$h2,$d0,$d1,$d2)=map("%r$_",(5..10));
-$code.=<<___;
-.globl poly1305_emit
-.type poly1305_emit,\@function
-.align 16
-poly1305_emit:
- stm${g} %r6,%r9,`6*$SIZE_T`($sp)
-
- lg $h0,0($ctx)
- lg $h1,8($ctx)
- lg $h2,16($ctx)
-
- lghi %r0,5
- lghi %r1,0
- lgr $d0,$h0
- lgr $d1,$h1
-
- algr $h0,%r0 # compare to modulus
- alcgr $h1,%r1
- alcgr $h2,%r1
-
- srlg $h2,$h2,2 # did it borrow/carry?
- slgr %r1,$h2 # 0-$h2>>2
- lg $h2,0($nonce) # load nonce
- lghi %r0,-1
- lg $ctx,8($nonce)
- xgr %r0,%r1 # ~%r1
-
- ngr $h0,%r1
- ngr $d0,%r0
- ngr $h1,%r1
- ngr $d1,%r0
- ogr $h0,$d0
- rllg $d0,$h2,32 # flip nonce words
- ogr $h1,$d1
- rllg $d1,$ctx,32
-
- algr $h0,$d0 # accumulate nonce
- alcgr $h1,$d1
-
- strvg $h0,0($mac) # write little-endian result
- strvg $h1,8($mac)
-
- lm${g} %r6,%r9,`6*$SIZE_T`($sp)
- br %r14
-.size poly1305_emit,.-poly1305_emit
-
-.string "Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>"
-___
+GLOBL ("poly1305_emit");
+TYPE ("poly1305_emit","\@function");
+ALIGN (16);
+LABEL ("poly1305_emit");
+LABEL (".Lpoly1305_emit");
+&{$z? \&stmg:\&stm} ("%r6","%r10","6*$SIZE_T($sp)");
+
+ lg ($d0,"0($ctx)");
+ lg ($d1,"8($ctx)");
+ lg ($d2,"16($ctx)");
+
+ llgfr ("%r0",$d0); # base 2^26 -> base 2^64
+ srlg ($h0,$d0,32);
+ llgfr ("%r1",$d1);
+ srlg ($h1,$d1,32);
+ srlg ($h2,$d2,32);
+
+ sllg ("%r0","%r0",26);
+ algr ($h0,"%r0");
+ sllg ("%r0",$h1,52);
+ srlg ($h1,$h1,12);
+ sllg ("%r1","%r1",14);
+ algr ($h0,"%r0");
+ alcgr ($h1,"%r1");
+ sllg ("%r0",$h2,40);
+ srlg ($h2,$h2,24);
+ lghi ("%r1",0);
+ algr ($h1,"%r0");
+ alcgr ($h2,"%r1");
+
+ llgf ("%r0","24($ctx)"); # is_base2_26
+ lcgr ("%r0","%r0");
+
+ xgr ($h0,$d0); # choose between radixes
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+ ngr ($h0,"%r0");
+ ngr ($h1,"%r0");
+ ngr ($h2,"%r0");
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ xgr ($h2,$d2);
+
+ lghi ("%r0",5);
+ lgr ($d0,$h0);
+ lgr ($d1,$h1);
+
+ algr ($h0,"%r0"); # compare to modulus
+ alcgr ($h1,"%r1");
+ alcgr ($h2,"%r1");
+
+ srlg ($h2,$h2,2); # did it borrow/carry?
+ slgr ("%r1",$h2); # 0-$h2>>2
+ lg ($d2,"0($nonce)"); # load nonce
+ lg ($ctx,"8($nonce)");
+
+ xgr ($h0,$d0);
+ xgr ($h1,$d1);
+ ngr ($h0,"%r1");
+ ngr ($h1,"%r1");
+ xgr ($h0,$d0);
+ rllg ($d0,$d2,32); # flip nonce words
+ xgr ($h1,$d1);
+ rllg ($d1,$ctx,32);
+
+ algr ($h0,$d0); # accumulate nonce
+ alcgr ($h1,$d1);
+
+ strvg ($h0,"0($mac)"); # write little-endian result
+ strvg ($h1,"8($mac)");
+
+&{$z? \&lmg:\&lm} ("%r6","%r10","6*$SIZE_T($sp)");
+ br ("%r14");
+SIZE ("poly1305_emit",".-poly1305_emit");
}
-$code =~ s/\`([^\`]*)\`/eval $1/gem;
-$code =~ s/\b(srlg\s+)(%r[0-9]+\s*,)\s*([0-9]+)/$1$2$2$3/gm;
+################
+
+ALIGN (16);
+LABEL (".Lconst");
+LONG (0x04050607,0x14151617,0x0c0d0e0f,0x1c1d1e1f); # merge odd
+LONG (0x07060504,0x03020100,0x17161514,0x13121110); # byte swap masks
+LONG (0x0f0e0d0c,0x0b0a0908,0x1f1e1d1c,0x1b1a1918);
+LONG (0x00000000,0x09080706,0x00000000,0x19181716);
+
+LONG (0x00000000,0x00000000,0x00000000,0x0c0d0e0f); # magic tail masks
+LONG (0x0c0d0e0f,0x00000000,0x00000000,0x00000000);
+LONG (0x00000000,0x00000000,0x0c0d0e0f,0x00000000);
+
+LONG (0xffffffff,0x00000000,0xffffffff,0xffffffff);
+LONG (0xffffffff,0x00000000,0xffffffff,0x00000000);
+LONG (0x00000000,0x00000000,0xffffffff,0x00000000);
+
+STRING ("\"Poly1305 for s390x, CRYPTOGAMS by <appro\@openssl.org>\"");
-print $code;
-close STDOUT or die "error closing STDOUT: $!";
+PERLASM_END();
diff -up openssl-1.1.1e/crypto/poly1305/build.info.s390x-update openssl-1.1.1e/crypto/poly1305/build.info
--- openssl-1.1.1e/crypto/poly1305/build.info.s390x-update 2020-03-17 15:31:17.000000000 +0100
+++ openssl-1.1.1e/crypto/poly1305/build.info 2020-03-19 16:20:22.042227342 +0100
@@ -18,6 +18,7 @@ INCLUDE[poly1305-armv8.o]=..
GENERATE[poly1305-mips.S]=asm/poly1305-mips.pl $(PERLASM_SCHEME)
INCLUDE[poly1305-mips.o]=..
GENERATE[poly1305-s390x.S]=asm/poly1305-s390x.pl $(PERLASM_SCHEME)
+INCLUDE[poly1305-s390x.o]=..
BEGINRAW[Makefile(unix)]
{- $builddir -}/poly1305-%.S: {- $sourcedir -}/asm/poly1305-%.pl