From 3e6d5a385b19adb1e87d7584b37603182326fb73 Mon Sep 17 00:00:00 2001 From: Clemens Lang Date: Thu, 14 Jul 2022 16:54:25 +0200 Subject: [PATCH] Improve AES-GCM & ChaCha20 perf on Power9+ ppc64le Backport patches that improve performance of AES-GCM on Power9 and newer, and ChaCha20 on Power10. Resolves: rhbz#2051312 Signed-off-by: Clemens Lang --- 0071-AES-GCM-performance-optimization.patch | 1635 +++++++++++++++++ ...erformance-optimizations-for-ppc64le.patch | 1493 +++++++++++++++ openssl.spec | 11 + 3 files changed, 3139 insertions(+) create mode 100644 0071-AES-GCM-performance-optimization.patch create mode 100644 0072-ChaCha20-performance-optimizations-for-ppc64le.patch diff --git a/0071-AES-GCM-performance-optimization.patch b/0071-AES-GCM-performance-optimization.patch new file mode 100644 index 0000000..edf40ec --- /dev/null +++ b/0071-AES-GCM-performance-optimization.patch @@ -0,0 +1,1635 @@ +Upstream-Status: Backport [https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c, https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd] +diff --git a/crypto/modes/asm/aes-gcm-ppc.pl b/crypto/modes/asm/aes-gcm-ppc.pl +new file mode 100644 +index 0000000..6624e6c +--- /dev/null ++++ b/crypto/modes/asm/aes-gcm-ppc.pl +@@ -0,0 +1,1438 @@ ++#! /usr/bin/env perl ++# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved. ++# Copyright 2021- IBM Inc. All rights reserved ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++# ++#=================================================================================== ++# Written by Danny Tsen for OpenSSL Project, ++# ++# GHASH is based on the Karatsuba multiplication method. ++# ++# Xi xor X1 ++# ++# X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H = ++# (X1.h * H4.h + xX.l * H4.l + X1 * H4) + ++# (X2.h * H3.h + X2.l * H3.l + X2 * H3) + ++# (X3.h * H2.h + X3.l * H2.l + X3 * H2) + ++# (X4.h * H.h + X4.l * H.l + X4 * H) ++# ++# Xi = v0 ++# H Poly = v2 ++# Hash keys = v3 - v14 ++# ( H.l, H, H.h) ++# ( H^2.l, H^2, H^2.h) ++# ( H^3.l, H^3, H^3.h) ++# ( H^4.l, H^4, H^4.h) ++# ++# v30 is IV ++# v31 - counter 1 ++# ++# AES used, ++# vs0 - vs14 for round keys ++# v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted) ++# ++# This implementation uses stitched AES-GCM approach to improve overall performance. ++# AES is implemented with 8x blocks and GHASH is using 2 4x blocks. ++# ++# Current large block (16384 bytes) performance per second with 128 bit key -- ++# ++# Encrypt Decrypt ++# Power10[le] (3.5GHz) 5.32G 5.26G ++# ++# =================================================================================== ++# ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++if ($flavour =~ /64/) { ++ $SIZE_T=8; ++ $LRSAVE=2*$SIZE_T; ++ $STU="stdu"; ++ $POP="ld"; ++ $PUSH="std"; ++ $UCMP="cmpld"; ++ $SHRI="srdi"; ++} elsif ($flavour =~ /32/) { ++ $SIZE_T=4; ++ $LRSAVE=$SIZE_T; ++ $STU="stwu"; ++ $POP="lwz"; ++ $PUSH="stw"; ++ $UCMP="cmplw"; ++ $SHRI="srwi"; ++} else { die "nonsense $flavour"; } ++ ++$sp="r1"; ++$FRAME=6*$SIZE_T+13*16; # 13*16 is for v20-v31 offload ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open STDOUT,"| $^X $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++ ++$code=<<___; ++.machine "any" ++.text ++ ++# 4x loops ++# v15 - v18 - input states ++# vs1 - vs9 - round keys ++# ++.macro Loop_aes_middle4x ++ xxlor 19+32, 1, 1 ++ xxlor 20+32, 2, 2 ++ xxlor 21+32, 3, 3 ++ xxlor 22+32, 4, 4 ++ ++ vcipher 15, 15, 19 ++ vcipher 16, 16, 19 ++ vcipher 17, 17, 19 ++ vcipher 18, 18, 19 ++ ++ vcipher 15, 15, 20 ++ vcipher 16, 16, 20 ++ vcipher 17, 17, 20 ++ vcipher 18, 18, 20 ++ ++ vcipher 15, 15, 21 ++ vcipher 16, 16, 21 ++ vcipher 17, 17, 21 ++ vcipher 18, 18, 21 ++ ++ vcipher 15, 15, 22 ++ vcipher 16, 16, 22 ++ vcipher 17, 17, 22 ++ vcipher 18, 18, 22 ++ ++ xxlor 19+32, 5, 5 ++ xxlor 20+32, 6, 6 ++ xxlor 21+32, 7, 7 ++ xxlor 22+32, 8, 8 ++ ++ vcipher 15, 15, 19 ++ vcipher 16, 16, 19 ++ vcipher 17, 17, 19 ++ vcipher 18, 18, 19 ++ ++ vcipher 15, 15, 20 ++ vcipher 16, 16, 20 ++ vcipher 17, 17, 20 ++ vcipher 18, 18, 20 ++ ++ vcipher 15, 15, 21 ++ vcipher 16, 16, 21 ++ vcipher 17, 17, 21 ++ vcipher 18, 18, 21 ++ ++ vcipher 15, 15, 22 ++ vcipher 16, 16, 22 ++ vcipher 17, 17, 22 ++ vcipher 18, 18, 22 ++ ++ xxlor 23+32, 9, 9 ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++.endm ++ ++# 8x loops ++# v15 - v22 - input states ++# vs1 - vs9 - round keys ++# ++.macro Loop_aes_middle8x ++ xxlor 23+32, 1, 1 ++ xxlor 24+32, 2, 2 ++ xxlor 25+32, 3, 3 ++ xxlor 26+32, 4, 4 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ vcipher 15, 15, 25 ++ vcipher 16, 16, 25 ++ vcipher 17, 17, 25 ++ vcipher 18, 18, 25 ++ vcipher 19, 19, 25 ++ vcipher 20, 20, 25 ++ vcipher 21, 21, 25 ++ vcipher 22, 22, 25 ++ ++ vcipher 15, 15, 26 ++ vcipher 16, 16, 26 ++ vcipher 17, 17, 26 ++ vcipher 18, 18, 26 ++ vcipher 19, 19, 26 ++ vcipher 20, 20, 26 ++ vcipher 21, 21, 26 ++ vcipher 22, 22, 26 ++ ++ xxlor 23+32, 5, 5 ++ xxlor 24+32, 6, 6 ++ xxlor 25+32, 7, 7 ++ xxlor 26+32, 8, 8 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ vcipher 15, 15, 25 ++ vcipher 16, 16, 25 ++ vcipher 17, 17, 25 ++ vcipher 18, 18, 25 ++ vcipher 19, 19, 25 ++ vcipher 20, 20, 25 ++ vcipher 21, 21, 25 ++ vcipher 22, 22, 25 ++ ++ vcipher 15, 15, 26 ++ vcipher 16, 16, 26 ++ vcipher 17, 17, 26 ++ vcipher 18, 18, 26 ++ vcipher 19, 19, 26 ++ vcipher 20, 20, 26 ++ vcipher 21, 21, 26 ++ vcipher 22, 22, 26 ++ ++ xxlor 23+32, 9, 9 ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++.endm ++ ++# ++# Compute 4x hash values based on Karatsuba method. ++# ++ppc_aes_gcm_ghash: ++ vxor 15, 15, 0 ++ ++ xxlxor 29, 29, 29 ++ ++ vpmsumd 23, 12, 15 # H4.L * X.L ++ vpmsumd 24, 9, 16 ++ vpmsumd 25, 6, 17 ++ vpmsumd 26, 3, 18 ++ ++ vxor 23, 23, 24 ++ vxor 23, 23, 25 ++ vxor 23, 23, 26 # L ++ ++ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L ++ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L ++ vpmsumd 26, 7, 17 ++ vpmsumd 27, 4, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 # M ++ ++ # sum hash and reduction with H Poly ++ vpmsumd 28, 23, 2 # reduction ++ ++ xxlor 29+32, 29, 29 ++ vsldoi 26, 24, 29, 8 # mL ++ vsldoi 29, 29, 24, 8 # mH ++ vxor 23, 23, 26 # mL + L ++ ++ vsldoi 23, 23, 23, 8 # swap ++ vxor 23, 23, 28 ++ ++ vpmsumd 24, 14, 15 # H4.H * X.H ++ vpmsumd 25, 11, 16 ++ vpmsumd 26, 8, 17 ++ vpmsumd 27, 5, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 ++ ++ vxor 24, 24, 29 ++ ++ # sum hash and reduction with H Poly ++ vsldoi 27, 23, 23, 8 # swap ++ vpmsumd 23, 23, 2 ++ vxor 27, 27, 24 ++ vxor 23, 23, 27 ++ ++ xxlor 32, 23+32, 23+32 # update hash ++ ++ blr ++ ++# ++# Combine two 4x ghash ++# v15 - v22 - input blocks ++# ++.macro ppc_aes_gcm_ghash2_4x ++ # first 4x hash ++ vxor 15, 15, 0 # Xi + X ++ ++ xxlxor 29, 29, 29 ++ ++ vpmsumd 23, 12, 15 # H4.L * X.L ++ vpmsumd 24, 9, 16 ++ vpmsumd 25, 6, 17 ++ vpmsumd 26, 3, 18 ++ ++ vxor 23, 23, 24 ++ vxor 23, 23, 25 ++ vxor 23, 23, 26 # L ++ ++ vpmsumd 24, 13, 15 # H4.L * X.H + H4.H * X.L ++ vpmsumd 25, 10, 16 # H3.L * X1.H + H3.H * X1.L ++ vpmsumd 26, 7, 17 ++ vpmsumd 27, 4, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ ++ # sum hash and reduction with H Poly ++ vpmsumd 28, 23, 2 # reduction ++ ++ xxlor 29+32, 29, 29 ++ ++ vxor 24, 24, 27 # M ++ vsldoi 26, 24, 29, 8 # mL ++ vsldoi 29, 29, 24, 8 # mH ++ vxor 23, 23, 26 # mL + L ++ ++ vsldoi 23, 23, 23, 8 # swap ++ vxor 23, 23, 28 ++ ++ vpmsumd 24, 14, 15 # H4.H * X.H ++ vpmsumd 25, 11, 16 ++ vpmsumd 26, 8, 17 ++ vpmsumd 27, 5, 18 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 # H ++ ++ vxor 24, 24, 29 # H + mH ++ ++ # sum hash and reduction with H Poly ++ vsldoi 27, 23, 23, 8 # swap ++ vpmsumd 23, 23, 2 ++ vxor 27, 27, 24 ++ vxor 27, 23, 27 # 1st Xi ++ ++ # 2nd 4x hash ++ vpmsumd 24, 9, 20 ++ vpmsumd 25, 6, 21 ++ vpmsumd 26, 3, 22 ++ vxor 19, 19, 27 # Xi + X ++ vpmsumd 23, 12, 19 # H4.L * X.L ++ ++ vxor 23, 23, 24 ++ vxor 23, 23, 25 ++ vxor 23, 23, 26 # L ++ ++ vpmsumd 24, 13, 19 # H4.L * X.H + H4.H * X.L ++ vpmsumd 25, 10, 20 # H3.L * X1.H + H3.H * X1.L ++ vpmsumd 26, 7, 21 ++ vpmsumd 27, 4, 22 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ ++ # sum hash and reduction with H Poly ++ vpmsumd 28, 23, 2 # reduction ++ ++ xxlor 29+32, 29, 29 ++ ++ vxor 24, 24, 27 # M ++ vsldoi 26, 24, 29, 8 # mL ++ vsldoi 29, 29, 24, 8 # mH ++ vxor 23, 23, 26 # mL + L ++ ++ vsldoi 23, 23, 23, 8 # swap ++ vxor 23, 23, 28 ++ ++ vpmsumd 24, 14, 19 # H4.H * X.H ++ vpmsumd 25, 11, 20 ++ vpmsumd 26, 8, 21 ++ vpmsumd 27, 5, 22 ++ ++ vxor 24, 24, 25 ++ vxor 24, 24, 26 ++ vxor 24, 24, 27 # H ++ ++ vxor 24, 24, 29 # H + mH ++ ++ # sum hash and reduction with H Poly ++ vsldoi 27, 23, 23, 8 # swap ++ vpmsumd 23, 23, 2 ++ vxor 27, 27, 24 ++ vxor 23, 23, 27 ++ ++ xxlor 32, 23+32, 23+32 # update hash ++ ++.endm ++ ++# ++# Compute update single hash ++# ++.macro ppc_update_hash_1x ++ vxor 28, 28, 0 ++ ++ vxor 19, 19, 19 ++ ++ vpmsumd 22, 3, 28 # L ++ vpmsumd 23, 4, 28 # M ++ vpmsumd 24, 5, 28 # H ++ ++ vpmsumd 27, 22, 2 # reduction ++ ++ vsldoi 25, 23, 19, 8 # mL ++ vsldoi 26, 19, 23, 8 # mH ++ vxor 22, 22, 25 # LL + LL ++ vxor 24, 24, 26 # HH + HH ++ ++ vsldoi 22, 22, 22, 8 # swap ++ vxor 22, 22, 27 ++ ++ vsldoi 20, 22, 22, 8 # swap ++ vpmsumd 22, 22, 2 # reduction ++ vxor 20, 20, 24 ++ vxor 22, 22, 20 ++ ++ vmr 0, 22 # update hash ++ ++.endm ++ ++# ++# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len, ++# const AES_KEY *key, unsigned char iv[16], ++# void *Xip); ++# ++# r3 - inp ++# r4 - out ++# r5 - len ++# r6 - AES round keys ++# r7 - iv ++# r8 - Xi, HPoli, hash keys ++# ++.global ppc_aes_gcm_encrypt ++.align 5 ++ppc_aes_gcm_encrypt: ++_ppc_aes_gcm_encrypt: ++ ++ stdu 1,-512(1) ++ mflr 0 ++ ++ std 14,112(1) ++ std 15,120(1) ++ std 16,128(1) ++ std 17,136(1) ++ std 18,144(1) ++ std 19,152(1) ++ std 20,160(1) ++ std 21,168(1) ++ li 9, 256 ++ stvx 20, 9, 1 ++ addi 9, 9, 16 ++ stvx 21, 9, 1 ++ addi 9, 9, 16 ++ stvx 22, 9, 1 ++ addi 9, 9, 16 ++ stvx 23, 9, 1 ++ addi 9, 9, 16 ++ stvx 24, 9, 1 ++ addi 9, 9, 16 ++ stvx 25, 9, 1 ++ addi 9, 9, 16 ++ stvx 26, 9, 1 ++ addi 9, 9, 16 ++ stvx 27, 9, 1 ++ addi 9, 9, 16 ++ stvx 28, 9, 1 ++ addi 9, 9, 16 ++ stvx 29, 9, 1 ++ addi 9, 9, 16 ++ stvx 30, 9, 1 ++ addi 9, 9, 16 ++ stvx 31, 9, 1 ++ std 0, 528(1) ++ ++ # Load Xi ++ lxvb16x 32, 0, 8 # load Xi ++ ++ # load Hash - h^4, h^3, h^2, h ++ li 10, 32 ++ lxvd2x 2+32, 10, 8 # H Poli ++ li 10, 48 ++ lxvd2x 3+32, 10, 8 # Hl ++ li 10, 64 ++ lxvd2x 4+32, 10, 8 # H ++ li 10, 80 ++ lxvd2x 5+32, 10, 8 # Hh ++ ++ li 10, 96 ++ lxvd2x 6+32, 10, 8 # H^2l ++ li 10, 112 ++ lxvd2x 7+32, 10, 8 # H^2 ++ li 10, 128 ++ lxvd2x 8+32, 10, 8 # H^2h ++ ++ li 10, 144 ++ lxvd2x 9+32, 10, 8 # H^3l ++ li 10, 160 ++ lxvd2x 10+32, 10, 8 # H^3 ++ li 10, 176 ++ lxvd2x 11+32, 10, 8 # H^3h ++ ++ li 10, 192 ++ lxvd2x 12+32, 10, 8 # H^4l ++ li 10, 208 ++ lxvd2x 13+32, 10, 8 # H^4 ++ li 10, 224 ++ lxvd2x 14+32, 10, 8 # H^4h ++ ++ # initialize ICB: GHASH( IV ), IV - r7 ++ lxvb16x 30+32, 0, 7 # load IV - v30 ++ ++ mr 12, 5 # length ++ li 11, 0 # block index ++ ++ # counter 1 ++ vxor 31, 31, 31 ++ vspltisb 22, 1 ++ vsldoi 31, 31, 22,1 # counter 1 ++ ++ # load round key to VSR ++ lxv 0, 0(6) ++ lxv 1, 0x10(6) ++ lxv 2, 0x20(6) ++ lxv 3, 0x30(6) ++ lxv 4, 0x40(6) ++ lxv 5, 0x50(6) ++ lxv 6, 0x60(6) ++ lxv 7, 0x70(6) ++ lxv 8, 0x80(6) ++ lxv 9, 0x90(6) ++ lxv 10, 0xa0(6) ++ ++ # load rounds - 10 (128), 12 (192), 14 (256) ++ lwz 9,240(6) ++ ++ # ++ # vxor state, state, w # addroundkey ++ xxlor 32+29, 0, 0 ++ vxor 15, 30, 29 # IV + round key - add round key 0 ++ ++ cmpdi 9, 10 ++ beq Loop_aes_gcm_8x ++ ++ # load 2 more round keys (v11, v12) ++ lxv 11, 0xb0(6) ++ lxv 12, 0xc0(6) ++ ++ cmpdi 9, 12 ++ beq Loop_aes_gcm_8x ++ ++ # load 2 more round keys (v11, v12, v13, v14) ++ lxv 13, 0xd0(6) ++ lxv 14, 0xe0(6) ++ cmpdi 9, 14 ++ beq Loop_aes_gcm_8x ++ ++ b aes_gcm_out ++ ++.align 5 ++Loop_aes_gcm_8x: ++ mr 14, 3 ++ mr 9, 4 ++ ++ # n blocks ++ li 10, 128 ++ divdu 10, 5, 10 # n 128 bytes-blocks ++ cmpdi 10, 0 ++ beq Loop_last_block ++ ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 16, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 29 ++ ++ mtctr 10 ++ ++ li 15, 16 ++ li 16, 32 ++ li 17, 48 ++ li 18, 64 ++ li 19, 80 ++ li 20, 96 ++ li 21, 112 ++ ++ lwz 10, 240(6) ++ ++Loop_8x_block: ++ ++ lxvb16x 15, 0, 14 # load block ++ lxvb16x 16, 15, 14 # load block ++ lxvb16x 17, 16, 14 # load block ++ lxvb16x 18, 17, 14 # load block ++ lxvb16x 19, 18, 14 # load block ++ lxvb16x 20, 19, 14 # load block ++ lxvb16x 21, 20, 14 # load block ++ lxvb16x 22, 21, 14 # load block ++ addi 14, 14, 128 ++ ++ Loop_aes_middle8x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_next_ghash ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_next_ghash ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_next_ghash ++ b aes_gcm_out ++ ++Do_next_ghash: ++ ++ # ++ # last round ++ vcipherlast 15, 15, 23 ++ vcipherlast 16, 16, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ xxlxor 48, 48, 16 ++ stxvb16x 48, 15, 9 # store output ++ ++ vcipherlast 17, 17, 23 ++ vcipherlast 18, 18, 23 ++ ++ xxlxor 49, 49, 17 ++ stxvb16x 49, 16, 9 # store output ++ xxlxor 50, 50, 18 ++ stxvb16x 50, 17, 9 # store output ++ ++ vcipherlast 19, 19, 23 ++ vcipherlast 20, 20, 23 ++ ++ xxlxor 51, 51, 19 ++ stxvb16x 51, 18, 9 # store output ++ xxlxor 52, 52, 20 ++ stxvb16x 52, 19, 9 # store output ++ ++ vcipherlast 21, 21, 23 ++ vcipherlast 22, 22, 23 ++ ++ xxlxor 53, 53, 21 ++ stxvb16x 53, 20, 9 # store output ++ xxlxor 54, 54, 22 ++ stxvb16x 54, 21, 9 # store output ++ ++ addi 9, 9, 128 ++ ++ # ghash here ++ ppc_aes_gcm_ghash2_4x ++ ++ xxlor 27+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vmr 29, 30 ++ vxor 15, 30, 27 # add round key ++ vaddudm 30, 30, 31 ++ vxor 16, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 27 ++ ++ addi 12, 12, -128 ++ addi 11, 11, 128 ++ ++ bdnz Loop_8x_block ++ ++ vmr 30, 29 ++ ++Loop_last_block: ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++ # loop last few blocks ++ li 10, 16 ++ divdu 10, 12, 10 ++ ++ mtctr 10 ++ ++ lwz 10, 240(6) ++ ++ cmpdi 12, 16 ++ blt Final_block ++ ++.macro Loop_aes_middle_1x ++ xxlor 19+32, 1, 1 ++ xxlor 20+32, 2, 2 ++ xxlor 21+32, 3, 3 ++ xxlor 22+32, 4, 4 ++ ++ vcipher 15, 15, 19 ++ vcipher 15, 15, 20 ++ vcipher 15, 15, 21 ++ vcipher 15, 15, 22 ++ ++ xxlor 19+32, 5, 5 ++ xxlor 20+32, 6, 6 ++ xxlor 21+32, 7, 7 ++ xxlor 22+32, 8, 8 ++ ++ vcipher 15, 15, 19 ++ vcipher 15, 15, 20 ++ vcipher 15, 15, 21 ++ vcipher 15, 15, 22 ++ ++ xxlor 19+32, 9, 9 ++ vcipher 15, 15, 19 ++.endm ++ ++Next_rem_block: ++ lxvb16x 15, 0, 14 # load block ++ ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_next_1x ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_next_1x ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_next_1x ++ ++Do_next_1x: ++ vcipherlast 15, 15, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ addi 14, 14, 16 ++ addi 9, 9, 16 ++ ++ vmr 28, 15 ++ ppc_update_hash_1x ++ ++ addi 12, 12, -16 ++ addi 11, 11, 16 ++ xxlor 19+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 15, 30, 19 # add round key ++ ++ bdnz Next_rem_block ++ ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++Final_block: ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_final_1x ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_final_1x ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_final_1x ++ ++Do_final_1x: ++ vcipherlast 15, 15, 23 ++ ++ lxvb16x 15, 0, 14 # load last block ++ xxlxor 47, 47, 15 ++ ++ # create partial block mask ++ li 15, 16 ++ sub 15, 15, 12 # index to the mask ++ ++ vspltisb 16, -1 # first 16 bytes - 0xffff...ff ++ vspltisb 17, 0 # second 16 bytes - 0x0000...00 ++ li 10, 192 ++ stvx 16, 10, 1 ++ addi 10, 10, 16 ++ stvx 17, 10, 1 ++ ++ addi 10, 1, 192 ++ lxvb16x 16, 15, 10 # load partial block mask ++ xxland 47, 47, 16 ++ ++ vmr 28, 15 ++ ppc_update_hash_1x ++ ++ # * should store only the remaining bytes. ++ bl Write_partial_block ++ ++ b aes_gcm_out ++ ++# ++# Write partial block ++# r9 - output ++# r12 - remaining bytes ++# v15 - partial input data ++# ++Write_partial_block: ++ li 10, 192 ++ stxvb16x 15+32, 10, 1 # last block ++ ++ #add 10, 9, 11 # Output ++ addi 10, 9, -1 ++ addi 16, 1, 191 ++ ++ mtctr 12 # remaining bytes ++ li 15, 0 ++ ++Write_last_byte: ++ lbzu 14, 1(16) ++ stbu 14, 1(10) ++ bdnz Write_last_byte ++ blr ++ ++aes_gcm_out: ++ # out = state ++ stxvb16x 32, 0, 8 # write out Xi ++ add 3, 11, 12 # return count ++ ++ li 9, 256 ++ lvx 20, 9, 1 ++ addi 9, 9, 16 ++ lvx 21, 9, 1 ++ addi 9, 9, 16 ++ lvx 22, 9, 1 ++ addi 9, 9, 16 ++ lvx 23, 9, 1 ++ addi 9, 9, 16 ++ lvx 24, 9, 1 ++ addi 9, 9, 16 ++ lvx 25, 9, 1 ++ addi 9, 9, 16 ++ lvx 26, 9, 1 ++ addi 9, 9, 16 ++ lvx 27, 9, 1 ++ addi 9, 9, 16 ++ lvx 28, 9, 1 ++ addi 9, 9, 16 ++ lvx 29, 9, 1 ++ addi 9, 9, 16 ++ lvx 30, 9, 1 ++ addi 9, 9, 16 ++ lvx 31, 9, 1 ++ ++ ld 0, 528(1) ++ ld 14,112(1) ++ ld 15,120(1) ++ ld 16,128(1) ++ ld 17,136(1) ++ ld 18,144(1) ++ ld 19,152(1) ++ ld 20,160(1) ++ ld 21,168(1) ++ ++ mtlr 0 ++ addi 1, 1, 512 ++ blr ++ ++# ++# 8x Decrypt ++# ++.global ppc_aes_gcm_decrypt ++.align 5 ++ppc_aes_gcm_decrypt: ++_ppc_aes_gcm_decrypt: ++ ++ stdu 1,-512(1) ++ mflr 0 ++ ++ std 14,112(1) ++ std 15,120(1) ++ std 16,128(1) ++ std 17,136(1) ++ std 18,144(1) ++ std 19,152(1) ++ std 20,160(1) ++ std 21,168(1) ++ li 9, 256 ++ stvx 20, 9, 1 ++ addi 9, 9, 16 ++ stvx 21, 9, 1 ++ addi 9, 9, 16 ++ stvx 22, 9, 1 ++ addi 9, 9, 16 ++ stvx 23, 9, 1 ++ addi 9, 9, 16 ++ stvx 24, 9, 1 ++ addi 9, 9, 16 ++ stvx 25, 9, 1 ++ addi 9, 9, 16 ++ stvx 26, 9, 1 ++ addi 9, 9, 16 ++ stvx 27, 9, 1 ++ addi 9, 9, 16 ++ stvx 28, 9, 1 ++ addi 9, 9, 16 ++ stvx 29, 9, 1 ++ addi 9, 9, 16 ++ stvx 30, 9, 1 ++ addi 9, 9, 16 ++ stvx 31, 9, 1 ++ std 0, 528(1) ++ ++ # Load Xi ++ lxvb16x 32, 0, 8 # load Xi ++ ++ # load Hash - h^4, h^3, h^2, h ++ li 10, 32 ++ lxvd2x 2+32, 10, 8 # H Poli ++ li 10, 48 ++ lxvd2x 3+32, 10, 8 # Hl ++ li 10, 64 ++ lxvd2x 4+32, 10, 8 # H ++ li 10, 80 ++ lxvd2x 5+32, 10, 8 # Hh ++ ++ li 10, 96 ++ lxvd2x 6+32, 10, 8 # H^2l ++ li 10, 112 ++ lxvd2x 7+32, 10, 8 # H^2 ++ li 10, 128 ++ lxvd2x 8+32, 10, 8 # H^2h ++ ++ li 10, 144 ++ lxvd2x 9+32, 10, 8 # H^3l ++ li 10, 160 ++ lxvd2x 10+32, 10, 8 # H^3 ++ li 10, 176 ++ lxvd2x 11+32, 10, 8 # H^3h ++ ++ li 10, 192 ++ lxvd2x 12+32, 10, 8 # H^4l ++ li 10, 208 ++ lxvd2x 13+32, 10, 8 # H^4 ++ li 10, 224 ++ lxvd2x 14+32, 10, 8 # H^4h ++ ++ # initialize ICB: GHASH( IV ), IV - r7 ++ lxvb16x 30+32, 0, 7 # load IV - v30 ++ ++ mr 12, 5 # length ++ li 11, 0 # block index ++ ++ # counter 1 ++ vxor 31, 31, 31 ++ vspltisb 22, 1 ++ vsldoi 31, 31, 22,1 # counter 1 ++ ++ # load round key to VSR ++ lxv 0, 0(6) ++ lxv 1, 0x10(6) ++ lxv 2, 0x20(6) ++ lxv 3, 0x30(6) ++ lxv 4, 0x40(6) ++ lxv 5, 0x50(6) ++ lxv 6, 0x60(6) ++ lxv 7, 0x70(6) ++ lxv 8, 0x80(6) ++ lxv 9, 0x90(6) ++ lxv 10, 0xa0(6) ++ ++ # load rounds - 10 (128), 12 (192), 14 (256) ++ lwz 9,240(6) ++ ++ # ++ # vxor state, state, w # addroundkey ++ xxlor 32+29, 0, 0 ++ vxor 15, 30, 29 # IV + round key - add round key 0 ++ ++ cmpdi 9, 10 ++ beq Loop_aes_gcm_8x_dec ++ ++ # load 2 more round keys (v11, v12) ++ lxv 11, 0xb0(6) ++ lxv 12, 0xc0(6) ++ ++ cmpdi 9, 12 ++ beq Loop_aes_gcm_8x_dec ++ ++ # load 2 more round keys (v11, v12, v13, v14) ++ lxv 13, 0xd0(6) ++ lxv 14, 0xe0(6) ++ cmpdi 9, 14 ++ beq Loop_aes_gcm_8x_dec ++ ++ b aes_gcm_out ++ ++.align 5 ++Loop_aes_gcm_8x_dec: ++ mr 14, 3 ++ mr 9, 4 ++ ++ # n blocks ++ li 10, 128 ++ divdu 10, 5, 10 # n 128 bytes-blocks ++ cmpdi 10, 0 ++ beq Loop_last_block_dec ++ ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 16, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 29 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 29 ++ ++ mtctr 10 ++ ++ li 15, 16 ++ li 16, 32 ++ li 17, 48 ++ li 18, 64 ++ li 19, 80 ++ li 20, 96 ++ li 21, 112 ++ ++ lwz 10, 240(6) ++ ++Loop_8x_block_dec: ++ ++ lxvb16x 15, 0, 14 # load block ++ lxvb16x 16, 15, 14 # load block ++ lxvb16x 17, 16, 14 # load block ++ lxvb16x 18, 17, 14 # load block ++ lxvb16x 19, 18, 14 # load block ++ lxvb16x 20, 19, 14 # load block ++ lxvb16x 21, 20, 14 # load block ++ lxvb16x 22, 21, 14 # load block ++ addi 14, 14, 128 ++ ++ Loop_aes_middle8x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_last_aes_dec ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_last_aes_dec ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 16, 16, 23 ++ vcipher 17, 17, 23 ++ vcipher 18, 18, 23 ++ vcipher 19, 19, 23 ++ vcipher 20, 20, 23 ++ vcipher 21, 21, 23 ++ vcipher 22, 22, 23 ++ ++ vcipher 15, 15, 24 ++ vcipher 16, 16, 24 ++ vcipher 17, 17, 24 ++ vcipher 18, 18, 24 ++ vcipher 19, 19, 24 ++ vcipher 20, 20, 24 ++ vcipher 21, 21, 24 ++ vcipher 22, 22, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_last_aes_dec ++ b aes_gcm_out ++ ++Do_last_aes_dec: ++ ++ # ++ # last round ++ vcipherlast 15, 15, 23 ++ vcipherlast 16, 16, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ xxlxor 48, 48, 16 ++ stxvb16x 48, 15, 9 # store output ++ ++ vcipherlast 17, 17, 23 ++ vcipherlast 18, 18, 23 ++ ++ xxlxor 49, 49, 17 ++ stxvb16x 49, 16, 9 # store output ++ xxlxor 50, 50, 18 ++ stxvb16x 50, 17, 9 # store output ++ ++ vcipherlast 19, 19, 23 ++ vcipherlast 20, 20, 23 ++ ++ xxlxor 51, 51, 19 ++ stxvb16x 51, 18, 9 # store output ++ xxlxor 52, 52, 20 ++ stxvb16x 52, 19, 9 # store output ++ ++ vcipherlast 21, 21, 23 ++ vcipherlast 22, 22, 23 ++ ++ xxlxor 53, 53, 21 ++ stxvb16x 53, 20, 9 # store output ++ xxlxor 54, 54, 22 ++ stxvb16x 54, 21, 9 # store output ++ ++ addi 9, 9, 128 ++ ++ xxlor 15+32, 15, 15 ++ xxlor 16+32, 16, 16 ++ xxlor 17+32, 17, 17 ++ xxlor 18+32, 18, 18 ++ xxlor 19+32, 19, 19 ++ xxlor 20+32, 20, 20 ++ xxlor 21+32, 21, 21 ++ xxlor 22+32, 22, 22 ++ ++ # ghash here ++ ppc_aes_gcm_ghash2_4x ++ ++ xxlor 27+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vmr 29, 30 ++ vxor 15, 30, 27 # add round key ++ vaddudm 30, 30, 31 ++ vxor 16, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 17, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 18, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 19, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 20, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 21, 30, 27 ++ vaddudm 30, 30, 31 ++ vxor 22, 30, 27 ++ addi 12, 12, -128 ++ addi 11, 11, 128 ++ ++ bdnz Loop_8x_block_dec ++ ++ vmr 30, 29 ++ ++Loop_last_block_dec: ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++ # loop last few blocks ++ li 10, 16 ++ divdu 10, 12, 10 ++ ++ mtctr 10 ++ ++ lwz 10,240(6) ++ ++ cmpdi 12, 16 ++ blt Final_block_dec ++ ++Next_rem_block_dec: ++ lxvb16x 15, 0, 14 # load block ++ ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_next_1x_dec ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_next_1x_dec ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_next_1x_dec ++ ++Do_next_1x_dec: ++ vcipherlast 15, 15, 23 ++ ++ xxlxor 47, 47, 15 ++ stxvb16x 47, 0, 9 # store output ++ addi 14, 14, 16 ++ addi 9, 9, 16 ++ ++ xxlor 28+32, 15, 15 ++ ppc_update_hash_1x ++ ++ addi 12, 12, -16 ++ addi 11, 11, 16 ++ xxlor 19+32, 0, 0 ++ vaddudm 30, 30, 31 # IV + counter ++ vxor 15, 30, 19 # add round key ++ ++ bdnz Next_rem_block_dec ++ ++ cmpdi 12, 0 ++ beq aes_gcm_out ++ ++Final_block_dec: ++ Loop_aes_middle_1x ++ ++ xxlor 23+32, 10, 10 ++ ++ cmpdi 10, 10 ++ beq Do_final_1x_dec ++ ++ # 192 bits ++ xxlor 24+32, 11, 11 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 12, 12 ++ ++ cmpdi 10, 12 ++ beq Do_final_1x_dec ++ ++ # 256 bits ++ xxlor 24+32, 13, 13 ++ ++ vcipher 15, 15, 23 ++ vcipher 15, 15, 24 ++ ++ xxlor 23+32, 14, 14 ++ ++ cmpdi 10, 14 ++ beq Do_final_1x_dec ++ ++Do_final_1x_dec: ++ vcipherlast 15, 15, 23 ++ ++ lxvb16x 15, 0, 14 # load block ++ xxlxor 47, 47, 15 ++ ++ # create partial block mask ++ li 15, 16 ++ sub 15, 15, 12 # index to the mask ++ ++ vspltisb 16, -1 # first 16 bytes - 0xffff...ff ++ vspltisb 17, 0 # second 16 bytes - 0x0000...00 ++ li 10, 192 ++ stvx 16, 10, 1 ++ addi 10, 10, 16 ++ stvx 17, 10, 1 ++ ++ addi 10, 1, 192 ++ lxvb16x 16, 15, 10 # load block mask ++ xxland 47, 47, 16 ++ ++ xxlor 28+32, 15, 15 ++ ppc_update_hash_1x ++ ++ # * should store only the remaining bytes. ++ bl Write_partial_block ++ ++ b aes_gcm_out ++ ++ ++___ ++ ++foreach (split("\n",$code)) { ++ s/\`([^\`]*)\`/eval $1/geo; ++ ++ if ($flavour =~ /le$/o) { # little-endian ++ s/le\?//o or ++ s/be\?/#be#/o; ++ } else { ++ s/le\?/#le#/o or ++ s/be\?//o; ++ } ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; # enforce flush +diff --git a/crypto/modes/build.info b/crypto/modes/build.info +index 687e872..0ea122e 100644 +--- a/crypto/modes/build.info ++++ b/crypto/modes/build.info +@@ -32,7 +32,7 @@ IF[{- !$disabled{asm} -}] + $MODESASM_parisc20_64=$MODESASM_parisc11 + $MODESDEF_parisc20_64=$MODESDEF_parisc11 + +- $MODESASM_ppc32=ghashp8-ppc.s ++ $MODESASM_ppc32=ghashp8-ppc.s aes-gcm-ppc.s + $MODESDEF_ppc32= + $MODESASM_ppc64=$MODESASM_ppc32 + $MODESDEF_ppc64=$MODESDEF_ppc32 +@@ -71,6 +71,7 @@ INCLUDE[ghash-sparcv9.o]=.. + GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl + GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl + GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl ++GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl + GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl + INCLUDE[ghash-armv4.o]=.. + GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl +diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h +index e95ad5a..0c281a3 100644 +--- a/include/crypto/aes_platform.h ++++ b/include/crypto/aes_platform.h +@@ -74,6 +74,26 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len, + # define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks + # define HWAES_xts_encrypt aes_p8_xts_encrypt + # define HWAES_xts_decrypt aes_p8_xts_decrypt ++# define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300) ++# define AES_GCM_ENC_BYTES 128 ++# define AES_GCM_DEC_BYTES 128 ++size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, unsigned char ivec[16], ++ u64 *Xi); ++size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, unsigned char ivec[16], ++ u64 *Xi); ++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ unsigned char ivec[16], u64 *Xi); ++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, ++ size_t len, const void *key, ++ unsigned char ivec[16], u64 *Xi); ++# define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap ++# define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap ++# define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \ ++ (gctx)->gcm.ghash==gcm_ghash_p8) ++void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len); + # endif /* PPC */ + + # if (defined(__arm__) || defined(__arm) || defined(__aarch64__)) +diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw.c b/providers/implementations/ciphers/cipher_aes_gcm_hw.c +index 44fa9d4..789ec12 100644 +--- a/providers/implementations/ciphers/cipher_aes_gcm_hw.c ++++ b/providers/implementations/ciphers/cipher_aes_gcm_hw.c +@@ -141,6 +141,8 @@ static const PROV_GCM_HW aes_gcm = { + # include "cipher_aes_gcm_hw_t4.inc" + #elif defined(AES_PMULL_CAPABLE) && defined(AES_GCM_ASM) + # include "cipher_aes_gcm_hw_armv8.inc" ++#elif defined(PPC_AES_GCM_CAPABLE) ++# include "cipher_aes_gcm_hw_ppc.inc" + #else + const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits) + { +diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc +new file mode 100644 +index 0000000..4eed0f4 +--- /dev/null ++++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc +@@ -0,0 +1,119 @@ ++/* ++ * Copyright 2001-2021 The OpenSSL Project Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License 2.0 (the "License"). You may not use ++ * this file except in compliance with the License. You can obtain a copy ++ * in the file LICENSE in the source distribution or at ++ * https://www.openssl.org/source/license.html ++ */ ++ ++/*- ++ * PPC support for AES GCM. ++ * This file is included by cipher_aes_gcm_hw.c ++ */ ++ ++static int aes_ppc_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key, ++ size_t keylen) ++{ ++ PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx; ++ AES_KEY *ks = &actx->ks.ks; ++ ++ GCM_HW_SET_KEY_CTR_FN(ks, aes_p8_set_encrypt_key, aes_p8_encrypt, ++ aes_p8_ctr32_encrypt_blocks); ++ return 1; ++} ++ ++ ++extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi); ++extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi); ++ ++static inline u32 UTO32(unsigned char *buf) ++{ ++ return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]); ++} ++ ++static inline u32 add32TOU(unsigned char buf[4], u32 n) ++{ ++ u32 r; ++ ++ r = UTO32(buf); ++ r += n; ++ buf[0] = (unsigned char) (r >> 24) & 0xFF; ++ buf[1] = (unsigned char) (r >> 16) & 0xFF; ++ buf[2] = (unsigned char) (r >> 8) & 0xFF; ++ buf[3] = (unsigned char) r & 0xFF; ++ return r; ++} ++ ++static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi, int encrypt) ++{ ++ int s = 0; ++ int ndone = 0; ++ int ctr_reset = 0; ++ u64 blocks_unused; ++ u64 nb = len / 16; ++ u64 next_ctr = 0; ++ unsigned char ctr_saved[12]; ++ ++ memcpy(ctr_saved, ivec, 12); ++ ++ while (nb) { ++ blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12); ++ if (nb > blocks_unused) { ++ len = blocks_unused * 16; ++ nb -= blocks_unused; ++ next_ctr = blocks_unused; ++ ctr_reset = 1; ++ } else { ++ len = nb * 16; ++ next_ctr = nb; ++ nb = 0; ++ } ++ ++ s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi) ++ : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi); ++ ++ /* add counter to ivec */ ++ add32TOU(ivec + 12, (u32) next_ctr); ++ if (ctr_reset) { ++ ctr_reset = 0; ++ in += len; ++ out += len; ++ } ++ memcpy(ivec, ctr_saved, 12); ++ ndone += s; ++ } ++ ++ return ndone; ++} ++ ++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi) ++{ ++ return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1); ++} ++ ++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len, ++ const void *key, unsigned char ivec[16], u64 *Xi) ++{ ++ return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0); ++} ++ ++ ++static const PROV_GCM_HW aes_ppc_gcm = { ++ aes_ppc_gcm_initkey, ++ ossl_gcm_setiv, ++ ossl_gcm_aad_update, ++ generic_aes_gcm_cipher_update, ++ ossl_gcm_cipher_final, ++ ossl_gcm_one_shot ++}; ++ ++const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits) ++{ ++ return PPC_AES_GCM_CAPABLE ? &aes_ppc_gcm : &aes_gcm; ++} ++ diff --git a/0072-ChaCha20-performance-optimizations-for-ppc64le.patch b/0072-ChaCha20-performance-optimizations-for-ppc64le.patch new file mode 100644 index 0000000..527b901 --- /dev/null +++ b/0072-ChaCha20-performance-optimizations-for-ppc64le.patch @@ -0,0 +1,1493 @@ +Upstream-Status: Backport [ + https://github.com/openssl/openssl/commit/f596bbe4da779b56eea34d96168b557d78e1149, + https://github.com/openssl/openssl/commit/7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa, + hunks in crypto/ppccap.c from https://github.com/openssl/openssl/commit/f5485b97b6c9977c0d39c7669b9f97a879312447 +] +diff --git a/crypto/chacha/asm/chachap10-ppc.pl b/crypto/chacha/asm/chachap10-ppc.pl +new file mode 100755 +index 0000000..36e9a8d +--- /dev/null ++++ b/crypto/chacha/asm/chachap10-ppc.pl +@@ -0,0 +1,1288 @@ ++#! /usr/bin/env perl ++# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ++# ==================================================================== ++# Written by Andy Polyakov for the OpenSSL ++# project. The module is, however, dual licensed under OpenSSL and ++# CRYPTOGAMS licenses depending on where you obtain it. For further ++# details see http://www.openssl.org/~appro/cryptogams/. ++# ==================================================================== ++# ++# October 2015 ++# ++# ChaCha20 for PowerPC/AltiVec. ++# ++# June 2018 ++# ++# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for ++# processors that can't issue more than one vector instruction per ++# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x ++# interleave would perform better. Incidentally PowerISA 2.07 (first ++# implemented by POWER8) defined new usable instructions, hence 4xVSX ++# code path... ++# ++# Performance in cycles per byte out of large buffer. ++# ++# IALU/gcc-4.x 3xAltiVec+1xIALU 4xVSX ++# ++# Freescale e300 13.6/+115% - - ++# PPC74x0/G4e 6.81/+310% 3.81 - ++# PPC970/G5 9.29/+160% ? - ++# POWER7 8.62/+61% 3.35 - ++# POWER8 8.70/+51% 2.91 2.09 ++# POWER9 8.80/+29% 4.44(*) 2.45(**) ++# ++# (*) this is trade-off result, it's possible to improve it, but ++# then it would negatively affect all others; ++# (**) POWER9 seems to be "allergic" to mixing vector and integer ++# instructions, which is why switch to vector-only code pays ++# off that much; ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++if ($flavour =~ /64/) { ++ $SIZE_T =8; ++ $LRSAVE =2*$SIZE_T; ++ $STU ="stdu"; ++ $POP ="ld"; ++ $PUSH ="std"; ++ $UCMP ="cmpld"; ++} elsif ($flavour =~ /32/) { ++ $SIZE_T =4; ++ $LRSAVE =$SIZE_T; ++ $STU ="stwu"; ++ $POP ="lwz"; ++ $PUSH ="stw"; ++ $UCMP ="cmplw"; ++} else { die "nonsense $flavour"; } ++ ++$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open STDOUT,"| $^X $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++ ++$LOCALS=6*$SIZE_T; ++$FRAME=$LOCALS+64+18*$SIZE_T; # 64 is for local variables ++ ++sub AUTOLOAD() # thunk [simplified] x86-style perlasm ++{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./; ++ $code .= "\t$opcode\t".join(',',@_)."\n"; ++} ++ ++my $sp = "r1"; ++ ++my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); ++ ++ ++{{{ ++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, ++ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15)); ++my @K = map("v$_",(16..19)); ++my $CTR = "v26"; ++my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30)); ++my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3); ++my $beperm = "v31"; ++ ++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); ++ ++my $FRAME=$LOCALS+64+7*16; # 7*16 is for v26-v31 offload ++ ++ ++sub VSX_lane_ROUND_4x { ++my ($a0,$b0,$c0,$d0)=@_; ++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); ++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); ++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); ++my @x=map("\"v$_\"",(0..15)); ++ ++ ( ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vrlw (@x[$d0],@x[$d0],'$sixteen')", ++ "&vrlw (@x[$d1],@x[$d1],'$sixteen')", ++ "&vrlw (@x[$d2],@x[$d2],'$sixteen')", ++ "&vrlw (@x[$d3],@x[$d3],'$sixteen')", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vrlw (@x[$b0],@x[$b0],'$twelve')", ++ "&vrlw (@x[$b1],@x[$b1],'$twelve')", ++ "&vrlw (@x[$b2],@x[$b2],'$twelve')", ++ "&vrlw (@x[$b3],@x[$b3],'$twelve')", ++ ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vrlw (@x[$d0],@x[$d0],'$eight')", ++ "&vrlw (@x[$d1],@x[$d1],'$eight')", ++ "&vrlw (@x[$d2],@x[$d2],'$eight')", ++ "&vrlw (@x[$d3],@x[$d3],'$eight')", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vrlw (@x[$b0],@x[$b0],'$seven')", ++ "&vrlw (@x[$b1],@x[$b1],'$seven')", ++ "&vrlw (@x[$b2],@x[$b2],'$seven')", ++ "&vrlw (@x[$b3],@x[$b3],'$seven')" ++ ); ++} ++ ++$code.=<<___; ++ ++.globl .ChaCha20_ctr32_vsx_p10 ++.align 5 ++.ChaCha20_ctr32_vsx_p10: ++ ${UCMP}i $len,255 ++ bgt ChaCha20_ctr32_vsx_8x ++ $STU $sp,-$FRAME($sp) ++ mflr r0 ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ mfspr r12,256 ++ stvx v26,r10,$sp ++ addi r10,r10,32 ++ stvx v27,r11,$sp ++ addi r11,r11,32 ++ stvx v28,r10,$sp ++ addi r10,r10,32 ++ stvx v29,r11,$sp ++ addi r11,r11,32 ++ stvx v30,r10,$sp ++ stvx v31,r11,$sp ++ stw r12,`$FRAME-4`($sp) # save vrsave ++ li r12,-4096+63 ++ $PUSH r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # preserve 29 AltiVec registers ++ ++ bl Lconsts # returns pointer Lsigma in r12 ++ lvx_4w @K[0],0,r12 # load sigma ++ addi r12,r12,0x70 ++ li $x10,16 ++ li $x20,32 ++ li $x30,48 ++ li r11,64 ++ ++ lvx_4w @K[1],0,$key # load key ++ lvx_4w @K[2],$x10,$key ++ lvx_4w @K[3],0,$ctr # load counter ++ ++ vxor $xt0,$xt0,$xt0 ++ lvx_4w $xt1,r11,r12 ++ vspltw $CTR,@K[3],0 ++ vsldoi @K[3],@K[3],$xt0,4 ++ vsldoi @K[3],$xt0,@K[3],12 # clear @K[3].word[0] ++ vadduwm $CTR,$CTR,$xt1 ++ ++ be?lvsl $beperm,0,$x10 # 0x00..0f ++ be?vspltisb $xt0,3 # 0x03..03 ++ be?vxor $beperm,$beperm,$xt0 # swap bytes within words ++ ++ li r0,10 # inner loop counter ++ mtctr r0 ++ b Loop_outer_vsx ++ ++.align 5 ++Loop_outer_vsx: ++ lvx $xa0,$x00,r12 # load [smashed] sigma ++ lvx $xa1,$x10,r12 ++ lvx $xa2,$x20,r12 ++ lvx $xa3,$x30,r12 ++ ++ vspltw $xb0,@K[1],0 # smash the key ++ vspltw $xb1,@K[1],1 ++ vspltw $xb2,@K[1],2 ++ vspltw $xb3,@K[1],3 ++ ++ vspltw $xc0,@K[2],0 ++ vspltw $xc1,@K[2],1 ++ vspltw $xc2,@K[2],2 ++ vspltw $xc3,@K[2],3 ++ ++ vmr $xd0,$CTR # smash the counter ++ vspltw $xd1,@K[3],1 ++ vspltw $xd2,@K[3],2 ++ vspltw $xd3,@K[3],3 ++ ++ vspltisw $sixteen,-16 # synthesize constants ++ vspltisw $twelve,12 ++ vspltisw $eight,8 ++ vspltisw $seven,7 ++ ++Loop_vsx_4x: ++___ ++ foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; } ++ foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; } ++$code.=<<___; ++ ++ bdnz Loop_vsx_4x ++ ++ vadduwm $xd0,$xd0,$CTR ++ ++ vmrgew $xt0,$xa0,$xa1 # transpose data ++ vmrgew $xt1,$xa2,$xa3 ++ vmrgow $xa0,$xa0,$xa1 ++ vmrgow $xa2,$xa2,$xa3 ++ vmrgew $xt2,$xb0,$xb1 ++ vmrgew $xt3,$xb2,$xb3 ++ vpermdi $xa1,$xa0,$xa2,0b00 ++ vpermdi $xa3,$xa0,$xa2,0b11 ++ vpermdi $xa0,$xt0,$xt1,0b00 ++ vpermdi $xa2,$xt0,$xt1,0b11 ++ ++ vmrgow $xb0,$xb0,$xb1 ++ vmrgow $xb2,$xb2,$xb3 ++ vmrgew $xt0,$xc0,$xc1 ++ vmrgew $xt1,$xc2,$xc3 ++ vpermdi $xb1,$xb0,$xb2,0b00 ++ vpermdi $xb3,$xb0,$xb2,0b11 ++ vpermdi $xb0,$xt2,$xt3,0b00 ++ vpermdi $xb2,$xt2,$xt3,0b11 ++ ++ vmrgow $xc0,$xc0,$xc1 ++ vmrgow $xc2,$xc2,$xc3 ++ vmrgew $xt2,$xd0,$xd1 ++ vmrgew $xt3,$xd2,$xd3 ++ vpermdi $xc1,$xc0,$xc2,0b00 ++ vpermdi $xc3,$xc0,$xc2,0b11 ++ vpermdi $xc0,$xt0,$xt1,0b00 ++ vpermdi $xc2,$xt0,$xt1,0b11 ++ ++ vmrgow $xd0,$xd0,$xd1 ++ vmrgow $xd2,$xd2,$xd3 ++ vspltisw $xt0,4 ++ vadduwm $CTR,$CTR,$xt0 # next counter value ++ vpermdi $xd1,$xd0,$xd2,0b00 ++ vpermdi $xd3,$xd0,$xd2,0b11 ++ vpermdi $xd0,$xt2,$xt3,0b00 ++ vpermdi $xd2,$xt2,$xt3,0b11 ++ ++ vadduwm $xa0,$xa0,@K[0] ++ vadduwm $xb0,$xb0,@K[1] ++ vadduwm $xc0,$xc0,@K[2] ++ vadduwm $xd0,$xd0,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx ++ ++ vadduwm $xa0,$xa1,@K[0] ++ vadduwm $xb0,$xb1,@K[1] ++ vadduwm $xc0,$xc1,@K[2] ++ vadduwm $xd0,$xd1,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx ++ ++ vadduwm $xa0,$xa2,@K[0] ++ vadduwm $xb0,$xb2,@K[1] ++ vadduwm $xc0,$xc2,@K[2] ++ vadduwm $xd0,$xd2,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx ++ ++ vadduwm $xa0,$xa3,@K[0] ++ vadduwm $xb0,$xb3,@K[1] ++ vadduwm $xc0,$xc3,@K[2] ++ vadduwm $xd0,$xd3,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ mtctr r0 ++ bne Loop_outer_vsx ++ ++Ldone_vsx: ++ lwz r12,`$FRAME-4`($sp) # pull vrsave ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ $POP r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # restore vrsave ++ lvx v26,r10,$sp ++ addi r10,r10,32 ++ lvx v27,r11,$sp ++ addi r11,r11,32 ++ lvx v28,r10,$sp ++ addi r10,r10,32 ++ lvx v29,r11,$sp ++ addi r11,r11,32 ++ lvx v30,r10,$sp ++ lvx v31,r11,$sp ++ mtlr r0 ++ addi $sp,$sp,$FRAME ++ blr ++ ++.align 4 ++Ltail_vsx: ++ addi r11,$sp,$LOCALS ++ mtctr $len ++ stvx_4w $xa0,$x00,r11 # offload block to stack ++ stvx_4w $xb0,$x10,r11 ++ stvx_4w $xc0,$x20,r11 ++ stvx_4w $xd0,$x30,r11 ++ subi r12,r11,1 # prepare for *++ptr ++ subi $inp,$inp,1 ++ subi $out,$out,1 ++ ++Loop_tail_vsx: ++ lbzu r6,1(r12) ++ lbzu r7,1($inp) ++ xor r6,r6,r7 ++ stbu r6,1($out) ++ bdnz Loop_tail_vsx ++ ++ stvx_4w $K[0],$x00,r11 # wipe copy of the block ++ stvx_4w $K[0],$x10,r11 ++ stvx_4w $K[0],$x20,r11 ++ stvx_4w $K[0],$x30,r11 ++ ++ b Ldone_vsx ++ .long 0 ++ .byte 0,12,0x04,1,0x80,0,5,0 ++ .long 0 ++.size .ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10 ++___ ++}}} ++ ++##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to ++# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info. ++# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value. ++# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel. ++# ++{{{ ++#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7)); ++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3, ++ $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3, ++ $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7, ++ $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31)); ++my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15)); ++my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3)); ++my @K = map("v$_",27,(24..26)); ++my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31)); ++my $xr0 = "v4"; ++my $CTR0 = "v22"; ++my $CTR1 = "v5"; ++my $beperm = "v31"; ++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10))); ++my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7)); ++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17)); ++my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21)); ++my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26)); ++ ++my $FRAME=$LOCALS+64+9*16; # 8*16 is for v24-v31 offload ++ ++sub VSX_lane_ROUND_8x { ++my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_; ++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0)); ++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1)); ++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2)); ++my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4)); ++my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5)); ++my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6)); ++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17)); ++my @x=map("\"v$_\"",(0..31)); ++ ++ ( ++ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", #copy v30 to v13 ++ "&vxxlorc (@x[$c7], $xv9,$xv9)", ++ ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", # Q1 ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", # Q2 ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", # Q3 ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", # Q4 ++ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", # Q1 ++ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", # Q2 ++ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", # Q3 ++ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", # Q4 ++ ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vxor (@x[$d4],@x[$d4],@x[$a4])", ++ "&vxor (@x[$d5],@x[$d5],@x[$a5])", ++ "&vxor (@x[$d6],@x[$d6],@x[$a6])", ++ "&vxor (@x[$d7],@x[$d7],@x[$a7])", ++ ++ "&vrlw (@x[$d0],@x[$d0],@x[$c7])", ++ "&vrlw (@x[$d1],@x[$d1],@x[$c7])", ++ "&vrlw (@x[$d2],@x[$d2],@x[$c7])", ++ "&vrlw (@x[$d3],@x[$d3],@x[$c7])", ++ "&vrlw (@x[$d4],@x[$d4],@x[$c7])", ++ "&vrlw (@x[$d5],@x[$d5],@x[$c7])", ++ "&vrlw (@x[$d6],@x[$d6],@x[$c7])", ++ "&vrlw (@x[$d7],@x[$d7],@x[$c7])", ++ ++ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", ++ "&vxxlorc (@x[$c7], $xv15,$xv15)", ++ "&vxxlorc (@x[$a7], $xv10,$xv10)", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", ++ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", ++ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", ++ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", ++ ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vxor (@x[$b4],@x[$b4],@x[$c4])", ++ "&vxor (@x[$b5],@x[$b5],@x[$c5])", ++ "&vxor (@x[$b6],@x[$b6],@x[$c6])", ++ "&vxor (@x[$b7],@x[$b7],@x[$c7])", ++ ++ "&vrlw (@x[$b0],@x[$b0],@x[$a7])", ++ "&vrlw (@x[$b1],@x[$b1],@x[$a7])", ++ "&vrlw (@x[$b2],@x[$b2],@x[$a7])", ++ "&vrlw (@x[$b3],@x[$b3],@x[$a7])", ++ "&vrlw (@x[$b4],@x[$b4],@x[$a7])", ++ "&vrlw (@x[$b5],@x[$b5],@x[$a7])", ++ "&vrlw (@x[$b6],@x[$b6],@x[$a7])", ++ "&vrlw (@x[$b7],@x[$b7],@x[$a7])", ++ ++ "&vxxlorc (@x[$a7], $xv13,$xv13)", ++ "&vxxlor ($xv15 ,@x[$c7],@x[$c7])", ++ "&vxxlorc (@x[$c7], $xv11,$xv11)", ++ ++ ++ "&vadduwm (@x[$a0],@x[$a0],@x[$b0])", ++ "&vadduwm (@x[$a1],@x[$a1],@x[$b1])", ++ "&vadduwm (@x[$a2],@x[$a2],@x[$b2])", ++ "&vadduwm (@x[$a3],@x[$a3],@x[$b3])", ++ "&vadduwm (@x[$a4],@x[$a4],@x[$b4])", ++ "&vadduwm (@x[$a5],@x[$a5],@x[$b5])", ++ "&vadduwm (@x[$a6],@x[$a6],@x[$b6])", ++ "&vadduwm (@x[$a7],@x[$a7],@x[$b7])", ++ ++ "&vxor (@x[$d0],@x[$d0],@x[$a0])", ++ "&vxor (@x[$d1],@x[$d1],@x[$a1])", ++ "&vxor (@x[$d2],@x[$d2],@x[$a2])", ++ "&vxor (@x[$d3],@x[$d3],@x[$a3])", ++ "&vxor (@x[$d4],@x[$d4],@x[$a4])", ++ "&vxor (@x[$d5],@x[$d5],@x[$a5])", ++ "&vxor (@x[$d6],@x[$d6],@x[$a6])", ++ "&vxor (@x[$d7],@x[$d7],@x[$a7])", ++ ++ "&vrlw (@x[$d0],@x[$d0],@x[$c7])", ++ "&vrlw (@x[$d1],@x[$d1],@x[$c7])", ++ "&vrlw (@x[$d2],@x[$d2],@x[$c7])", ++ "&vrlw (@x[$d3],@x[$d3],@x[$c7])", ++ "&vrlw (@x[$d4],@x[$d4],@x[$c7])", ++ "&vrlw (@x[$d5],@x[$d5],@x[$c7])", ++ "&vrlw (@x[$d6],@x[$d6],@x[$c7])", ++ "&vrlw (@x[$d7],@x[$d7],@x[$c7])", ++ ++ "&vxxlorc (@x[$c7], $xv15,$xv15)", ++ "&vxxlor ($xv13 ,@x[$a7],@x[$a7])", ++ "&vxxlorc (@x[$a7], $xv12,$xv12)", ++ ++ "&vadduwm (@x[$c0],@x[$c0],@x[$d0])", ++ "&vadduwm (@x[$c1],@x[$c1],@x[$d1])", ++ "&vadduwm (@x[$c2],@x[$c2],@x[$d2])", ++ "&vadduwm (@x[$c3],@x[$c3],@x[$d3])", ++ "&vadduwm (@x[$c4],@x[$c4],@x[$d4])", ++ "&vadduwm (@x[$c5],@x[$c5],@x[$d5])", ++ "&vadduwm (@x[$c6],@x[$c6],@x[$d6])", ++ "&vadduwm (@x[$c7],@x[$c7],@x[$d7])", ++ "&vxor (@x[$b0],@x[$b0],@x[$c0])", ++ "&vxor (@x[$b1],@x[$b1],@x[$c1])", ++ "&vxor (@x[$b2],@x[$b2],@x[$c2])", ++ "&vxor (@x[$b3],@x[$b3],@x[$c3])", ++ "&vxor (@x[$b4],@x[$b4],@x[$c4])", ++ "&vxor (@x[$b5],@x[$b5],@x[$c5])", ++ "&vxor (@x[$b6],@x[$b6],@x[$c6])", ++ "&vxor (@x[$b7],@x[$b7],@x[$c7])", ++ "&vrlw (@x[$b0],@x[$b0],@x[$a7])", ++ "&vrlw (@x[$b1],@x[$b1],@x[$a7])", ++ "&vrlw (@x[$b2],@x[$b2],@x[$a7])", ++ "&vrlw (@x[$b3],@x[$b3],@x[$a7])", ++ "&vrlw (@x[$b4],@x[$b4],@x[$a7])", ++ "&vrlw (@x[$b5],@x[$b5],@x[$a7])", ++ "&vrlw (@x[$b6],@x[$b6],@x[$a7])", ++ "&vrlw (@x[$b7],@x[$b7],@x[$a7])", ++ ++ "&vxxlorc (@x[$a7], $xv13,$xv13)", ++ ); ++} ++ ++$code.=<<___; ++ ++.globl .ChaCha20_ctr32_vsx_8x ++.align 5 ++.ChaCha20_ctr32_vsx_8x: ++ $STU $sp,-$FRAME($sp) ++ mflr r0 ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ mfspr r12,256 ++ stvx v24,r10,$sp ++ addi r10,r10,32 ++ stvx v25,r11,$sp ++ addi r11,r11,32 ++ stvx v26,r10,$sp ++ addi r10,r10,32 ++ stvx v27,r11,$sp ++ addi r11,r11,32 ++ stvx v28,r10,$sp ++ addi r10,r10,32 ++ stvx v29,r11,$sp ++ addi r11,r11,32 ++ stvx v30,r10,$sp ++ stvx v31,r11,$sp ++ stw r12,`$FRAME-4`($sp) # save vrsave ++ li r12,-4096+63 ++ $PUSH r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # preserve 29 AltiVec registers ++ ++ bl Lconsts # returns pointer Lsigma in r12 ++ ++ lvx_4w @K[0],0,r12 # load sigma ++ addi r12,r12,0x70 ++ li $x10,16 ++ li $x20,32 ++ li $x30,48 ++ li r11,64 ++ ++ vspltisw $xa4,-16 # synthesize constants ++ vspltisw $xb4,12 # synthesize constants ++ vspltisw $xc4,8 # synthesize constants ++ vspltisw $xd4,7 # synthesize constants ++ ++ lvx $xa0,$x00,r12 # load [smashed] sigma ++ lvx $xa1,$x10,r12 ++ lvx $xa2,$x20,r12 ++ lvx $xa3,$x30,r12 ++ ++ vxxlor $xv9 ,$xa4,$xa4 #save shift val in vr9-12 ++ vxxlor $xv10 ,$xb4,$xb4 ++ vxxlor $xv11 ,$xc4,$xc4 ++ vxxlor $xv12 ,$xd4,$xd4 ++ vxxlor $xv22 ,$xa0,$xa0 #save sigma in vr22-25 ++ vxxlor $xv23 ,$xa1,$xa1 ++ vxxlor $xv24 ,$xa2,$xa2 ++ vxxlor $xv25 ,$xa3,$xa3 ++ ++ lvx_4w @K[1],0,$key # load key ++ lvx_4w @K[2],$x10,$key ++ lvx_4w @K[3],0,$ctr # load counter ++ vspltisw $xt3,4 ++ ++ ++ vxor $xt2,$xt2,$xt2 ++ lvx_4w $xt1,r11,r12 ++ vspltw $xa2,@K[3],0 #save the original count after spltw ++ vsldoi @K[3],@K[3],$xt2,4 ++ vsldoi @K[3],$xt2,@K[3],12 # clear @K[3].word[0] ++ vadduwm $xt1,$xa2,$xt1 ++ vadduwm $xt3,$xt1,$xt3 # next counter value ++ vspltw $xa0,@K[2],2 # save the K[2] spltw 2 and save v8. ++ ++ be?lvsl $beperm,0,$x10 # 0x00..0f ++ be?vspltisb $xt0,3 # 0x03..03 ++ be?vxor $beperm,$beperm,$xt0 # swap bytes within words ++ be?vxxlor $xv26 ,$beperm,$beperm ++ ++ vxxlor $xv0 ,@K[0],@K[0] # K0,k1,k2 to vr0,1,2 ++ vxxlor $xv1 ,@K[1],@K[1] ++ vxxlor $xv2 ,@K[2],@K[2] ++ vxxlor $xv3 ,@K[3],@K[3] ++ vxxlor $xv4 ,$xt1,$xt1 #CTR ->4, CTR+4-> 5 ++ vxxlor $xv5 ,$xt3,$xt3 ++ vxxlor $xv8 ,$xa0,$xa0 ++ ++ li r0,10 # inner loop counter ++ mtctr r0 ++ b Loop_outer_vsx_8x ++ ++.align 5 ++Loop_outer_vsx_8x: ++ vxxlorc $xa0,$xv22,$xv22 # load [smashed] sigma ++ vxxlorc $xa1,$xv23,$xv23 ++ vxxlorc $xa2,$xv24,$xv24 ++ vxxlorc $xa3,$xv25,$xv25 ++ vxxlorc $xa4,$xv22,$xv22 ++ vxxlorc $xa5,$xv23,$xv23 ++ vxxlorc $xa6,$xv24,$xv24 ++ vxxlorc $xa7,$xv25,$xv25 ++ ++ vspltw $xb0,@K[1],0 # smash the key ++ vspltw $xb1,@K[1],1 ++ vspltw $xb2,@K[1],2 ++ vspltw $xb3,@K[1],3 ++ vspltw $xb4,@K[1],0 # smash the key ++ vspltw $xb5,@K[1],1 ++ vspltw $xb6,@K[1],2 ++ vspltw $xb7,@K[1],3 ++ ++ vspltw $xc0,@K[2],0 ++ vspltw $xc1,@K[2],1 ++ vspltw $xc2,@K[2],2 ++ vspltw $xc3,@K[2],3 ++ vspltw $xc4,@K[2],0 ++ vspltw $xc7,@K[2],3 ++ vspltw $xc5,@K[2],1 ++ ++ vxxlorc $xd0,$xv4,$xv4 # smash the counter ++ vspltw $xd1,@K[3],1 ++ vspltw $xd2,@K[3],2 ++ vspltw $xd3,@K[3],3 ++ vxxlorc $xd4,$xv5,$xv5 # smash the counter ++ vspltw $xd5,@K[3],1 ++ vspltw $xd6,@K[3],2 ++ vspltw $xd7,@K[3],3 ++ vxxlorc $xc6,$xv8,$xv8 #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done ++ ++Loop_vsx_8x: ++___ ++ foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; } ++ foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; } ++$code.=<<___; ++ ++ bdnz Loop_vsx_8x ++ vxxlor $xv13 ,$xd4,$xd4 # save the register vr24-31 ++ vxxlor $xv14 ,$xd5,$xd5 # ++ vxxlor $xv15 ,$xd6,$xd6 # ++ vxxlor $xv16 ,$xd7,$xd7 # ++ ++ vxxlor $xv18 ,$xc4,$xc4 # ++ vxxlor $xv19 ,$xc5,$xc5 # ++ vxxlor $xv20 ,$xc6,$xc6 # ++ vxxlor $xv21 ,$xc7,$xc7 # ++ ++ vxxlor $xv6 ,$xb6,$xb6 # save vr23, so we get 8 regs ++ vxxlor $xv7 ,$xb7,$xb7 # save vr23, so we get 8 regs ++ be?vxxlorc $beperm,$xv26,$xv26 # copy back the the beperm. ++ ++ vxxlorc @K[0],$xv0,$xv0 #27 ++ vxxlorc @K[1],$xv1,$xv1 #24 ++ vxxlorc @K[2],$xv2,$xv2 #25 ++ vxxlorc @K[3],$xv3,$xv3 #26 ++ vxxlorc $CTR0,$xv4,$xv4 ++###changing to vertical ++ ++ vmrgew $xt0,$xa0,$xa1 # transpose data ++ vmrgew $xt1,$xa2,$xa3 ++ vmrgow $xa0,$xa0,$xa1 ++ vmrgow $xa2,$xa2,$xa3 ++ ++ vmrgew $xt2,$xb0,$xb1 ++ vmrgew $xt3,$xb2,$xb3 ++ vmrgow $xb0,$xb0,$xb1 ++ vmrgow $xb2,$xb2,$xb3 ++ ++ vadduwm $xd0,$xd0,$CTR0 ++ ++ vpermdi $xa1,$xa0,$xa2,0b00 ++ vpermdi $xa3,$xa0,$xa2,0b11 ++ vpermdi $xa0,$xt0,$xt1,0b00 ++ vpermdi $xa2,$xt0,$xt1,0b11 ++ vpermdi $xb1,$xb0,$xb2,0b00 ++ vpermdi $xb3,$xb0,$xb2,0b11 ++ vpermdi $xb0,$xt2,$xt3,0b00 ++ vpermdi $xb2,$xt2,$xt3,0b11 ++ ++ vmrgew $xt0,$xc0,$xc1 ++ vmrgew $xt1,$xc2,$xc3 ++ vmrgow $xc0,$xc0,$xc1 ++ vmrgow $xc2,$xc2,$xc3 ++ vmrgew $xt2,$xd0,$xd1 ++ vmrgew $xt3,$xd2,$xd3 ++ vmrgow $xd0,$xd0,$xd1 ++ vmrgow $xd2,$xd2,$xd3 ++ ++ vpermdi $xc1,$xc0,$xc2,0b00 ++ vpermdi $xc3,$xc0,$xc2,0b11 ++ vpermdi $xc0,$xt0,$xt1,0b00 ++ vpermdi $xc2,$xt0,$xt1,0b11 ++ vpermdi $xd1,$xd0,$xd2,0b00 ++ vpermdi $xd3,$xd0,$xd2,0b11 ++ vpermdi $xd0,$xt2,$xt3,0b00 ++ vpermdi $xd2,$xt2,$xt3,0b11 ++ ++ vspltisw $xt0,8 ++ vadduwm $CTR0,$CTR0,$xt0 # next counter value ++ vxxlor $xv4 ,$CTR0,$CTR0 #CTR+4-> 5 ++ ++ vadduwm $xa0,$xa0,@K[0] ++ vadduwm $xb0,$xb0,@K[1] ++ vadduwm $xc0,$xc0,@K[2] ++ vadduwm $xd0,$xd0,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xa0,$xa1,@K[0] ++ vadduwm $xb0,$xb1,@K[1] ++ vadduwm $xc0,$xc1,@K[2] ++ vadduwm $xd0,$xd1,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xa0,$xa2,@K[0] ++ vadduwm $xb0,$xb2,@K[1] ++ vadduwm $xc0,$xc2,@K[2] ++ vadduwm $xd0,$xd2,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xa0,$xa3,@K[0] ++ vadduwm $xb0,$xb3,@K[1] ++ vadduwm $xc0,$xc3,@K[2] ++ vadduwm $xd0,$xd3,@K[3] ++ ++ be?vperm $xa0,$xa0,$xa0,$beperm ++ be?vperm $xb0,$xb0,$xb0,$beperm ++ be?vperm $xc0,$xc0,$xc0,$beperm ++ be?vperm $xd0,$xd0,$xd0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x ++ ++ lvx_4w $xt0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xt0,$xt0,$xa0 ++ vxor $xt1,$xt1,$xb0 ++ vxor $xt2,$xt2,$xc0 ++ vxor $xt3,$xt3,$xd0 ++ ++ stvx_4w $xt0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31. ++#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0. ++ ++ vxxlorc $CTR1 ,$xv5,$xv5 ++ ++ vxxlorc $xcn4 ,$xv18,$xv18 ++ vxxlorc $xcn5 ,$xv19,$xv19 ++ vxxlorc $xcn6 ,$xv20,$xv20 ++ vxxlorc $xcn7 ,$xv21,$xv21 ++ ++ vxxlorc $xdn4 ,$xv13,$xv13 ++ vxxlorc $xdn5 ,$xv14,$xv14 ++ vxxlorc $xdn6 ,$xv15,$xv15 ++ vxxlorc $xdn7 ,$xv16,$xv16 ++ vadduwm $xdn4,$xdn4,$CTR1 ++ ++ vxxlorc $xb6 ,$xv6,$xv6 ++ vxxlorc $xb7 ,$xv7,$xv7 ++#use xa1->xr0, as xt0...in the block 4-7 ++ ++ vmrgew $xr0,$xa4,$xa5 # transpose data ++ vmrgew $xt1,$xa6,$xa7 ++ vmrgow $xa4,$xa4,$xa5 ++ vmrgow $xa6,$xa6,$xa7 ++ vmrgew $xt2,$xb4,$xb5 ++ vmrgew $xt3,$xb6,$xb7 ++ vmrgow $xb4,$xb4,$xb5 ++ vmrgow $xb6,$xb6,$xb7 ++ ++ vpermdi $xa5,$xa4,$xa6,0b00 ++ vpermdi $xa7,$xa4,$xa6,0b11 ++ vpermdi $xa4,$xr0,$xt1,0b00 ++ vpermdi $xa6,$xr0,$xt1,0b11 ++ vpermdi $xb5,$xb4,$xb6,0b00 ++ vpermdi $xb7,$xb4,$xb6,0b11 ++ vpermdi $xb4,$xt2,$xt3,0b00 ++ vpermdi $xb6,$xt2,$xt3,0b11 ++ ++ vmrgew $xr0,$xcn4,$xcn5 ++ vmrgew $xt1,$xcn6,$xcn7 ++ vmrgow $xcn4,$xcn4,$xcn5 ++ vmrgow $xcn6,$xcn6,$xcn7 ++ vmrgew $xt2,$xdn4,$xdn5 ++ vmrgew $xt3,$xdn6,$xdn7 ++ vmrgow $xdn4,$xdn4,$xdn5 ++ vmrgow $xdn6,$xdn6,$xdn7 ++ ++ vpermdi $xcn5,$xcn4,$xcn6,0b00 ++ vpermdi $xcn7,$xcn4,$xcn6,0b11 ++ vpermdi $xcn4,$xr0,$xt1,0b00 ++ vpermdi $xcn6,$xr0,$xt1,0b11 ++ vpermdi $xdn5,$xdn4,$xdn6,0b00 ++ vpermdi $xdn7,$xdn4,$xdn6,0b11 ++ vpermdi $xdn4,$xt2,$xt3,0b00 ++ vpermdi $xdn6,$xt2,$xt3,0b11 ++ ++ vspltisw $xr0,8 ++ vadduwm $CTR1,$CTR1,$xr0 # next counter value ++ vxxlor $xv5 ,$CTR1,$CTR1 #CTR+4-> 5 ++ ++ vadduwm $xan0,$xa4,@K[0] ++ vadduwm $xbn0,$xb4,@K[1] ++ vadduwm $xcn0,$xcn4,@K[2] ++ vadduwm $xdn0,$xdn4,@K[3] ++ ++ be?vperm $xan0,$xa4,$xa4,$beperm ++ be?vperm $xbn0,$xb4,$xb4,$beperm ++ be?vperm $xcn0,$xcn4,$xcn4,$beperm ++ be?vperm $xdn0,$xdn4,$xdn4,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xan0,$xa5,@K[0] ++ vadduwm $xbn0,$xb5,@K[1] ++ vadduwm $xcn0,$xcn5,@K[2] ++ vadduwm $xdn0,$xdn5,@K[3] ++ ++ be?vperm $xan0,$xan0,$xan0,$beperm ++ be?vperm $xbn0,$xbn0,$xbn0,$beperm ++ be?vperm $xcn0,$xcn0,$xcn0,$beperm ++ be?vperm $xdn0,$xdn0,$xdn0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xan0,$xa6,@K[0] ++ vadduwm $xbn0,$xb6,@K[1] ++ vadduwm $xcn0,$xcn6,@K[2] ++ vadduwm $xdn0,$xdn6,@K[3] ++ ++ be?vperm $xan0,$xan0,$xan0,$beperm ++ be?vperm $xbn0,$xbn0,$xbn0,$beperm ++ be?vperm $xcn0,$xcn0,$xcn0,$beperm ++ be?vperm $xdn0,$xdn0,$xdn0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ vadduwm $xan0,$xa7,@K[0] ++ vadduwm $xbn0,$xb7,@K[1] ++ vadduwm $xcn0,$xcn7,@K[2] ++ vadduwm $xdn0,$xdn7,@K[3] ++ ++ be?vperm $xan0,$xan0,$xan0,$beperm ++ be?vperm $xbn0,$xbn0,$xbn0,$beperm ++ be?vperm $xcn0,$xcn0,$xcn0,$beperm ++ be?vperm $xdn0,$xdn0,$xdn0,$beperm ++ ++ ${UCMP}i $len,0x40 ++ blt Ltail_vsx_8x_1 ++ ++ lvx_4w $xr0,$x00,$inp ++ lvx_4w $xt1,$x10,$inp ++ lvx_4w $xt2,$x20,$inp ++ lvx_4w $xt3,$x30,$inp ++ ++ vxor $xr0,$xr0,$xan0 ++ vxor $xt1,$xt1,$xbn0 ++ vxor $xt2,$xt2,$xcn0 ++ vxor $xt3,$xt3,$xdn0 ++ ++ stvx_4w $xr0,$x00,$out ++ stvx_4w $xt1,$x10,$out ++ addi $inp,$inp,0x40 ++ stvx_4w $xt2,$x20,$out ++ subi $len,$len,0x40 ++ stvx_4w $xt3,$x30,$out ++ addi $out,$out,0x40 ++ beq Ldone_vsx_8x ++ ++ mtctr r0 ++ bne Loop_outer_vsx_8x ++ ++Ldone_vsx_8x: ++ lwz r12,`$FRAME-4`($sp) # pull vrsave ++ li r10,`15+$LOCALS+64` ++ li r11,`31+$LOCALS+64` ++ $POP r0, `$FRAME+$LRSAVE`($sp) ++ mtspr 256,r12 # restore vrsave ++ lvx v24,r10,$sp ++ addi r10,r10,32 ++ lvx v25,r11,$sp ++ addi r11,r11,32 ++ lvx v26,r10,$sp ++ addi r10,r10,32 ++ lvx v27,r11,$sp ++ addi r11,r11,32 ++ lvx v28,r10,$sp ++ addi r10,r10,32 ++ lvx v29,r11,$sp ++ addi r11,r11,32 ++ lvx v30,r10,$sp ++ lvx v31,r11,$sp ++ mtlr r0 ++ addi $sp,$sp,$FRAME ++ blr ++ ++.align 4 ++Ltail_vsx_8x: ++ addi r11,$sp,$LOCALS ++ mtctr $len ++ stvx_4w $xa0,$x00,r11 # offload block to stack ++ stvx_4w $xb0,$x10,r11 ++ stvx_4w $xc0,$x20,r11 ++ stvx_4w $xd0,$x30,r11 ++ subi r12,r11,1 # prepare for *++ptr ++ subi $inp,$inp,1 ++ subi $out,$out,1 ++ bl Loop_tail_vsx_8x ++Ltail_vsx_8x_1: ++ addi r11,$sp,$LOCALS ++ mtctr $len ++ stvx_4w $xan0,$x00,r11 # offload block to stack ++ stvx_4w $xbn0,$x10,r11 ++ stvx_4w $xcn0,$x20,r11 ++ stvx_4w $xdn0,$x30,r11 ++ subi r12,r11,1 # prepare for *++ptr ++ subi $inp,$inp,1 ++ subi $out,$out,1 ++ bl Loop_tail_vsx_8x ++ ++Loop_tail_vsx_8x: ++ lbzu r6,1(r12) ++ lbzu r7,1($inp) ++ xor r6,r6,r7 ++ stbu r6,1($out) ++ bdnz Loop_tail_vsx_8x ++ ++ stvx_4w $K[0],$x00,r11 # wipe copy of the block ++ stvx_4w $K[0],$x10,r11 ++ stvx_4w $K[0],$x20,r11 ++ stvx_4w $K[0],$x30,r11 ++ ++ b Ldone_vsx_8x ++ .long 0 ++ .byte 0,12,0x04,1,0x80,0,5,0 ++ .long 0 ++.size .ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x ++___ ++}}} ++ ++ ++$code.=<<___; ++.align 5 ++Lconsts: ++ mflr r0 ++ bcl 20,31,\$+4 ++ mflr r12 #vvvvv "distance between . and Lsigma ++ addi r12,r12,`64-8` ++ mtlr r0 ++ blr ++ .long 0 ++ .byte 0,12,0x14,0,0,0,0,0 ++ .space `64-9*4` ++Lsigma: ++ .long 0x61707865,0x3320646e,0x79622d32,0x6b206574 ++ .long 1,0,0,0 ++ .long 2,0,0,0 ++ .long 3,0,0,0 ++ .long 4,0,0,0 ++___ ++$code.=<<___ if ($LITTLE_ENDIAN); ++ .long 0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001 ++ .long 0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300 ++___ ++$code.=<<___ if (!$LITTLE_ENDIAN); # flipped words ++ .long 0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d ++ .long 0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c ++___ ++$code.=<<___; ++ .long 0x61707865,0x61707865,0x61707865,0x61707865 ++ .long 0x3320646e,0x3320646e,0x3320646e,0x3320646e ++ .long 0x79622d32,0x79622d32,0x79622d32,0x79622d32 ++ .long 0x6b206574,0x6b206574,0x6b206574,0x6b206574 ++ .long 0,1,2,3 ++ .long 0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c ++.asciz "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by " ++.align 2 ++___ ++ ++foreach (split("\n",$code)) { ++ s/\`([^\`]*)\`/eval $1/ge; ++ ++ # instructions prefixed with '?' are endian-specific and need ++ # to be adjusted accordingly... ++ if ($flavour !~ /le$/) { # big-endian ++ s/be\?// or ++ s/le\?/#le#/ or ++ s/\?lvsr/lvsl/ or ++ s/\?lvsl/lvsr/ or ++ s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or ++ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/; ++ } else { # little-endian ++ s/le\?// or ++ s/be\?/#be#/ or ++ s/\?([a-z]+)/$1/ or ++ s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/; ++ } ++ ++ print $_,"\n"; ++} ++ ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/chacha/build.info b/crypto/chacha/build.info +index c12cb9c..2a819b2 100644 +--- a/crypto/chacha/build.info ++++ b/crypto/chacha/build.info +@@ -12,7 +12,7 @@ IF[{- !$disabled{asm} -}] + $CHACHAASM_armv4=chacha-armv4.S + $CHACHAASM_aarch64=chacha-armv8.S + +- $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s ++ $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s + $CHACHAASM_ppc64=$CHACHAASM_ppc32 + + $CHACHAASM_c64xplus=chacha-c64xplus.s +@@ -29,6 +29,7 @@ SOURCE[../../libcrypto]=$CHACHAASM + GENERATE[chacha-x86.s]=asm/chacha-x86.pl + GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl + GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl ++GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl + GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl + INCLUDE[chacha-armv4.o]=.. + GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl +diff --git a/crypto/chacha/chacha_ppc.c b/crypto/chacha/chacha_ppc.c +index 5319040..f99cca8 100644 +--- a/crypto/chacha/chacha_ppc.c ++++ b/crypto/chacha/chacha_ppc.c +@@ -23,13 +23,18 @@ void ChaCha20_ctr32_vmx(unsigned char *out, const unsigned char *inp, + void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp, + size_t len, const unsigned int key[8], + const unsigned int counter[4]); ++void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp, ++ size_t len, const unsigned int key[8], ++ const unsigned int counter[4]); + void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp, + size_t len, const unsigned int key[8], + const unsigned int counter[4]) + { +- OPENSSL_ppccap_P & PPC_CRYPTO207 +- ? ChaCha20_ctr32_vsx(out, inp, len, key, counter) +- : OPENSSL_ppccap_P & PPC_ALTIVEC +- ? ChaCha20_ctr32_vmx(out, inp, len, key, counter) +- : ChaCha20_ctr32_int(out, inp, len, key, counter); ++ OPENSSL_ppccap_P & PPC_BRD31 ++ ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter) ++ :OPENSSL_ppccap_P & PPC_CRYPTO207 ++ ? ChaCha20_ctr32_vsx(out, inp, len, key, counter) ++ : OPENSSL_ppccap_P & PPC_ALTIVEC ++ ? ChaCha20_ctr32_vmx(out, inp, len, key, counter) ++ : ChaCha20_ctr32_int(out, inp, len, key, counter); + } +diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl +index 2ee4440..4590340 100755 +--- a/crypto/perlasm/ppc-xlate.pl ++++ b/crypto/perlasm/ppc-xlate.pl +@@ -293,6 +293,14 @@ my $vpermdi = sub { # xxpermdi + $dm = oct($dm) if ($dm =~ /^0/); + " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7; + }; ++my $vxxlor = sub { # xxlor ++ my ($f, $vrt, $vra, $vrb) = @_; ++ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6; ++}; ++my $vxxlorc = sub { # xxlor ++ my ($f, $vrt, $vra, $vrb) = @_; ++ " .long ".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1; ++}; + + # PowerISA 2.07 stuff + sub vcrypto_op { +@@ -377,6 +385,15 @@ my $addex = sub { + }; + my $vmsumudm = sub { vfour_vsr(@_, 35); }; + ++# PowerISA 3.1 stuff ++my $brd = sub { ++ my ($f, $ra, $rs) = @_; ++ " .long ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1); ++}; ++my $vsrq = sub { vcrypto_op(@_, 517); }; ++ ++ ++ + while($line=<>) { + + $line =~ s|[#!;].*$||; # get rid of asm-style comments... +diff --git a/crypto/ppccap.c b/crypto/ppccap.c +index 8bcfed2..664627c 100644 +--- a/crypto/ppccap.c ++++ b/crypto/ppccap.c +@@ -45,6 +45,7 @@ void OPENSSL_ppc64_probe(void); + void OPENSSL_altivec_probe(void); + void OPENSSL_crypto207_probe(void); + void OPENSSL_madd300_probe(void); ++void OPENSSL_brd31_probe(void); + + long OPENSSL_rdtsc_mftb(void); + long OPENSSL_rdtsc_mfspr268(void); +@@ -117,16 +118,21 @@ static unsigned long getauxval(unsigned long key) + #endif + + /* I wish was universally available */ +-#define HWCAP 16 /* AT_HWCAP */ ++#ifndef AT_HWCAP ++# define AT_HWCAP 16 /* AT_HWCAP */ ++#endif + #define HWCAP_PPC64 (1U << 30) + #define HWCAP_ALTIVEC (1U << 28) + #define HWCAP_FPU (1U << 27) + #define HWCAP_POWER6_EXT (1U << 9) + #define HWCAP_VSX (1U << 7) + +-#define HWCAP2 26 /* AT_HWCAP2 */ ++#ifndef AT_HWCAP2 ++# define AT_HWCAP2 26 /* AT_HWCAP2 */ ++#endif + #define HWCAP_VEC_CRYPTO (1U << 25) + #define HWCAP_ARCH_3_00 (1U << 23) ++#define HWCAP_ARCH_3_1 (1U << 18) + + # if defined(__GNUC__) && __GNUC__>=2 + __attribute__ ((constructor)) +@@ -187,6 +193,9 @@ void OPENSSL_cpuid_setup(void) + if (__power_set(0xffffffffU<<17)) /* POWER9 and later */ + OPENSSL_ppccap_P |= PPC_MADD300; + ++ if (__power_set(0xffffffffU<<18)) /* POWER10 and later */ ++ OPENSSL_ppccap_P |= PPC_BRD31; ++ + return; + # endif + #endif +@@ -215,8 +224,8 @@ void OPENSSL_cpuid_setup(void) + + #ifdef OSSL_IMPLEMENT_GETAUXVAL + { +- unsigned long hwcap = getauxval(HWCAP); +- unsigned long hwcap2 = getauxval(HWCAP2); ++ unsigned long hwcap = getauxval(AT_HWCAP); ++ unsigned long hwcap2 = getauxval(AT_HWCAP2); + + if (hwcap & HWCAP_FPU) { + OPENSSL_ppccap_P |= PPC_FPU; +@@ -242,6 +251,10 @@ void OPENSSL_cpuid_setup(void) + if (hwcap2 & HWCAP_ARCH_3_00) { + OPENSSL_ppccap_P |= PPC_MADD300; + } ++ ++ if (hwcap2 & HWCAP_ARCH_3_1) { ++ OPENSSL_ppccap_P |= PPC_BRD31; ++ } + } + #endif + +@@ -263,7 +276,7 @@ void OPENSSL_cpuid_setup(void) + sigaction(SIGILL, &ill_act, &ill_oact); + + #ifndef OSSL_IMPLEMENT_GETAUXVAL +- if (sigsetjmp(ill_jmp,1) == 0) { ++ if (sigsetjmp(ill_jmp, 1) == 0) { + OPENSSL_fpu_probe(); + OPENSSL_ppccap_P |= PPC_FPU; + +diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl +index c6555df..706164a 100755 +--- a/crypto/ppccpuid.pl ++++ b/crypto/ppccpuid.pl +@@ -81,6 +81,17 @@ $code=<<___; + .long 0 + .byte 0,12,0x14,0,0,0,0,0 + ++.globl .OPENSSL_brd31_probe ++.align 4 ++.OPENSSL_brd31_probe: ++ xor r0,r0,r0 ++ brd r3,r0 ++ blr ++ .long 0 ++ .byte 0,12,0x14,0,0,0,0,0 ++.size .OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe ++ ++ + .globl .OPENSSL_wipe_cpu + .align 4 + .OPENSSL_wipe_cpu: +diff --git a/include/crypto/ppc_arch.h b/include/crypto/ppc_arch.h +index 3b3ce4b..fcc846c 100644 +--- a/include/crypto/ppc_arch.h ++++ b/include/crypto/ppc_arch.h +@@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P; + # define PPC_MADD300 (1<<4) + # define PPC_MFTB (1<<5) + # define PPC_MFSPR268 (1<<6) ++# define PPC_BRD31 (1<<7) + + #endif diff --git a/openssl.spec b/openssl.spec index 38e2e94..0ba4ba0 100644 --- a/openssl.spec +++ b/openssl.spec @@ -140,6 +140,13 @@ Patch68: 0068-CVE-2022-2068.patch Patch69: 0069-CVE-2022-2097.patch # https://github.com/openssl/openssl/commit/edceec7fe0c9a5534ae155c8398c63dd7dd95483 Patch70: 0070-EVP_PKEY_Q_keygen-Call-OPENSSL_init_crypto-to-init-s.patch +# https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c +# https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd +Patch71: 0071-AES-GCM-performance-optimization.patch +# https://github.com/openssl/openssl/commit/f596bbe4da779b56eea34d96168b557d78e1149 +# https://github.com/openssl/openssl/commit/7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa +# hunks in crypto/ppccap.c from https://github.com/openssl/openssl/commit/f5485b97b6c9977c0d39c7669b9f97a879312447 +Patch72: 0072-ChaCha20-performance-optimizations-for-ppc64le.patch License: ASL 2.0 URL: http://www.openssl.org/ @@ -474,6 +481,10 @@ install -m644 %{SOURCE9} \ - Fix segfault in EVP_PKEY_Q_keygen() when OpenSSL was not previously initialized. Resolves: rhbz#2103289 +- Improve AES-GCM performance on Power9 and Power10 ppc64le + Resolves: rhbz#2051312 +- Improve ChaCha20 performance on Power10 ppc64le + Resolves: rhbz#2051312 * Tue Jul 05 2022 Clemens Lang - 1:3.0.1-37 - CVE-2022-2097: AES OCB fails to encrypt some bytes on 32-bit x86