From 3e6d5a385b19adb1e87d7584b37603182326fb73 Mon Sep 17 00:00:00 2001
From: Clemens Lang <cllang@redhat.com>
Date: Thu, 14 Jul 2022 16:54:25 +0200
Subject: [PATCH] Improve AES-GCM & ChaCha20 perf on Power9+ ppc64le

Backport patches that improve performance of AES-GCM on Power9 and
newer, and ChaCha20 on Power10.

Resolves: rhbz#2051312
Signed-off-by: Clemens Lang <cllang@redhat.com>
---
 0071-AES-GCM-performance-optimization.patch   | 1635 +++++++++++++++++
 ...erformance-optimizations-for-ppc64le.patch | 1493 +++++++++++++++
 openssl.spec                                  |   11 +
 3 files changed, 3139 insertions(+)
 create mode 100644 0071-AES-GCM-performance-optimization.patch
 create mode 100644 0072-ChaCha20-performance-optimizations-for-ppc64le.patch

diff --git a/0071-AES-GCM-performance-optimization.patch b/0071-AES-GCM-performance-optimization.patch
new file mode 100644
index 0000000..edf40ec
--- /dev/null
+++ b/0071-AES-GCM-performance-optimization.patch
@@ -0,0 +1,1635 @@
+Upstream-Status: Backport [https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c, https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd]
+diff --git a/crypto/modes/asm/aes-gcm-ppc.pl b/crypto/modes/asm/aes-gcm-ppc.pl
+new file mode 100644
+index 0000000..6624e6c
+--- /dev/null
++++ b/crypto/modes/asm/aes-gcm-ppc.pl
+@@ -0,0 +1,1438 @@
++#! /usr/bin/env perl
++# Copyright 2014-2020 The OpenSSL Project Authors. All Rights Reserved.
++# Copyright 2021- IBM Inc. All rights reserved
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++#
++#===================================================================================
++# Written by Danny Tsen <dtsen@us.ibm.com> for OpenSSL Project,
++#
++# GHASH is based on the Karatsuba multiplication method.
++#
++#    Xi xor X1
++#
++#    X1 * H^4 + X2 * H^3 + x3 * H^2 + X4 * H =
++#      (X1.h * H4.h + xX.l * H4.l + X1 * H4) +
++#      (X2.h * H3.h + X2.l * H3.l + X2 * H3) +
++#      (X3.h * H2.h + X3.l * H2.l + X3 * H2) +
++#      (X4.h * H.h + X4.l * H.l + X4 * H)
++#
++# Xi = v0
++# H Poly = v2
++# Hash keys = v3 - v14
++#     ( H.l, H, H.h)
++#     ( H^2.l, H^2, H^2.h)
++#     ( H^3.l, H^3, H^3.h)
++#     ( H^4.l, H^4, H^4.h)
++#
++# v30 is IV
++# v31 - counter 1
++#
++# AES used,
++#     vs0 - vs14 for round keys
++#     v15, v16, v17, v18, v19, v20, v21, v22 for 8 blocks (encrypted)
++#
++# This implementation uses stitched AES-GCM approach to improve overall performance.
++# AES is implemented with 8x blocks and GHASH is using 2 4x blocks.
++#
++# Current large block (16384 bytes) performance per second with 128 bit key --
++#
++#                        Encrypt  Decrypt
++# Power10[le] (3.5GHz)   5.32G    5.26G
++#
++# ===================================================================================
++#
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++if ($flavour =~ /64/) {
++	$SIZE_T=8;
++	$LRSAVE=2*$SIZE_T;
++	$STU="stdu";
++	$POP="ld";
++	$PUSH="std";
++	$UCMP="cmpld";
++	$SHRI="srdi";
++} elsif ($flavour =~ /32/) {
++	$SIZE_T=4;
++	$LRSAVE=$SIZE_T;
++	$STU="stwu";
++	$POP="lwz";
++	$PUSH="stw";
++	$UCMP="cmplw";
++	$SHRI="srwi";
++} else { die "nonsense $flavour"; }
++
++$sp="r1";
++$FRAME=6*$SIZE_T+13*16;	# 13*16 is for v20-v31 offload
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
++die "can't locate ppc-xlate.pl";
++
++open STDOUT,"| $^X $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++
++$code=<<___;
++.machine        "any"
++.text
++
++# 4x loops
++# v15 - v18 - input states
++# vs1 - vs9 - round keys
++#
++.macro Loop_aes_middle4x
++	xxlor	19+32, 1, 1
++	xxlor	20+32, 2, 2
++	xxlor	21+32, 3, 3
++	xxlor	22+32, 4, 4
++
++	vcipher	15, 15, 19
++	vcipher	16, 16, 19
++	vcipher	17, 17, 19
++	vcipher	18, 18, 19
++
++	vcipher	15, 15, 20
++	vcipher	16, 16, 20
++	vcipher	17, 17, 20
++	vcipher	18, 18, 20
++
++	vcipher	15, 15, 21
++	vcipher	16, 16, 21
++	vcipher	17, 17, 21
++	vcipher	18, 18, 21
++
++	vcipher	15, 15, 22
++	vcipher	16, 16, 22
++	vcipher	17, 17, 22
++	vcipher	18, 18, 22
++
++	xxlor	19+32, 5, 5
++	xxlor	20+32, 6, 6
++	xxlor	21+32, 7, 7
++	xxlor	22+32, 8, 8
++
++	vcipher	15, 15, 19
++	vcipher	16, 16, 19
++	vcipher	17, 17, 19
++	vcipher	18, 18, 19
++
++	vcipher	15, 15, 20
++	vcipher	16, 16, 20
++	vcipher	17, 17, 20
++	vcipher	18, 18, 20
++
++	vcipher	15, 15, 21
++	vcipher	16, 16, 21
++	vcipher	17, 17, 21
++	vcipher	18, 18, 21
++
++	vcipher	15, 15, 22
++	vcipher	16, 16, 22
++	vcipher	17, 17, 22
++	vcipher	18, 18, 22
++
++	xxlor	23+32, 9, 9
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++.endm
++
++# 8x loops
++# v15 - v22 - input states
++# vs1 - vs9 - round keys
++#
++.macro Loop_aes_middle8x
++	xxlor	23+32, 1, 1
++	xxlor	24+32, 2, 2
++	xxlor	25+32, 3, 3
++	xxlor	26+32, 4, 4
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	vcipher	15, 15, 25
++	vcipher	16, 16, 25
++	vcipher	17, 17, 25
++	vcipher	18, 18, 25
++	vcipher	19, 19, 25
++	vcipher	20, 20, 25
++	vcipher	21, 21, 25
++	vcipher	22, 22, 25
++
++	vcipher	15, 15, 26
++	vcipher	16, 16, 26
++	vcipher	17, 17, 26
++	vcipher	18, 18, 26
++	vcipher	19, 19, 26
++	vcipher	20, 20, 26
++	vcipher	21, 21, 26
++	vcipher	22, 22, 26
++
++	xxlor	23+32, 5, 5
++	xxlor	24+32, 6, 6
++	xxlor	25+32, 7, 7
++	xxlor	26+32, 8, 8
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	vcipher	15, 15, 25
++	vcipher	16, 16, 25
++	vcipher	17, 17, 25
++	vcipher	18, 18, 25
++	vcipher	19, 19, 25
++	vcipher	20, 20, 25
++	vcipher	21, 21, 25
++	vcipher	22, 22, 25
++
++	vcipher	15, 15, 26
++	vcipher	16, 16, 26
++	vcipher	17, 17, 26
++	vcipher	18, 18, 26
++	vcipher	19, 19, 26
++	vcipher	20, 20, 26
++	vcipher	21, 21, 26
++	vcipher	22, 22, 26
++
++	xxlor	23+32, 9, 9
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++.endm
++
++#
++# Compute 4x hash values based on Karatsuba method.
++#
++ppc_aes_gcm_ghash:
++	vxor		15, 15, 0
++
++	xxlxor		29, 29, 29
++
++	vpmsumd		23, 12, 15		# H4.L * X.L
++	vpmsumd		24, 9, 16
++	vpmsumd		25, 6, 17
++	vpmsumd		26, 3, 18
++
++	vxor		23, 23, 24
++	vxor		23, 23, 25
++	vxor		23, 23, 26		# L
++
++	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
++	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
++	vpmsumd		26, 7, 17
++	vpmsumd		27, 4, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27		# M
++
++	# sum hash and reduction with H Poly
++	vpmsumd		28, 23, 2		# reduction
++
++	xxlor		29+32, 29, 29
++	vsldoi		26, 24, 29, 8		# mL
++	vsldoi		29, 29, 24, 8		# mH
++	vxor		23, 23, 26		# mL + L
++
++	vsldoi		23, 23, 23, 8		# swap
++	vxor		23, 23, 28
++
++	vpmsumd		24, 14, 15		# H4.H * X.H
++	vpmsumd		25, 11, 16
++	vpmsumd		26, 8, 17
++	vpmsumd		27, 5, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27
++
++	vxor		24, 24, 29
++
++	# sum hash and reduction with H Poly
++	vsldoi		27, 23, 23, 8		# swap
++	vpmsumd		23, 23, 2
++	vxor		27, 27, 24
++	vxor		23, 23, 27
++
++	xxlor		32, 23+32, 23+32		# update hash
++
++	blr
++
++#
++# Combine two 4x ghash
++# v15 - v22 - input blocks
++#
++.macro ppc_aes_gcm_ghash2_4x
++	# first 4x hash
++	vxor		15, 15, 0		# Xi + X
++
++	xxlxor		29, 29, 29
++
++	vpmsumd		23, 12, 15		# H4.L * X.L
++	vpmsumd		24, 9, 16
++	vpmsumd		25, 6, 17
++	vpmsumd		26, 3, 18
++
++	vxor		23, 23, 24
++	vxor		23, 23, 25
++	vxor		23, 23, 26		# L
++
++	vpmsumd		24, 13, 15		# H4.L * X.H + H4.H * X.L
++	vpmsumd		25, 10, 16		# H3.L * X1.H + H3.H * X1.L
++	vpmsumd		26, 7, 17
++	vpmsumd		27, 4, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++
++	# sum hash and reduction with H Poly
++	vpmsumd		28, 23, 2		# reduction
++
++	xxlor		29+32, 29, 29
++
++	vxor		24, 24, 27		# M
++	vsldoi		26, 24, 29, 8		# mL
++	vsldoi		29, 29, 24, 8		# mH
++	vxor		23, 23, 26		# mL + L
++
++	vsldoi		23, 23, 23, 8		# swap
++	vxor		23, 23, 28
++
++	vpmsumd		24, 14, 15		# H4.H * X.H
++	vpmsumd		25, 11, 16
++	vpmsumd		26, 8, 17
++	vpmsumd		27, 5, 18
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27		# H
++
++	vxor		24, 24, 29		# H + mH
++
++	# sum hash and reduction with H Poly
++	vsldoi		27, 23, 23, 8		# swap
++	vpmsumd		23, 23, 2
++	vxor		27, 27, 24
++	vxor		27, 23, 27		# 1st Xi
++
++	# 2nd 4x hash
++	vpmsumd		24, 9, 20
++	vpmsumd		25, 6, 21
++	vpmsumd		26, 3, 22
++	vxor		19, 19, 27		# Xi + X
++	vpmsumd		23, 12, 19		# H4.L * X.L
++
++	vxor		23, 23, 24
++	vxor		23, 23, 25
++	vxor		23, 23, 26		# L
++
++	vpmsumd		24, 13, 19		# H4.L * X.H + H4.H * X.L
++	vpmsumd		25, 10, 20		# H3.L * X1.H + H3.H * X1.L
++	vpmsumd		26, 7, 21
++	vpmsumd		27, 4, 22
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++
++	# sum hash and reduction with H Poly
++	vpmsumd		28, 23, 2		# reduction
++
++	xxlor		29+32, 29, 29
++
++	vxor		24, 24, 27		# M
++	vsldoi		26, 24, 29, 8		# mL
++	vsldoi		29, 29, 24, 8		# mH
++	vxor		23, 23, 26		# mL + L
++
++	vsldoi		23, 23, 23, 8		# swap
++	vxor		23, 23, 28
++
++	vpmsumd		24, 14, 19		# H4.H * X.H
++	vpmsumd		25, 11, 20
++	vpmsumd		26, 8, 21
++	vpmsumd		27, 5, 22
++
++	vxor		24, 24, 25
++	vxor		24, 24, 26
++	vxor		24, 24, 27		# H
++
++	vxor		24, 24, 29		# H + mH
++
++	# sum hash and reduction with H Poly
++	vsldoi		27, 23, 23, 8		# swap
++	vpmsumd		23, 23, 2
++	vxor		27, 27, 24
++	vxor		23, 23, 27
++
++	xxlor		32, 23+32, 23+32		# update hash
++
++.endm
++
++#
++# Compute update single hash
++#
++.macro ppc_update_hash_1x
++	vxor		28, 28, 0
++
++	vxor		19, 19, 19
++
++	vpmsumd		22, 3, 28		# L
++	vpmsumd		23, 4, 28		# M
++	vpmsumd		24, 5, 28		# H
++
++	vpmsumd		27, 22, 2		# reduction
++
++	vsldoi		25, 23, 19, 8		# mL
++	vsldoi		26, 19, 23, 8		# mH
++	vxor		22, 22, 25		# LL + LL
++	vxor		24, 24, 26		# HH + HH
++
++	vsldoi		22, 22, 22, 8		# swap
++	vxor		22, 22, 27
++
++	vsldoi		20, 22, 22, 8		# swap
++	vpmsumd		22, 22, 2		# reduction
++	vxor		20, 20, 24
++	vxor		22, 22, 20
++
++	vmr		0, 22			# update hash
++
++.endm
++
++#
++# ppc_aes_gcm_encrypt (const void *inp, void *out, size_t len,
++#               const AES_KEY *key, unsigned char iv[16],
++#               void *Xip);
++#
++#    r3 - inp
++#    r4 - out
++#    r5 - len
++#    r6 - AES round keys
++#    r7 - iv
++#    r8 - Xi, HPoli, hash keys
++#
++.global ppc_aes_gcm_encrypt
++.align 5
++ppc_aes_gcm_encrypt:
++_ppc_aes_gcm_encrypt:
++
++	stdu 1,-512(1)
++	mflr 0
++
++	std	14,112(1)
++	std	15,120(1)
++	std	16,128(1)
++	std	17,136(1)
++	std	18,144(1)
++	std	19,152(1)
++	std	20,160(1)
++	std	21,168(1)
++	li	9, 256
++	stvx	20, 9, 1
++	addi	9, 9, 16
++	stvx	21, 9, 1
++	addi	9, 9, 16
++	stvx	22, 9, 1
++	addi	9, 9, 16
++	stvx	23, 9, 1
++	addi	9, 9, 16
++	stvx	24, 9, 1
++	addi	9, 9, 16
++	stvx	25, 9, 1
++	addi	9, 9, 16
++	stvx	26, 9, 1
++	addi	9, 9, 16
++	stvx	27, 9, 1
++	addi	9, 9, 16
++	stvx	28, 9, 1
++	addi	9, 9, 16
++	stvx	29, 9, 1
++	addi	9, 9, 16
++	stvx	30, 9, 1
++	addi	9, 9, 16
++	stvx	31, 9, 1
++	std	0, 528(1)
++
++	# Load Xi
++	lxvb16x	32, 0, 8	# load Xi
++
++	# load Hash - h^4, h^3, h^2, h
++	li	10, 32
++	lxvd2x	2+32, 10, 8	# H Poli
++	li	10, 48
++	lxvd2x	3+32, 10, 8	# Hl
++	li	10, 64
++	lxvd2x	4+32, 10, 8	# H
++	li	10, 80
++	lxvd2x	5+32, 10, 8	# Hh
++
++	li	10, 96
++	lxvd2x	6+32, 10, 8	# H^2l
++	li	10, 112
++	lxvd2x	7+32, 10, 8	# H^2
++	li	10, 128
++	lxvd2x	8+32, 10, 8	# H^2h
++
++	li	10, 144
++	lxvd2x	9+32, 10, 8	# H^3l
++	li	10, 160
++	lxvd2x	10+32, 10, 8	# H^3
++	li	10, 176
++	lxvd2x	11+32, 10, 8	# H^3h
++
++	li	10, 192
++	lxvd2x	12+32, 10, 8	# H^4l
++	li	10, 208
++	lxvd2x	13+32, 10, 8	# H^4
++	li	10, 224
++	lxvd2x	14+32, 10, 8	# H^4h
++
++	# initialize ICB: GHASH( IV ), IV - r7
++	lxvb16x	30+32, 0, 7	# load IV  - v30
++
++	mr	12, 5		# length
++	li	11, 0		# block index
++
++	# counter 1
++	vxor	31, 31, 31
++	vspltisb 22, 1
++	vsldoi	31, 31, 22,1	# counter 1
++
++	# load round key to VSR
++	lxv	0, 0(6)
++	lxv	1, 0x10(6)
++	lxv	2, 0x20(6)
++	lxv	3, 0x30(6)
++	lxv	4, 0x40(6)
++	lxv	5, 0x50(6)
++	lxv	6, 0x60(6)
++	lxv	7, 0x70(6)
++	lxv	8, 0x80(6)
++	lxv	9, 0x90(6)
++	lxv	10, 0xa0(6)
++
++	# load rounds - 10 (128), 12 (192), 14 (256)
++	lwz	9,240(6)
++
++	#
++	# vxor	state, state, w # addroundkey
++	xxlor	32+29, 0, 0
++	vxor	15, 30, 29	# IV + round key - add round key 0
++
++	cmpdi	9, 10
++	beq	Loop_aes_gcm_8x
++
++	# load 2 more round keys (v11, v12)
++	lxv	11, 0xb0(6)
++	lxv	12, 0xc0(6)
++
++	cmpdi	9, 12
++	beq	Loop_aes_gcm_8x
++
++	# load 2 more round keys (v11, v12, v13, v14)
++	lxv	13, 0xd0(6)
++	lxv	14, 0xe0(6)
++	cmpdi	9, 14
++	beq	Loop_aes_gcm_8x
++
++	b	aes_gcm_out
++
++.align 5
++Loop_aes_gcm_8x:
++	mr	14, 3
++	mr	9, 4
++
++	# n blocks
++	li	10, 128
++	divdu	10, 5, 10	# n 128 bytes-blocks
++	cmpdi	10, 0
++	beq	Loop_last_block
++
++	vaddudm	30, 30, 31	# IV + counter
++	vxor	16, 30, 29
++	vaddudm	30, 30, 31
++	vxor	17, 30, 29
++	vaddudm	30, 30, 31
++	vxor	18, 30, 29
++	vaddudm	30, 30, 31
++	vxor	19, 30, 29
++	vaddudm	30, 30, 31
++	vxor	20, 30, 29
++	vaddudm	30, 30, 31
++	vxor	21, 30, 29
++	vaddudm	30, 30, 31
++	vxor	22, 30, 29
++
++	mtctr	10
++
++	li	15, 16
++	li	16, 32
++	li	17, 48
++	li	18, 64
++	li	19, 80
++	li	20, 96
++	li	21, 112
++
++	lwz	10, 240(6)
++
++Loop_8x_block:
++
++	lxvb16x		15, 0, 14	# load block
++	lxvb16x		16, 15, 14	# load block
++	lxvb16x		17, 16, 14	# load block
++	lxvb16x		18, 17, 14	# load block
++	lxvb16x		19, 18, 14	# load block
++	lxvb16x		20, 19, 14	# load block
++	lxvb16x		21, 20, 14	# load block
++	lxvb16x		22, 21, 14	# load block
++	addi		14, 14, 128
++
++	Loop_aes_middle8x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_next_ghash
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_next_ghash
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_next_ghash
++	b	aes_gcm_out
++
++Do_next_ghash:
++
++	#
++	# last round
++	vcipherlast     15, 15, 23
++	vcipherlast     16, 16, 23
++
++	xxlxor		47, 47, 15
++	stxvb16x        47, 0, 9	# store output
++	xxlxor		48, 48, 16
++	stxvb16x        48, 15, 9	# store output
++
++	vcipherlast     17, 17, 23
++	vcipherlast     18, 18, 23
++
++	xxlxor		49, 49, 17
++	stxvb16x        49, 16, 9	# store output
++	xxlxor		50, 50, 18
++	stxvb16x        50, 17, 9	# store output
++
++	vcipherlast     19, 19, 23
++	vcipherlast     20, 20, 23
++
++	xxlxor		51, 51, 19
++	stxvb16x        51, 18, 9	# store output
++	xxlxor		52, 52, 20
++	stxvb16x        52, 19, 9	# store output
++
++	vcipherlast     21, 21, 23
++	vcipherlast     22, 22, 23
++
++	xxlxor		53, 53, 21
++	stxvb16x        53, 20, 9	# store output
++	xxlxor		54, 54, 22
++	stxvb16x        54, 21, 9	# store output
++
++	addi		9, 9, 128
++
++	# ghash here
++	ppc_aes_gcm_ghash2_4x
++
++	xxlor	27+32, 0, 0
++	vaddudm 30, 30, 31		# IV + counter
++	vmr	29, 30
++	vxor    15, 30, 27		# add round key
++	vaddudm 30, 30, 31
++	vxor    16, 30, 27
++	vaddudm 30, 30, 31
++	vxor    17, 30, 27
++	vaddudm 30, 30, 31
++	vxor    18, 30, 27
++	vaddudm 30, 30, 31
++	vxor    19, 30, 27
++	vaddudm 30, 30, 31
++	vxor    20, 30, 27
++	vaddudm 30, 30, 31
++	vxor    21, 30, 27
++	vaddudm 30, 30, 31
++	vxor    22, 30, 27
++
++	addi    12, 12, -128
++	addi    11, 11, 128
++
++	bdnz	Loop_8x_block
++
++	vmr	30, 29
++
++Loop_last_block:
++	cmpdi   12, 0
++	beq     aes_gcm_out
++
++	# loop last few blocks
++	li      10, 16
++	divdu   10, 12, 10
++
++	mtctr   10
++
++	lwz	10, 240(6)
++
++	cmpdi   12, 16
++	blt     Final_block
++
++.macro Loop_aes_middle_1x
++	xxlor	19+32, 1, 1
++	xxlor	20+32, 2, 2
++	xxlor	21+32, 3, 3
++	xxlor	22+32, 4, 4
++
++	vcipher 15, 15, 19
++	vcipher 15, 15, 20
++	vcipher 15, 15, 21
++	vcipher 15, 15, 22
++
++	xxlor	19+32, 5, 5
++	xxlor	20+32, 6, 6
++	xxlor	21+32, 7, 7
++	xxlor	22+32, 8, 8
++
++	vcipher 15, 15, 19
++	vcipher 15, 15, 20
++	vcipher 15, 15, 21
++	vcipher 15, 15, 22
++
++	xxlor	19+32, 9, 9
++	vcipher 15, 15, 19
++.endm
++
++Next_rem_block:
++	lxvb16x 15, 0, 14		# load block
++
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_next_1x
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_next_1x
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_next_1x
++
++Do_next_1x:
++	vcipherlast     15, 15, 23
++
++	xxlxor		47, 47, 15
++	stxvb16x	47, 0, 9	# store output
++	addi		14, 14, 16
++	addi		9, 9, 16
++
++	vmr		28, 15
++	ppc_update_hash_1x
++
++	addi		12, 12, -16
++	addi		11, 11, 16
++	xxlor		19+32, 0, 0
++	vaddudm		30, 30, 31		# IV + counter
++	vxor		15, 30, 19		# add round key
++
++	bdnz	Next_rem_block
++
++	cmpdi	12, 0
++	beq	aes_gcm_out
++
++Final_block:
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_final_1x
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_final_1x
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_final_1x
++
++Do_final_1x:
++	vcipherlast     15, 15, 23
++
++	lxvb16x	15, 0, 14		# load last block
++	xxlxor	47, 47, 15
++
++	# create partial block mask
++	li	15, 16
++	sub	15, 15, 12		# index to the mask
++
++	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
++	vspltisb	17, 0		# second 16 bytes - 0x0000...00
++	li	10, 192
++	stvx	16, 10, 1
++	addi	10, 10, 16
++	stvx	17, 10, 1
++
++	addi	10, 1, 192
++	lxvb16x	16, 15, 10		# load partial block mask
++	xxland	47, 47, 16
++
++	vmr	28, 15
++	ppc_update_hash_1x
++
++	# * should store only the remaining bytes.
++	bl	Write_partial_block
++
++	b aes_gcm_out
++
++#
++# Write partial block
++# r9 - output
++# r12 - remaining bytes
++# v15 - partial input data
++#
++Write_partial_block:
++	li		10, 192
++	stxvb16x	15+32, 10, 1		# last block
++
++	#add		10, 9, 11		# Output
++	addi		10, 9, -1
++	addi		16, 1, 191
++
++        mtctr		12			# remaining bytes
++	li		15, 0
++
++Write_last_byte:
++        lbzu		14, 1(16)
++	stbu		14, 1(10)
++        bdnz		Write_last_byte
++	blr
++
++aes_gcm_out:
++	# out = state
++	stxvb16x	32, 0, 8		# write out Xi
++	add	3, 11, 12		# return count
++
++	li	9, 256
++	lvx	20, 9, 1
++	addi	9, 9, 16
++	lvx	21, 9, 1
++	addi	9, 9, 16
++	lvx	22, 9, 1
++	addi	9, 9, 16
++	lvx	23, 9, 1
++	addi	9, 9, 16
++	lvx	24, 9, 1
++	addi	9, 9, 16
++	lvx	25, 9, 1
++	addi	9, 9, 16
++	lvx	26, 9, 1
++	addi	9, 9, 16
++	lvx	27, 9, 1
++	addi	9, 9, 16
++	lvx	28, 9, 1
++	addi	9, 9, 16
++	lvx	29, 9, 1
++	addi	9, 9, 16
++	lvx	30, 9, 1
++	addi	9, 9, 16
++	lvx	31, 9, 1
++
++	ld	0, 528(1)
++	ld      14,112(1)
++	ld      15,120(1)
++	ld      16,128(1)
++	ld      17,136(1)
++	ld      18,144(1)
++	ld      19,152(1)
++	ld      20,160(1)
++	ld	21,168(1)
++
++	mtlr	0
++	addi	1, 1, 512
++	blr
++
++#
++# 8x Decrypt
++#
++.global ppc_aes_gcm_decrypt
++.align 5
++ppc_aes_gcm_decrypt:
++_ppc_aes_gcm_decrypt:
++
++	stdu 1,-512(1)
++	mflr 0
++
++	std	14,112(1)
++	std	15,120(1)
++	std	16,128(1)
++	std	17,136(1)
++	std	18,144(1)
++	std	19,152(1)
++	std	20,160(1)
++	std	21,168(1)
++	li	9, 256
++	stvx	20, 9, 1
++	addi	9, 9, 16
++	stvx	21, 9, 1
++	addi	9, 9, 16
++	stvx	22, 9, 1
++	addi	9, 9, 16
++	stvx	23, 9, 1
++	addi	9, 9, 16
++	stvx	24, 9, 1
++	addi	9, 9, 16
++	stvx	25, 9, 1
++	addi	9, 9, 16
++	stvx	26, 9, 1
++	addi	9, 9, 16
++	stvx	27, 9, 1
++	addi	9, 9, 16
++	stvx	28, 9, 1
++	addi	9, 9, 16
++	stvx	29, 9, 1
++	addi	9, 9, 16
++	stvx	30, 9, 1
++	addi	9, 9, 16
++	stvx	31, 9, 1
++	std	0, 528(1)
++
++	# Load Xi
++	lxvb16x	32, 0, 8	# load Xi
++
++	# load Hash - h^4, h^3, h^2, h
++	li	10, 32
++	lxvd2x	2+32, 10, 8	# H Poli
++	li	10, 48
++	lxvd2x	3+32, 10, 8	# Hl
++	li	10, 64
++	lxvd2x	4+32, 10, 8	# H
++	li	10, 80
++	lxvd2x	5+32, 10, 8	# Hh
++
++	li	10, 96
++	lxvd2x	6+32, 10, 8	# H^2l
++	li	10, 112
++	lxvd2x	7+32, 10, 8	# H^2
++	li	10, 128
++	lxvd2x	8+32, 10, 8	# H^2h
++
++	li	10, 144
++	lxvd2x	9+32, 10, 8	# H^3l
++	li	10, 160
++	lxvd2x	10+32, 10, 8	# H^3
++	li	10, 176
++	lxvd2x	11+32, 10, 8	# H^3h
++
++	li	10, 192
++	lxvd2x	12+32, 10, 8	# H^4l
++	li	10, 208
++	lxvd2x	13+32, 10, 8	# H^4
++	li	10, 224
++	lxvd2x	14+32, 10, 8	# H^4h
++
++	# initialize ICB: GHASH( IV ), IV - r7
++	lxvb16x	30+32, 0, 7	# load IV  - v30
++
++	mr	12, 5		# length
++	li	11, 0		# block index
++
++	# counter 1
++	vxor	31, 31, 31
++	vspltisb 22, 1
++	vsldoi	31, 31, 22,1	# counter 1
++
++	# load round key to VSR
++	lxv	0, 0(6)
++	lxv	1, 0x10(6)
++	lxv	2, 0x20(6)
++	lxv	3, 0x30(6)
++	lxv	4, 0x40(6)
++	lxv	5, 0x50(6)
++	lxv	6, 0x60(6)
++	lxv	7, 0x70(6)
++	lxv	8, 0x80(6)
++	lxv	9, 0x90(6)
++	lxv	10, 0xa0(6)
++
++	# load rounds - 10 (128), 12 (192), 14 (256)
++	lwz	9,240(6)
++
++	#
++	# vxor	state, state, w # addroundkey
++	xxlor	32+29, 0, 0
++	vxor	15, 30, 29	# IV + round key - add round key 0
++
++	cmpdi	9, 10
++	beq	Loop_aes_gcm_8x_dec
++
++	# load 2 more round keys (v11, v12)
++	lxv	11, 0xb0(6)
++	lxv	12, 0xc0(6)
++
++	cmpdi	9, 12
++	beq	Loop_aes_gcm_8x_dec
++
++	# load 2 more round keys (v11, v12, v13, v14)
++	lxv	13, 0xd0(6)
++	lxv	14, 0xe0(6)
++	cmpdi	9, 14
++	beq	Loop_aes_gcm_8x_dec
++
++	b	aes_gcm_out
++
++.align 5
++Loop_aes_gcm_8x_dec:
++	mr	14, 3
++	mr	9, 4
++
++	# n blocks
++	li	10, 128
++	divdu	10, 5, 10	# n 128 bytes-blocks
++	cmpdi	10, 0
++	beq	Loop_last_block_dec
++
++	vaddudm	30, 30, 31	# IV + counter
++	vxor	16, 30, 29
++	vaddudm	30, 30, 31
++	vxor	17, 30, 29
++	vaddudm	30, 30, 31
++	vxor	18, 30, 29
++	vaddudm	30, 30, 31
++	vxor	19, 30, 29
++	vaddudm	30, 30, 31
++	vxor	20, 30, 29
++	vaddudm	30, 30, 31
++	vxor	21, 30, 29
++	vaddudm	30, 30, 31
++	vxor	22, 30, 29
++
++	mtctr	10
++
++	li	15, 16
++	li	16, 32
++	li	17, 48
++	li	18, 64
++	li	19, 80
++	li	20, 96
++	li	21, 112
++
++	lwz	10, 240(6)
++
++Loop_8x_block_dec:
++
++	lxvb16x		15, 0, 14	# load block
++	lxvb16x		16, 15, 14	# load block
++	lxvb16x		17, 16, 14	# load block
++	lxvb16x		18, 17, 14	# load block
++	lxvb16x		19, 18, 14	# load block
++	lxvb16x		20, 19, 14	# load block
++	lxvb16x		21, 20, 14	# load block
++	lxvb16x		22, 21, 14	# load block
++	addi		14, 14, 128
++
++	Loop_aes_middle8x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_last_aes_dec
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_last_aes_dec
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	16, 16, 23
++	vcipher	17, 17, 23
++	vcipher	18, 18, 23
++	vcipher	19, 19, 23
++	vcipher	20, 20, 23
++	vcipher	21, 21, 23
++	vcipher	22, 22, 23
++
++	vcipher	15, 15, 24
++	vcipher	16, 16, 24
++	vcipher	17, 17, 24
++	vcipher	18, 18, 24
++	vcipher	19, 19, 24
++	vcipher	20, 20, 24
++	vcipher	21, 21, 24
++	vcipher	22, 22, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_last_aes_dec
++	b	aes_gcm_out
++
++Do_last_aes_dec:
++
++	#
++	# last round
++	vcipherlast     15, 15, 23
++	vcipherlast     16, 16, 23
++
++	xxlxor		47, 47, 15
++	stxvb16x        47, 0, 9	# store output
++	xxlxor		48, 48, 16
++	stxvb16x        48, 15, 9	# store output
++
++	vcipherlast     17, 17, 23
++	vcipherlast     18, 18, 23
++
++	xxlxor		49, 49, 17
++	stxvb16x        49, 16, 9	# store output
++	xxlxor		50, 50, 18
++	stxvb16x        50, 17, 9	# store output
++
++	vcipherlast     19, 19, 23
++	vcipherlast     20, 20, 23
++
++	xxlxor		51, 51, 19
++	stxvb16x        51, 18, 9	# store output
++	xxlxor		52, 52, 20
++	stxvb16x        52, 19, 9	# store output
++
++	vcipherlast     21, 21, 23
++	vcipherlast     22, 22, 23
++
++	xxlxor		53, 53, 21
++	stxvb16x        53, 20, 9	# store output
++	xxlxor		54, 54, 22
++	stxvb16x        54, 21, 9	# store output
++
++	addi		9, 9, 128
++
++	xxlor		15+32, 15, 15
++	xxlor		16+32, 16, 16
++	xxlor		17+32, 17, 17
++	xxlor		18+32, 18, 18
++	xxlor		19+32, 19, 19
++	xxlor		20+32, 20, 20
++	xxlor		21+32, 21, 21
++	xxlor		22+32, 22, 22
++
++	# ghash here
++	ppc_aes_gcm_ghash2_4x
++
++	xxlor	27+32, 0, 0
++	vaddudm 30, 30, 31		# IV + counter
++	vmr	29, 30
++	vxor    15, 30, 27		# add round key
++	vaddudm 30, 30, 31
++	vxor    16, 30, 27
++	vaddudm 30, 30, 31
++	vxor    17, 30, 27
++	vaddudm 30, 30, 31
++	vxor    18, 30, 27
++	vaddudm 30, 30, 31
++	vxor    19, 30, 27
++	vaddudm 30, 30, 31
++	vxor    20, 30, 27
++	vaddudm 30, 30, 31
++	vxor    21, 30, 27
++	vaddudm 30, 30, 31
++	vxor    22, 30, 27
++	addi    12, 12, -128
++	addi    11, 11, 128
++
++	bdnz	Loop_8x_block_dec
++
++	vmr	30, 29
++
++Loop_last_block_dec:
++	cmpdi   12, 0
++	beq     aes_gcm_out
++
++	# loop last few blocks
++	li      10, 16
++	divdu   10, 12, 10
++
++	mtctr   10
++
++	lwz	10,240(6)
++
++	cmpdi   12, 16
++	blt     Final_block_dec
++
++Next_rem_block_dec:
++	lxvb16x 15, 0, 14		# load block
++
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_next_1x_dec
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_next_1x_dec
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_next_1x_dec
++
++Do_next_1x_dec:
++	vcipherlast     15, 15, 23
++
++	xxlxor  47, 47, 15
++	stxvb16x        47, 0, 9	# store output
++	addi	14, 14, 16
++	addi	9, 9, 16
++
++	xxlor	28+32, 15, 15
++	ppc_update_hash_1x
++
++	addi    12, 12, -16
++	addi    11, 11, 16
++	xxlor	19+32, 0, 0
++	vaddudm 30, 30, 31		# IV + counter
++	vxor	15, 30, 19		# add round key
++
++	bdnz	Next_rem_block_dec
++
++	cmpdi	12, 0
++	beq	aes_gcm_out
++
++Final_block_dec:
++	Loop_aes_middle_1x
++
++	xxlor	23+32, 10, 10
++
++	cmpdi	10, 10
++	beq	Do_final_1x_dec
++
++	# 192 bits
++	xxlor	24+32, 11, 11
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 12, 12
++
++	cmpdi	10, 12
++	beq	Do_final_1x_dec
++
++	# 256 bits
++	xxlor	24+32, 13, 13
++
++	vcipher	15, 15, 23
++	vcipher	15, 15, 24
++
++	xxlor	23+32, 14, 14
++
++	cmpdi	10, 14
++	beq	Do_final_1x_dec
++
++Do_final_1x_dec:
++	vcipherlast     15, 15, 23
++
++	lxvb16x	15, 0, 14		# load block
++	xxlxor	47, 47, 15
++
++	# create partial block mask
++	li	15, 16
++	sub	15, 15, 12		# index to the mask
++
++	vspltisb	16, -1		# first 16 bytes - 0xffff...ff
++	vspltisb	17, 0		# second 16 bytes - 0x0000...00
++	li	10, 192
++	stvx	16, 10, 1
++	addi	10, 10, 16
++	stvx	17, 10, 1
++
++	addi	10, 1, 192
++	lxvb16x	16, 15, 10		# load block mask
++	xxland	47, 47, 16
++
++	xxlor	28+32, 15, 15
++	ppc_update_hash_1x
++
++	# * should store only the remaining bytes.
++	bl	Write_partial_block
++
++	b aes_gcm_out
++
++
++___
++
++foreach (split("\n",$code)) {
++	s/\`([^\`]*)\`/eval $1/geo;
++
++	if ($flavour =~ /le$/o) {	# little-endian
++	    s/le\?//o		or
++	    s/be\?/#be#/o;
++	} else {
++	    s/le\?/#le#/o	or
++	    s/be\?//o;
++	}
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!"; # enforce flush
+diff --git a/crypto/modes/build.info b/crypto/modes/build.info
+index 687e872..0ea122e 100644
+--- a/crypto/modes/build.info
++++ b/crypto/modes/build.info
+@@ -32,7 +32,7 @@ IF[{- !$disabled{asm} -}]
+   $MODESASM_parisc20_64=$MODESASM_parisc11
+   $MODESDEF_parisc20_64=$MODESDEF_parisc11
+ 
+-  $MODESASM_ppc32=ghashp8-ppc.s
++  $MODESASM_ppc32=ghashp8-ppc.s aes-gcm-ppc.s
+   $MODESDEF_ppc32=
+   $MODESASM_ppc64=$MODESASM_ppc32
+   $MODESDEF_ppc64=$MODESDEF_ppc32
+@@ -71,6 +71,7 @@ INCLUDE[ghash-sparcv9.o]=..
+ GENERATE[ghash-alpha.S]=asm/ghash-alpha.pl
+ GENERATE[ghash-parisc.s]=asm/ghash-parisc.pl
+ GENERATE[ghashp8-ppc.s]=asm/ghashp8-ppc.pl
++GENERATE[aes-gcm-ppc.s]=asm/aes-gcm-ppc.pl
+ GENERATE[ghash-armv4.S]=asm/ghash-armv4.pl
+ INCLUDE[ghash-armv4.o]=..
+ GENERATE[ghashv8-armx.S]=asm/ghashv8-armx.pl
+diff --git a/include/crypto/aes_platform.h b/include/crypto/aes_platform.h
+index e95ad5a..0c281a3 100644
+--- a/include/crypto/aes_platform.h
++++ b/include/crypto/aes_platform.h
+@@ -74,6 +74,26 @@ void AES_xts_decrypt(const unsigned char *inp, unsigned char *out, size_t len,
+ #   define HWAES_ctr32_encrypt_blocks aes_p8_ctr32_encrypt_blocks
+ #   define HWAES_xts_encrypt aes_p8_xts_encrypt
+ #   define HWAES_xts_decrypt aes_p8_xts_decrypt
++#   define PPC_AES_GCM_CAPABLE (OPENSSL_ppccap_P & PPC_MADD300)
++#   define AES_GCM_ENC_BYTES 128
++#   define AES_GCM_DEC_BYTES 128
++size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out,
++                           size_t len, const void *key, unsigned char ivec[16],
++                           u64 *Xi);
++size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out,
++                           size_t len, const void *key, unsigned char ivec[16],
++                           u64 *Xi);
++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out,
++                                size_t len, const void *key,
++                                unsigned char ivec[16], u64 *Xi);
++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out,
++                                size_t len, const void *key,
++                                unsigned char ivec[16], u64 *Xi);
++#   define AES_gcm_encrypt ppc_aes_gcm_encrypt_wrap
++#   define AES_gcm_decrypt ppc_aes_gcm_decrypt_wrap
++#   define AES_GCM_ASM(gctx) ((gctx)->ctr==aes_p8_ctr32_encrypt_blocks && \
++                              (gctx)->gcm.ghash==gcm_ghash_p8)
++void gcm_ghash_p8(u64 Xi[2],const u128 Htable[16],const u8 *inp, size_t len);
+ #  endif /* PPC */
+ 
+ #  if (defined(__arm__) || defined(__arm) || defined(__aarch64__))
+diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw.c b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
+index 44fa9d4..789ec12 100644
+--- a/providers/implementations/ciphers/cipher_aes_gcm_hw.c
++++ b/providers/implementations/ciphers/cipher_aes_gcm_hw.c
+@@ -141,6 +141,8 @@ static const PROV_GCM_HW aes_gcm = {
+ # include "cipher_aes_gcm_hw_t4.inc"
+ #elif defined(AES_PMULL_CAPABLE) && defined(AES_GCM_ASM)
+ # include "cipher_aes_gcm_hw_armv8.inc"
++#elif defined(PPC_AES_GCM_CAPABLE)
++# include "cipher_aes_gcm_hw_ppc.inc"
+ #else
+ const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
+ {
+diff --git a/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
+new file mode 100644
+index 0000000..4eed0f4
+--- /dev/null
++++ b/providers/implementations/ciphers/cipher_aes_gcm_hw_ppc.inc
+@@ -0,0 +1,119 @@
++/*
++ * Copyright 2001-2021 The OpenSSL Project Authors. All Rights Reserved.
++ *
++ * Licensed under the Apache License 2.0 (the "License").  You may not use
++ * this file except in compliance with the License.  You can obtain a copy
++ * in the file LICENSE in the source distribution or at
++ * https://www.openssl.org/source/license.html
++ */
++
++/*-
++ * PPC support for AES GCM.
++ * This file is included by cipher_aes_gcm_hw.c
++ */
++
++static int aes_ppc_gcm_initkey(PROV_GCM_CTX *ctx, const unsigned char *key,
++                               size_t keylen)
++{
++    PROV_AES_GCM_CTX *actx = (PROV_AES_GCM_CTX *)ctx;
++    AES_KEY *ks = &actx->ks.ks;
++
++    GCM_HW_SET_KEY_CTR_FN(ks, aes_p8_set_encrypt_key, aes_p8_encrypt,
++                          aes_p8_ctr32_encrypt_blocks);
++    return 1;
++}
++
++
++extern size_t ppc_aes_gcm_encrypt(const unsigned char *in, unsigned char *out, size_t len,
++                                  const void *key, unsigned char ivec[16], u64 *Xi);
++extern size_t ppc_aes_gcm_decrypt(const unsigned char *in, unsigned char *out, size_t len,
++                                  const void *key, unsigned char ivec[16], u64 *Xi);
++
++static inline u32 UTO32(unsigned char *buf)
++{
++    return ((u32) buf[0] << 24) | ((u32) buf[1] << 16) | ((u32) buf[2] << 8) | ((u32) buf[3]);
++}
++
++static inline u32 add32TOU(unsigned char buf[4], u32 n)
++{
++    u32 r;
++
++    r = UTO32(buf);
++    r += n;
++    buf[0] = (unsigned char) (r >> 24) & 0xFF;
++    buf[1] = (unsigned char) (r >> 16) & 0xFF;
++    buf[2] = (unsigned char) (r >> 8) & 0xFF;
++    buf[3] = (unsigned char) r & 0xFF;
++    return r;
++}
++
++static size_t aes_p10_gcm_crypt(const unsigned char *in, unsigned char *out, size_t len,
++                                const void *key, unsigned char ivec[16], u64 *Xi, int encrypt)
++{
++    int s = 0;
++    int ndone = 0;
++    int ctr_reset = 0;
++    u64 blocks_unused;
++    u64 nb = len / 16;
++    u64 next_ctr = 0;
++    unsigned char ctr_saved[12];
++
++    memcpy(ctr_saved, ivec, 12);
++
++    while (nb) {
++        blocks_unused = (u64) 0xffffffffU + 1 - (u64) UTO32 (ivec + 12);
++        if (nb > blocks_unused) {
++            len = blocks_unused * 16;
++            nb -= blocks_unused;
++            next_ctr = blocks_unused;
++            ctr_reset = 1;
++        } else {
++            len = nb * 16;
++            next_ctr = nb;
++            nb = 0;
++        }
++
++        s = encrypt ? ppc_aes_gcm_encrypt(in, out, len, key, ivec, Xi)
++                    : ppc_aes_gcm_decrypt(in, out, len, key, ivec, Xi);
++
++        /* add counter to ivec */
++        add32TOU(ivec + 12, (u32) next_ctr);
++        if (ctr_reset) {
++            ctr_reset = 0;
++            in += len;
++            out += len;
++        }
++        memcpy(ivec, ctr_saved, 12);
++        ndone += s;
++    }
++
++    return ndone;
++}
++
++size_t ppc_aes_gcm_encrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
++                                const void *key, unsigned char ivec[16], u64 *Xi)
++{
++    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 1);
++}
++
++size_t ppc_aes_gcm_decrypt_wrap(const unsigned char *in, unsigned char *out, size_t len,
++                                const void *key, unsigned char ivec[16], u64 *Xi)
++{
++    return aes_p10_gcm_crypt(in, out, len, key, ivec, Xi, 0);
++}
++
++
++static const PROV_GCM_HW aes_ppc_gcm = {
++    aes_ppc_gcm_initkey,
++    ossl_gcm_setiv,
++    ossl_gcm_aad_update,
++    generic_aes_gcm_cipher_update,
++    ossl_gcm_cipher_final,
++    ossl_gcm_one_shot
++};
++
++const PROV_GCM_HW *ossl_prov_aes_hw_gcm(size_t keybits)
++{
++    return PPC_AES_GCM_CAPABLE ? &aes_ppc_gcm : &aes_gcm;
++}
++
diff --git a/0072-ChaCha20-performance-optimizations-for-ppc64le.patch b/0072-ChaCha20-performance-optimizations-for-ppc64le.patch
new file mode 100644
index 0000000..527b901
--- /dev/null
+++ b/0072-ChaCha20-performance-optimizations-for-ppc64le.patch
@@ -0,0 +1,1493 @@
+Upstream-Status: Backport [
+    https://github.com/openssl/openssl/commit/f596bbe4da779b56eea34d96168b557d78e1149,
+    https://github.com/openssl/openssl/commit/7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa,
+    hunks in crypto/ppccap.c from https://github.com/openssl/openssl/commit/f5485b97b6c9977c0d39c7669b9f97a879312447
+]
+diff --git a/crypto/chacha/asm/chachap10-ppc.pl b/crypto/chacha/asm/chachap10-ppc.pl
+new file mode 100755
+index 0000000..36e9a8d
+--- /dev/null
++++ b/crypto/chacha/asm/chachap10-ppc.pl
+@@ -0,0 +1,1288 @@
++#! /usr/bin/env perl
++# Copyright 2016-2020 The OpenSSL Project Authors. All Rights Reserved.
++#
++# Licensed under the Apache License 2.0 (the "License").  You may not use
++# this file except in compliance with the License.  You can obtain a copy
++# in the file LICENSE in the source distribution or at
++# https://www.openssl.org/source/license.html
++
++#
++# ====================================================================
++# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
++# project. The module is, however, dual licensed under OpenSSL and
++# CRYPTOGAMS licenses depending on where you obtain it. For further
++# details see http://www.openssl.org/~appro/cryptogams/.
++# ====================================================================
++#
++# October 2015
++#
++# ChaCha20 for PowerPC/AltiVec.
++#
++# June 2018
++#
++# Add VSX 2.07 code path. Original 3xAltiVec+1xIALU is well-suited for
++# processors that can't issue more than one vector instruction per
++# cycle. But POWER8 (and POWER9) can issue a pair, and vector-only 4x
++# interleave would perform better. Incidentally PowerISA 2.07 (first
++# implemented by POWER8) defined new usable instructions, hence 4xVSX
++# code path...
++#
++# Performance in cycles per byte out of large buffer.
++#
++#			IALU/gcc-4.x    3xAltiVec+1xIALU	4xVSX
++#
++# Freescale e300	13.6/+115%	-			-
++# PPC74x0/G4e		6.81/+310%	3.81			-
++# PPC970/G5		9.29/+160%	?			-
++# POWER7		8.62/+61%	3.35			-
++# POWER8		8.70/+51%	2.91			2.09
++# POWER9		8.80/+29%	4.44(*)			2.45(**)
++#
++# (*)	this is trade-off result, it's possible to improve it, but
++#	then it would negatively affect all others;
++# (**)	POWER9 seems to be "allergic" to mixing vector and integer
++#	instructions, which is why switch to vector-only code pays
++#	off that much;
++
++# $output is the last argument if it looks like a file (it has an extension)
++# $flavour is the first argument if it doesn't look like a file
++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
++
++if ($flavour =~ /64/) {
++	$SIZE_T	=8;
++	$LRSAVE	=2*$SIZE_T;
++	$STU	="stdu";
++	$POP	="ld";
++	$PUSH	="std";
++	$UCMP	="cmpld";
++} elsif ($flavour =~ /32/) {
++	$SIZE_T	=4;
++	$LRSAVE	=$SIZE_T;
++	$STU	="stwu";
++	$POP	="lwz";
++	$PUSH	="stw";
++	$UCMP	="cmplw";
++} else { die "nonsense $flavour"; }
++
++$LITTLE_ENDIAN = ($flavour=~/le$/) ? 1 : 0;
++
++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
++die "can't locate ppc-xlate.pl";
++
++open STDOUT,"| $^X $xlate $flavour \"$output\""
++    or die "can't call $xlate: $!";
++
++$LOCALS=6*$SIZE_T;
++$FRAME=$LOCALS+64+18*$SIZE_T;	# 64 is for local variables
++
++sub AUTOLOAD()		# thunk [simplified] x86-style perlasm
++{ my $opcode = $AUTOLOAD; $opcode =~ s/.*:://; $opcode =~ s/_/\./;
++    $code .= "\t$opcode\t".join(',',@_)."\n";
++}
++
++my $sp = "r1";
++
++my ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
++
++
++{{{
++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
++    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3) = map("v$_",(0..15));
++my @K = map("v$_",(16..19));
++my $CTR = "v26";
++my ($xt0,$xt1,$xt2,$xt3) = map("v$_",(27..30));
++my ($sixteen,$twelve,$eight,$seven) = ($xt0,$xt1,$xt2,$xt3);
++my $beperm = "v31";
++
++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
++
++my $FRAME=$LOCALS+64+7*16;	# 7*16 is for v26-v31 offload
++
++
++sub VSX_lane_ROUND_4x {
++my ($a0,$b0,$c0,$d0)=@_;
++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
++my @x=map("\"v$_\"",(0..15));
++
++	(
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vrlw		(@x[$d0],@x[$d0],'$sixteen')",
++	 "&vrlw		(@x[$d1],@x[$d1],'$sixteen')",
++	  "&vrlw	(@x[$d2],@x[$d2],'$sixteen')",
++	   "&vrlw	(@x[$d3],@x[$d3],'$sixteen')",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vrlw		(@x[$b0],@x[$b0],'$twelve')",
++	 "&vrlw		(@x[$b1],@x[$b1],'$twelve')",
++	  "&vrlw	(@x[$b2],@x[$b2],'$twelve')",
++	   "&vrlw	(@x[$b3],@x[$b3],'$twelve')",
++
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vrlw		(@x[$d0],@x[$d0],'$eight')",
++	 "&vrlw		(@x[$d1],@x[$d1],'$eight')",
++	  "&vrlw	(@x[$d2],@x[$d2],'$eight')",
++	   "&vrlw	(@x[$d3],@x[$d3],'$eight')",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vrlw		(@x[$b0],@x[$b0],'$seven')",
++	 "&vrlw		(@x[$b1],@x[$b1],'$seven')",
++	  "&vrlw	(@x[$b2],@x[$b2],'$seven')",
++	   "&vrlw	(@x[$b3],@x[$b3],'$seven')"
++	);
++}
++
++$code.=<<___;
++
++.globl	.ChaCha20_ctr32_vsx_p10
++.align	5
++.ChaCha20_ctr32_vsx_p10:
++	${UCMP}i $len,255
++	bgt 	ChaCha20_ctr32_vsx_8x
++	$STU	$sp,-$FRAME($sp)
++	mflr	r0
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	mfspr	r12,256
++	stvx	v26,r10,$sp
++	addi	r10,r10,32
++	stvx	v27,r11,$sp
++	addi	r11,r11,32
++	stvx	v28,r10,$sp
++	addi	r10,r10,32
++	stvx	v29,r11,$sp
++	addi	r11,r11,32
++	stvx	v30,r10,$sp
++	stvx	v31,r11,$sp
++	stw	r12,`$FRAME-4`($sp)		# save vrsave
++	li	r12,-4096+63
++	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# preserve 29 AltiVec registers
++
++	bl	Lconsts				# returns pointer Lsigma in r12
++	lvx_4w	@K[0],0,r12			# load sigma
++	addi	r12,r12,0x70
++	li	$x10,16
++	li	$x20,32
++	li	$x30,48
++	li	r11,64
++
++	lvx_4w	@K[1],0,$key			# load key
++	lvx_4w	@K[2],$x10,$key
++	lvx_4w	@K[3],0,$ctr			# load counter
++
++	vxor	$xt0,$xt0,$xt0
++	lvx_4w	$xt1,r11,r12
++	vspltw	$CTR,@K[3],0
++	vsldoi	@K[3],@K[3],$xt0,4
++	vsldoi	@K[3],$xt0,@K[3],12		# clear @K[3].word[0]
++	vadduwm	$CTR,$CTR,$xt1
++
++	be?lvsl	$beperm,0,$x10			# 0x00..0f
++	be?vspltisb $xt0,3			# 0x03..03
++	be?vxor	$beperm,$beperm,$xt0		# swap bytes within words
++
++	li	r0,10				# inner loop counter
++	mtctr	r0
++	b	Loop_outer_vsx
++
++.align	5
++Loop_outer_vsx:
++	lvx	$xa0,$x00,r12			# load [smashed] sigma
++	lvx	$xa1,$x10,r12
++	lvx	$xa2,$x20,r12
++	lvx	$xa3,$x30,r12
++
++	vspltw	$xb0,@K[1],0			# smash the key
++	vspltw	$xb1,@K[1],1
++	vspltw	$xb2,@K[1],2
++	vspltw	$xb3,@K[1],3
++
++	vspltw	$xc0,@K[2],0
++	vspltw	$xc1,@K[2],1
++	vspltw	$xc2,@K[2],2
++	vspltw	$xc3,@K[2],3
++
++	vmr	$xd0,$CTR			# smash the counter
++	vspltw	$xd1,@K[3],1
++	vspltw	$xd2,@K[3],2
++	vspltw	$xd3,@K[3],3
++
++	vspltisw $sixteen,-16			# synthesize constants
++	vspltisw $twelve,12
++	vspltisw $eight,8
++	vspltisw $seven,7
++
++Loop_vsx_4x:
++___
++	foreach (&VSX_lane_ROUND_4x(0, 4, 8,12)) { eval; }
++	foreach (&VSX_lane_ROUND_4x(0, 5,10,15)) { eval; }
++$code.=<<___;
++
++	bdnz	Loop_vsx_4x
++
++	vadduwm	$xd0,$xd0,$CTR
++
++	vmrgew	$xt0,$xa0,$xa1			# transpose data
++	vmrgew	$xt1,$xa2,$xa3
++	vmrgow	$xa0,$xa0,$xa1
++	vmrgow	$xa2,$xa2,$xa3
++	vmrgew	$xt2,$xb0,$xb1
++	vmrgew	$xt3,$xb2,$xb3
++	vpermdi	$xa1,$xa0,$xa2,0b00
++	vpermdi	$xa3,$xa0,$xa2,0b11
++	vpermdi	$xa0,$xt0,$xt1,0b00
++	vpermdi	$xa2,$xt0,$xt1,0b11
++
++	vmrgow	$xb0,$xb0,$xb1
++	vmrgow	$xb2,$xb2,$xb3
++	vmrgew	$xt0,$xc0,$xc1
++	vmrgew	$xt1,$xc2,$xc3
++	vpermdi	$xb1,$xb0,$xb2,0b00
++	vpermdi	$xb3,$xb0,$xb2,0b11
++	vpermdi	$xb0,$xt2,$xt3,0b00
++	vpermdi	$xb2,$xt2,$xt3,0b11
++
++	vmrgow	$xc0,$xc0,$xc1
++	vmrgow	$xc2,$xc2,$xc3
++	vmrgew	$xt2,$xd0,$xd1
++	vmrgew	$xt3,$xd2,$xd3
++	vpermdi	$xc1,$xc0,$xc2,0b00
++	vpermdi	$xc3,$xc0,$xc2,0b11
++	vpermdi	$xc0,$xt0,$xt1,0b00
++	vpermdi	$xc2,$xt0,$xt1,0b11
++
++	vmrgow	$xd0,$xd0,$xd1
++	vmrgow	$xd2,$xd2,$xd3
++	vspltisw $xt0,4
++	vadduwm  $CTR,$CTR,$xt0		# next counter value
++	vpermdi	$xd1,$xd0,$xd2,0b00
++	vpermdi	$xd3,$xd0,$xd2,0b11
++	vpermdi	$xd0,$xt2,$xt3,0b00
++	vpermdi	$xd2,$xt2,$xt3,0b11
++
++	vadduwm	$xa0,$xa0,@K[0]
++	vadduwm	$xb0,$xb0,@K[1]
++	vadduwm	$xc0,$xc0,@K[2]
++	vadduwm	$xd0,$xd0,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx
++
++	vadduwm	$xa0,$xa1,@K[0]
++	vadduwm	$xb0,$xb1,@K[1]
++	vadduwm	$xc0,$xc1,@K[2]
++	vadduwm	$xd0,$xd1,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx
++
++	vadduwm	$xa0,$xa2,@K[0]
++	vadduwm	$xb0,$xb2,@K[1]
++	vadduwm	$xc0,$xc2,@K[2]
++	vadduwm	$xd0,$xd2,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx
++
++	vadduwm	$xa0,$xa3,@K[0]
++	vadduwm	$xb0,$xb3,@K[1]
++	vadduwm	$xc0,$xc3,@K[2]
++	vadduwm	$xd0,$xd3,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	mtctr	r0
++	bne	Loop_outer_vsx
++
++Ldone_vsx:
++	lwz	r12,`$FRAME-4`($sp)		# pull vrsave
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	$POP	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# restore vrsave
++	lvx	v26,r10,$sp
++	addi	r10,r10,32
++	lvx	v27,r11,$sp
++	addi	r11,r11,32
++	lvx	v28,r10,$sp
++	addi	r10,r10,32
++	lvx	v29,r11,$sp
++	addi	r11,r11,32
++	lvx	v30,r10,$sp
++	lvx	v31,r11,$sp
++	mtlr	r0
++	addi	$sp,$sp,$FRAME
++	blr
++
++.align	4
++Ltail_vsx:
++	addi	r11,$sp,$LOCALS
++	mtctr	$len
++	stvx_4w	$xa0,$x00,r11			# offload block to stack
++	stvx_4w	$xb0,$x10,r11
++	stvx_4w	$xc0,$x20,r11
++	stvx_4w	$xd0,$x30,r11
++	subi	r12,r11,1			# prepare for *++ptr
++	subi	$inp,$inp,1
++	subi	$out,$out,1
++
++Loop_tail_vsx:
++	lbzu	r6,1(r12)
++	lbzu	r7,1($inp)
++	xor	r6,r6,r7
++	stbu	r6,1($out)
++	bdnz	Loop_tail_vsx
++
++	stvx_4w	$K[0],$x00,r11			# wipe copy of the block
++	stvx_4w	$K[0],$x10,r11
++	stvx_4w	$K[0],$x20,r11
++	stvx_4w	$K[0],$x30,r11
++
++	b	Ldone_vsx
++	.long	0
++	.byte	0,12,0x04,1,0x80,0,5,0
++	.long	0
++.size	.ChaCha20_ctr32_vsx_p10,.-.ChaCha20_ctr32_vsx_p10
++___
++}}}
++
++##This is 8 block in parallel implementation. The heart of chacha round uses vector instruction that has access to
++# vsr[32+X]. To perform the 8 parallel block we tend to use all 32 register to hold the 8 block info.
++# WE need to store few register value on side, so we can use VSR{32+X} for few vector instructions used in round op and hold intermediate value.
++# WE use the VSR[0]-VSR[31] for holding intermediate value and perform 8 block in parallel.
++#
++{{{
++#### ($out,$inp,$len,$key,$ctr) = map("r$_",(3..7));
++my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
++    $xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3,
++    $xa4,$xa5,$xa6,$xa7, $xb4,$xb5,$xb6,$xb7,
++    $xc4,$xc5,$xc6,$xc7, $xd4,$xd5,$xd6,$xd7) = map("v$_",(0..31));
++my ($xcn4,$xcn5,$xcn6,$xcn7, $xdn4,$xdn5,$xdn6,$xdn7) = map("v$_",(8..15));
++my ($xan0,$xbn0,$xcn0,$xdn0) = map("v$_",(0..3));
++my @K = map("v$_",27,(24..26));
++my ($xt0,$xt1,$xt2,$xt3,$xt4) = map("v$_",23,(28..31));
++my $xr0 = "v4";
++my $CTR0 = "v22";
++my $CTR1 = "v5";
++my $beperm = "v31";
++my ($x00,$x10,$x20,$x30) = (0, map("r$_",(8..10)));
++my ($xv0,$xv1,$xv2,$xv3,$xv4,$xv5,$xv6,$xv7) = map("v$_",(0..7));
++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("v$_",(8..17));
++my ($xv18,$xv19,$xv20,$xv21) = map("v$_",(18..21));
++my ($xv22,$xv23,$xv24,$xv25,$xv26) = map("v$_",(22..26));
++
++my $FRAME=$LOCALS+64+9*16;	# 8*16 is for v24-v31 offload
++
++sub VSX_lane_ROUND_8x {
++my ($a0,$b0,$c0,$d0,$a4,$b4,$c4,$d4)=@_;
++my ($a1,$b1,$c1,$d1)=map(($_&~3)+(($_+1)&3),($a0,$b0,$c0,$d0));
++my ($a2,$b2,$c2,$d2)=map(($_&~3)+(($_+1)&3),($a1,$b1,$c1,$d1));
++my ($a3,$b3,$c3,$d3)=map(($_&~3)+(($_+1)&3),($a2,$b2,$c2,$d2));
++my ($a5,$b5,$c5,$d5)=map(($_&~3)+(($_+1)&3),($a4,$b4,$c4,$d4));
++my ($a6,$b6,$c6,$d6)=map(($_&~3)+(($_+1)&3),($a5,$b5,$c5,$d5));
++my ($a7,$b7,$c7,$d7)=map(($_&~3)+(($_+1)&3),($a6,$b6,$c6,$d6));
++my ($xv8,$xv9,$xv10,$xv11,$xv12,$xv13,$xv14,$xv15,$xv16,$xv17) = map("\"v$_\"",(8..17));
++my @x=map("\"v$_\"",(0..31));
++
++	(
++	"&vxxlor        ($xv15 ,@x[$c7],@x[$c7])",      #copy v30 to v13
++	"&vxxlorc       (@x[$c7], $xv9,$xv9)",
++
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",	# Q1
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",	# Q2
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",	# Q3
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",	# Q4
++	"&vadduwm	(@x[$a4],@x[$a4],@x[$b4])",	# Q1
++	 "&vadduwm	(@x[$a5],@x[$a5],@x[$b5])",	# Q2
++	  "&vadduwm	(@x[$a6],@x[$a6],@x[$b6])",	# Q3
++	   "&vadduwm	(@x[$a7],@x[$a7],@x[$b7])",	# Q4
++
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vxor		(@x[$d4],@x[$d4],@x[$a4])",
++	 "&vxor		(@x[$d5],@x[$d5],@x[$a5])",
++	  "&vxor	(@x[$d6],@x[$d6],@x[$a6])",
++	   "&vxor	(@x[$d7],@x[$d7],@x[$a7])",
++
++	"&vrlw		(@x[$d0],@x[$d0],@x[$c7])",
++	 "&vrlw		(@x[$d1],@x[$d1],@x[$c7])",
++	  "&vrlw	(@x[$d2],@x[$d2],@x[$c7])",
++	   "&vrlw	(@x[$d3],@x[$d3],@x[$c7])",
++	"&vrlw		(@x[$d4],@x[$d4],@x[$c7])",
++	 "&vrlw		(@x[$d5],@x[$d5],@x[$c7])",
++	  "&vrlw	(@x[$d6],@x[$d6],@x[$c7])",
++	   "&vrlw	(@x[$d7],@x[$d7],@x[$c7])",
++
++	"&vxxlor        ($xv13 ,@x[$a7],@x[$a7])",
++	"&vxxlorc       (@x[$c7], $xv15,$xv15)",
++	"&vxxlorc       (@x[$a7], $xv10,$xv10)",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vadduwm	(@x[$c4],@x[$c4],@x[$d4])",
++	 "&vadduwm	(@x[$c5],@x[$c5],@x[$d5])",
++	  "&vadduwm	(@x[$c6],@x[$c6],@x[$d6])",
++	   "&vadduwm	(@x[$c7],@x[$c7],@x[$d7])",
++
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vxor		(@x[$b4],@x[$b4],@x[$c4])",
++	 "&vxor		(@x[$b5],@x[$b5],@x[$c5])",
++	  "&vxor	(@x[$b6],@x[$b6],@x[$c6])",
++	   "&vxor	(@x[$b7],@x[$b7],@x[$c7])",
++
++	"&vrlw		(@x[$b0],@x[$b0],@x[$a7])",
++	 "&vrlw		(@x[$b1],@x[$b1],@x[$a7])",
++	  "&vrlw	(@x[$b2],@x[$b2],@x[$a7])",
++	   "&vrlw	(@x[$b3],@x[$b3],@x[$a7])",
++	"&vrlw		(@x[$b4],@x[$b4],@x[$a7])",
++	 "&vrlw		(@x[$b5],@x[$b5],@x[$a7])",
++	  "&vrlw	(@x[$b6],@x[$b6],@x[$a7])",
++	   "&vrlw	(@x[$b7],@x[$b7],@x[$a7])",
++
++	"&vxxlorc       (@x[$a7], $xv13,$xv13)",
++	"&vxxlor	($xv15 ,@x[$c7],@x[$c7])",
++	"&vxxlorc       (@x[$c7], $xv11,$xv11)",
++
++
++	"&vadduwm	(@x[$a0],@x[$a0],@x[$b0])",
++	 "&vadduwm	(@x[$a1],@x[$a1],@x[$b1])",
++	  "&vadduwm	(@x[$a2],@x[$a2],@x[$b2])",
++	   "&vadduwm	(@x[$a3],@x[$a3],@x[$b3])",
++	"&vadduwm	(@x[$a4],@x[$a4],@x[$b4])",
++	 "&vadduwm	(@x[$a5],@x[$a5],@x[$b5])",
++	  "&vadduwm	(@x[$a6],@x[$a6],@x[$b6])",
++	   "&vadduwm	(@x[$a7],@x[$a7],@x[$b7])",
++
++	"&vxor		(@x[$d0],@x[$d0],@x[$a0])",
++	 "&vxor		(@x[$d1],@x[$d1],@x[$a1])",
++	  "&vxor	(@x[$d2],@x[$d2],@x[$a2])",
++	   "&vxor	(@x[$d3],@x[$d3],@x[$a3])",
++	"&vxor		(@x[$d4],@x[$d4],@x[$a4])",
++	 "&vxor		(@x[$d5],@x[$d5],@x[$a5])",
++	  "&vxor	(@x[$d6],@x[$d6],@x[$a6])",
++	   "&vxor	(@x[$d7],@x[$d7],@x[$a7])",
++
++	"&vrlw		(@x[$d0],@x[$d0],@x[$c7])",
++	 "&vrlw		(@x[$d1],@x[$d1],@x[$c7])",
++	  "&vrlw	(@x[$d2],@x[$d2],@x[$c7])",
++	   "&vrlw	(@x[$d3],@x[$d3],@x[$c7])",
++	"&vrlw		(@x[$d4],@x[$d4],@x[$c7])",
++	 "&vrlw		(@x[$d5],@x[$d5],@x[$c7])",
++	  "&vrlw	(@x[$d6],@x[$d6],@x[$c7])",
++	   "&vrlw	(@x[$d7],@x[$d7],@x[$c7])",
++
++	"&vxxlorc       (@x[$c7], $xv15,$xv15)",
++	"&vxxlor        ($xv13 ,@x[$a7],@x[$a7])",
++	"&vxxlorc       (@x[$a7], $xv12,$xv12)",
++
++	"&vadduwm	(@x[$c0],@x[$c0],@x[$d0])",
++	 "&vadduwm	(@x[$c1],@x[$c1],@x[$d1])",
++	  "&vadduwm	(@x[$c2],@x[$c2],@x[$d2])",
++	   "&vadduwm	(@x[$c3],@x[$c3],@x[$d3])",
++	"&vadduwm	(@x[$c4],@x[$c4],@x[$d4])",
++	 "&vadduwm	(@x[$c5],@x[$c5],@x[$d5])",
++	  "&vadduwm	(@x[$c6],@x[$c6],@x[$d6])",
++	   "&vadduwm	(@x[$c7],@x[$c7],@x[$d7])",
++	"&vxor		(@x[$b0],@x[$b0],@x[$c0])",
++	 "&vxor		(@x[$b1],@x[$b1],@x[$c1])",
++	  "&vxor	(@x[$b2],@x[$b2],@x[$c2])",
++	   "&vxor	(@x[$b3],@x[$b3],@x[$c3])",
++	"&vxor		(@x[$b4],@x[$b4],@x[$c4])",
++	 "&vxor		(@x[$b5],@x[$b5],@x[$c5])",
++	  "&vxor	(@x[$b6],@x[$b6],@x[$c6])",
++	   "&vxor	(@x[$b7],@x[$b7],@x[$c7])",
++	"&vrlw		(@x[$b0],@x[$b0],@x[$a7])",
++	 "&vrlw		(@x[$b1],@x[$b1],@x[$a7])",
++	  "&vrlw	(@x[$b2],@x[$b2],@x[$a7])",
++	   "&vrlw	(@x[$b3],@x[$b3],@x[$a7])",
++	"&vrlw		(@x[$b4],@x[$b4],@x[$a7])",
++	 "&vrlw		(@x[$b5],@x[$b5],@x[$a7])",
++	  "&vrlw	(@x[$b6],@x[$b6],@x[$a7])",
++	   "&vrlw	(@x[$b7],@x[$b7],@x[$a7])",
++
++	"&vxxlorc       (@x[$a7], $xv13,$xv13)",
++	);
++}
++
++$code.=<<___;
++
++.globl	.ChaCha20_ctr32_vsx_8x
++.align	5
++.ChaCha20_ctr32_vsx_8x:
++	$STU	$sp,-$FRAME($sp)
++	mflr	r0
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	mfspr	r12,256
++	stvx	v24,r10,$sp
++	addi	r10,r10,32
++	stvx	v25,r11,$sp
++	addi	r11,r11,32
++	stvx	v26,r10,$sp
++	addi	r10,r10,32
++	stvx	v27,r11,$sp
++	addi	r11,r11,32
++	stvx	v28,r10,$sp
++	addi	r10,r10,32
++	stvx	v29,r11,$sp
++	addi	r11,r11,32
++	stvx	v30,r10,$sp
++	stvx	v31,r11,$sp
++	stw	r12,`$FRAME-4`($sp)		# save vrsave
++	li	r12,-4096+63
++	$PUSH	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# preserve 29 AltiVec registers
++
++	bl	Lconsts				# returns pointer Lsigma in r12
++
++	lvx_4w	@K[0],0,r12			# load sigma
++	addi	r12,r12,0x70
++	li	$x10,16
++	li	$x20,32
++	li	$x30,48
++	li	r11,64
++
++	vspltisw $xa4,-16			# synthesize constants
++	vspltisw $xb4,12			# synthesize constants
++	vspltisw $xc4,8			# synthesize constants
++	vspltisw $xd4,7			# synthesize constants
++
++	lvx	$xa0,$x00,r12			# load [smashed] sigma
++	lvx	$xa1,$x10,r12
++	lvx	$xa2,$x20,r12
++	lvx	$xa3,$x30,r12
++
++	vxxlor	$xv9   ,$xa4,$xa4               #save shift val in vr9-12
++	vxxlor	$xv10  ,$xb4,$xb4
++	vxxlor	$xv11  ,$xc4,$xc4
++	vxxlor	$xv12  ,$xd4,$xd4
++	vxxlor	$xv22  ,$xa0,$xa0               #save sigma in vr22-25
++	vxxlor	$xv23  ,$xa1,$xa1
++	vxxlor	$xv24  ,$xa2,$xa2
++	vxxlor	$xv25  ,$xa3,$xa3
++
++	lvx_4w	@K[1],0,$key			# load key
++	lvx_4w	@K[2],$x10,$key
++	lvx_4w	@K[3],0,$ctr			# load counter
++	vspltisw $xt3,4
++
++
++	vxor	$xt2,$xt2,$xt2
++	lvx_4w	$xt1,r11,r12
++	vspltw	$xa2,@K[3],0			#save the original count after spltw
++	vsldoi	@K[3],@K[3],$xt2,4
++	vsldoi	@K[3],$xt2,@K[3],12		# clear @K[3].word[0]
++	vadduwm	$xt1,$xa2,$xt1
++	vadduwm $xt3,$xt1,$xt3     		# next counter value
++	vspltw	$xa0,@K[2],2                    # save the K[2] spltw 2 and save v8.
++
++	be?lvsl	  $beperm,0,$x10			# 0x00..0f
++	be?vspltisb $xt0,3			# 0x03..03
++	be?vxor   $beperm,$beperm,$xt0		# swap bytes within words
++	be?vxxlor $xv26 ,$beperm,$beperm
++
++	vxxlor	$xv0 ,@K[0],@K[0]               # K0,k1,k2 to vr0,1,2
++	vxxlor	$xv1 ,@K[1],@K[1]
++	vxxlor	$xv2 ,@K[2],@K[2]
++	vxxlor	$xv3 ,@K[3],@K[3]
++	vxxlor	$xv4 ,$xt1,$xt1                #CTR ->4, CTR+4-> 5
++	vxxlor	$xv5 ,$xt3,$xt3
++	vxxlor	$xv8 ,$xa0,$xa0
++
++	li	r0,10				# inner loop counter
++	mtctr	r0
++	b	Loop_outer_vsx_8x
++
++.align	5
++Loop_outer_vsx_8x:
++	vxxlorc	$xa0,$xv22,$xv22	        # load [smashed] sigma
++	vxxlorc	$xa1,$xv23,$xv23
++	vxxlorc	$xa2,$xv24,$xv24
++	vxxlorc	$xa3,$xv25,$xv25
++	vxxlorc	$xa4,$xv22,$xv22
++	vxxlorc	$xa5,$xv23,$xv23
++	vxxlorc	$xa6,$xv24,$xv24
++	vxxlorc	$xa7,$xv25,$xv25
++
++	vspltw	$xb0,@K[1],0			# smash the key
++	vspltw	$xb1,@K[1],1
++	vspltw	$xb2,@K[1],2
++	vspltw	$xb3,@K[1],3
++	vspltw	$xb4,@K[1],0			# smash the key
++	vspltw	$xb5,@K[1],1
++	vspltw	$xb6,@K[1],2
++	vspltw	$xb7,@K[1],3
++
++	vspltw	$xc0,@K[2],0
++	vspltw	$xc1,@K[2],1
++	vspltw	$xc2,@K[2],2
++	vspltw	$xc3,@K[2],3
++	vspltw	$xc4,@K[2],0
++	vspltw	$xc7,@K[2],3
++	vspltw	$xc5,@K[2],1
++
++	vxxlorc	$xd0,$xv4,$xv4			# smash the counter
++	vspltw	$xd1,@K[3],1
++	vspltw	$xd2,@K[3],2
++	vspltw	$xd3,@K[3],3
++	vxxlorc	$xd4,$xv5,$xv5			# smash the counter
++	vspltw	$xd5,@K[3],1
++	vspltw	$xd6,@K[3],2
++	vspltw	$xd7,@K[3],3
++	vxxlorc	$xc6,$xv8,$xv8                  #copy of vlspt k[2],2 is in v8.v26 ->k[3] so need to wait until k3 is done
++
++Loop_vsx_8x:
++___
++	foreach (&VSX_lane_ROUND_8x(0,4, 8,12,16,20,24,28)) { eval; }
++	foreach (&VSX_lane_ROUND_8x(0,5,10,15,16,21,26,31)) { eval; }
++$code.=<<___;
++
++	bdnz	        Loop_vsx_8x
++	vxxlor	        $xv13 ,$xd4,$xd4                # save the register vr24-31
++	vxxlor	        $xv14 ,$xd5,$xd5                #
++	vxxlor	        $xv15 ,$xd6,$xd6                #
++	vxxlor	        $xv16 ,$xd7,$xd7                #
++
++	vxxlor	        $xv18 ,$xc4,$xc4                #
++	vxxlor	        $xv19 ,$xc5,$xc5                #
++	vxxlor	        $xv20 ,$xc6,$xc6                #
++	vxxlor	        $xv21 ,$xc7,$xc7                #
++
++	vxxlor	        $xv6  ,$xb6,$xb6                # save vr23, so we get 8 regs
++	vxxlor	        $xv7  ,$xb7,$xb7                # save vr23, so we get 8 regs
++	be?vxxlorc      $beperm,$xv26,$xv26             # copy back the the beperm.
++
++	vxxlorc	   @K[0],$xv0,$xv0                #27
++	vxxlorc	   @K[1],$xv1,$xv1 		  #24
++	vxxlorc	   @K[2],$xv2,$xv2		  #25
++	vxxlorc	   @K[3],$xv3,$xv3		  #26
++	vxxlorc	   $CTR0,$xv4,$xv4
++###changing to vertical
++
++	vmrgew	$xt0,$xa0,$xa1			# transpose data
++	vmrgew	$xt1,$xa2,$xa3
++	vmrgow	$xa0,$xa0,$xa1
++	vmrgow	$xa2,$xa2,$xa3
++
++	vmrgew	$xt2,$xb0,$xb1
++	vmrgew	$xt3,$xb2,$xb3
++	vmrgow	$xb0,$xb0,$xb1
++	vmrgow	$xb2,$xb2,$xb3
++
++	vadduwm	$xd0,$xd0,$CTR0
++
++	vpermdi	$xa1,$xa0,$xa2,0b00
++	vpermdi	$xa3,$xa0,$xa2,0b11
++	vpermdi	$xa0,$xt0,$xt1,0b00
++	vpermdi	$xa2,$xt0,$xt1,0b11
++	vpermdi	$xb1,$xb0,$xb2,0b00
++	vpermdi	$xb3,$xb0,$xb2,0b11
++	vpermdi	$xb0,$xt2,$xt3,0b00
++	vpermdi	$xb2,$xt2,$xt3,0b11
++
++	vmrgew	$xt0,$xc0,$xc1
++	vmrgew	$xt1,$xc2,$xc3
++	vmrgow	$xc0,$xc0,$xc1
++	vmrgow	$xc2,$xc2,$xc3
++	vmrgew	$xt2,$xd0,$xd1
++	vmrgew	$xt3,$xd2,$xd3
++	vmrgow	$xd0,$xd0,$xd1
++	vmrgow	$xd2,$xd2,$xd3
++
++	vpermdi	$xc1,$xc0,$xc2,0b00
++	vpermdi	$xc3,$xc0,$xc2,0b11
++	vpermdi	$xc0,$xt0,$xt1,0b00
++	vpermdi	$xc2,$xt0,$xt1,0b11
++	vpermdi	$xd1,$xd0,$xd2,0b00
++	vpermdi	$xd3,$xd0,$xd2,0b11
++	vpermdi	$xd0,$xt2,$xt3,0b00
++	vpermdi	$xd2,$xt2,$xt3,0b11
++
++	vspltisw $xt0,8
++	vadduwm  $CTR0,$CTR0,$xt0		# next counter value
++	vxxlor	 $xv4 ,$CTR0,$CTR0	        #CTR+4-> 5
++
++	vadduwm	$xa0,$xa0,@K[0]
++	vadduwm	$xb0,$xb0,@K[1]
++	vadduwm	$xc0,$xc0,@K[2]
++	vadduwm	$xd0,$xd0,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xa0,$xa1,@K[0]
++	vadduwm	$xb0,$xb1,@K[1]
++	vadduwm	$xc0,$xc1,@K[2]
++	vadduwm	$xd0,$xd1,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xa0,$xa2,@K[0]
++	vadduwm	$xb0,$xb2,@K[1]
++	vadduwm	$xc0,$xc2,@K[2]
++	vadduwm	$xd0,$xd2,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xa0,$xa3,@K[0]
++	vadduwm	$xb0,$xb3,@K[1]
++	vadduwm	$xc0,$xc3,@K[2]
++	vadduwm	$xd0,$xd3,@K[3]
++
++	be?vperm $xa0,$xa0,$xa0,$beperm
++	be?vperm $xb0,$xb0,$xb0,$beperm
++	be?vperm $xc0,$xc0,$xc0,$beperm
++	be?vperm $xd0,$xd0,$xd0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x
++
++	lvx_4w	$xt0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xt0,$xt0,$xa0
++	vxor	$xt1,$xt1,$xb0
++	vxor	$xt2,$xt2,$xc0
++	vxor	$xt3,$xt3,$xd0
++
++	stvx_4w	$xt0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++#blk4-7: 24:31 remain the same as we can use the same logic above . Reg a4-b7 remain same.Load c4,d7--> position 8-15.we can reuse vr24-31.
++#VR0-3 : are used to load temp value, vr4 --> as xr0 instead of xt0.
++
++	vxxlorc	   $CTR1 ,$xv5,$xv5
++
++	vxxlorc	   $xcn4 ,$xv18,$xv18
++	vxxlorc	   $xcn5 ,$xv19,$xv19
++	vxxlorc	   $xcn6 ,$xv20,$xv20
++	vxxlorc	   $xcn7 ,$xv21,$xv21
++
++	vxxlorc	   $xdn4 ,$xv13,$xv13
++	vxxlorc	   $xdn5 ,$xv14,$xv14
++	vxxlorc	   $xdn6 ,$xv15,$xv15
++	vxxlorc	   $xdn7 ,$xv16,$xv16
++	vadduwm	   $xdn4,$xdn4,$CTR1
++
++	vxxlorc	   $xb6 ,$xv6,$xv6
++	vxxlorc	   $xb7 ,$xv7,$xv7
++#use xa1->xr0, as xt0...in the block 4-7
++
++	vmrgew	$xr0,$xa4,$xa5			# transpose data
++	vmrgew	$xt1,$xa6,$xa7
++	vmrgow	$xa4,$xa4,$xa5
++	vmrgow	$xa6,$xa6,$xa7
++	vmrgew	$xt2,$xb4,$xb5
++	vmrgew	$xt3,$xb6,$xb7
++	vmrgow	$xb4,$xb4,$xb5
++	vmrgow	$xb6,$xb6,$xb7
++
++	vpermdi	$xa5,$xa4,$xa6,0b00
++	vpermdi	$xa7,$xa4,$xa6,0b11
++	vpermdi	$xa4,$xr0,$xt1,0b00
++	vpermdi	$xa6,$xr0,$xt1,0b11
++	vpermdi	$xb5,$xb4,$xb6,0b00
++	vpermdi	$xb7,$xb4,$xb6,0b11
++	vpermdi	$xb4,$xt2,$xt3,0b00
++	vpermdi	$xb6,$xt2,$xt3,0b11
++
++	vmrgew	$xr0,$xcn4,$xcn5
++	vmrgew	$xt1,$xcn6,$xcn7
++	vmrgow	$xcn4,$xcn4,$xcn5
++	vmrgow	$xcn6,$xcn6,$xcn7
++	vmrgew	$xt2,$xdn4,$xdn5
++	vmrgew	$xt3,$xdn6,$xdn7
++	vmrgow	$xdn4,$xdn4,$xdn5
++	vmrgow	$xdn6,$xdn6,$xdn7
++
++	vpermdi	$xcn5,$xcn4,$xcn6,0b00
++	vpermdi	$xcn7,$xcn4,$xcn6,0b11
++	vpermdi	$xcn4,$xr0,$xt1,0b00
++	vpermdi	$xcn6,$xr0,$xt1,0b11
++	vpermdi	$xdn5,$xdn4,$xdn6,0b00
++	vpermdi	$xdn7,$xdn4,$xdn6,0b11
++	vpermdi	$xdn4,$xt2,$xt3,0b00
++	vpermdi	$xdn6,$xt2,$xt3,0b11
++
++	vspltisw $xr0,8
++	vadduwm  $CTR1,$CTR1,$xr0		# next counter value
++	vxxlor	 $xv5 ,$CTR1,$CTR1	        #CTR+4-> 5
++
++	vadduwm	$xan0,$xa4,@K[0]
++	vadduwm	$xbn0,$xb4,@K[1]
++	vadduwm	$xcn0,$xcn4,@K[2]
++	vadduwm	$xdn0,$xdn4,@K[3]
++
++	be?vperm $xan0,$xa4,$xa4,$beperm
++	be?vperm $xbn0,$xb4,$xb4,$beperm
++	be?vperm $xcn0,$xcn4,$xcn4,$beperm
++	be?vperm $xdn0,$xdn4,$xdn4,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xan0,$xa5,@K[0]
++	vadduwm	$xbn0,$xb5,@K[1]
++	vadduwm	$xcn0,$xcn5,@K[2]
++	vadduwm	$xdn0,$xdn5,@K[3]
++
++	be?vperm $xan0,$xan0,$xan0,$beperm
++	be?vperm $xbn0,$xbn0,$xbn0,$beperm
++	be?vperm $xcn0,$xcn0,$xcn0,$beperm
++	be?vperm $xdn0,$xdn0,$xdn0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xan0,$xa6,@K[0]
++	vadduwm	$xbn0,$xb6,@K[1]
++	vadduwm	$xcn0,$xcn6,@K[2]
++	vadduwm	$xdn0,$xdn6,@K[3]
++
++	be?vperm $xan0,$xan0,$xan0,$beperm
++	be?vperm $xbn0,$xbn0,$xbn0,$beperm
++	be?vperm $xcn0,$xcn0,$xcn0,$beperm
++	be?vperm $xdn0,$xdn0,$xdn0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	vadduwm	$xan0,$xa7,@K[0]
++	vadduwm	$xbn0,$xb7,@K[1]
++	vadduwm	$xcn0,$xcn7,@K[2]
++	vadduwm	$xdn0,$xdn7,@K[3]
++
++	be?vperm $xan0,$xan0,$xan0,$beperm
++	be?vperm $xbn0,$xbn0,$xbn0,$beperm
++	be?vperm $xcn0,$xcn0,$xcn0,$beperm
++	be?vperm $xdn0,$xdn0,$xdn0,$beperm
++
++	${UCMP}i $len,0x40
++	blt	Ltail_vsx_8x_1
++
++	lvx_4w	$xr0,$x00,$inp
++	lvx_4w	$xt1,$x10,$inp
++	lvx_4w	$xt2,$x20,$inp
++	lvx_4w	$xt3,$x30,$inp
++
++	vxor	$xr0,$xr0,$xan0
++	vxor	$xt1,$xt1,$xbn0
++	vxor	$xt2,$xt2,$xcn0
++	vxor	$xt3,$xt3,$xdn0
++
++	stvx_4w	$xr0,$x00,$out
++	stvx_4w	$xt1,$x10,$out
++	addi	$inp,$inp,0x40
++	stvx_4w	$xt2,$x20,$out
++	subi	$len,$len,0x40
++	stvx_4w	$xt3,$x30,$out
++	addi	$out,$out,0x40
++	beq	Ldone_vsx_8x
++
++	mtctr	r0
++	bne	Loop_outer_vsx_8x
++
++Ldone_vsx_8x:
++	lwz	r12,`$FRAME-4`($sp)		# pull vrsave
++	li	r10,`15+$LOCALS+64`
++	li	r11,`31+$LOCALS+64`
++	$POP	r0, `$FRAME+$LRSAVE`($sp)
++	mtspr	256,r12				# restore vrsave
++	lvx	v24,r10,$sp
++	addi	r10,r10,32
++	lvx	v25,r11,$sp
++	addi	r11,r11,32
++	lvx	v26,r10,$sp
++	addi	r10,r10,32
++	lvx	v27,r11,$sp
++	addi	r11,r11,32
++	lvx	v28,r10,$sp
++	addi	r10,r10,32
++	lvx	v29,r11,$sp
++	addi	r11,r11,32
++	lvx	v30,r10,$sp
++	lvx	v31,r11,$sp
++	mtlr	r0
++	addi	$sp,$sp,$FRAME
++	blr
++
++.align	4
++Ltail_vsx_8x:
++	addi	r11,$sp,$LOCALS
++	mtctr	$len
++	stvx_4w	$xa0,$x00,r11			# offload block to stack
++	stvx_4w	$xb0,$x10,r11
++	stvx_4w	$xc0,$x20,r11
++	stvx_4w	$xd0,$x30,r11
++	subi	r12,r11,1			# prepare for *++ptr
++	subi	$inp,$inp,1
++	subi	$out,$out,1
++	bl      Loop_tail_vsx_8x
++Ltail_vsx_8x_1:
++	addi	r11,$sp,$LOCALS
++	mtctr	$len
++	stvx_4w	$xan0,$x00,r11			# offload block to stack
++	stvx_4w	$xbn0,$x10,r11
++	stvx_4w	$xcn0,$x20,r11
++	stvx_4w	$xdn0,$x30,r11
++	subi	r12,r11,1			# prepare for *++ptr
++	subi	$inp,$inp,1
++	subi	$out,$out,1
++        bl      Loop_tail_vsx_8x
++
++Loop_tail_vsx_8x:
++	lbzu	r6,1(r12)
++	lbzu	r7,1($inp)
++	xor	r6,r6,r7
++	stbu	r6,1($out)
++	bdnz	Loop_tail_vsx_8x
++
++	stvx_4w	$K[0],$x00,r11			# wipe copy of the block
++	stvx_4w	$K[0],$x10,r11
++	stvx_4w	$K[0],$x20,r11
++	stvx_4w	$K[0],$x30,r11
++
++	b	Ldone_vsx_8x
++	.long	0
++	.byte	0,12,0x04,1,0x80,0,5,0
++	.long	0
++.size	.ChaCha20_ctr32_vsx_8x,.-.ChaCha20_ctr32_vsx_8x
++___
++}}}
++
++
++$code.=<<___;
++.align	5
++Lconsts:
++	mflr	r0
++	bcl	20,31,\$+4
++	mflr	r12	#vvvvv "distance between . and Lsigma
++	addi	r12,r12,`64-8`
++	mtlr	r0
++	blr
++	.long	0
++	.byte	0,12,0x14,0,0,0,0,0
++	.space	`64-9*4`
++Lsigma:
++	.long   0x61707865,0x3320646e,0x79622d32,0x6b206574
++	.long	1,0,0,0
++	.long	2,0,0,0
++	.long	3,0,0,0
++	.long	4,0,0,0
++___
++$code.=<<___ 	if ($LITTLE_ENDIAN);
++	.long	0x0e0f0c0d,0x0a0b0809,0x06070405,0x02030001
++	.long	0x0d0e0f0c,0x090a0b08,0x05060704,0x01020300
++___
++$code.=<<___ 	if (!$LITTLE_ENDIAN);	# flipped words
++	.long	0x02030001,0x06070405,0x0a0b0809,0x0e0f0c0d
++	.long	0x01020300,0x05060704,0x090a0b08,0x0d0e0f0c
++___
++$code.=<<___;
++	.long	0x61707865,0x61707865,0x61707865,0x61707865
++	.long	0x3320646e,0x3320646e,0x3320646e,0x3320646e
++	.long	0x79622d32,0x79622d32,0x79622d32,0x79622d32
++	.long	0x6b206574,0x6b206574,0x6b206574,0x6b206574
++	.long	0,1,2,3
++        .long   0x03020100,0x07060504,0x0b0a0908,0x0f0e0d0c
++.asciz  "ChaCha20 for PowerPC/AltiVec, CRYPTOGAMS by <appro\@openssl.org>"
++.align	2
++___
++
++foreach (split("\n",$code)) {
++	s/\`([^\`]*)\`/eval $1/ge;
++
++	# instructions prefixed with '?' are endian-specific and need
++	# to be adjusted accordingly...
++	if ($flavour !~ /le$/) {	# big-endian
++	    s/be\?//		or
++	    s/le\?/#le#/	or
++	    s/\?lvsr/lvsl/	or
++	    s/\?lvsl/lvsr/	or
++	    s/\?(vperm\s+v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+,\s*)(v[0-9]+)/$1$3$2$4/ or
++	    s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 16-$3/;
++	} else {			# little-endian
++	    s/le\?//		or
++	    s/be\?/#be#/	or
++	    s/\?([a-z]+)/$1/	or
++	    s/vrldoi(\s+v[0-9]+,\s*)(v[0-9]+,)\s*([0-9]+)/vsldoi$1$2$2 $3/;
++	}
++
++	print $_,"\n";
++}
++
++close STDOUT or die "error closing STDOUT: $!";
+diff --git a/crypto/chacha/build.info b/crypto/chacha/build.info
+index c12cb9c..2a819b2 100644
+--- a/crypto/chacha/build.info
++++ b/crypto/chacha/build.info
+@@ -12,7 +12,7 @@ IF[{- !$disabled{asm} -}]
+   $CHACHAASM_armv4=chacha-armv4.S
+   $CHACHAASM_aarch64=chacha-armv8.S
+ 
+-  $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s
++  $CHACHAASM_ppc32=chacha_ppc.c chacha-ppc.s chachap10-ppc.s
+   $CHACHAASM_ppc64=$CHACHAASM_ppc32
+ 
+   $CHACHAASM_c64xplus=chacha-c64xplus.s
+@@ -29,6 +29,7 @@ SOURCE[../../libcrypto]=$CHACHAASM
+ GENERATE[chacha-x86.s]=asm/chacha-x86.pl
+ GENERATE[chacha-x86_64.s]=asm/chacha-x86_64.pl
+ GENERATE[chacha-ppc.s]=asm/chacha-ppc.pl
++GENERATE[chachap10-ppc.s]=asm/chachap10-ppc.pl
+ GENERATE[chacha-armv4.S]=asm/chacha-armv4.pl
+ INCLUDE[chacha-armv4.o]=..
+ GENERATE[chacha-armv8.S]=asm/chacha-armv8.pl
+diff --git a/crypto/chacha/chacha_ppc.c b/crypto/chacha/chacha_ppc.c
+index 5319040..f99cca8 100644
+--- a/crypto/chacha/chacha_ppc.c
++++ b/crypto/chacha/chacha_ppc.c
+@@ -23,13 +23,18 @@ void ChaCha20_ctr32_vmx(unsigned char *out, const unsigned char *inp,
+ void ChaCha20_ctr32_vsx(unsigned char *out, const unsigned char *inp,
+                         size_t len, const unsigned int key[8],
+                         const unsigned int counter[4]);
++void ChaCha20_ctr32_vsx_p10(unsigned char *out, const unsigned char *inp,
++                        size_t len, const unsigned int key[8],
++                        const unsigned int counter[4]);
+ void ChaCha20_ctr32(unsigned char *out, const unsigned char *inp,
+                     size_t len, const unsigned int key[8],
+                     const unsigned int counter[4])
+ {
+-    OPENSSL_ppccap_P & PPC_CRYPTO207
+-        ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
+-        : OPENSSL_ppccap_P & PPC_ALTIVEC
+-            ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
+-            : ChaCha20_ctr32_int(out, inp, len, key, counter);
++    OPENSSL_ppccap_P & PPC_BRD31
++        ? ChaCha20_ctr32_vsx_p10(out, inp, len, key, counter)
++        :OPENSSL_ppccap_P & PPC_CRYPTO207
++            ? ChaCha20_ctr32_vsx(out, inp, len, key, counter)
++            : OPENSSL_ppccap_P & PPC_ALTIVEC
++                 ? ChaCha20_ctr32_vmx(out, inp, len, key, counter)
++                 : ChaCha20_ctr32_int(out, inp, len, key, counter);
+ }
+diff --git a/crypto/perlasm/ppc-xlate.pl b/crypto/perlasm/ppc-xlate.pl
+index 2ee4440..4590340 100755
+--- a/crypto/perlasm/ppc-xlate.pl
++++ b/crypto/perlasm/ppc-xlate.pl
+@@ -293,6 +293,14 @@ my $vpermdi	= sub {				# xxpermdi
+     $dm = oct($dm) if ($dm =~ /^0/);
+     "	.long	".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|($dm<<8)|(10<<3)|7;
+ };
++my $vxxlor	= sub {				# xxlor
++    my ($f, $vrt, $vra, $vrb) = @_;
++    "	.long	".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|6;
++};
++my $vxxlorc	= sub {				# xxlor
++    my ($f, $vrt, $vra, $vrb) = @_;
++    "	.long	".sprintf "0x%X",(60<<26)|($vrt<<21)|($vra<<16)|($vrb<<11)|(146<<3)|1;
++};
+ 
+ # PowerISA 2.07 stuff
+ sub vcrypto_op {
+@@ -377,6 +385,15 @@ my $addex = sub {
+ };
+ my $vmsumudm	= sub { vfour_vsr(@_, 35); };
+ 
++# PowerISA 3.1 stuff
++my $brd = sub {
++    my ($f, $ra, $rs) = @_;
++    "  .long   ".sprintf "0x%X",(31<<26)|($rs<<21)|($ra<<16)|(187<<1);
++};
++my $vsrq	= sub { vcrypto_op(@_, 517); };
++
++
++
+ while($line=<>) {
+ 
+     $line =~ s|[#!;].*$||;	# get rid of asm-style comments...
+diff --git a/crypto/ppccap.c b/crypto/ppccap.c
+index 8bcfed2..664627c 100644
+--- a/crypto/ppccap.c
++++ b/crypto/ppccap.c
+@@ -45,6 +45,7 @@ void OPENSSL_ppc64_probe(void);
+ void OPENSSL_altivec_probe(void);
+ void OPENSSL_crypto207_probe(void);
+ void OPENSSL_madd300_probe(void);
++void OPENSSL_brd31_probe(void);
+ 
+ long OPENSSL_rdtsc_mftb(void);
+ long OPENSSL_rdtsc_mfspr268(void);
+@@ -117,16 +118,21 @@ static unsigned long getauxval(unsigned long key)
+ #endif
+ 
+ /* I wish <sys/auxv.h> was universally available */
+-#define HWCAP                   16      /* AT_HWCAP */
++#ifndef AT_HWCAP
++# define AT_HWCAP               16      /* AT_HWCAP */
++#endif
+ #define HWCAP_PPC64             (1U << 30)
+ #define HWCAP_ALTIVEC           (1U << 28)
+ #define HWCAP_FPU               (1U << 27)
+ #define HWCAP_POWER6_EXT        (1U << 9)
+ #define HWCAP_VSX               (1U << 7)
+ 
+-#define HWCAP2                  26      /* AT_HWCAP2 */
++#ifndef AT_HWCAP2
++# define AT_HWCAP2              26      /* AT_HWCAP2 */
++#endif
+ #define HWCAP_VEC_CRYPTO        (1U << 25)
+ #define HWCAP_ARCH_3_00         (1U << 23)
++#define HWCAP_ARCH_3_1          (1U << 18)
+ 
+ # if defined(__GNUC__) && __GNUC__>=2
+ __attribute__ ((constructor))
+@@ -187,6 +193,9 @@ void OPENSSL_cpuid_setup(void)
+     if (__power_set(0xffffffffU<<17))           /* POWER9 and later */
+         OPENSSL_ppccap_P |= PPC_MADD300;
+ 
++    if (__power_set(0xffffffffU<<18))           /* POWER10 and later */
++        OPENSSL_ppccap_P |= PPC_BRD31;
++
+     return;
+ # endif
+ #endif
+@@ -215,8 +224,8 @@ void OPENSSL_cpuid_setup(void)
+ 
+ #ifdef OSSL_IMPLEMENT_GETAUXVAL
+     {
+-        unsigned long hwcap = getauxval(HWCAP);
+-        unsigned long hwcap2 = getauxval(HWCAP2);
++        unsigned long hwcap = getauxval(AT_HWCAP);
++        unsigned long hwcap2 = getauxval(AT_HWCAP2);
+ 
+         if (hwcap & HWCAP_FPU) {
+             OPENSSL_ppccap_P |= PPC_FPU;
+@@ -242,6 +251,10 @@ void OPENSSL_cpuid_setup(void)
+         if (hwcap2 & HWCAP_ARCH_3_00) {
+             OPENSSL_ppccap_P |= PPC_MADD300;
+         }
++
++        if (hwcap2 & HWCAP_ARCH_3_1) {
++            OPENSSL_ppccap_P |= PPC_BRD31;
++        }
+     }
+ #endif
+ 
+@@ -263,7 +276,7 @@ void OPENSSL_cpuid_setup(void)
+     sigaction(SIGILL, &ill_act, &ill_oact);
+ 
+ #ifndef OSSL_IMPLEMENT_GETAUXVAL
+-    if (sigsetjmp(ill_jmp,1) == 0) {
++    if (sigsetjmp(ill_jmp, 1) == 0) {
+         OPENSSL_fpu_probe();
+         OPENSSL_ppccap_P |= PPC_FPU;
+ 
+diff --git a/crypto/ppccpuid.pl b/crypto/ppccpuid.pl
+index c6555df..706164a 100755
+--- a/crypto/ppccpuid.pl
++++ b/crypto/ppccpuid.pl
+@@ -81,6 +81,17 @@ $code=<<___;
+ 	.long	0
+ 	.byte	0,12,0x14,0,0,0,0,0
+ 
++.globl	.OPENSSL_brd31_probe
++.align	4
++.OPENSSL_brd31_probe:
++	xor	r0,r0,r0
++	brd	r3,r0
++	blr
++	.long	0
++	.byte	0,12,0x14,0,0,0,0,0
++.size	.OPENSSL_brd31_probe,.-.OPENSSL_brd31_probe
++
++
+ .globl	.OPENSSL_wipe_cpu
+ .align	4
+ .OPENSSL_wipe_cpu:
+diff --git a/include/crypto/ppc_arch.h b/include/crypto/ppc_arch.h
+index 3b3ce4b..fcc846c 100644
+--- a/include/crypto/ppc_arch.h
++++ b/include/crypto/ppc_arch.h
+@@ -24,5 +24,6 @@ extern unsigned int OPENSSL_ppccap_P;
+ # define PPC_MADD300     (1<<4)
+ # define PPC_MFTB        (1<<5)
+ # define PPC_MFSPR268    (1<<6)
++# define PPC_BRD31       (1<<7)
+ 
+ #endif
diff --git a/openssl.spec b/openssl.spec
index 38e2e94..0ba4ba0 100644
--- a/openssl.spec
+++ b/openssl.spec
@@ -140,6 +140,13 @@ Patch68: 0068-CVE-2022-2068.patch
 Patch69: 0069-CVE-2022-2097.patch
 # https://github.com/openssl/openssl/commit/edceec7fe0c9a5534ae155c8398c63dd7dd95483
 Patch70: 0070-EVP_PKEY_Q_keygen-Call-OPENSSL_init_crypto-to-init-s.patch
+# https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c
+# https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd
+Patch71: 0071-AES-GCM-performance-optimization.patch
+# https://github.com/openssl/openssl/commit/f596bbe4da779b56eea34d96168b557d78e1149
+# https://github.com/openssl/openssl/commit/7e1f3ffcc5bc15fb9a12b9e3bb202f544c6ed5aa
+# hunks in crypto/ppccap.c from https://github.com/openssl/openssl/commit/f5485b97b6c9977c0d39c7669b9f97a879312447
+Patch72: 0072-ChaCha20-performance-optimizations-for-ppc64le.patch
 
 License: ASL 2.0
 URL: http://www.openssl.org/
@@ -474,6 +481,10 @@ install -m644 %{SOURCE9} \
 - Fix segfault in EVP_PKEY_Q_keygen() when OpenSSL was not previously
   initialized.
   Resolves: rhbz#2103289
+- Improve AES-GCM performance on Power9 and Power10 ppc64le
+  Resolves: rhbz#2051312
+- Improve ChaCha20 performance on Power10 ppc64le
+  Resolves: rhbz#2051312
 
 * Tue Jul 05 2022 Clemens Lang <cllang@redhat.com> - 1:3.0.1-37
 - CVE-2022-2097: AES OCB fails to encrypt some bytes on 32-bit x86