forked from rpms/openssl
		
	Backport of ppc64le Montgomery multiply enhancement
Resolves: rhbz#2130708
This commit is contained in:
		
							parent
							
								
									c29e183891
								
							
						
					
					
						commit
						657265459d
					
				
							
								
								
									
										703
									
								
								0067-ppc64le-Montgomery-multiply.patch
									
									
									
									
									
										Normal file
									
								
							
							
						
						
									
										703
									
								
								0067-ppc64le-Montgomery-multiply.patch
									
									
									
									
									
										Normal file
									
								
							| @ -0,0 +1,703 @@ | ||||
| From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001 | ||||
| From: Rohan McLure <rohanmclure@linux.ibm.com> | ||||
| Date: Mon, 27 Jun 2022 12:14:55 +1000 | ||||
| Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC | ||||
|  Montgomery Multiplication"" | ||||
| 
 | ||||
| This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e. | ||||
| ---
 | ||||
|  crypto/bn/asm/ppc64-mont-fixed.pl | 581 ++++++++++++++++++++++++++++++ | ||||
|  crypto/bn/bn_ppc.c                |  15 + | ||||
|  crypto/bn/build.info              |   3 +- | ||||
|  3 files changed, 598 insertions(+), 1 deletion(-) | ||||
| 
 | ||||
| diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
 | ||||
| index e69de29bb2d1..0fb397bc5f12 100755
 | ||||
| --- a/crypto/bn/asm/ppc64-mont-fixed.pl
 | ||||
| +++ b/crypto/bn/asm/ppc64-mont-fixed.pl
 | ||||
| @@ -0,0 +1,581 @@
 | ||||
| +#! /usr/bin/env perl
 | ||||
| +# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved.
 | ||||
| +#
 | ||||
| +# Licensed under the Apache License 2.0 (the "License").  You may not use
 | ||||
| +# this file except in compliance with the License.  You can obtain a copy
 | ||||
| +# in the file LICENSE in the source distribution or at
 | ||||
| +# https://www.openssl.org/source/license.html
 | ||||
| +
 | ||||
| +# ====================================================================
 | ||||
| +# Written by Amitay Isaacs <amitay@ozlabs.org>, Martin Schwenke
 | ||||
| +# <martin@meltin.net> & Alastair D'Silva <alastair@d-silva.org> for
 | ||||
| +# the OpenSSL project.
 | ||||
| +# ====================================================================
 | ||||
| +
 | ||||
| +#
 | ||||
| +# Fixed length (n=6), unrolled PPC Montgomery Multiplication
 | ||||
| +#
 | ||||
| +
 | ||||
| +# 2021
 | ||||
| +#
 | ||||
| +# Although this is a generic implementation for unrolling Montgomery
 | ||||
| +# Multiplication for arbitrary values of n, this is currently only
 | ||||
| +# used for n = 6 to improve the performance of ECC p384.
 | ||||
| +#
 | ||||
| +# Unrolling allows intermediate results to be stored in registers,
 | ||||
| +# rather than on the stack, improving performance by ~7% compared to
 | ||||
| +# the existing PPC assembly code.
 | ||||
| +#
 | ||||
| +# The ISA 3.0 implementation uses combination multiply/add
 | ||||
| +# instructions (maddld, maddhdu) to improve performance by an
 | ||||
| +# additional ~10% on Power 9.
 | ||||
| +#
 | ||||
| +# Finally, saving non-volatile registers into volatile vector
 | ||||
| +# registers instead of onto the stack saves a little more.
 | ||||
| +#
 | ||||
| +# On a Power 9 machine we see an overall improvement of ~18%.
 | ||||
| +#
 | ||||
| +
 | ||||
| +use strict;
 | ||||
| +use warnings;
 | ||||
| +
 | ||||
| +my ($flavour, $output, $dir, $xlate);
 | ||||
| +
 | ||||
| +# $output is the last argument if it looks like a file (it has an extension)
 | ||||
| +# $flavour is the first argument if it doesn't look like a file
 | ||||
| +$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef;
 | ||||
| +$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef;
 | ||||
| +
 | ||||
| +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
 | ||||
| +( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or
 | ||||
| +( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or
 | ||||
| +die "can't locate ppc-xlate.pl";
 | ||||
| +
 | ||||
| +open STDOUT,"| $^X $xlate $flavour \"$output\""
 | ||||
| +    or die "can't call $xlate: $!";
 | ||||
| +
 | ||||
| +if ($flavour !~ /64/) {
 | ||||
| +	die "bad flavour ($flavour) - only ppc64 permitted";
 | ||||
| +}
 | ||||
| +
 | ||||
| +my $SIZE_T= 8;
 | ||||
| +
 | ||||
| +# Registers are global so the code is remotely readable
 | ||||
| +
 | ||||
| +# Parameters for Montgomery multiplication
 | ||||
| +my $sp	= "r1";
 | ||||
| +my $toc	= "r2";
 | ||||
| +my $rp	= "r3";
 | ||||
| +my $ap	= "r4";
 | ||||
| +my $bp	= "r5";
 | ||||
| +my $np	= "r6";
 | ||||
| +my $n0	= "r7";
 | ||||
| +my $num	= "r8";
 | ||||
| +
 | ||||
| +my $i	= "r9";
 | ||||
| +my $c0	= "r10";
 | ||||
| +my $bp0	= "r11";
 | ||||
| +my $bpi	= "r11";
 | ||||
| +my $bpj	= "r11";
 | ||||
| +my $tj	= "r12";
 | ||||
| +my $apj	= "r12";
 | ||||
| +my $npj	= "r12";
 | ||||
| +my $lo	= "r14";
 | ||||
| +my $c1	= "r14";
 | ||||
| +
 | ||||
| +# Non-volatile registers used for tp[i]
 | ||||
| +#
 | ||||
| +# 12 registers are available but the limit on unrolling is 10,
 | ||||
| +# since registers from $tp[0] to $tp[$n+1] are used.
 | ||||
| +my @tp = ("r20" .. "r31");
 | ||||
| +
 | ||||
| +# volatile VSRs for saving non-volatile GPRs - faster than stack
 | ||||
| +my @vsrs = ("v32" .. "v46");
 | ||||
| +
 | ||||
| +package Mont;
 | ||||
| +
 | ||||
| +sub new($$)
 | ||||
| +{
 | ||||
| +	my ($class, $n) = @_;
 | ||||
| +
 | ||||
| +	if ($n > 10) {
 | ||||
| +		die "Can't unroll for BN length ${n} (maximum 10)"
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	my $self = {
 | ||||
| +		code => "",
 | ||||
| +		n => $n,
 | ||||
| +	};
 | ||||
| +	bless $self, $class;
 | ||||
| +
 | ||||
| +	return $self;
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub add_code($$)
 | ||||
| +{
 | ||||
| +	my ($self, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->{code} .= $c;
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub get_code($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	return $self->{code};
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub get_function_name($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	return "bn_mul_mont_fixed_n" . $self->{n};
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub get_label($$)
 | ||||
| +{
 | ||||
| +	my ($self, $l) = @_;
 | ||||
| +
 | ||||
| +	return "L" . $l . "_" . $self->{n};
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub get_labels($@)
 | ||||
| +{
 | ||||
| +	my ($self, @labels) = @_;
 | ||||
| +
 | ||||
| +	my %out = ();
 | ||||
| +
 | ||||
| +	foreach my $l (@labels) {
 | ||||
| +		$out{"$l"} = $self->get_label("$l");
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	return \%out;
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub nl($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code("\n");
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub copy_result($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	my ($n) = $self->{n};
 | ||||
| +
 | ||||
| +	for (my $j = 0; $j < $n; $j++) {
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	std		$tp[$j],`$j*$SIZE_T`($rp)
 | ||||
| +___
 | ||||
| +	}
 | ||||
| +
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub mul_mont_fixed($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	my ($n) = $self->{n};
 | ||||
| +	my $fname = $self->get_function_name();
 | ||||
| +	my $label = $self->get_labels("outer", "enter", "sub", "copy", "end");
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +
 | ||||
| +.globl	.${fname}
 | ||||
| +.align	5
 | ||||
| +.${fname}:
 | ||||
| +
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->save_registers();
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	ld		$n0,0($n0)
 | ||||
| +
 | ||||
| +	ld		$bp0,0($bp)
 | ||||
| +
 | ||||
| +	ld		$apj,0($ap)
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->mul_c_0($tp[0], $apj, $bp0, $c0);
 | ||||
| +
 | ||||
| +	for (my $j = 1; $j < $n - 1; $j++) {
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	ld		$apj,`$j*$SIZE_T`($ap)
 | ||||
| +___
 | ||||
| +		$self->mul($tp[$j], $apj, $bp0, $c0);
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	ld		$apj,`($n-1)*$SIZE_T`($ap)
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0);
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	li		$tp[$n+1],0
 | ||||
| +
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	li		$i,0
 | ||||
| +	mtctr		$num
 | ||||
| +	b		$label->{"enter"}
 | ||||
| +
 | ||||
| +.align	4
 | ||||
| +$label->{"outer"}:
 | ||||
| +	ldx		$bpi,$bp,$i
 | ||||
| +
 | ||||
| +	ld		$apj,0($ap)
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0);
 | ||||
| +
 | ||||
| +	for (my $j = 1; $j < $n; $j++) {
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	ld		$apj,`$j*$SIZE_T`($ap)
 | ||||
| +___
 | ||||
| +		$self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0);
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	addc		$tp[$n],$tp[$n],$c0
 | ||||
| +	addze		$tp[$n+1],$tp[$n+1]
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +.align	4
 | ||||
| +$label->{"enter"}:
 | ||||
| +	mulld		$bpi,$tp[0],$n0
 | ||||
| +
 | ||||
| +	ld		$npj,0($np)
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0);
 | ||||
| +
 | ||||
| +	for (my $j = 1; $j < $n; $j++) {
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	ld		$npj,`$j*$SIZE_T`($np)
 | ||||
| +___
 | ||||
| +		$self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0);
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	addc		$tp[$n-1],$tp[$n],$c0
 | ||||
| +	addze		$tp[$n],$tp[$n+1]
 | ||||
| +
 | ||||
| +	addi		$i,$i,$SIZE_T
 | ||||
| +	bdnz		$label->{"outer"}
 | ||||
| +
 | ||||
| +	and.		$tp[$n],$tp[$n],$tp[$n]
 | ||||
| +	bne		$label->{"sub"}
 | ||||
| +
 | ||||
| +	cmpld	$tp[$n-1],$npj
 | ||||
| +	blt		$label->{"copy"}
 | ||||
| +
 | ||||
| +$label->{"sub"}:
 | ||||
| +___
 | ||||
| +
 | ||||
| +	#
 | ||||
| +	# Reduction
 | ||||
| +	#
 | ||||
| +
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	ld		$bpj,`0*$SIZE_T`($np)
 | ||||
| +	subfc		$c1,$bpj,$tp[0]
 | ||||
| +	std		$c1,`0*$SIZE_T`($rp)
 | ||||
| +
 | ||||
| +___
 | ||||
| +	for (my $j = 1; $j < $n - 1; $j++) {
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	ld		$bpj,`$j*$SIZE_T`($np)
 | ||||
| +	subfe		$c1,$bpj,$tp[$j]
 | ||||
| +	std		$c1,`$j*$SIZE_T`($rp)
 | ||||
| +
 | ||||
| +___
 | ||||
| +	}
 | ||||
| +
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	subfe		$c1,$npj,$tp[$n-1]
 | ||||
| +	std		$c1,`($n-1)*$SIZE_T`($rp)
 | ||||
| +
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	addme.		$tp[$n],$tp[$n]
 | ||||
| +	beq		$label->{"end"}
 | ||||
| +
 | ||||
| +$label->{"copy"}:
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->copy_result();
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +
 | ||||
| +$label->{"end"}:
 | ||||
| +___
 | ||||
| +
 | ||||
| +	$self->restore_registers();
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	li		r3,1
 | ||||
| +	blr
 | ||||
| +.size .${fname},.-.${fname}
 | ||||
| +___
 | ||||
| +
 | ||||
| +}
 | ||||
| +
 | ||||
| +package Mont::GPR;
 | ||||
| +
 | ||||
| +our @ISA = ('Mont');
 | ||||
| +
 | ||||
| +sub new($$)
 | ||||
| +{
 | ||||
| +    my ($class, $n) = @_;
 | ||||
| +
 | ||||
| +    return $class->SUPER::new($n);
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub save_registers($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	my $n = $self->{n};
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	std	$lo,-8($sp)
 | ||||
| +___
 | ||||
| +
 | ||||
| +	for (my $j = 0; $j <= $n+1; $j++) {
 | ||||
| +		$self->{code}.=<<___;
 | ||||
| +	std	$tp[$j],-`($j+2)*8`($sp)
 | ||||
| +___
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub restore_registers($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	my $n = $self->{n};
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	ld	$lo,-8($sp)
 | ||||
| +___
 | ||||
| +
 | ||||
| +	for (my $j = 0; $j <= $n+1; $j++) {
 | ||||
| +		$self->{code}.=<<___;
 | ||||
| +	ld	$tp[$j],-`($j+2)*8`($sp)
 | ||||
| +___
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	$self->{code} .=<<___;
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Direct translation of C mul()
 | ||||
| +sub mul($$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	mulld		$lo,$a,$w
 | ||||
| +	addc		$r,$lo,$c
 | ||||
| +	mulhdu		$c,$a,$w
 | ||||
| +	addze		$c,$c
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Like mul() but $c is ignored as an input - an optimisation to save a
 | ||||
| +# preliminary instruction that would set input $c to 0
 | ||||
| +sub mul_c_0($$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	mulld		$r,$a,$w
 | ||||
| +	mulhdu		$c,$a,$w
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Like mul() but does not to the final addition of CA into $c - an
 | ||||
| +# optimisation to save an instruction
 | ||||
| +sub mul_last($$$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r1, $r2, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	mulld		$lo,$a,$w
 | ||||
| +	addc		$r1,$lo,$c
 | ||||
| +	mulhdu		$c,$a,$w
 | ||||
| +
 | ||||
| +	addze		$r2,$c
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Like C mul_add() but allow $r_out and $r_in to be different
 | ||||
| +sub mul_add($$$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	mulld		$lo,$a,$w
 | ||||
| +	addc		$lo,$lo,$c
 | ||||
| +	mulhdu		$c,$a,$w
 | ||||
| +	addze		$c,$c
 | ||||
| +	addc		$r_out,$r_in,$lo
 | ||||
| +	addze		$c,$c
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Like mul_add() but $c is ignored as an input - an optimisation to save a
 | ||||
| +# preliminary instruction that would set input $c to 0
 | ||||
| +sub mul_add_c_0($$$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	mulld		$lo,$a,$w
 | ||||
| +	addc		$r_out,$r_in,$lo
 | ||||
| +	mulhdu		$c,$a,$w
 | ||||
| +	addze		$c,$c
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +package Mont::GPR_300;
 | ||||
| +
 | ||||
| +our @ISA = ('Mont::GPR');
 | ||||
| +
 | ||||
| +sub new($$)
 | ||||
| +{
 | ||||
| +	my ($class, $n) = @_;
 | ||||
| +
 | ||||
| +	my $mont = $class->SUPER::new($n);
 | ||||
| +
 | ||||
| +	return $mont;
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub get_function_name($)
 | ||||
| +{
 | ||||
| +	my ($self) = @_;
 | ||||
| +
 | ||||
| +	return "bn_mul_mont_300_fixed_n" . $self->{n};
 | ||||
| +}
 | ||||
| +
 | ||||
| +sub get_label($$)
 | ||||
| +{
 | ||||
| +	my ($self, $l) = @_;
 | ||||
| +
 | ||||
| +	return "L" . $l . "_300_" . $self->{n};
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Direct translation of C mul()
 | ||||
| +sub mul($$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r, $a, $w, $c, $last) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	maddld		$r,$a,$w,$c
 | ||||
| +	maddhdu		$c,$a,$w,$c
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Save the last carry as the final entry
 | ||||
| +sub mul_last($$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r1, $r2, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	maddld		$r1,$a,$w,$c
 | ||||
| +	maddhdu		$r2,$a,$w,$c
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Like mul() but $c is ignored as an input - an optimisation to save a
 | ||||
| +# preliminary instruction that would set input $c to 0
 | ||||
| +sub mul_c_0($$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	mulld          $r,$a,$w
 | ||||
| +	mulhdu          $c,$a,$w
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Like C mul_add() but allow $r_out and $r_in to be different
 | ||||
| +sub mul_add($$$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	maddld		$lo,$a,$w,$c
 | ||||
| +	maddhdu		$c,$a,$w,$c
 | ||||
| +	addc		$r_out,$r_in,$lo
 | ||||
| +	addze		$c,$c
 | ||||
| +
 | ||||
| +___
 | ||||
| +}
 | ||||
| +
 | ||||
| +# Like mul_add() but $c is ignored as an input - an optimisation to save a
 | ||||
| +# preliminary instruction that would set input $c to 0
 | ||||
| +sub mul_add_c_0($$$$$$)
 | ||||
| +{
 | ||||
| +	my ($self, $r_out, $r_in, $a, $w, $c) = @_;
 | ||||
| +
 | ||||
| +	$self->add_code(<<___);
 | ||||
| +	maddld		$lo,$a,$w,$r_in
 | ||||
| +	maddhdu		$c,$a,$w,$r_in
 | ||||
| +___
 | ||||
| +
 | ||||
| +	if ($r_out ne $lo) {
 | ||||
| +		$self->add_code(<<___);
 | ||||
| +	mr			$r_out,$lo
 | ||||
| +___
 | ||||
| +	}
 | ||||
| +
 | ||||
| +	$self->nl();
 | ||||
| +}
 | ||||
| +
 | ||||
| +
 | ||||
| +package main;
 | ||||
| +
 | ||||
| +my $code;
 | ||||
| +
 | ||||
| +$code.=<<___;
 | ||||
| +.machine "any"
 | ||||
| +.text
 | ||||
| +___
 | ||||
| +
 | ||||
| +my $mont;
 | ||||
| +
 | ||||
| +$mont = new Mont::GPR(6);
 | ||||
| +$mont->mul_mont_fixed();
 | ||||
| +$code .= $mont->get_code();
 | ||||
| +
 | ||||
| +$mont = new Mont::GPR_300(6);
 | ||||
| +$mont->mul_mont_fixed();
 | ||||
| +$code .= $mont->get_code();
 | ||||
| +
 | ||||
| +$code =~ s/\`([^\`]*)\`/eval $1/gem;
 | ||||
| +
 | ||||
| +$code.=<<___;
 | ||||
| +.asciz  "Montgomery Multiplication for PPC by <amitay\@ozlabs.org>, <alastair\@d-silva.org>"
 | ||||
| +___
 | ||||
| +
 | ||||
| +print $code;
 | ||||
| +close STDOUT or die "error closing STDOUT: $!";
 | ||||
| diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c
 | ||||
| index 3ee76ea96574..1e9421bee213 100644
 | ||||
| --- a/crypto/bn/bn_ppc.c
 | ||||
| +++ b/crypto/bn/bn_ppc.c
 | ||||
| @@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 | ||||
|                          const BN_ULONG *np, const BN_ULONG *n0, int num); | ||||
|      int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, | ||||
|                            const BN_ULONG *np, const BN_ULONG *n0, int num); | ||||
| +    int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
 | ||||
| +                             const BN_ULONG *bp, const BN_ULONG *np,
 | ||||
| +                             const BN_ULONG *n0, int num);
 | ||||
| +    int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap,
 | ||||
| +                                 const BN_ULONG *bp, const BN_ULONG *np,
 | ||||
| +                                 const BN_ULONG *n0, int num);
 | ||||
|   | ||||
|      if (num < 4) | ||||
|          return 0; | ||||
| @@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
 | ||||
|       * no opportunity to figure it out... | ||||
|       */ | ||||
|   | ||||
| +#if defined(_ARCH_PPC64) && !defined(__ILP32__)
 | ||||
| +    if (num == 6) {
 | ||||
| +        if (OPENSSL_ppccap_P & PPC_MADD300)
 | ||||
| +            return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num);
 | ||||
| +        else
 | ||||
| +            return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num);
 | ||||
| +    }
 | ||||
| +#endif
 | ||||
| +
 | ||||
|      return bn_mul_mont_int(rp, ap, bp, np, n0, num); | ||||
|  } | ||||
| diff --git a/crypto/bn/build.info b/crypto/bn/build.info
 | ||||
| index 4f8d0689b5ea..987a70ae263b 100644
 | ||||
| --- a/crypto/bn/build.info
 | ||||
| +++ b/crypto/bn/build.info
 | ||||
| @@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}]
 | ||||
|   | ||||
|    $BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s | ||||
|    $BNDEF_ppc32=OPENSSL_BN_ASM_MONT | ||||
| -  $BNASM_ppc64=$BNASM_ppc32
 | ||||
| +  $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s
 | ||||
|    $BNDEF_ppc64=$BNDEF_ppc32 | ||||
|   | ||||
|    $BNASM_c64xplus=asm/bn-c64xplus.asm | ||||
| @@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl
 | ||||
|  GENERATE[bn-ppc.s]=asm/ppc.pl | ||||
|  GENERATE[ppc-mont.s]=asm/ppc-mont.pl | ||||
|  GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl | ||||
| +GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl
 | ||||
|   | ||||
|  GENERATE[alpha-mont.S]=asm/alpha-mont.pl | ||||
|   | ||||
| 
 | ||||
| From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001 | ||||
| From: Rohan McLure <rohanmclure@linux.ibm.com> | ||||
| Date: Thu, 30 Jun 2022 16:21:06 +1000 | ||||
| Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9 | ||||
| 
 | ||||
| In the reference C implementation in bn_asm.c, tp[num + 1] contains the | ||||
| carry bit for accumulations into tp[num]. tp[num + 1] is only ever | ||||
| assigned, never itself incremented. | ||||
| ---
 | ||||
|  crypto/bn/asm/ppc64-mont-fixed.pl | 6 ++++-- | ||||
|  1 file changed, 4 insertions(+), 2 deletions(-) | ||||
| 
 | ||||
| diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl
 | ||||
| index 0fb397bc5f12..e27d0ad93d85 100755
 | ||||
| --- a/crypto/bn/asm/ppc64-mont-fixed.pl
 | ||||
| +++ b/crypto/bn/asm/ppc64-mont-fixed.pl
 | ||||
| @@ -63,6 +63,7 @@
 | ||||
|  # Registers are global so the code is remotely readable | ||||
|   | ||||
|  # Parameters for Montgomery multiplication | ||||
| +my $ze	= "r0";
 | ||||
|  my $sp	= "r1"; | ||||
|  my $toc	= "r2"; | ||||
|  my $rp	= "r3"; | ||||
| @@ -192,6 +193,7 @@ ($)
 | ||||
|  	$self->save_registers(); | ||||
|   | ||||
|  	$self->add_code(<<___); | ||||
| +	li		$ze,0
 | ||||
|  	ld		$n0,0($n0) | ||||
|   | ||||
|  	ld		$bp0,0($bp) | ||||
| @@ -242,7 +244,7 @@ ($)
 | ||||
|   | ||||
|  	$self->add_code(<<___); | ||||
|  	addc		$tp[$n],$tp[$n],$c0 | ||||
| -	addze		$tp[$n+1],$tp[$n+1]
 | ||||
| +	addze		$tp[$n+1],$ze
 | ||||
|  ___ | ||||
|   | ||||
|  	$self->add_code(<<___); | ||||
| @@ -272,7 +274,7 @@ ($)
 | ||||
|  	and.		$tp[$n],$tp[$n],$tp[$n] | ||||
|  	bne		$label->{"sub"} | ||||
|   | ||||
| -	cmpld	$tp[$n-1],$npj
 | ||||
| +	cmpld		$tp[$n-1],$npj
 | ||||
|  	blt		$label->{"copy"} | ||||
|   | ||||
|  $label->{"sub"}: | ||||
| @ -109,6 +109,9 @@ Patch60: 0060-FIPS-KAT-signature-tests.patch | ||||
| # https://bugzilla.redhat.com/show_bug.cgi?id=2087147 | ||||
| Patch61: 0061-Deny-SHA-1-signature-verification-in-FIPS-provider.patch | ||||
| Patch62: 0062-fips-Expose-a-FIPS-indicator.patch | ||||
| # https://bugzilla.redhat.com/show_bug.cgi?id=2130708 | ||||
| # https://github.com/openssl/openssl/pull/18883 | ||||
| Patch67: 0067-ppc64le-Montgomery-multiply.patch | ||||
| # https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c | ||||
| # https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd | ||||
| Patch71: 0071-AES-GCM-performance-optimization.patch | ||||
| @ -488,6 +491,8 @@ install -m644 %{SOURCE9} \ | ||||
|   Resolves: rhbz#2093804 | ||||
| - Adjusting include for the FIPS_mode macro | ||||
|   Resolves: rhbz#2083879 | ||||
| - Backport of ppc64le Montgomery multiply enhancement | ||||
|   Resolves: rhbz#2130708 | ||||
| 
 | ||||
| * Tue Nov 22 2022 Dmitry Belyavskiy <dbelyavs@redhat.com> - 1:3.0.7-1 | ||||
| - Rebasing to OpenSSL 3.0.7 | ||||
|  | ||||
		Loading…
	
		Reference in New Issue
	
	Block a user