diff --git a/0067-ppc64le-Montgomery-multiply.patch b/0067-ppc64le-Montgomery-multiply.patch new file mode 100644 index 0000000..36c0222 --- /dev/null +++ b/0067-ppc64le-Montgomery-multiply.patch @@ -0,0 +1,703 @@ +From 33ffd36afa7594aeb958a925f521cb287ca850c8 Mon Sep 17 00:00:00 2001 +From: Rohan McLure +Date: Mon, 27 Jun 2022 12:14:55 +1000 +Subject: [PATCH 1/2] Revert "Revert "bn: Add fixed length (n=6), unrolled PPC + Montgomery Multiplication"" + +This reverts commit 712d9cc90e355b2c98a959d4e9398610d2269c9e. +--- + crypto/bn/asm/ppc64-mont-fixed.pl | 581 ++++++++++++++++++++++++++++++ + crypto/bn/bn_ppc.c | 15 + + crypto/bn/build.info | 3 +- + 3 files changed, 598 insertions(+), 1 deletion(-) + +diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl +index e69de29bb2d1..0fb397bc5f12 100755 +--- a/crypto/bn/asm/ppc64-mont-fixed.pl ++++ b/crypto/bn/asm/ppc64-mont-fixed.pl +@@ -0,0 +1,581 @@ ++#! /usr/bin/env perl ++# Copyright 2021-2022 The OpenSSL Project Authors. All Rights Reserved. ++# ++# Licensed under the Apache License 2.0 (the "License"). You may not use ++# this file except in compliance with the License. You can obtain a copy ++# in the file LICENSE in the source distribution or at ++# https://www.openssl.org/source/license.html ++ ++# ==================================================================== ++# Written by Amitay Isaacs , Martin Schwenke ++# & Alastair D'Silva for ++# the OpenSSL project. ++# ==================================================================== ++ ++# ++# Fixed length (n=6), unrolled PPC Montgomery Multiplication ++# ++ ++# 2021 ++# ++# Although this is a generic implementation for unrolling Montgomery ++# Multiplication for arbitrary values of n, this is currently only ++# used for n = 6 to improve the performance of ECC p384. ++# ++# Unrolling allows intermediate results to be stored in registers, ++# rather than on the stack, improving performance by ~7% compared to ++# the existing PPC assembly code. ++# ++# The ISA 3.0 implementation uses combination multiply/add ++# instructions (maddld, maddhdu) to improve performance by an ++# additional ~10% on Power 9. ++# ++# Finally, saving non-volatile registers into volatile vector ++# registers instead of onto the stack saves a little more. ++# ++# On a Power 9 machine we see an overall improvement of ~18%. ++# ++ ++use strict; ++use warnings; ++ ++my ($flavour, $output, $dir, $xlate); ++ ++# $output is the last argument if it looks like a file (it has an extension) ++# $flavour is the first argument if it doesn't look like a file ++$output = $#ARGV >= 0 && $ARGV[$#ARGV] =~ m|\.\w+$| ? pop : undef; ++$flavour = $#ARGV >= 0 && $ARGV[0] !~ m|\.| ? shift : undef; ++ ++$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; ++( $xlate="${dir}ppc-xlate.pl" and -f $xlate ) or ++( $xlate="${dir}../../perlasm/ppc-xlate.pl" and -f $xlate) or ++die "can't locate ppc-xlate.pl"; ++ ++open STDOUT,"| $^X $xlate $flavour \"$output\"" ++ or die "can't call $xlate: $!"; ++ ++if ($flavour !~ /64/) { ++ die "bad flavour ($flavour) - only ppc64 permitted"; ++} ++ ++my $SIZE_T= 8; ++ ++# Registers are global so the code is remotely readable ++ ++# Parameters for Montgomery multiplication ++my $sp = "r1"; ++my $toc = "r2"; ++my $rp = "r3"; ++my $ap = "r4"; ++my $bp = "r5"; ++my $np = "r6"; ++my $n0 = "r7"; ++my $num = "r8"; ++ ++my $i = "r9"; ++my $c0 = "r10"; ++my $bp0 = "r11"; ++my $bpi = "r11"; ++my $bpj = "r11"; ++my $tj = "r12"; ++my $apj = "r12"; ++my $npj = "r12"; ++my $lo = "r14"; ++my $c1 = "r14"; ++ ++# Non-volatile registers used for tp[i] ++# ++# 12 registers are available but the limit on unrolling is 10, ++# since registers from $tp[0] to $tp[$n+1] are used. ++my @tp = ("r20" .. "r31"); ++ ++# volatile VSRs for saving non-volatile GPRs - faster than stack ++my @vsrs = ("v32" .. "v46"); ++ ++package Mont; ++ ++sub new($$) ++{ ++ my ($class, $n) = @_; ++ ++ if ($n > 10) { ++ die "Can't unroll for BN length ${n} (maximum 10)" ++ } ++ ++ my $self = { ++ code => "", ++ n => $n, ++ }; ++ bless $self, $class; ++ ++ return $self; ++} ++ ++sub add_code($$) ++{ ++ my ($self, $c) = @_; ++ ++ $self->{code} .= $c; ++} ++ ++sub get_code($) ++{ ++ my ($self) = @_; ++ ++ return $self->{code}; ++} ++ ++sub get_function_name($) ++{ ++ my ($self) = @_; ++ ++ return "bn_mul_mont_fixed_n" . $self->{n}; ++} ++ ++sub get_label($$) ++{ ++ my ($self, $l) = @_; ++ ++ return "L" . $l . "_" . $self->{n}; ++} ++ ++sub get_labels($@) ++{ ++ my ($self, @labels) = @_; ++ ++ my %out = (); ++ ++ foreach my $l (@labels) { ++ $out{"$l"} = $self->get_label("$l"); ++ } ++ ++ return \%out; ++} ++ ++sub nl($) ++{ ++ my ($self) = @_; ++ ++ $self->add_code("\n"); ++} ++ ++sub copy_result($) ++{ ++ my ($self) = @_; ++ ++ my ($n) = $self->{n}; ++ ++ for (my $j = 0; $j < $n; $j++) { ++ $self->add_code(<<___); ++ std $tp[$j],`$j*$SIZE_T`($rp) ++___ ++ } ++ ++} ++ ++sub mul_mont_fixed($) ++{ ++ my ($self) = @_; ++ ++ my ($n) = $self->{n}; ++ my $fname = $self->get_function_name(); ++ my $label = $self->get_labels("outer", "enter", "sub", "copy", "end"); ++ ++ $self->add_code(<<___); ++ ++.globl .${fname} ++.align 5 ++.${fname}: ++ ++___ ++ ++ $self->save_registers(); ++ ++ $self->add_code(<<___); ++ ld $n0,0($n0) ++ ++ ld $bp0,0($bp) ++ ++ ld $apj,0($ap) ++___ ++ ++ $self->mul_c_0($tp[0], $apj, $bp0, $c0); ++ ++ for (my $j = 1; $j < $n - 1; $j++) { ++ $self->add_code(<<___); ++ ld $apj,`$j*$SIZE_T`($ap) ++___ ++ $self->mul($tp[$j], $apj, $bp0, $c0); ++ } ++ ++ $self->add_code(<<___); ++ ld $apj,`($n-1)*$SIZE_T`($ap) ++___ ++ ++ $self->mul_last($tp[$n-1], $tp[$n], $apj, $bp0, $c0); ++ ++ $self->add_code(<<___); ++ li $tp[$n+1],0 ++ ++___ ++ ++ $self->add_code(<<___); ++ li $i,0 ++ mtctr $num ++ b $label->{"enter"} ++ ++.align 4 ++$label->{"outer"}: ++ ldx $bpi,$bp,$i ++ ++ ld $apj,0($ap) ++___ ++ ++ $self->mul_add_c_0($tp[0], $tp[0], $apj, $bpi, $c0); ++ ++ for (my $j = 1; $j < $n; $j++) { ++ $self->add_code(<<___); ++ ld $apj,`$j*$SIZE_T`($ap) ++___ ++ $self->mul_add($tp[$j], $tp[$j], $apj, $bpi, $c0); ++ } ++ ++ $self->add_code(<<___); ++ addc $tp[$n],$tp[$n],$c0 ++ addze $tp[$n+1],$tp[$n+1] ++___ ++ ++ $self->add_code(<<___); ++.align 4 ++$label->{"enter"}: ++ mulld $bpi,$tp[0],$n0 ++ ++ ld $npj,0($np) ++___ ++ ++ $self->mul_add_c_0($lo, $tp[0], $bpi, $npj, $c0); ++ ++ for (my $j = 1; $j < $n; $j++) { ++ $self->add_code(<<___); ++ ld $npj,`$j*$SIZE_T`($np) ++___ ++ $self->mul_add($tp[$j-1], $tp[$j], $npj, $bpi, $c0); ++ } ++ ++ $self->add_code(<<___); ++ addc $tp[$n-1],$tp[$n],$c0 ++ addze $tp[$n],$tp[$n+1] ++ ++ addi $i,$i,$SIZE_T ++ bdnz $label->{"outer"} ++ ++ and. $tp[$n],$tp[$n],$tp[$n] ++ bne $label->{"sub"} ++ ++ cmpld $tp[$n-1],$npj ++ blt $label->{"copy"} ++ ++$label->{"sub"}: ++___ ++ ++ # ++ # Reduction ++ # ++ ++ $self->add_code(<<___); ++ ld $bpj,`0*$SIZE_T`($np) ++ subfc $c1,$bpj,$tp[0] ++ std $c1,`0*$SIZE_T`($rp) ++ ++___ ++ for (my $j = 1; $j < $n - 1; $j++) { ++ $self->add_code(<<___); ++ ld $bpj,`$j*$SIZE_T`($np) ++ subfe $c1,$bpj,$tp[$j] ++ std $c1,`$j*$SIZE_T`($rp) ++ ++___ ++ } ++ ++ $self->add_code(<<___); ++ subfe $c1,$npj,$tp[$n-1] ++ std $c1,`($n-1)*$SIZE_T`($rp) ++ ++___ ++ ++ $self->add_code(<<___); ++ addme. $tp[$n],$tp[$n] ++ beq $label->{"end"} ++ ++$label->{"copy"}: ++___ ++ ++ $self->copy_result(); ++ ++ $self->add_code(<<___); ++ ++$label->{"end"}: ++___ ++ ++ $self->restore_registers(); ++ ++ $self->add_code(<<___); ++ li r3,1 ++ blr ++.size .${fname},.-.${fname} ++___ ++ ++} ++ ++package Mont::GPR; ++ ++our @ISA = ('Mont'); ++ ++sub new($$) ++{ ++ my ($class, $n) = @_; ++ ++ return $class->SUPER::new($n); ++} ++ ++sub save_registers($) ++{ ++ my ($self) = @_; ++ ++ my $n = $self->{n}; ++ ++ $self->add_code(<<___); ++ std $lo,-8($sp) ++___ ++ ++ for (my $j = 0; $j <= $n+1; $j++) { ++ $self->{code}.=<<___; ++ std $tp[$j],-`($j+2)*8`($sp) ++___ ++ } ++ ++ $self->add_code(<<___); ++ ++___ ++} ++ ++sub restore_registers($) ++{ ++ my ($self) = @_; ++ ++ my $n = $self->{n}; ++ ++ $self->add_code(<<___); ++ ld $lo,-8($sp) ++___ ++ ++ for (my $j = 0; $j <= $n+1; $j++) { ++ $self->{code}.=<<___; ++ ld $tp[$j],-`($j+2)*8`($sp) ++___ ++ } ++ ++ $self->{code} .=<<___; ++ ++___ ++} ++ ++# Direct translation of C mul() ++sub mul($$$$$) ++{ ++ my ($self, $r, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $r,$lo,$c ++ mulhdu $c,$a,$w ++ addze $c,$c ++ ++___ ++} ++ ++# Like mul() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_c_0($$$$$) ++{ ++ my ($self, $r, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $r,$a,$w ++ mulhdu $c,$a,$w ++ ++___ ++} ++ ++# Like mul() but does not to the final addition of CA into $c - an ++# optimisation to save an instruction ++sub mul_last($$$$$$) ++{ ++ my ($self, $r1, $r2, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $r1,$lo,$c ++ mulhdu $c,$a,$w ++ ++ addze $r2,$c ++___ ++} ++ ++# Like C mul_add() but allow $r_out and $r_in to be different ++sub mul_add($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $lo,$lo,$c ++ mulhdu $c,$a,$w ++ addze $c,$c ++ addc $r_out,$r_in,$lo ++ addze $c,$c ++ ++___ ++} ++ ++# Like mul_add() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_add_c_0($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $lo,$a,$w ++ addc $r_out,$r_in,$lo ++ mulhdu $c,$a,$w ++ addze $c,$c ++ ++___ ++} ++ ++package Mont::GPR_300; ++ ++our @ISA = ('Mont::GPR'); ++ ++sub new($$) ++{ ++ my ($class, $n) = @_; ++ ++ my $mont = $class->SUPER::new($n); ++ ++ return $mont; ++} ++ ++sub get_function_name($) ++{ ++ my ($self) = @_; ++ ++ return "bn_mul_mont_300_fixed_n" . $self->{n}; ++} ++ ++sub get_label($$) ++{ ++ my ($self, $l) = @_; ++ ++ return "L" . $l . "_300_" . $self->{n}; ++} ++ ++# Direct translation of C mul() ++sub mul($$$$$) ++{ ++ my ($self, $r, $a, $w, $c, $last) = @_; ++ ++ $self->add_code(<<___); ++ maddld $r,$a,$w,$c ++ maddhdu $c,$a,$w,$c ++ ++___ ++} ++ ++# Save the last carry as the final entry ++sub mul_last($$$$$) ++{ ++ my ($self, $r1, $r2, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ maddld $r1,$a,$w,$c ++ maddhdu $r2,$a,$w,$c ++ ++___ ++} ++ ++# Like mul() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_c_0($$$$$) ++{ ++ my ($self, $r, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ mulld $r,$a,$w ++ mulhdu $c,$a,$w ++ ++___ ++} ++ ++# Like C mul_add() but allow $r_out and $r_in to be different ++sub mul_add($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ maddld $lo,$a,$w,$c ++ maddhdu $c,$a,$w,$c ++ addc $r_out,$r_in,$lo ++ addze $c,$c ++ ++___ ++} ++ ++# Like mul_add() but $c is ignored as an input - an optimisation to save a ++# preliminary instruction that would set input $c to 0 ++sub mul_add_c_0($$$$$$) ++{ ++ my ($self, $r_out, $r_in, $a, $w, $c) = @_; ++ ++ $self->add_code(<<___); ++ maddld $lo,$a,$w,$r_in ++ maddhdu $c,$a,$w,$r_in ++___ ++ ++ if ($r_out ne $lo) { ++ $self->add_code(<<___); ++ mr $r_out,$lo ++___ ++ } ++ ++ $self->nl(); ++} ++ ++ ++package main; ++ ++my $code; ++ ++$code.=<<___; ++.machine "any" ++.text ++___ ++ ++my $mont; ++ ++$mont = new Mont::GPR(6); ++$mont->mul_mont_fixed(); ++$code .= $mont->get_code(); ++ ++$mont = new Mont::GPR_300(6); ++$mont->mul_mont_fixed(); ++$code .= $mont->get_code(); ++ ++$code =~ s/\`([^\`]*)\`/eval $1/gem; ++ ++$code.=<<___; ++.asciz "Montgomery Multiplication for PPC by , " ++___ ++ ++print $code; ++close STDOUT or die "error closing STDOUT: $!"; +diff --git a/crypto/bn/bn_ppc.c b/crypto/bn/bn_ppc.c +index 3ee76ea96574..1e9421bee213 100644 +--- a/crypto/bn/bn_ppc.c ++++ b/crypto/bn/bn_ppc.c +@@ -19,6 +19,12 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, int num); + int bn_mul4x_mont_int(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + const BN_ULONG *np, const BN_ULONG *n0, int num); ++ int bn_mul_mont_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, ++ const BN_ULONG *bp, const BN_ULONG *np, ++ const BN_ULONG *n0, int num); ++ int bn_mul_mont_300_fixed_n6(BN_ULONG *rp, const BN_ULONG *ap, ++ const BN_ULONG *bp, const BN_ULONG *np, ++ const BN_ULONG *n0, int num); + + if (num < 4) + return 0; +@@ -34,5 +40,14 @@ int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, + * no opportunity to figure it out... + */ + ++#if defined(_ARCH_PPC64) && !defined(__ILP32__) ++ if (num == 6) { ++ if (OPENSSL_ppccap_P & PPC_MADD300) ++ return bn_mul_mont_300_fixed_n6(rp, ap, bp, np, n0, num); ++ else ++ return bn_mul_mont_fixed_n6(rp, ap, bp, np, n0, num); ++ } ++#endif ++ + return bn_mul_mont_int(rp, ap, bp, np, n0, num); + } +diff --git a/crypto/bn/build.info b/crypto/bn/build.info +index 4f8d0689b5ea..987a70ae263b 100644 +--- a/crypto/bn/build.info ++++ b/crypto/bn/build.info +@@ -79,7 +79,7 @@ IF[{- !$disabled{asm} -}] + + $BNASM_ppc32=bn_ppc.c bn-ppc.s ppc-mont.s + $BNDEF_ppc32=OPENSSL_BN_ASM_MONT +- $BNASM_ppc64=$BNASM_ppc32 ++ $BNASM_ppc64=$BNASM_ppc32 ppc64-mont-fixed.s + $BNDEF_ppc64=$BNDEF_ppc32 + + $BNASM_c64xplus=asm/bn-c64xplus.asm +@@ -173,6 +173,7 @@ GENERATE[parisc-mont.s]=asm/parisc-mont.pl + GENERATE[bn-ppc.s]=asm/ppc.pl + GENERATE[ppc-mont.s]=asm/ppc-mont.pl + GENERATE[ppc64-mont.s]=asm/ppc64-mont.pl ++GENERATE[ppc64-mont-fixed.s]=asm/ppc64-mont-fixed.pl + + GENERATE[alpha-mont.S]=asm/alpha-mont.pl + + +From 01ebad0d6e3a09bc9e32350b402901471610a3dc Mon Sep 17 00:00:00 2001 +From: Rohan McLure +Date: Thu, 30 Jun 2022 16:21:06 +1000 +Subject: [PATCH 2/2] Fix unrolled montgomery multiplication for POWER9 + +In the reference C implementation in bn_asm.c, tp[num + 1] contains the +carry bit for accumulations into tp[num]. tp[num + 1] is only ever +assigned, never itself incremented. +--- + crypto/bn/asm/ppc64-mont-fixed.pl | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/crypto/bn/asm/ppc64-mont-fixed.pl b/crypto/bn/asm/ppc64-mont-fixed.pl +index 0fb397bc5f12..e27d0ad93d85 100755 +--- a/crypto/bn/asm/ppc64-mont-fixed.pl ++++ b/crypto/bn/asm/ppc64-mont-fixed.pl +@@ -63,6 +63,7 @@ + # Registers are global so the code is remotely readable + + # Parameters for Montgomery multiplication ++my $ze = "r0"; + my $sp = "r1"; + my $toc = "r2"; + my $rp = "r3"; +@@ -192,6 +193,7 @@ ($) + $self->save_registers(); + + $self->add_code(<<___); ++ li $ze,0 + ld $n0,0($n0) + + ld $bp0,0($bp) +@@ -242,7 +244,7 @@ ($) + + $self->add_code(<<___); + addc $tp[$n],$tp[$n],$c0 +- addze $tp[$n+1],$tp[$n+1] ++ addze $tp[$n+1],$ze + ___ + + $self->add_code(<<___); +@@ -272,7 +274,7 @@ ($) + and. $tp[$n],$tp[$n],$tp[$n] + bne $label->{"sub"} + +- cmpld $tp[$n-1],$npj ++ cmpld $tp[$n-1],$npj + blt $label->{"copy"} + + $label->{"sub"}: diff --git a/openssl.spec b/openssl.spec index eeaf6b2..1a2b7e5 100644 --- a/openssl.spec +++ b/openssl.spec @@ -109,6 +109,9 @@ Patch60: 0060-FIPS-KAT-signature-tests.patch # https://bugzilla.redhat.com/show_bug.cgi?id=2087147 Patch61: 0061-Deny-SHA-1-signature-verification-in-FIPS-provider.patch Patch62: 0062-fips-Expose-a-FIPS-indicator.patch +# https://bugzilla.redhat.com/show_bug.cgi?id=2130708 +# https://github.com/openssl/openssl/pull/18883 +Patch67: 0067-ppc64le-Montgomery-multiply.patch # https://github.com/openssl/openssl/commit/44a563dde1584cd9284e80b6e45ee5019be8d36c # https://github.com/openssl/openssl/commit/345c99b6654b8313c792d54f829943068911ddbd Patch71: 0071-AES-GCM-performance-optimization.patch @@ -488,6 +491,8 @@ install -m644 %{SOURCE9} \ Resolves: rhbz#2093804 - Adjusting include for the FIPS_mode macro Resolves: rhbz#2083879 +- Backport of ppc64le Montgomery multiply enhancement + Resolves: rhbz#2130708 * Tue Nov 22 2022 Dmitry Belyavskiy - 1:3.0.7-1 - Rebasing to OpenSSL 3.0.7