diff --git a/SOURCES/nettle-3.4.1-ecdsa-verify.patch b/SOURCES/nettle-3.4.1-ecdsa-verify.patch
new file mode 100644
index 0000000..c46f185
--- /dev/null
+++ b/SOURCES/nettle-3.4.1-ecdsa-verify.patch
@@ -0,0 +1,109 @@
+From 932ea29845da1ae350d9c056cb2cb0379a66d642 Mon Sep 17 00:00:00 2001
+From: Daiki Ueno <dueno@redhat.com>
+Date: Tue, 30 Mar 2021 09:22:47 +0200
+Subject: [PATCH] Port upstream hardening of EC scaler multiplication
+
+Some internal functions used in point multiplications are known to
+misbehave if the scaler is out-of-range.  This performs canonical
+reduction on scalers, before point multiplication.
+
+Signed-off-by: Daiki Ueno <dueno@redhat.com>
+---
+ ecc-ecdsa-sign.c   |  7 +++++--
+ ecc-ecdsa-verify.c | 14 ++++++++++++--
+ eddsa-hash.c       |  9 +++++++--
+ 3 files changed, 24 insertions(+), 6 deletions(-)
+
+diff --git a/ecc-ecdsa-sign.c b/ecc-ecdsa-sign.c
+index 3b9e9cc1..45062528 100644
+--- a/ecc-ecdsa-sign.c
++++ b/ecc-ecdsa-sign.c
+@@ -62,6 +62,8 @@ ecc_ecdsa_sign (const struct ecc_curve *ecc,
+ 		mp_limb_t *rp, mp_limb_t *sp,
+ 		mp_limb_t *scratch)
+ {
++  mp_limb_t cy;
++
+ #define P	    scratch
+ #define kinv	    scratch                /* Needs 5*ecc->p.size for computation */
+ #define hp	    (scratch  + ecc->p.size) /* NOTE: ecc->p.size + 1 limbs! */
+@@ -91,8 +93,9 @@ ecc_ecdsa_sign (const struct ecc_curve *ecc,
+   ecc_modq_mul (ecc, tp, zp, rp);
+   ecc_modq_add (ecc, hp, hp, tp);
+   ecc_modq_mul (ecc, tp, hp, kinv);
+-
+-  mpn_copyi (sp, tp, ecc->p.size);
++  /* Ensure canonical reduction. */
++  cy = mpn_sub_n (sp, tp, ecc->q.m, ecc->q.size);
++  cnd_copy (cy, sp, tp, ecc->q.size);
+ #undef P
+ #undef hp
+ #undef kinv
+diff --git a/ecc-ecdsa-verify.c b/ecc-ecdsa-verify.c
+index d7f5b684..6b8acb07 100644
+--- a/ecc-ecdsa-verify.c
++++ b/ecc-ecdsa-verify.c
+@@ -75,6 +75,8 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
+ 		  const mp_limb_t *rp, const mp_limb_t *sp,
+ 		  mp_limb_t *scratch)
+ {
++  mp_limb_t cy;
++
+   /* Procedure, according to RFC 6090, "KT-I". q denotes the group
+      order.
+ 
+@@ -98,6 +100,7 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
+ #define P1 (scratch + 4*ecc->p.size)
+ #define sinv (scratch)
+ #define hp (scratch + ecc->p.size)
++#define tp (scratch + 4*ecc->p.size)
+ 
+   if (! (ecdsa_in_range (ecc, rp)
+ 	 && ecdsa_in_range (ecc, sp)))
+@@ -112,10 +115,16 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
+ 
+   /* u1 = h / s, P1 = u1 * G */
+   ecc_hash (&ecc->q, hp, length, digest);
+-  ecc_modq_mul (ecc, u1, hp, sinv);
++  ecc_modq_mul (ecc, tp, hp, sinv);
++  /* Ensure canonical reduction. */
++  cy = mpn_sub_n (u1, tp, ecc->q.m, ecc->q.size);
++  cnd_copy (cy, u1, tp, ecc->q.size);
+ 
+   /* u2 = r / s, P2 = u2 * Y */
+-  ecc_modq_mul (ecc, u2, rp, sinv);
++  ecc_modq_mul (ecc, hp, rp, sinv);
++  /* Ensure canonical reduction. */
++  cy = mpn_sub_n (u2, hp, ecc->q.m, ecc->q.size);
++  cnd_copy (cy, u2, hp, ecc->q.size);
+ 
+    /* Total storage: 5*ecc->p.size + ecc->mul_itch */
+   ecc->mul (ecc, P2, u2, pp, u2 + ecc->p.size);
+@@ -154,4 +163,5 @@ ecc_ecdsa_verify (const struct ecc_curve *ecc,
+ #undef u2
+ #undef hp
+ #undef u1
++#undef tp
+ }
+diff --git a/eddsa-hash.c b/eddsa-hash.c
+index 4fb79f1b..53c6fc49 100644
+--- a/eddsa-hash.c
++++ b/eddsa-hash.c
+@@ -45,7 +45,12 @@ void
+ _eddsa_hash (const struct ecc_modulo *m,
+ 	     mp_limb_t *rp, const uint8_t *digest)
+ {
++  mp_limb_t cy;
++
+   size_t nbytes = 1 + m->bit_size / 8;
+-  mpn_set_base256_le (rp, 2*m->size, digest, 2*nbytes);
+-  m->mod (m, rp);
++  mpn_set_base256_le (rp + m->size, 2*m->size, digest, 2*nbytes);
++  m->mod (m, rp + m->size);
++  /* Ensure canonical reduction. */
++  cy = mpn_sub_n (rp, rp + m->size, m->m, m->size);
++  cnd_copy (cy, rp, rp + m->size, m->size);
+ }
+-- 
+2.30.2
+
diff --git a/SOURCES/nettle-3.4.1-powerpc64-aes-asm.patch b/SOURCES/nettle-3.4.1-powerpc64-aes-asm.patch
new file mode 100644
index 0000000..8bcdbe7
--- /dev/null
+++ b/SOURCES/nettle-3.4.1-powerpc64-aes-asm.patch
@@ -0,0 +1,1142 @@
+diff --git a/Makefile.in b/Makefile.in
+index b43e494f..ec46a9df 100644
+--- a/Makefile.in
++++ b/Makefile.in
+@@ -189,7 +189,7 @@ hogweed_SOURCES = sexp.c sexp-format.c \
+ 		  ed25519-sha512-pubkey.c \
+ 		  ed25519-sha512-sign.c ed25519-sha512-verify.c
+ 
+-OPT_SOURCES = fat-x86_64.c fat-arm.c mini-gmp.c
++OPT_SOURCES = fat-arm.c fat-ppc.c fat-x86_64.c mini-gmp.c
+ 
+ HEADERS = aes.h arcfour.h arctwo.h asn1.h blowfish.h \
+ 	  base16.h base64.h bignum.h buffer.h camellia.h cast128.h \
+@@ -573,7 +573,8 @@ distdir: $(DISTFILES)
+ 	done
+ 	set -e; for d in sparc32 sparc64 x86 \
+ 		x86_64 x86_64/aesni x86_64/fat \
+-		arm arm/neon arm/v6 arm/fat ; do \
++		arm arm/neon arm/v6 arm/fat \
++		powerpc64 powerpc64/p8 powerpc64/fat ; do \
+ 	  mkdir "$(distdir)/$$d" ; \
+ 	  find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \
+ 	    -exec cp '{}' "$(distdir)/$$d" ';' ; \
+diff --git a/aes-decrypt-internal.c b/aes-decrypt-internal.c
+index 709c52f9..9e8cf34a 100644
+--- a/aes-decrypt-internal.c
++++ b/aes-decrypt-internal.c
+@@ -40,6 +40,16 @@
+ #include "aes-internal.h"
+ #include "macros.h"
+ 
++/* For fat builds */
++#if HAVE_NATIVE_aes_decrypt
++void
++_nettle_aes_decrypt_c(unsigned rounds, const uint32_t *keys,
++    const struct aes_table *T,
++    size_t length, uint8_t *dst,
++    const uint8_t *src);
++#define _nettle_aes_decrypt _nettle_aes_decrypt_c
++#endif
++
+ void
+ _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys,
+ 		    const struct aes_table *T,
+diff --git a/aes-encrypt-internal.c b/aes-encrypt-internal.c
+index 9f61386d..ad17e6c1 100644
+--- a/aes-encrypt-internal.c
++++ b/aes-encrypt-internal.c
+@@ -40,6 +40,16 @@
+ #include "aes-internal.h"
+ #include "macros.h"
+ 
++/* For fat builds */
++#if HAVE_NATIVE_aes_encrypt
++void
++_nettle_aes_encrypt_c(unsigned rounds, const uint32_t *keys,
++    const struct aes_table *T,
++    size_t length, uint8_t *dst,
++    const uint8_t *src);
++#define _nettle_aes_encrypt _nettle_aes_encrypt_c
++#endif
++
+ void
+ _nettle_aes_encrypt(unsigned rounds, const uint32_t *keys,
+ 		    const struct aes_table *T,
+diff --git a/asm.m4 b/asm.m4
+index ee377a78..59d64098 100644
+--- a/asm.m4
++++ b/asm.m4
+@@ -51,6 +51,14 @@ define(<ALIGN>,
+ <.align ifelse(ALIGN_LOG,yes,<m4_log2($1)>,$1)
+ >)
+ 
++define(<IF_BE>, <ifelse(
++WORDS_BIGENDIAN,yes,<$1>,
++WORDS_BIGENDIAN,no,<$2>,
++<errprint(<Unsupported endianness value>,WORDS_BIGENDIAN,<
++>)
++  m4exit(1)>)>)
++define(<IF_LE>, <IF_BE(<$2>, <$1>)>)
++
+ dnl Struct defining macros
+ 
+ dnl STRUCTURE(prefix) 
+diff --git a/config.m4.in b/config.m4.in
+index 666e34b8..e480334d 100644
+--- a/config.m4.in
++++ b/config.m4.in
+@@ -9,6 +9,7 @@ define(<W64_ABI>, <@W64_ABI@>)dnl
+ define(<RODATA>, <@ASM_RODATA@>)dnl
+ define(<ASM_X86_ENDBR>,<@ASM_X86_ENDBR@>)dnl
+ define(<ASM_X86_MARK_CET_ALIGN>,<@ASM_X86_MARK_CET_ALIGN@>)dnl
++define(<WORDS_BIGENDIAN>, <@ASM_WORDS_BIGENDIAN@>)dnl
+ divert(1)
+ @ASM_X86_MARK_CET@
+ @ASM_MARK_NOEXEC_STACK@
+diff --git a/configure.ac b/configure.ac
+index 090e43a4..788e6842 100644
+--- a/configure.ac
++++ b/configure.ac
+@@ -85,6 +85,10 @@ AC_ARG_ENABLE(x86-aesni,
+   AC_HELP_STRING([--enable-x86-aesni], [Enable x86_64 aes instructions. (default=no)]),,
+   [enable_x86_aesni=no])
+ 
++AC_ARG_ENABLE(power-crypto-ext,
++  AC_HELP_STRING([--enable-power-crypto-ext], [Enable POWER crypto extensions. (default=no)]),,
++  [enable_power_crypto_ext=no])
++
+ AC_ARG_ENABLE(mini-gmp,
+   AC_HELP_STRING([--enable-mini-gmp], [Enable mini-gmp, used instead of libgmp.]),,
+   [enable_mini_gmp=no])
+@@ -201,7 +205,11 @@ LSH_FUNC_STRERROR
+ # getenv_secure is used for fat overrides,
+ # getline is used in the testsuite
+ AC_CHECK_FUNCS(secure_getenv getline)
+-AC_C_BIGENDIAN
++
++ASM_WORDS_BIGENDIAN=unknown
++AC_C_BIGENDIAN([AC_DEFINE([WORDS_BIGENDIAN], 1)
++		ASM_WORDS_BIGENDIAN=yes],
++	[ASM_WORDS_BIGENDIAN=no])
+ 
+ LSH_GCC_ATTRIBUTES
+ 
+@@ -310,6 +318,17 @@ case "$host_cpu" in
+     AC_TRY_COMPILE([
+ #if defined(__sgi) && defined(__LP64__)
+ #error 64-bit mips
++#endif
++    ], [], [
++      ABI=32
++    ], [
++      ABI=64
++    ])
++    ;;
++  *powerpc64*)
++    AC_TRY_COMPILE([
++#if defined(__PPC64__)
++#error 64-bit powerpc
+ #endif
+     ], [], [
+       ABI=32
+@@ -422,6 +441,18 @@ if test "x$enable_assembler" = xyes ; then
+ 	esac
+       fi
+       ;;
++    *powerpc64*)
++      if test "$ABI" = 64 ; then
++	asm_path="powerpc64"
++	if test "x$enable_fat" = xyes ; then
++	  asm_path="powerpc64/fat $asm_path"
++	  OPT_NETTLE_SOURCES="fat-ppc.c $OPT_NETTLE_SOURCES"
++	elif test "x$enable_power_crypto_ext" = xyes ; then
++          asm_path="powerpc64/p8 $asm_path"
++	fi
++      fi
++      ;;
++
+     *)
+       enable_assembler=no
+       ;;
+@@ -544,6 +575,8 @@ AC_SUBST([IF_ASM])
+ AH_VERBATIM([HAVE_NATIVE],
+ [/* Define to 1 each of the following for which a native (ie. CPU specific)
+     implementation of the corresponding routine exists.  */
++#undef HAVE_NATIVE_aes_decrypt
++#undef HAVE_NATIVE_aes_encrypt
+ #undef HAVE_NATIVE_ecc_192_modp
+ #undef HAVE_NATIVE_ecc_192_redc
+ #undef HAVE_NATIVE_ecc_224_modp
+@@ -857,6 +890,7 @@ AC_SUBST(ASM_TYPE_PROGBITS)
+ AC_SUBST(ASM_MARK_NOEXEC_STACK)
+ AC_SUBST(ASM_ALIGN_LOG)
+ AC_SUBST(W64_ABI)
++AC_SUBST(ASM_WORDS_BIGENDIAN)
+ AC_SUBST(EMULATOR)
+ AC_SUBST(ASM_X86_ENDBR)
+ AC_SUBST(ASM_X86_MARK_CET)
+diff --git a/fat-ppc.c b/fat-ppc.c
+new file mode 100644
+index 00000000..7198e2dd
+--- /dev/null
++++ b/fat-ppc.c
+@@ -0,0 +1,129 @@
++/* fat-ppc.c
++
++   Copyright (C) 2020 Mamone Tarsha
++
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++*/
++
++#define _GNU_SOURCE
++
++#if HAVE_CONFIG_H
++# include "config.h"
++#endif
++
++#include <assert.h>
++#include <stdio.h>
++#include <stdlib.h>
++#include <string.h>
++#if defined(__FreeBSD__) && __FreeBSD__ < 12
++#include <sys/sysctl.h>
++#else
++#include <sys/auxv.h>
++#endif
++
++#include "nettle-types.h"
++
++#include "aes-internal.h"
++#include "gcm.h"
++#include "fat-setup.h"
++
++/* Define from arch/powerpc/include/uapi/asm/cputable.h in Linux kernel */
++#ifndef PPC_FEATURE2_VEC_CRYPTO
++#define PPC_FEATURE2_VEC_CRYPTO 0x02000000
++#endif
++
++struct ppc_features
++{
++  int have_crypto_ext;
++};
++
++static void
++get_ppc_features (struct ppc_features *features)
++{
++  unsigned long hwcap2 = 0;
++#if defined(__FreeBSD__)
++#if __FreeBSD__ < 12
++  size_t len = sizeof(hwcap2);
++  sysctlbyname("hw.cpu_features2", &hwcap2, &len, NULL, 0);
++#else
++  elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2));
++#endif
++#else
++  hwcap2 = getauxval(AT_HWCAP2);
++#endif
++  features->have_crypto_ext =
++   (hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO ? 1 : 0;
++}
++
++DECLARE_FAT_FUNC(_nettle_aes_encrypt, aes_crypt_internal_func)
++DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, c)
++DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, ppc64)
++
++DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func)
++DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
++DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
++
++static void CONSTRUCTOR
++fat_init (void)
++{
++  struct ppc_features features;
++  int verbose;
++
++  get_ppc_features (&features);
++
++  verbose = getenv (ENV_VERBOSE) != NULL;
++  if (verbose)
++    fprintf (stderr, "libnettle: cpu features: %s\n",
++     features.have_crypto_ext ? "crypto extensions" : "");
++
++  if (features.have_crypto_ext)
++  {
++     if (verbose)
++        fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
++     _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
++     _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
++  }
++  else
++  {
++     _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
++     _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
++  }
++}
++
++DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
++ (unsigned rounds, const uint32_t *keys,
++ const struct aes_table *T,
++ size_t length, uint8_t *dst,
++ const uint8_t *src),
++ (rounds, keys, T, length, dst, src))
++
++DEFINE_FAT_FUNC(_nettle_aes_decrypt, void,
++ (unsigned rounds, const uint32_t *keys,
++ const struct aes_table *T,
++ size_t length, uint8_t *dst,
++ const uint8_t *src),
++ (rounds, keys, T, length, dst, src))
+diff --git a/powerpc64/fat/aes-decrypt-internal-2.asm b/powerpc64/fat/aes-decrypt-internal-2.asm
+new file mode 100644
+index 00000000..3a4e08c2
+--- /dev/null
++++ b/powerpc64/fat/aes-decrypt-internal-2.asm
+@@ -0,0 +1,37 @@
++C powerpc64/fat/aes-decrypt-internal-2.asm
++
++
++ifelse(<
++   Copyright (C) 2020 Mamone Tarsha
++
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++>)
++
++dnl PROLOGUE(_nettle_aes_decrypt) picked up by configure
++
++define(<fat_transform>, <$1_ppc64>)
++include_src(<powerpc64/p8/aes-decrypt-internal.asm>)
+diff --git a/powerpc64/fat/aes-encrypt-internal-2.asm b/powerpc64/fat/aes-encrypt-internal-2.asm
+new file mode 100644
+index 00000000..42126e4f
+--- /dev/null
++++ b/powerpc64/fat/aes-encrypt-internal-2.asm
+@@ -0,0 +1,37 @@
++C powerpc64/fat/aes-encrypt-internal-2.asm
++
++
++ifelse(<
++   Copyright (C) 2020 Mamone Tarsha
++
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++>)
++
++dnl PROLOGUE(_nettle_aes_encrypt) picked up by configure
++
++define(<fat_transform>, <$1_ppc64>)
++include_src(<powerpc64/p8/aes-encrypt-internal.asm>)
+diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4
+new file mode 100644
+index 00000000..b76bb8b1
+--- /dev/null
++++ b/powerpc64/machine.m4
+@@ -0,0 +1,36 @@
++define(<PROLOGUE>,
++<.globl C_NAME($1)
++DECLARE_FUNC(C_NAME($1))
++ifelse(WORDS_BIGENDIAN,no,
++<ifdef(<FUNC_ALIGN>,<.align FUNC_ALIGN>)
++C_NAME($1):
++addis 2,12,(.TOC.-C_NAME($1))@ha
++addi 2,2,(.TOC.-C_NAME($1))@l
++.localentry C_NAME($1), .-C_NAME($1)>,
++<.section ".opd","aw"
++.align 3
++C_NAME($1):
++.quad .C_NAME($1),.TOC.@tocbase,0
++.previous
++ifdef(<FUNC_ALIGN>,<.align FUNC_ALIGN>)
++.C_NAME($1):>)
++undefine(<FUNC_ALIGN>)>)
++
++define(<EPILOGUE>,
++<ifelse(WORDS_BIGENDIAN,no,
++<.size C_NAME($1), . - C_NAME($1)>,
++<.size .C_NAME($1), . - .C_NAME($1)
++.size C_NAME($1), . - .C_NAME($1)>)>)
++
++C Get vector-scalar register from vector register
++C VSR(VR)
++define(<VSR>,<32+$1>)
++
++C Load the quadword in DATA_SRC storage into
++C VEC_DST. GPR is general-purpose register
++C used to obtain the effective address of
++C DATA_SRC storage.
++C DATA_LOAD_VEC(VEC_DST, DATA_SRC, GPR)
++define(<DATA_LOAD_VEC>,
++<ld $3,$2@got(2)
++lvx $1,0,$3>)
+diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm
+new file mode 100644
+index 00000000..bfedb32b
+--- /dev/null
++++ b/powerpc64/p8/aes-decrypt-internal.asm
+@@ -0,0 +1,356 @@
++C powerpc64/p8/aes-decrypt-internal.asm
++
++ifelse(<
++   Copyright (C) 2020 Mamone Tarsha
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++>)
++
++C Register usage:
++
++define(<SP>, <1>)
++define(<TOCP>, <2>)
++
++define(<ROUNDS>, <3>)
++define(<KEYS>, <4>)
++define(<LENGTH>, <6>)
++define(<DST>, <7>)
++define(<SRC>, <8>)
++
++define(<swap_mask>, <0>)
++
++define(<K>, <1>)
++define(<S0>, <2>)
++define(<S1>, <3>)
++define(<S2>, <4>)
++define(<S3>, <5>)
++define(<S4>, <6>)
++define(<S5>, <7>)
++define(<S6>, <8>)
++define(<S7>, <9>)
++
++C ZERO vector register is used in place of RoundKey
++C for vncipher instruction because the order of InvMixColumns
++C and Xor processes are flipped in that instruction.
++C The Xor process with RoundKey is executed afterward.
++define(<ZERO>, <10>)
++
++.file "aes-decrypt-internal.asm"
++
++.text
++
++ C _aes_decrypt(unsigned rounds, const uint32_t *keys,
++ C       const struct aes_table *T,
++ C       size_t length, uint8_t *dst,
++ C       uint8_t *src)
++
++define(<FUNC_ALIGN>, <5>)
++PROLOGUE(_nettle_aes_decrypt)
++ vxor ZERO,ZERO,ZERO
++
++ DATA_LOAD_VEC(swap_mask,.swap_mask,5)
++
++ subi ROUNDS,ROUNDS,1
++ srdi LENGTH,LENGTH,4
++
++ srdi 5,LENGTH,3 #8x loop count
++ cmpldi 5,0
++ beq L4x
++
++ std 25,-56(SP);
++ std 26,-48(SP);
++ std 27,-40(SP);
++ std 28,-32(SP);
++ std 29,-24(SP);
++ std 30,-16(SP);
++ std 31,-8(SP);
++
++ li 25,0x10
++ li 26,0x20
++ li 27,0x30
++ li 28,0x40
++ li 29,0x50
++ li 30,0x60
++ li 31,0x70
++
++.align 5
++Lx8_loop:
++ lxvd2x VSR(K),0,KEYS
++ vperm   K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++ lxvd2x VSR(S1),25,SRC
++ lxvd2x VSR(S2),26,SRC
++ lxvd2x VSR(S3),27,SRC
++ lxvd2x VSR(S4),28,SRC
++ lxvd2x VSR(S5),29,SRC
++ lxvd2x VSR(S6),30,SRC
++ lxvd2x VSR(S7),31,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask
++ vperm S4,S4,S4,swap_mask
++ vperm S5,S5,S5,swap_mask
++ vperm S6,S6,S6,swap_mask
++ vperm S7,S7,S7,swap_mask>)
++
++ vxor S0,S0,K
++ vxor S1,S1,K
++ vxor S2,S2,K
++ vxor S3,S3,K
++ vxor S4,S4,K
++ vxor S5,S5,K
++ vxor S6,S6,K
++ vxor S7,S7,K
++
++ mtctr ROUNDS
++ li 10,0x10
++.align 5
++L8x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm   K,K,K,swap_mask
++ vncipher S0,S0,ZERO
++ vncipher S1,S1,ZERO
++ vncipher S2,S2,ZERO
++ vncipher S3,S3,ZERO
++ vncipher S4,S4,ZERO
++ vncipher S5,S5,ZERO
++ vncipher S6,S6,ZERO
++ vncipher S7,S7,ZERO
++ vxor S0,S0,K
++ vxor S1,S1,K
++ vxor S2,S2,K
++ vxor S3,S3,K
++ vxor S4,S4,K
++ vxor S5,S5,K
++ vxor S6,S6,K
++ vxor S7,S7,K
++ addi 10,10,0x10
++ bdnz L8x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm   K,K,K,swap_mask
++ vncipherlast S0,S0,K
++ vncipherlast S1,S1,K
++ vncipherlast S2,S2,K
++ vncipherlast S3,S3,K
++ vncipherlast S4,S4,K
++ vncipherlast S5,S5,K
++ vncipherlast S6,S6,K
++ vncipherlast S7,S7,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask
++ vperm S4,S4,S4,swap_mask
++ vperm S5,S5,S5,swap_mask
++ vperm S6,S6,S6,swap_mask
++ vperm S7,S7,S7,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++ stxvd2x VSR(S1),25,DST
++ stxvd2x VSR(S2),26,DST
++ stxvd2x VSR(S3),27,DST
++ stxvd2x VSR(S4),28,DST
++ stxvd2x VSR(S5),29,DST
++ stxvd2x VSR(S6),30,DST
++ stxvd2x VSR(S7),31,DST
++
++ addi SRC,SRC,0x80
++ addi DST,DST,0x80
++ subic. 5,5,1
++ bne Lx8_loop
++
++ ld 25,-56(SP);
++ ld 26,-48(SP);
++ ld 27,-40(SP);
++ ld 28,-32(SP);
++ ld 29,-24(SP);
++ ld 30,-16(SP);
++ ld 31,-8(SP);
++
++ clrldi LENGTH,LENGTH,61
++
++L4x:
++ srdi   5,LENGTH,2
++ cmpldi   5,0
++ beq   L2x
++
++ lxvd2x   VSR(K),0,KEYS
++ vperm   K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++ li  9,0x10
++ lxvd2x VSR(S1),9,SRC
++ addi   9,9,0x10
++ lxvd2x VSR(S2),9,SRC
++ addi   9,9,0x10
++ lxvd2x VSR(S3),9,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask>)
++
++ vxor S0,S0,K
++ vxor S1,S1,K
++ vxor S2,S2,K
++ vxor S3,S3,K
++
++ mtctr ROUNDS
++ li 10,0x10
++.align 5
++L4x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vncipher S0,S0,ZERO
++ vncipher S1,S1,ZERO
++ vncipher S2,S2,ZERO
++ vncipher S3,S3,ZERO
++ vxor   S0,S0,K
++ vxor  S1,S1,K
++ vxor   S2,S2,K
++ vxor   S3,S3,K
++ addi   10,10,0x10
++ bdnz  L4x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm   K,K,K,swap_mask
++ vncipherlast S0,S0,K
++ vncipherlast S1,S1,K
++ vncipherlast S2,S2,K
++ vncipherlast S3,S3,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++ li  9,0x10
++ stxvd2x VSR(S1),9,DST
++ addi   9,9,0x10
++ stxvd2x VSR(S2),9,DST
++ addi  9,9,0x10
++ stxvd2x VSR(S3),9,DST
++
++ addi   SRC,SRC,0x40
++ addi   DST,DST,0x40
++
++ clrldi LENGTH,LENGTH,62
++
++L2x:
++ srdi  5,LENGTH,1
++ cmpldi  5,0
++ beq   L1x
++
++ lxvd2x VSR(K),0,KEYS
++ vperm K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++ li   9,0x10
++ lxvd2x VSR(S1),9,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask>)
++
++ vxor  S0,S0,K
++ vxor   S1,S1,K
++
++ mtctr   ROUNDS
++ li  10,0x10
++.align 5
++L2x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vncipher S0,S0,ZERO
++ vncipher S1,S1,ZERO
++ vxor  S0,S0,K
++ vxor  S1,S1,K
++ addi   10,10,0x10
++ bdnz   L2x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vncipherlast S0,S0,K
++ vncipherlast S1,S1,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++ li  9,0x10
++ stxvd2x VSR(S1),9,DST
++
++ addi   SRC,SRC,0x20
++ addi   DST,DST,0x20
++
++ clrldi LENGTH,LENGTH,63
++
++L1x:
++ cmpldi LENGTH,0
++ beq   Ldone
++
++ lxvd2x VSR(K),0,KEYS
++ vperm   K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask>)
++
++ vxor   S0,S0,K
++
++ mtctr   ROUNDS
++ li   10,0x10
++.align 5
++L1x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vncipher S0,S0,ZERO
++ vxor   S0,S0,K
++ addi   10,10,0x10
++ bdnz   L1x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vncipherlast S0,S0,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++
++Ldone:
++ blr
++EPILOGUE(_nettle_aes_decrypt)
++
++ .data
++ .align 4
++.swap_mask:
++IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>)
++IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)
+diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm
+new file mode 100644
+index 00000000..67c7e597
+--- /dev/null
++++ b/powerpc64/p8/aes-encrypt-internal.asm
+@@ -0,0 +1,333 @@
++C powerpc64/p8/aes-encrypt-internal.asm
++
++ifelse(<
++   Copyright (C) 2020 Mamone Tarsha
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++>)
++
++C Register usage:
++
++define(<SP>, <1>)
++define(<TOCP>, <2>)
++
++define(<ROUNDS>, <3>)
++define(<KEYS>, <4>)
++define(<LENGTH>, <6>)
++define(<DST>, <7>)
++define(<SRC>, <8>)
++
++define(<swap_mask>, <0>)
++
++define(<K>, <1>)
++define(<S0>, <2>)
++define(<S1>, <3>)
++define(<S2>, <4>)
++define(<S3>, <5>)
++define(<S4>, <6>)
++define(<S5>, <7>)
++define(<S6>, <8>)
++define(<S7>, <9>)
++
++.file "aes-encrypt-internal.asm"
++
++.text
++
++ C _aes_encrypt(unsigned rounds, const uint32_t *keys,
++ C       const struct aes_table *T,
++ C       size_t length, uint8_t *dst,
++ C       uint8_t *src)
++
++define(<FUNC_ALIGN>, <5>)
++PROLOGUE(_nettle_aes_encrypt)
++ DATA_LOAD_VEC(swap_mask,.swap_mask,5)
++
++ subi ROUNDS,ROUNDS,1
++ srdi LENGTH,LENGTH,4
++
++ srdi 5,LENGTH,3 #8x loop count
++ cmpldi 5,0
++ beq L4x
++
++ std 25,-56(SP);
++ std 26,-48(SP);
++ std 27,-40(SP);
++ std 28,-32(SP);
++ std 29,-24(SP);
++ std 30,-16(SP);
++ std 31,-8(SP);
++
++ li 25,0x10
++ li 26,0x20
++ li 27,0x30
++ li 28,0x40
++ li 29,0x50
++ li 30,0x60
++ li 31,0x70
++
++.align 5
++Lx8_loop:
++ lxvd2x VSR(K),0,KEYS
++ vperm   K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++ lxvd2x VSR(S1),25,SRC
++ lxvd2x VSR(S2),26,SRC
++ lxvd2x VSR(S3),27,SRC
++ lxvd2x VSR(S4),28,SRC
++ lxvd2x VSR(S5),29,SRC
++ lxvd2x VSR(S6),30,SRC
++ lxvd2x VSR(S7),31,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask
++ vperm S4,S4,S4,swap_mask
++ vperm S5,S5,S5,swap_mask
++ vperm S6,S6,S6,swap_mask
++ vperm S7,S7,S7,swap_mask>)
++
++ vxor S0,S0,K
++ vxor S1,S1,K
++ vxor S2,S2,K
++ vxor S3,S3,K
++ vxor S4,S4,K
++ vxor S5,S5,K
++ vxor S6,S6,K
++ vxor S7,S7,K
++
++ mtctr ROUNDS
++ li 10,0x10
++.align 5
++L8x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm   K,K,K,swap_mask
++ vcipher S0,S0,K
++ vcipher S1,S1,K
++ vcipher S2,S2,K
++ vcipher S3,S3,K
++ vcipher S4,S4,K
++ vcipher S5,S5,K
++ vcipher S6,S6,K
++ vcipher S7,S7,K
++ addi 10,10,0x10
++ bdnz L8x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm   K,K,K,swap_mask
++ vcipherlast S0,S0,K
++ vcipherlast S1,S1,K
++ vcipherlast S2,S2,K
++ vcipherlast S3,S3,K
++ vcipherlast S4,S4,K
++ vcipherlast S5,S5,K
++ vcipherlast S6,S6,K
++ vcipherlast S7,S7,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask
++ vperm S4,S4,S4,swap_mask
++ vperm S5,S5,S5,swap_mask
++ vperm S6,S6,S6,swap_mask
++ vperm S7,S7,S7,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++ stxvd2x VSR(S1),25,DST
++ stxvd2x VSR(S2),26,DST
++ stxvd2x VSR(S3),27,DST
++ stxvd2x VSR(S4),28,DST
++ stxvd2x VSR(S5),29,DST
++ stxvd2x VSR(S6),30,DST
++ stxvd2x VSR(S7),31,DST
++
++ addi SRC,SRC,0x80
++ addi DST,DST,0x80
++ subic. 5,5,1
++ bne Lx8_loop
++
++ ld 25,-56(SP);
++ ld 26,-48(SP);
++ ld 27,-40(SP);
++ ld 28,-32(SP);
++ ld 29,-24(SP);
++ ld 30,-16(SP);
++ ld 31,-8(SP);
++
++ clrldi LENGTH,LENGTH,61
++
++L4x:
++ srdi   5,LENGTH,2
++ cmpldi   5,0
++ beq   L2x
++
++ lxvd2x   VSR(K),0,KEYS
++ vperm   K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++ li  9,0x10
++ lxvd2x VSR(S1),9,SRC
++ addi   9,9,0x10
++ lxvd2x VSR(S2),9,SRC
++ addi   9,9,0x10
++ lxvd2x VSR(S3),9,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask>)
++
++ vxor S0,S0,K
++ vxor S1,S1,K
++ vxor S2,S2,K
++ vxor S3,S3,K
++
++ mtctr ROUNDS
++ li 10,0x10
++.align 5
++L4x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vcipher S0,S0,K
++ vcipher S1,S1,K
++ vcipher S2,S2,K
++ vcipher S3,S3,K
++ addi   10,10,0x10
++ bdnz  L4x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm   K,K,K,swap_mask
++ vcipherlast S0,S0,K
++ vcipherlast S1,S1,K
++ vcipherlast S2,S2,K
++ vcipherlast S3,S3,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask
++ vperm S2,S2,S2,swap_mask
++ vperm S3,S3,S3,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++ li  9,0x10
++ stxvd2x VSR(S1),9,DST
++ addi   9,9,0x10
++ stxvd2x VSR(S2),9,DST
++ addi  9,9,0x10
++ stxvd2x VSR(S3),9,DST
++
++ addi   SRC,SRC,0x40
++ addi   DST,DST,0x40
++
++ clrldi LENGTH,LENGTH,62
++
++L2x:
++ srdi  5,LENGTH,1
++ cmpldi  5,0
++ beq   L1x
++
++ lxvd2x VSR(K),0,KEYS
++ vperm K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++ li   9,0x10
++ lxvd2x VSR(S1),9,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask>)
++
++ vxor  S0,S0,K
++ vxor   S1,S1,K
++
++ mtctr   ROUNDS
++ li  10,0x10
++.align 5
++L2x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vcipher S0,S0,K
++ vcipher S1,S1,K
++ addi   10,10,0x10
++ bdnz   L2x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vcipherlast S0,S0,K
++ vcipherlast S1,S1,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask
++ vperm S1,S1,S1,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++ li  9,0x10
++ stxvd2x VSR(S1),9,DST
++
++ addi   SRC,SRC,0x20
++ addi   DST,DST,0x20
++
++ clrldi LENGTH,LENGTH,63
++
++L1x:
++ cmpldi LENGTH,0
++ beq   Ldone
++
++ lxvd2x VSR(K),0,KEYS
++ vperm   K,K,K,swap_mask
++
++ lxvd2x VSR(S0),0,SRC
++
++IF_LE(<vperm S0,S0,S0,swap_mask>)
++
++ vxor   S0,S0,K
++
++ mtctr   ROUNDS
++ li   10,0x10
++.align 5
++L1x_round_loop:
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vcipher S0,S0,K
++ addi   10,10,0x10
++ bdnz   L1x_round_loop
++
++ lxvd2x VSR(K),10,KEYS
++ vperm  K,K,K,swap_mask
++ vcipherlast S0,S0,K
++
++IF_LE(<vperm S0,S0,S0,swap_mask>)
++
++ stxvd2x VSR(S0),0,DST
++
++Ldone:
++ blr
++EPILOGUE(_nettle_aes_encrypt)
++
++ .data
++ .align 4
++.swap_mask:
++IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>)
++IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)
diff --git a/SOURCES/nettle-3.4.1-powerpc64-ghash-asm.patch b/SOURCES/nettle-3.4.1-powerpc64-ghash-asm.patch
new file mode 100644
index 0000000..255adbd
--- /dev/null
+++ b/SOURCES/nettle-3.4.1-powerpc64-ghash-asm.patch
@@ -0,0 +1,1519 @@
+diff -up ./configure.ac.ghash ./configure.ac
+--- ./configure.ac.ghash	2021-07-14 14:11:58.126891572 +0200
++++ ./configure.ac	2021-07-14 14:11:58.130891552 +0200
+@@ -211,6 +211,22 @@ AC_C_BIGENDIAN([AC_DEFINE([WORDS_BIGENDI
+ 		ASM_WORDS_BIGENDIAN=yes],
+ 	[ASM_WORDS_BIGENDIAN=no])
+ 
++AC_CACHE_CHECK([for __builtin_bswap64],
++		nettle_cv_c_builtin_bswap64,
++[AC_TRY_LINK([
++#include <stdint.h>
++],[
++uint64_t x = 17;
++uint64_t y = __builtin_bswap64(x);
++],
++nettle_cv_c_builtin_bswap64=yes,
++nettle_cv_c_builtin_bswap64=no)])
++
++AH_TEMPLATE([HAVE_BUILTIN_BSWAP64], [Define if __builtin_bswap64 is available])
++if test "x$nettle_cv_c_builtin_bswap64" = "xyes" ; then
++  AC_DEFINE(HAVE_BUILTIN_BSWAP64)
++fi
++
+ LSH_GCC_ATTRIBUTES
+ 
+ # According to Simon Josefsson, looking for uint32_t and friends in
+@@ -472,7 +488,7 @@ asm_replace_list="aes-encrypt-internal.a
+ 		sha3-permute.asm umac-nh.asm umac-nh-n.asm machine.m4"
+ 
+ # Assembler files which generate additional object files if they are used.
+-asm_nettle_optional_list="gcm-hash8.asm cpuid.asm \
++asm_nettle_optional_list="gcm-hash.asm gcm-hash8.asm cpuid.asm \
+   aes-encrypt-internal-2.asm aes-decrypt-internal-2.asm memxor-2.asm \
+   salsa20-core-internal-2.asm sha1-compress-2.asm sha256-compress-2.asm \
+   sha3-permute-2.asm sha512-compress-2.asm \
+@@ -588,6 +604,10 @@ AH_VERBATIM([HAVE_NATIVE],
+ #undef HAVE_NATIVE_ecc_384_redc
+ #undef HAVE_NATIVE_ecc_521_modp
+ #undef HAVE_NATIVE_ecc_521_redc
++#undef HAVE_NATIVE_gcm_init_key
++#undef HAVE_NATIVE_fat_gcm_init_key
++#undef HAVE_NATIVE_gcm_hash
++#undef HAVE_NATIVE_fat_gcm_hash
+ #undef HAVE_NATIVE_gcm_hash8
+ #undef HAVE_NATIVE_salsa20_core
+ #undef HAVE_NATIVE_sha1_compress
+diff -up ./ctr16.c.ghash ./ctr16.c
+--- ./ctr16.c.ghash	2021-07-14 14:11:58.130891552 +0200
++++ ./ctr16.c	2021-07-14 14:11:58.130891552 +0200
+@@ -0,0 +1,106 @@
++/* ctr16.c
++
++   Cipher counter mode, optimized for 16-byte blocks.
++
++   Copyright (C) 2005-2018 Niels Möller
++   Copyright (C) 2018 Red Hat, Inc.
++
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++*/
++
++#if HAVE_CONFIG_H
++# include "config.h"
++#endif
++
++#include <assert.h>
++
++#include "ctr.h"
++
++#include "ctr-internal.h"
++#include "memxor.h"
++#include "nettle-internal.h"
++
++#define MIN(a,b) (((a) < (b)) ? (a) : (b))
++
++void
++_ctr_crypt16(const void *ctx, nettle_cipher_func *f,
++	     nettle_fill16_func *fill, uint8_t *ctr,
++	     size_t length, uint8_t *dst,
++	     const uint8_t *src)
++{
++  if (dst != src && !((uintptr_t) dst % sizeof(uint64_t)))
++    {
++      size_t blocks = length / 16u;
++      size_t done;
++      fill (ctr, blocks, (union nettle_block16 *) dst);
++
++      done = blocks * 16;
++      f(ctx, done, dst, dst);
++      memxor (dst, src, done);
++
++      length -= done;
++      if (length > 0)
++	{ /* Left-over partial block */
++	  union nettle_block16 block;
++	  dst += done;
++	  src += done;
++	  assert (length < 16);
++	  /* Use fill, to update ctr value in the same way in all cases. */
++	  fill (ctr, 1, &block);
++	  f (ctx, 16, block.b, block.b);
++	  memxor3 (dst, src, block.b, length);
++	}
++    }
++  else
++    {
++      /* Construct an aligned buffer of consecutive counter values, of
++	 size at most CTR_BUFFER_LIMIT. */
++      TMP_DECL(buffer, union nettle_block16, CTR_BUFFER_LIMIT / 16);
++      size_t blocks = (length + 15) / 16u;
++      size_t i;
++      TMP_ALLOC(buffer, MIN(blocks, CTR_BUFFER_LIMIT / 16));
++
++      for (i = 0; blocks >= CTR_BUFFER_LIMIT / 16;
++	   i += CTR_BUFFER_LIMIT, blocks -= CTR_BUFFER_LIMIT / 16)
++	{
++	  fill (ctr, CTR_BUFFER_LIMIT / 16, buffer);
++	  f(ctx, CTR_BUFFER_LIMIT, buffer->b, buffer->b);
++	  if (length - i < CTR_BUFFER_LIMIT)
++	    goto done;
++	  memxor3 (dst + i, src + i, buffer->b, CTR_BUFFER_LIMIT);
++	}
++
++      if (blocks > 0)
++	{
++	  assert (length - i < CTR_BUFFER_LIMIT);
++	  fill (ctr, blocks, buffer);
++	  f(ctx, blocks * 16, buffer->b, buffer->b);
++	done:
++	  memxor3 (dst + i, src + i, buffer->b, length - i);
++	}
++    }
++}
+diff -up ./ctr.c.ghash ./ctr.c
+--- ./ctr.c.ghash	2018-12-04 21:56:05.000000000 +0100
++++ ./ctr.c	2021-07-14 14:13:07.714539484 +0200
+@@ -41,11 +41,83 @@
+ 
+ #include "ctr.h"
+ 
++#include "ctr-internal.h"
+ #include "macros.h"
+ #include "memxor.h"
+ #include "nettle-internal.h"
+ 
+-#define NBLOCKS 4
++#define MIN(a,b) (((a) < (b)) ? (a) : (b))
++
++/* The 'u64' member has been added in the public header
++   (nettle-types.h).  Check that the alignment is not affected with
++   it using _Static_assert. */
++union nettle_block16_
++{
++  uint8_t b[16];
++  unsigned long w[16 / sizeof(unsigned long)];
++};
++_Static_assert(__alignof(union nettle_block16_) == __alignof(union nettle_block16),
++	       "nettle_block16 alignment should be preserved");
++
++static size_t
++ctr_fill (size_t block_size, uint8_t *ctr, size_t length, uint8_t *buffer)
++{
++  size_t i;
++  for (i = 0; i + block_size <= length; i += block_size)
++    {
++      memcpy (buffer + i, ctr, block_size);
++      INCREMENT(block_size, ctr);
++    }
++  return i;
++}
++
++#if WORDS_BIGENDIAN
++# define USE_CTR_CRYPT16 1
++static nettle_fill16_func ctr_fill16;
++static void
++ctr_fill16(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
++{
++  uint64_t hi, lo;
++  size_t i;
++  hi = READ_UINT64(ctr);
++  lo = READ_UINT64(ctr + 8);
++
++  for (i = 0; i < blocks; i++)
++    {
++      buffer[i].u64[0] = hi;
++      buffer[i].u64[1] = lo;
++      hi += !(++lo);
++    }
++  WRITE_UINT64(ctr, hi);
++  WRITE_UINT64(ctr + 8, lo);
++}
++#else /* !WORDS_BIGENDIAN */
++# if HAVE_BUILTIN_BSWAP64
++#  define USE_CTR_CRYPT16 1
++static nettle_fill16_func ctr_fill16;
++static void
++ctr_fill16(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
++{
++  uint64_t hi, lo;
++  size_t i;
++  /* Read hi in native endianness */
++  hi = LE_READ_UINT64(ctr);
++  lo = READ_UINT64(ctr + 8);
++
++  for (i = 0; i < blocks; i++)
++    {
++      buffer[i].u64[0] = hi;
++      buffer[i].u64[1] = __builtin_bswap64(lo);
++      if (!++lo)
++	hi = __builtin_bswap64(__builtin_bswap64(hi) + 1);
++    }
++  LE_WRITE_UINT64(ctr, hi);
++  WRITE_UINT64(ctr + 8, lo);
++}
++# else /* ! HAVE_BUILTIN_BSWAP64 */
++#  define USE_CTR_CRYPT16 0
++# endif
++#endif /* !WORDS_BIGENDIAN */
+ 
+ void
+ ctr_crypt(const void *ctx, nettle_cipher_func *f,
+@@ -53,84 +125,64 @@ ctr_crypt(const void *ctx, nettle_cipher
+ 	  size_t length, uint8_t *dst,
+ 	  const uint8_t *src)
+ {
+-  if (src != dst)
++#if USE_CTR_CRYPT16
++  if (block_size == 16)
+     {
+-      if (length == block_size)
+-	{
+-	  f(ctx, block_size, dst, ctr);
+-	  INCREMENT(block_size, ctr);
+-	  memxor(dst, src, block_size);
+-	}
+-      else
++      _ctr_crypt16(ctx, f, ctr_fill16, ctr, length, dst, src);
++      return;
++    }
++#endif
++
++  if(src != dst)
++    {
++      size_t filled = ctr_fill (block_size, ctr, length, dst);
++
++      f(ctx, filled, dst, dst);
++      memxor(dst, src, filled);
++
++      if (filled < length)
+ 	{
+-	  size_t left;
+-	  uint8_t *p;	  
++	  TMP_DECL(block, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
++	  TMP_ALLOC(block, block_size);
+ 
+-	  for (p = dst, left = length;
+-	       left >= block_size;
+-	       left -= block_size, p += block_size)
+-	    {
+-	      memcpy (p, ctr, block_size);
+-	      INCREMENT(block_size, ctr);
+-	    }
+-
+-	  f(ctx, length - left, dst, dst);
+-	  memxor(dst, src, length - left);
+-
+-	  if (left)
+-	    {
+-	      TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
+-	      TMP_ALLOC(buffer, block_size);
+-
+-	      f(ctx, block_size, buffer, ctr);
+-	      INCREMENT(block_size, ctr);
+-	      memxor3(dst + length - left, src + length - left, buffer, left);
+-	    }
++	  f(ctx, block_size, block, ctr);
++	  INCREMENT(block_size, ctr);
++	  memxor3(dst + filled, src + filled, block, length - filled);
+ 	}
+     }
+   else
+     {
+-      if (length > block_size)
+-	{
+-	  TMP_DECL(buffer, uint8_t, NBLOCKS * NETTLE_MAX_CIPHER_BLOCK_SIZE);
+-	  size_t chunk = NBLOCKS * block_size;
++      /* For in-place CTR, construct a buffer of consecutive counter
++	 values, of size at most CTR_BUFFER_LIMIT. */
++      TMP_DECL(buffer, uint8_t, CTR_BUFFER_LIMIT);
++
++      size_t buffer_size;
++      if (length < block_size)
++	buffer_size = block_size;
++      else if (length <= CTR_BUFFER_LIMIT)
++	buffer_size = length;
++      else
++	buffer_size = CTR_BUFFER_LIMIT;
+ 
+-	  TMP_ALLOC(buffer, chunk);
++      TMP_ALLOC(buffer, buffer_size);
+ 
+-	  for (; length >= chunk;
+-	       length -= chunk, src += chunk, dst += chunk)
+-	    {
+-	      unsigned n;
+-	      uint8_t *p;	  
+-	      for (n = 0, p = buffer; n < NBLOCKS; n++, p += block_size)
+-		{
+-		  memcpy (p, ctr, block_size);
+-		  INCREMENT(block_size, ctr);
+-		}
+-	      f(ctx, chunk, buffer, buffer);
+-	      memxor(dst, buffer, chunk);
+-	    }
+-
+-	  if (length > 0)
+-	    {
+-	      /* Final, possibly partial, blocks */
+-	      for (chunk = 0; chunk < length; chunk += block_size)
+-		{
+-		  memcpy (buffer + chunk, ctr, block_size);
+-		  INCREMENT(block_size, ctr);
+-		}
+-	      f(ctx, chunk, buffer, buffer);
+-	      memxor3(dst, src, buffer, length);
+-	    }
++      while (length >= block_size)
++	{
++	  size_t filled
++	    = ctr_fill (block_size, ctr, MIN(buffer_size, length), buffer);
++	  assert (filled > 0);
++	  f(ctx, filled, buffer, buffer);
++	  memxor(dst, buffer, filled);
++	  length -= filled;
++	  dst += filled;
+ 	}
+-      else if (length > 0)
+-      	{
+-	  TMP_DECL(buffer, uint8_t, NETTLE_MAX_CIPHER_BLOCK_SIZE);
+-	  TMP_ALLOC(buffer, block_size);
+ 
++      /* Final, possibly partial, block. */
++      if (length > 0)
++	{
+ 	  f(ctx, block_size, buffer, ctr);
+ 	  INCREMENT(block_size, ctr);
+-	  memxor3(dst, src, buffer, length);
++	  memxor(dst, buffer, length);
+ 	}
+     }
+ }
+diff -up ./ctr-internal.h.ghash ./ctr-internal.h
+--- ./ctr-internal.h.ghash	2021-07-14 14:11:58.130891552 +0200
++++ ./ctr-internal.h	2021-07-14 14:11:58.130891552 +0200
+@@ -0,0 +1,56 @@
++/* ctr-internal.h
++
++   Copyright (C) 2018 Niels Möller
++
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++*/
++
++#ifndef NETTLE_CTR_INTERNAL_H_INCLUDED
++#define NETTLE_CTR_INTERNAL_H_INCLUDED
++
++#include "nettle-types.h"
++
++/* Name mangling */
++#define _ctr_crypt16 _nettle_ctr_crypt16
++
++/* Size limit for temporary stack buffers. */
++#define CTR_BUFFER_LIMIT 512
++
++/* Fill BUFFER (n blocks) with incrementing CTR values. It would be
++   nice if CTR was always 64-bit aligned, but it isn't when called
++   from ctr_crypt. */
++typedef void
++nettle_fill16_func(uint8_t *ctr, size_t n, union nettle_block16 *buffer);
++
++void
++_ctr_crypt16(const void *ctx, nettle_cipher_func *f,
++	     nettle_fill16_func *fill, uint8_t *ctr,
++	     size_t length, uint8_t *dst,
++	     const uint8_t *src);
++
++
++#endif /* NETTLE_CTR_INTERNAL_H_INCLUDED */
+diff -up ./fat-ppc.c.ghash ./fat-ppc.c
+--- ./fat-ppc.c.ghash	2021-07-14 14:11:58.126891572 +0200
++++ ./fat-ppc.c	2021-07-14 14:11:58.130891552 +0200
+@@ -49,6 +49,7 @@
+ 
+ #include "aes-internal.h"
+ #include "gcm.h"
++#include "gcm-internal.h"
+ #include "fat-setup.h"
+ 
+ /* Define from arch/powerpc/include/uapi/asm/cputable.h in Linux kernel */
+@@ -87,6 +88,16 @@ DECLARE_FAT_FUNC(_nettle_aes_decrypt, ae
+ DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c)
+ DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64)
+ 
++#if GCM_TABLE_BITS == 8
++DECLARE_FAT_FUNC(_nettle_gcm_init_key, gcm_init_key_func)
++DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, c)
++DECLARE_FAT_FUNC_VAR(gcm_init_key, gcm_init_key_func, ppc64)
++
++DECLARE_FAT_FUNC(_nettle_gcm_hash, gcm_hash_func)
++DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, c)
++DECLARE_FAT_FUNC_VAR(gcm_hash, gcm_hash_func, ppc64)
++#endif /* GCM_TABLE_BITS == 8 */
++
+ static void CONSTRUCTOR
+ fat_init (void)
+ {
+@@ -101,17 +112,29 @@ fat_init (void)
+      features.have_crypto_ext ? "crypto extensions" : "");
+ 
+   if (features.have_crypto_ext)
+-  {
+-     if (verbose)
+-        fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
+-     _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
+-     _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
+-  }
++    {
++      if (verbose)
++	fprintf (stderr, "libnettle: enabling arch 2.07 code.\n");
++      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64;
++      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64;
++#if GCM_TABLE_BITS == 8
++      /* Make sure _nettle_gcm_init_key_vec function is compatible
++         with _nettle_gcm_hash_vec function e.g. _nettle_gcm_init_key_c()
++         fills gcm_key table with values that are incompatible with
++         _nettle_gcm_hash_ppc64() */
++      _nettle_gcm_init_key_vec = _nettle_gcm_init_key_ppc64;
++      _nettle_gcm_hash_vec = _nettle_gcm_hash_ppc64;
++#endif /* GCM_TABLE_BITS == 8 */
++    }
+   else
+-  {
+-     _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
+-     _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
+-  }
++    {
++      _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c;
++      _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c;
++#if GCM_TABLE_BITS == 8
++      _nettle_gcm_init_key_vec = _nettle_gcm_init_key_c;
++      _nettle_gcm_hash_vec = _nettle_gcm_hash_c;
++#endif /* GCM_TABLE_BITS == 8 */
++    }
+ }
+ 
+ DEFINE_FAT_FUNC(_nettle_aes_encrypt, void,
+@@ -127,3 +150,14 @@ DEFINE_FAT_FUNC(_nettle_aes_decrypt, voi
+  size_t length, uint8_t *dst,
+  const uint8_t *src),
+  (rounds, keys, T, length, dst, src))
++
++#if GCM_TABLE_BITS == 8
++DEFINE_FAT_FUNC(_nettle_gcm_init_key, void,
++		(union nettle_block16 *table),
++		(table))
++
++DEFINE_FAT_FUNC(_nettle_gcm_hash, void,
++		(const struct gcm_key *key, union nettle_block16 *x,
++		 size_t length, const uint8_t *data),
++		(key, x, length, data))
++#endif /* GCM_TABLE_BITS == 8 */
+diff -up ./fat-setup.h.ghash ./fat-setup.h
+--- ./fat-setup.h.ghash	2018-12-04 21:56:06.000000000 +0100
++++ ./fat-setup.h	2021-07-14 14:11:58.130891552 +0200
+@@ -159,6 +159,11 @@ typedef void aes_crypt_internal_func (un
+ 				      size_t length, uint8_t *dst,
+ 				      const uint8_t *src);
+ 
++typedef void gcm_init_key_func (union nettle_block16 *table);
++
++typedef void gcm_hash_func (const struct gcm_key *key, union nettle_block16 *x,
++			    size_t length, const uint8_t *data);
++
+ typedef void *(memxor_func)(void *dst, const void *src, size_t n);
+ 
+ typedef void salsa20_core_func (uint32_t *dst, const uint32_t *src, unsigned rounds);
+diff -up ./gcm.c.ghash ./gcm.c
+--- ./gcm.c.ghash	2018-12-04 21:56:05.000000000 +0100
++++ ./gcm.c	2021-07-14 14:11:58.131891547 +0200
+@@ -6,8 +6,9 @@
+    See also the gcm paper at
+    http://www.cryptobarn.com/papers/gcm-spec.pdf.
+ 
+-   Copyright (C) 2011, 2013 Niels Möller
+    Copyright (C) 2011 Katholieke Universiteit Leuven
++   Copyright (C) 2011, 2013, 2018 Niels Möller
++   Copyright (C) 2018 Red Hat, Inc.
+    
+    Contributed by Nikos Mavrogiannopoulos
+ 
+@@ -48,9 +49,11 @@
+ 
+ #include "gcm.h"
+ 
++#include "gcm-internal.h"
+ #include "memxor.h"
+ #include "nettle-internal.h"
+ #include "macros.h"
++#include "ctr-internal.h"
+ 
+ #define GHASH_POLYNOMIAL 0xE1UL
+ 
+@@ -112,7 +115,17 @@ gcm_gf_shift (union nettle_block16 *r, c
+ #endif /* ! WORDS_BIGENDIAN */
+ }
+ 
+-#if GCM_TABLE_BITS == 0
++#if GCM_TABLE_BITS != 8
++/* The native implementations (currently ppc64 only) depend on the
++   GCM_TABLE_BITS == 8 layout */
++#undef HAVE_NATIVE_gcm_hash
++#undef HAVE_NATIVE_gcm_init_key
++#undef HAVE_NATIVE_fat_gcm_hash
++#undef HAVE_NATIVE_fat_gcm_init_key
++#endif
++
++#if !HAVE_NATIVE_gcm_hash
++# if GCM_TABLE_BITS == 0
+ /* Sets x <- x * y mod r, using the plain bitwise algorithm from the
+    specification. y may be shorter than a full block, missing bytes
+    are assumed zero. */
+@@ -140,15 +153,15 @@ gcm_gf_mul (union nettle_block16 *x, con
+     }
+   memcpy (x->b, Z.b, sizeof(Z));
+ }
+-#else /* GCM_TABLE_BITS != 0 */
++# else /* GCM_TABLE_BITS != 0 */
+ 
+-# if WORDS_BIGENDIAN
+-#  define W(left,right) (0x##left##right)
+-# else
+-#  define W(left,right) (0x##right##left)
+-# endif
++#  if WORDS_BIGENDIAN
++#   define W(left,right) (0x##left##right)
++#  else
++#   define W(left,right) (0x##right##left)
++#  endif
+ 
+-# if GCM_TABLE_BITS == 4
++#  if GCM_TABLE_BITS == 4
+ static const uint16_t
+ shift_table[0x10] = {
+   W(00,00),W(1c,20),W(38,40),W(24,60),W(70,80),W(6c,a0),W(48,c0),W(54,e0),
+@@ -177,26 +190,13 @@ gcm_gf_shift_4(union nettle_block16 *x)
+ #  error Unsupported word size. */
+ #endif
+ #else /* ! WORDS_BIGENDIAN */
+-# if SIZEOF_LONG == 4
+-#define RSHIFT_WORD(x) \
+-  ((((x) & 0xf0f0f0f0UL) >> 4)			\
+-   | (((x) & 0x000f0f0f) << 12))
+-  reduce = shift_table[(w[3] >> 24) & 0xf];
+-  w[3] = RSHIFT_WORD(w[3]) | ((w[2] >> 20) & 0xf0);
+-  w[2] = RSHIFT_WORD(w[2]) | ((w[1] >> 20) & 0xf0);
+-  w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 20) & 0xf0);
+-  w[0] = RSHIFT_WORD(w[0]) ^ reduce;
+-# elif SIZEOF_LONG == 8
+-#define RSHIFT_WORD(x) \
+-  ((((x) & 0xf0f0f0f0f0f0f0f0UL) >> 4) \
+-   | (((x) & 0x000f0f0f0f0f0f0fUL) << 12))
+-  reduce = shift_table[(w[1] >> 56) & 0xf];
+-  w[1] = RSHIFT_WORD(w[1]) | ((w[0] >> 52) & 0xf0);
+-  w[0] = RSHIFT_WORD(w[0]) ^ reduce;
+-# else
+-#  error Unsupported word size. */
+-# endif
+-# undef RSHIFT_WORD
++# define RSHIFT_WORD_4(x) \
++  ((((x) & UINT64_C(0xf0f0f0f0f0f0f0f0)) >> 4) \
++   | (((x) & UINT64_C(0x000f0f0f0f0f0f0f)) << 12))
++  reduce = shift_table[(u64[1] >> 56) & 0xf];
++  u64[1] = RSHIFT_WORD_4(u64[1]) | ((u64[0] >> 52) & 0xf0);
++  u64[0] = RSHIFT_WORD_4(u64[0]) ^ reduce;
++# undef RSHIFT_WORD_4
+ #endif /* ! WORDS_BIGENDIAN */
+ }
+ 
+@@ -219,10 +219,10 @@ gcm_gf_mul (union nettle_block16 *x, con
+     }
+   memcpy (x->b, Z.b, sizeof(Z));
+ }
+-# elif GCM_TABLE_BITS == 8
+-#  if HAVE_NATIVE_gcm_hash8
++#  elif GCM_TABLE_BITS == 8
++#   if HAVE_NATIVE_gcm_hash8
+ 
+-#define gcm_hash _nettle_gcm_hash8
++#define _nettle_gcm_hash _nettle_gcm_hash8
+ void
+ _nettle_gcm_hash8 (const struct gcm_key *key, union nettle_block16 *x,
+ 		   size_t length, const uint8_t *data);
+@@ -317,18 +317,46 @@ gcm_gf_mul (union nettle_block16 *x, con
+   gcm_gf_shift_8(&Z);
+   gcm_gf_add(x, &Z, &table[x->b[0]]);
+ }
+-#  endif /* ! HAVE_NATIVE_gcm_hash8 */
+-# else /* GCM_TABLE_BITS != 8 */
+-#  error Unsupported table size. 
+-# endif /* GCM_TABLE_BITS != 8 */
++#   endif /* ! HAVE_NATIVE_gcm_hash8 */
++#  else /* GCM_TABLE_BITS != 8 */
++#   error Unsupported table size.
++#  endif /* GCM_TABLE_BITS != 8 */
++
++#  undef W
++# endif /* GCM_TABLE_BITS != 0 */
++#endif /* !HAVE_NATIVE_gcm_hash */
+ 
+-#undef W
+-
+-#endif /* GCM_TABLE_BITS */
+ 
+ /* Increment the rightmost 32 bits. */
+ #define INC32(block) INCREMENT(4, (block.b) + GCM_BLOCK_SIZE - 4)
+ 
++#if !HAVE_NATIVE_gcm_init_key
++# if !HAVE_NATIVE_fat_gcm_hash
++#  define _nettle_gcm_init_key _nettle_gcm_init_key_c
++static
++# endif
++void
++_nettle_gcm_init_key_c(union nettle_block16 *table)
++{
++#if GCM_TABLE_BITS
++  /* Middle element if GCM_TABLE_BITS > 0, otherwise the first
++     element */
++  unsigned i = (1<<GCM_TABLE_BITS)/2;
++
++  /* Algorithm 3 from the gcm paper. First do powers of two, then do
++     the rest by adding. */
++  while (i /= 2)
++    gcm_gf_shift(&table[i], &table[2*i]);
++  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
++    {
++      unsigned j;
++      for (j = 1; j < i; j++)
++        gcm_gf_add(&table[i+j], &table[i], &table[j]);
++    }
++#endif
++}
++#endif /* !HAVE_NATIVE_gcm_init_key */
++
+ /* Initialization of GCM.
+  * @ctx: The context of GCM
+  * @cipher: The context of the underlying block cipher
+@@ -345,25 +373,18 @@ gcm_set_key(struct gcm_key *key,
+   /* H */  
+   memset(key->h[0].b, 0, GCM_BLOCK_SIZE);
+   f (cipher, GCM_BLOCK_SIZE, key->h[i].b, key->h[0].b);
+-  
+-#if GCM_TABLE_BITS
+-  /* Algorithm 3 from the gcm paper. First do powers of two, then do
+-     the rest by adding. */
+-  while (i /= 2)
+-    gcm_gf_shift(&key->h[i], &key->h[2*i]);
+-  for (i = 2; i < 1<<GCM_TABLE_BITS; i *= 2)
+-    {
+-      unsigned j;
+-      for (j = 1; j < i; j++)
+-	gcm_gf_add(&key->h[i+j], &key->h[i],&key->h[j]);
+-    }
+-#endif
++
++  _nettle_gcm_init_key(key->h);
+ }
+ 
+-#ifndef gcm_hash
+-static void
+-gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
+-	 size_t length, const uint8_t *data)
++#if !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8)
++# if !HAVE_NATIVE_fat_gcm_hash
++#  define _nettle_gcm_hash _nettle_gcm_hash_c
++static
++# endif
++void
++_nettle_gcm_hash_c(const struct gcm_key *key, union nettle_block16 *x,
++		   size_t length, const uint8_t *data)
+ {
+   for (; length >= GCM_BLOCK_SIZE;
+        length -= GCM_BLOCK_SIZE, data += GCM_BLOCK_SIZE)
+@@ -377,7 +398,7 @@ gcm_hash(const struct gcm_key *key, unio
+       gcm_gf_mul (x, key->h);
+     }
+ }
+-#endif /* !gcm_hash */
++#endif /* !(HAVE_NATIVE_gcm_hash || HAVE_NATIVE_gcm_hash8) */
+ 
+ static void
+ gcm_hash_sizes(const struct gcm_key *key, union nettle_block16 *x,
+@@ -391,7 +412,7 @@ gcm_hash_sizes(const struct gcm_key *key
+   WRITE_UINT64 (buffer, auth_size);
+   WRITE_UINT64 (buffer + 8, data_size);
+ 
+-  gcm_hash(key, x, GCM_BLOCK_SIZE, buffer);
++  _nettle_gcm_hash(key, x, GCM_BLOCK_SIZE, buffer);
+ }
+ 
+ /* NOTE: The key is needed only if length != GCM_IV_SIZE */
+@@ -410,7 +431,7 @@ gcm_set_iv(struct gcm_ctx *ctx, const st
+   else
+     {
+       memset(ctx->iv.b, 0, GCM_BLOCK_SIZE);
+-      gcm_hash(key, &ctx->iv, length, iv);
++      _nettle_gcm_hash(key, &ctx->iv, length, iv);
+       gcm_hash_sizes(key, &ctx->iv, 0, length);
+     }
+ 
+@@ -429,47 +450,68 @@ gcm_update(struct gcm_ctx *ctx, const st
+   assert(ctx->auth_size % GCM_BLOCK_SIZE == 0);
+   assert(ctx->data_size == 0);
+ 
+-  gcm_hash(key, &ctx->x, length, data);
++  _nettle_gcm_hash(key, &ctx->x, length, data);
+ 
+   ctx->auth_size += length;
+ }
+ 
++static nettle_fill16_func gcm_fill;
++#if WORDS_BIGENDIAN
+ static void
+-gcm_crypt(struct gcm_ctx *ctx, const void *cipher, nettle_cipher_func *f,
+-	  size_t length, uint8_t *dst, const uint8_t *src)
++gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
+ {
+-  uint8_t buffer[GCM_BLOCK_SIZE];
++  uint64_t hi, mid;
++  uint32_t lo;
++  size_t i;
++  hi = READ_UINT64(ctr);
++  mid = (uint64_t) READ_UINT32(ctr + 8) << 32;
++  lo = READ_UINT32(ctr + 12);
+ 
+-  if (src != dst)
++  for (i = 0; i < blocks; i++)
+     {
+-      for (; length >= GCM_BLOCK_SIZE;
+-           (length -= GCM_BLOCK_SIZE,
+-	    src += GCM_BLOCK_SIZE, dst += GCM_BLOCK_SIZE))
+-        {
+-          f (cipher, GCM_BLOCK_SIZE, dst, ctx->ctr.b);
+-          memxor (dst, src, GCM_BLOCK_SIZE);
+-          INC32 (ctx->ctr);
+-        }
++      buffer[i].u64[0] = hi;
++      buffer[i].u64[1] = mid + lo++;
+     }
+-  else
++  WRITE_UINT32(ctr + 12, lo);
++
++}
++#elif HAVE_BUILTIN_BSWAP64
++/* Assume __builtin_bswap32 is also available */
++static void
++gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
++{
++  uint64_t hi, mid;
++  uint32_t lo;
++  size_t i;
++  hi = LE_READ_UINT64(ctr);
++  mid = LE_READ_UINT32(ctr + 8);
++  lo = READ_UINT32(ctr + 12);
++
++  for (i = 0; i < blocks; i++)
+     {
+-      for (; length >= GCM_BLOCK_SIZE;
+-           (length -= GCM_BLOCK_SIZE,
+-	    src += GCM_BLOCK_SIZE, dst += GCM_BLOCK_SIZE))
+-        {
+-          f (cipher, GCM_BLOCK_SIZE, buffer, ctx->ctr.b);
+-          memxor3 (dst, src, buffer, GCM_BLOCK_SIZE);
+-          INC32 (ctx->ctr);
+-        }
++      buffer[i].u64[0] = hi;
++      buffer[i].u64[1] = mid + ((uint64_t)__builtin_bswap32(lo) << 32);
++      lo++;
+     }
+-  if (length > 0)
++  WRITE_UINT32(ctr + 12, lo);
++}
++#else
++static void
++gcm_fill(uint8_t *ctr, size_t blocks, union nettle_block16 *buffer)
++{
++  uint32_t c;
++
++  c = READ_UINT32(ctr + GCM_BLOCK_SIZE - 4);
++
++  for (; blocks-- > 0; buffer++, c++)
+     {
+-      /* A final partial block */
+-      f (cipher, GCM_BLOCK_SIZE, buffer, ctx->ctr.b);
+-      memxor3 (dst, src, buffer, length);
+-      INC32 (ctx->ctr);
++      memcpy(buffer->b, ctr, GCM_BLOCK_SIZE - 4);
++      WRITE_UINT32(buffer->b + GCM_BLOCK_SIZE - 4, c);
+     }
++
++  WRITE_UINT32(ctr + GCM_BLOCK_SIZE - 4, c);
+ }
++#endif
+ 
+ void
+ gcm_encrypt (struct gcm_ctx *ctx, const struct gcm_key *key,
+@@ -478,8 +520,8 @@ gcm_encrypt (struct gcm_ctx *ctx, const
+ {
+   assert(ctx->data_size % GCM_BLOCK_SIZE == 0);
+ 
+-  gcm_crypt(ctx, cipher, f, length, dst, src);
+-  gcm_hash(key, &ctx->x, length, dst);
++  _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src);
++  _nettle_gcm_hash(key, &ctx->x, length, dst);
+ 
+   ctx->data_size += length;
+ }
+@@ -491,8 +533,8 @@ gcm_decrypt(struct gcm_ctx *ctx, const s
+ {
+   assert(ctx->data_size % GCM_BLOCK_SIZE == 0);
+ 
+-  gcm_hash(key, &ctx->x, length, src);
+-  gcm_crypt(ctx, cipher, f, length, dst, src);
++  _nettle_gcm_hash(key, &ctx->x, length, src);
++  _ctr_crypt16(cipher, f, gcm_fill, ctx->ctr.b, length, dst, src);
+ 
+   ctx->data_size += length;
+ }
+diff -up ./gcm-internal.h.ghash ./gcm-internal.h
+--- ./gcm-internal.h.ghash	2021-07-14 14:11:58.131891547 +0200
++++ ./gcm-internal.h	2021-07-14 14:11:58.131891547 +0200
+@@ -0,0 +1,54 @@
++/* gcm-internal.h
++
++   Copyright (C) 2020 Niels Möller
++
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++*/
++
++#ifndef NETTLE_GCM_INTERNAL_H_INCLUDED
++#define NETTLE_GCM_INTERNAL_H_INCLUDED
++
++/* Functions available only in some configurations */
++void
++_nettle_gcm_init_key (union nettle_block16 *table);
++
++void
++_nettle_gcm_hash(const struct gcm_key *key, union nettle_block16 *x,
++		 size_t length, const uint8_t *data);
++
++#if HAVE_NATIVE_fat_gcm_init_key
++void
++_nettle_gcm_init_key_c (union nettle_block16 *table);
++#endif
++
++#if HAVE_NATIVE_fat_gcm_hash
++void
++_nettle_gcm_hash_c (const struct gcm_key *key, union nettle_block16 *x,
++		    size_t length, const uint8_t *data);
++#endif
++
++#endif /* NETTLE_GCM_INTERNAL_H_INCLUDED */
+diff -up ./Makefile.in.ghash ./Makefile.in
+--- ./Makefile.in.ghash	2021-07-14 14:11:58.124891582 +0200
++++ ./Makefile.in	2021-07-14 14:11:58.131891547 +0200
+@@ -96,7 +96,7 @@ nettle_SOURCES = aes-decrypt-internal.c
+ 		 chacha-crypt.c chacha-core-internal.c \
+ 		 chacha-poly1305.c chacha-poly1305-meta.c \
+ 		 chacha-set-key.c chacha-set-nonce.c \
+-		 ctr.c des.c des3.c des-compat.c \
++		 ctr.c ctr16.c des.c des3.c des-compat.c \
+ 		 eax.c eax-aes128.c eax-aes128-meta.c \
+ 		 gcm.c gcm-aes.c \
+ 		 gcm-aes128.c gcm-aes128-meta.c \
+@@ -233,6 +233,8 @@ DISTFILES = $(SOURCES) $(HEADERS) getopt
+ 	cast128_sboxes.h desinfo.h desCode.h \
+ 	memxor-internal.h nettle-internal.h nettle-write.h \
+ 	rsa-internal.h \
++	ctr-internal.h \
++	gcm-internal.h \
+ 	gmp-glue.h ecc-internal.h fat-setup.h \
+ 	mini-gmp.h asm.m4 \
+ 	nettle.texinfo nettle.info nettle.html nettle.pdf sha-example.c
+diff -up ./nettle-types.h.ghash ./nettle-types.h
+--- ./nettle-types.h.ghash	2018-12-04 21:56:06.000000000 +0100
++++ ./nettle-types.h	2021-07-14 14:11:58.131891547 +0200
+@@ -48,6 +48,7 @@ union nettle_block16
+ {
+   uint8_t b[16];
+   unsigned long w[16 / sizeof(unsigned long)];
++  uint64_t u64[2];
+ };
+ 
+ /* Randomness. Used by key generation and dsa signature creation. */
+diff -up ./powerpc64/fat/gcm-hash.asm.ghash ./powerpc64/fat/gcm-hash.asm
+--- ./powerpc64/fat/gcm-hash.asm.ghash	2021-07-14 14:11:58.131891547 +0200
++++ ./powerpc64/fat/gcm-hash.asm	2021-07-14 14:11:58.131891547 +0200
+@@ -0,0 +1,39 @@
++C powerpc64/fat/gcm-hash.asm
++
++
++ifelse(<
++   Copyright (C) 2020 Mamone Tarsha
++
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++>)
++
++dnl picked up by configure
++dnl PROLOGUE(_nettle_fat_gcm_init_key)
++dnl PROLOGUE(_nettle_fat_gcm_hash)
++
++define(<fat_transform>, <$1_ppc64>)
++include_src(<powerpc64/p8/gcm-hash.asm>)
+diff -up ./powerpc64/p8/gcm-hash.asm.ghash ./powerpc64/p8/gcm-hash.asm
+--- ./powerpc64/p8/gcm-hash.asm.ghash	2021-07-14 14:11:58.131891547 +0200
++++ ./powerpc64/p8/gcm-hash.asm	2021-07-14 14:11:58.131891547 +0200
+@@ -0,0 +1,499 @@
++C powerpc64/p8/gcm-hash.asm
++
++ifelse(<
++   Copyright (C) 2020 Niels Möller and Mamone Tarsha
++   This file is part of GNU Nettle.
++
++   GNU Nettle is free software: you can redistribute it and/or
++   modify it under the terms of either:
++
++     * the GNU Lesser General Public License as published by the Free
++       Software Foundation; either version 3 of the License, or (at your
++       option) any later version.
++
++   or
++
++     * the GNU General Public License as published by the Free
++       Software Foundation; either version 2 of the License, or (at your
++       option) any later version.
++
++   or both in parallel, as here.
++
++   GNU Nettle is distributed in the hope that it will be useful,
++   but WITHOUT ANY WARRANTY; without even the implied warranty of
++   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
++   General Public License for more details.
++
++   You should have received copies of the GNU General Public License and
++   the GNU Lesser General Public License along with this program.  If
++   not, see http://www.gnu.org/licenses/.
++>)
++
++C gcm_set_key() assigns H value in the middle element of the table
++define(<H_Idx>, <128>)
++
++C Register usage:
++
++define(<SP>, <1>)
++define(<TOCP>, <2>)
++
++define(<TABLE>, <3>)
++
++define(<ZERO>, <0>)
++define(<B1>, <1>)
++define(<EMSB>, <16>)
++define(<POLY>, <17>)
++define(<POLY_L>, <1>)
++
++define(<H>, <2>)
++define(<H2>, <3>)
++define(<H3>, <4>)
++define(<H4>, <5>)
++define(<H1M>, <6>)
++define(<H1L>, <7>)
++define(<H2M>, <8>)
++define(<H2L>, <9>)
++define(<Hl>, <10>)
++define(<Hm>, <11>)
++define(<Hp>, <12>)
++define(<Hl2>, <13>)
++define(<Hm2>, <14>)
++define(<Hp2>, <15>)
++define(<R>, <13>)
++define(<F>, <14>)
++define(<T>, <15>)
++define(<R2>, <16>)
++define(<F2>, <17>)
++define(<T2>, <18>)
++
++define(<LE_TEMP>, <18>)
++define(<LE_MASK>, <19>)
++
++.file "gcm-hash.asm"
++
++.text
++
++    C void gcm_init_key (union gcm_block *table)
++
++C This function populates the gcm table as the following layout
++C *******************************************************************************
++C | H1M = (H1 div x⁶⁴)||((H1 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
++C | H1L = (H1 mod x⁶⁴)||(((H1 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H1 div x⁶⁴) |
++C |                                                                             |
++C | H2M = (H2 div x⁶⁴)||((H2 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
++C | H2L = (H2 mod x⁶⁴)||(((H2 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H2 div x⁶⁴) |
++C |                                                                             |
++C | H3M = (H3 div x⁶⁴)||((H3 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
++C | H3L = (H3 mod x⁶⁴)||(((H3 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H3 div x⁶⁴) |
++C |                                                                             |
++C | H4M = (H3 div x⁶⁴)||((H4 mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷)) div x⁶⁴              |
++C | H4L = (H3 mod x⁶⁴)||(((H4 mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)) mod x⁶⁴) + (H4 div x⁶⁴) |
++C *******************************************************************************
++
++define(<FUNC_ALIGN>, <5>)
++PROLOGUE(_nettle_gcm_init_key)
++    DATA_LOAD_VEC(POLY,.polynomial,7)           C 0xC2000000000000000000000000000001
++IF_LE(<
++    li             8,0
++    lvsl           LE_MASK,0,8                  C 0x000102030405060708090A0B0C0D0E0F
++    vspltisb       LE_TEMP,0x07                  C 0x07070707070707070707070707070707
++    vxor           LE_MASK,LE_MASK,LE_TEMP       C 0x07060504030201000F0E0D0C0B0A0908
++>)
++
++    C 'H' is assigned by gcm_set_key() to the middle element of the table
++    li             10,H_Idx*16
++    lxvd2x         VSR(H),10,TABLE              C load 'H'
++    C byte-reverse of each doubleword permuting on little-endian mode
++IF_LE(<
++    vperm          H,H,H,LE_MASK
++>)
++
++    C --- calculate H = H << 1 mod P(X), P(X) = (x¹²⁸+x¹²⁷+x¹²⁶+x¹²¹+1) ---
++
++    vupkhsb        EMSB,H                        C extend most significant bit to first byte
++    vspltisb       B1,1                          C 0x01010101010101010101010101010101
++    vspltb         EMSB,EMSB,0                   C first byte quadword-extend
++    vsl            H,H,B1                        C H = H << 1
++    vand           EMSB,EMSB,POLY                C EMSB &= 0xC2000000000000000000000000000001
++    vxor           ZERO,ZERO,ZERO                C 0x00000000000000000000000000000000
++    vxor           H,H,EMSB                      C H ^= EMSB
++
++    C --- calculate H^2 = H*H ---
++
++    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY) C 0x0000000000000000C200000000000000
++
++    C --- Hp = (H mod x⁶⁴) / x⁶⁴ mod P(X) ---
++    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) mod P(X), deg(Hp) ≤ 127 ---
++    C --- Hp = (H mod x⁶⁴) × (x⁶⁴+x⁶³+x⁶²+x⁵⁷) ---
++    vpmsumd        Hp,H,POLY_L                   C Hp = (H mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)
++    xxswapd        VSR(Hm),VSR(H)
++    xxmrgld        VSR(Hl),VSR(H),VSR(ZERO)      C Hl = (H mod x⁶⁴) × x⁶⁴
++    vxor           Hm,Hm,Hp                      C Hm = Hm + Hp
++    vxor           Hl,Hl,Hp                      C Hl = Hl + Hp
++    xxmrgld        VSR(H1L),VSR(H),VSR(Hm)       C H1L = (H mod x⁶⁴)||(Hl mod x⁶⁴)
++    xxmrghd        VSR(H1M),VSR(H),VSR(Hl)       C H1M = (H div x⁶⁴)||(Hl div x⁶⁴)
++
++    vpmsumd        F,H1L,H                       C F = (H1Lh × Hh) + (H1Ll × Hl)
++    vpmsumd        R,H1M,H                       C R = (H1Mh × Hh) + (H1Ml × Hl)
++
++    C --- rduction ---
++    vpmsumd        T,F,POLY_L                    C T = (F mod x⁶⁴) × (x⁶³+x⁶²+x⁵⁷)
++    xxswapd        VSR(H2),VSR(F)
++    vxor           R,R,T                         C R = R + T
++    vxor           H2,R,H2
++
++    xxmrgld        VSR(Hl),VSR(H2),VSR(ZERO)
++    xxswapd        VSR(Hm),VSR(H2)
++    vpmsumd        Hp,H2,POLY_L
++    vxor           Hl,Hl,Hp
++    vxor           Hm,Hm,Hp
++    xxmrghd        VSR(H2M),VSR(H2),VSR(Hl)
++    xxmrgld        VSR(H2L),VSR(H2),VSR(Hm)
++
++    C store H1M, H1L, H2M, H2L
++    li             8,1*16
++    li             9,2*16
++    li             10,3*16
++    stxvd2x        VSR(H1M),0,TABLE
++    stxvd2x        VSR(H1L),8,TABLE
++    stxvd2x        VSR(H2M),9,TABLE
++    stxvd2x        VSR(H2L),10,TABLE
++
++    C --- calculate H^3 = H^1*H^2, H^4 = H^2*H^2 ---
++
++    vpmsumd        F,H1L,H2
++    vpmsumd        F2,H2L,H2
++    vpmsumd        R,H1M,H2
++    vpmsumd        R2,H2M,H2
++
++    vpmsumd        T,F,POLY_L
++    vpmsumd        T2,F2,POLY_L
++    xxswapd        VSR(H3),VSR(F)
++    xxswapd        VSR(H4),VSR(F2)
++    vxor           R,R,T
++    vxor           R2,R2,T2
++    vxor           H3,R,H3
++    vxor           H4,R2,H4
++
++    xxmrgld        VSR(Hl),VSR(H3),VSR(ZERO)
++    xxmrgld        VSR(Hl2),VSR(H4),VSR(ZERO)
++    xxswapd        VSR(Hm),VSR(H3)
++    xxswapd        VSR(Hm2),VSR(H4)
++    vpmsumd        Hp,H3,POLY_L
++    vpmsumd        Hp2,H4,POLY_L
++    vxor           Hl,Hl,Hp
++    vxor           Hl2,Hl2,Hp2
++    vxor           Hm,Hm,Hp
++    vxor           Hm2,Hm2,Hp2
++    xxmrghd        VSR(H1M),VSR(H3),VSR(Hl)
++    xxmrghd        VSR(H2M),VSR(H4),VSR(Hl2)
++    xxmrgld        VSR(H1L),VSR(H3),VSR(Hm)
++    xxmrgld        VSR(H2L),VSR(H4),VSR(Hm2)
++
++    C store H3M, H3L, H4M, H4L
++    li             7,4*16
++    li             8,5*16
++    li             9,6*16
++    li             10,7*16
++    stxvd2x        VSR(H1M),7,TABLE
++    stxvd2x        VSR(H1L),8,TABLE
++    stxvd2x        VSR(H2M),9,TABLE
++    stxvd2x        VSR(H2L),10,TABLE
++
++    blr
++EPILOGUE(_nettle_gcm_init_key)
++
++define(<TABLE>, <3>)
++define(<X>, <4>)
++define(<LENGTH>, <5>)
++define(<DATA>, <6>)
++
++define(<ZERO>, <16>)
++define(<POLY>, <17>)
++define(<POLY_L>, <0>)
++
++define(<D>, <1>)
++define(<C0>, <2>)
++define(<C1>, <3>)
++define(<C2>, <4>)
++define(<C3>, <5>)
++define(<H1M>, <6>)
++define(<H1L>, <7>)
++define(<H2M>, <8>)
++define(<H2L>, <9>)
++define(<H3M>, <10>)
++define(<H3L>, <11>)
++define(<H4M>, <12>)
++define(<H4L>, <13>)
++define(<R>, <14>)
++define(<F>, <15>)
++define(<R2>, <16>)
++define(<F2>, <17>)
++define(<T>, <18>)
++define(<R3>, <20>)
++define(<F3>, <21>)
++define(<R4>, <22>)
++define(<F4>, <23>)
++
++define(<LE_TEMP>, <18>)
++define(<LE_MASK>, <19>)
++
++    C void gcm_hash (const struct gcm_key *key, union gcm_block *x,
++    C                size_t length, const uint8_t *data)
++
++define(<FUNC_ALIGN>, <5>)
++PROLOGUE(_nettle_gcm_hash)
++    vxor           ZERO,ZERO,ZERO
++    DATA_LOAD_VEC(POLY,.polynomial,7)
++IF_LE(<
++    li             8,0
++    lvsl           LE_MASK,0,8
++    vspltisb       LE_TEMP,0x07
++    vxor           LE_MASK,LE_MASK,LE_TEMP
++>)
++    xxmrghd        VSR(POLY_L),VSR(ZERO),VSR(POLY)
++
++    lxvd2x         VSR(D),0,X                    C load 'X' pointer
++    C byte-reverse of each doubleword permuting on little-endian mode
++IF_LE(<
++    vperm          D,D,D,LE_MASK
++>)
++
++    C --- process 4 blocks '128-bit each' per one loop ---
++
++    srdi.          7,LENGTH,6                   C 4-blocks loop count 'LENGTH / (4 * 16)'
++    beq            L2x
++
++    mtctr          7                            C assign counter register to loop count
++
++    C store non-volatile vector registers
++    addi           8,SP,-64
++    stvx           20,0,8
++    addi           8,8,16
++    stvx           21,0,8
++    addi           8,8,16
++    stvx           22,0,8
++    addi           8,8,16
++    stvx           23,0,8
++
++    C load table elements
++    li             8,1*16
++    li             9,2*16
++    li             10,3*16
++    lxvd2x         VSR(H1M),0,TABLE
++    lxvd2x         VSR(H1L),8,TABLE
++    lxvd2x         VSR(H2M),9,TABLE
++    lxvd2x         VSR(H2L),10,TABLE
++    li             7,4*16
++    li             8,5*16
++    li             9,6*16
++    li             10,7*16
++    lxvd2x         VSR(H3M),7,TABLE
++    lxvd2x         VSR(H3L),8,TABLE
++    lxvd2x         VSR(H4M),9,TABLE
++    lxvd2x         VSR(H4L),10,TABLE
++
++    li             8,0x10
++    li             9,0x20
++    li             10,0x30
++.align 5
++L4x_loop:
++    C input loading
++    lxvd2x         VSR(C0),0,DATA                C load C0
++    lxvd2x         VSR(C1),8,DATA               C load C1
++    lxvd2x         VSR(C2),9,DATA               C load C2
++    lxvd2x         VSR(C3),10,DATA              C load C3
++
++IF_LE(<
++    vperm          C0,C0,C0,LE_MASK
++    vperm          C1,C1,C1,LE_MASK
++    vperm          C2,C2,C2,LE_MASK
++    vperm          C3,C3,C3,LE_MASK
++>)
++
++    C previous digest combining
++    vxor           C0,C0,D
++
++    C polynomial multiplication
++    vpmsumd        F2,H3L,C1
++    vpmsumd        R2,H3M,C1
++    vpmsumd        F3,H2L,C2
++    vpmsumd        R3,H2M,C2
++    vpmsumd        F4,H1L,C3
++    vpmsumd        R4,H1M,C3
++    vpmsumd        F,H4L,C0
++    vpmsumd        R,H4M,C0
++
++    C deferred recombination of partial products
++    vxor           F3,F3,F4
++    vxor           R3,R3,R4
++    vxor           F,F,F2
++    vxor           R,R,R2
++    vxor           F,F,F3
++    vxor           R,R,R3
++
++    C reduction
++    vpmsumd        T,F,POLY_L
++    xxswapd        VSR(D),VSR(F)
++    vxor           R,R,T
++    vxor           D,R,D
++
++    addi           DATA,DATA,0x40
++    bdnz           L4x_loop
++
++    C restore non-volatile vector registers
++    addi           8,SP,-64
++    lvx            20,0,8
++    addi           8,8,16
++    lvx            21,0,8
++    addi           8,8,16
++    lvx            22,0,8
++    addi           8,8,16
++    lvx            23,0,8
++
++    clrldi         LENGTH,LENGTH,58              C 'set the high-order 58 bits to zeros'
++L2x:
++    C --- process 2 blocks ---
++
++    srdi.          7,LENGTH,5                   C 'LENGTH / (2 * 16)'
++    beq            L1x
++
++    C load table elements
++    li             8,1*16
++    li             9,2*16
++    li             10,3*16
++    lxvd2x         VSR(H1M),0,TABLE
++    lxvd2x         VSR(H1L),8,TABLE
++    lxvd2x         VSR(H2M),9,TABLE
++    lxvd2x         VSR(H2L),10,TABLE
++
++    C input loading
++    li             10,0x10
++    lxvd2x         VSR(C0),0,DATA                C load C0
++    lxvd2x         VSR(C1),10,DATA              C load C1
++
++IF_LE(<
++    vperm          C0,C0,C0,LE_MASK
++    vperm          C1,C1,C1,LE_MASK
++>)
++
++    C previous digest combining
++    vxor           C0,C0,D
++
++    C polynomial multiplication
++    vpmsumd        F2,H1L,C1
++    vpmsumd        R2,H1M,C1
++    vpmsumd        F,H2L,C0
++    vpmsumd        R,H2M,C0
++
++    C deferred recombination of partial products
++    vxor           F,F,F2
++    vxor           R,R,R2
++
++    C reduction
++    vpmsumd        T,F,POLY_L
++    xxswapd        VSR(D),VSR(F)
++    vxor           R,R,T
++    vxor           D,R,D
++
++    addi           DATA,DATA,0x20
++    clrldi         LENGTH,LENGTH,59              C 'set the high-order 59 bits to zeros'
++L1x:
++    C --- process 1 block ---
++
++    srdi.          7,LENGTH,4                   C 'LENGTH / (1 * 16)'
++    beq            Lmod
++
++    C load table elements
++    li             8,1*16
++    lxvd2x         VSR(H1M),0,TABLE
++    lxvd2x         VSR(H1L),8,TABLE
++
++    C input loading
++    lxvd2x         VSR(C0),0,DATA                C load C0
++
++IF_LE(<
++    vperm          C0,C0,C0,LE_MASK
++>)
++
++    C previous digest combining
++    vxor           C0,C0,D
++
++    C polynomial multiplication
++    vpmsumd        F,H1L,C0
++    vpmsumd        R,H1M,C0
++
++    C reduction
++    vpmsumd        T,F,POLY_L
++    xxswapd        VSR(D),VSR(F)
++    vxor           R,R,T
++    vxor           D,R,D
++
++    addi           DATA,DATA,0x10
++    clrldi         LENGTH,LENGTH,60              C 'set the high-order 60 bits to zeros'
++Lmod:
++    C --- process the modulo bytes, padding the low-order bytes with zeros ---
++
++    cmpldi         LENGTH,0
++    beq            Ldone
++
++    C load table elements
++    li             8,1*16
++    lxvd2x         VSR(H1M),0,TABLE
++    lxvd2x         VSR(H1L),8,TABLE
++
++    C push every modulo byte to the stack and load them with padding into vector register
++    vxor           ZERO,ZERO,ZERO
++    addi           8,SP,-16
++    stvx           ZERO,0,8
++Lstb_loop:
++    subic.         LENGTH,LENGTH,1
++    lbzx           7,LENGTH,DATA
++    stbx           7,LENGTH,8
++    bne            Lstb_loop
++    lxvd2x         VSR(C0),0,8
++
++IF_LE(<
++    vperm          C0,C0,C0,LE_MASK
++>)
++
++    C previous digest combining
++    vxor           C0,C0,D
++
++    C polynomial multiplication
++    vpmsumd        F,H1L,C0
++    vpmsumd        R,H1M,C0
++
++    C reduction
++    vpmsumd        T,F,POLY_L
++    xxswapd        VSR(D),VSR(F)
++    vxor           R,R,T
++    vxor           D,R,D
++
++Ldone:
++    C byte-reverse of each doubleword permuting on little-endian mode
++IF_LE(<
++    vperm          D,D,D,LE_MASK
++>)
++    stxvd2x        VSR(D),0,X                    C store digest 'D'
++
++    blr
++EPILOGUE(_nettle_gcm_hash)
++
++.data
++    C 0xC2000000000000000000000000000001
++.polynomial:
++.align 4
++IF_BE(<
++.byte 0xC2
++.rept 14
++.byte 0x00
++.endr
++.byte 0x01
++>,<
++.byte 0x01
++.rept 14
++.byte 0x00
++.endr
++.byte 0xC2
++>)
diff --git a/SOURCES/nettle-3.4.1-rsa-decrypt.patch b/SOURCES/nettle-3.4.1-rsa-decrypt.patch
new file mode 100644
index 0000000..ecfba91
--- /dev/null
+++ b/SOURCES/nettle-3.4.1-rsa-decrypt.patch
@@ -0,0 +1,609 @@
+From 5646ca77ee92de0ae33e7d2e0a3383c61a4091ed Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
+Date: Thu, 6 May 2021 21:30:23 +0200
+Subject: [PATCH 1/4] Add check that message length to _pkcs1_sec_decrypt is
+ valid.
+
+* pkcs1-sec-decrypt.c (_pkcs1_sec_decrypt): Check that message
+length is valid, for given key size.
+* testsuite/rsa-sec-decrypt-test.c (test_main): Add test cases for
+calls to rsa_sec_decrypt specifying a too large message length.
+
+(cherry picked from commit 7616541e6eff73353bf682c62e3a68e4fe696707)
+---
+ ChangeLog                        |  8 ++++++++
+ pkcs1-sec-decrypt.c              |  4 +++-
+ testsuite/rsa-sec-decrypt-test.c | 17 ++++++++++++++++-
+ 3 files changed, 27 insertions(+), 2 deletions(-)
+
+diff --git a/ChangeLog b/ChangeLog
+index 4c7338a1..7cd0455e 100644
+--- a/ChangeLog
++++ b/ChangeLog
+@@ -1,3 +1,11 @@
++2021-05-06  Niels Möller  <nisse@lysator.liu.se>
++
++	Bug fixes merged from from 3.7.3 release (starting from 2021-05-06).
++	* pkcs1-sec-decrypt.c (_pkcs1_sec_decrypt): Check that message
++	length is valid, for given key size.
++	* testsuite/rsa-sec-decrypt-test.c (test_main): Add test cases for
++	calls to rsa_sec_decrypt specifying a too large message length.
++
+ 2018-12-04  Niels Möller  <nisse@lysator.liu.se>
+ 
+ 	* Released nettle-3.4.1.
+diff --git a/pkcs1-sec-decrypt.c b/pkcs1-sec-decrypt.c
+index 722044b0..02fd07e1 100644
+--- a/pkcs1-sec-decrypt.c
++++ b/pkcs1-sec-decrypt.c
+@@ -64,7 +64,9 @@ _pkcs1_sec_decrypt (size_t length, uint8_t *message,
+   volatile int ok;
+   size_t i, t;
+ 
+-  assert (padded_message_length >= length);
++  /* Message independent branch */
++  if (length + 11 > padded_message_length)
++    return 0;
+ 
+   t = padded_message_length - length - 1;
+ 
+diff --git a/testsuite/rsa-sec-decrypt-test.c b/testsuite/rsa-sec-decrypt-test.c
+index 64f0b13c..4a9f301b 100644
+--- a/testsuite/rsa-sec-decrypt-test.c
++++ b/testsuite/rsa-sec-decrypt-test.c
+@@ -55,6 +55,7 @@ rsa_decrypt_for_test(const struct rsa_public_key *pub,
+ #endif
+ 
+ #define PAYLOAD_SIZE 50
++#define DECRYPTED_SIZE 256
+ void
+ test_main(void)
+ {
+@@ -63,7 +64,7 @@ test_main(void)
+   struct knuth_lfib_ctx random_ctx;
+ 
+   uint8_t plaintext[PAYLOAD_SIZE];
+-  uint8_t decrypted[PAYLOAD_SIZE];
++  uint8_t decrypted[DECRYPTED_SIZE];
+   uint8_t verifybad[PAYLOAD_SIZE];
+   unsigned n_size = 1024;
+   mpz_t gibberish;
+@@ -98,6 +99,20 @@ test_main(void)
+                                     PAYLOAD_SIZE, decrypted, gibberish) == 1);
+       ASSERT (MEMEQ (PAYLOAD_SIZE, plaintext, decrypted));
+ 
++      ASSERT (pub.size > 10);
++      ASSERT (pub.size <= DECRYPTED_SIZE);
++
++      /* Check that too large message length is rejected, largest
++	 valid size is pub.size - 11. */
++      ASSERT (!rsa_decrypt_for_test (&pub, &key, &random_ctx,
++				     (nettle_random_func *) knuth_lfib_random,
++				     pub.size - 10, decrypted, gibberish));
++
++      /* This case used to result in arithmetic underflow and a crash. */
++      ASSERT (!rsa_decrypt_for_test (&pub, &key, &random_ctx,
++				     (nettle_random_func *) knuth_lfib_random,
++				     pub.size, decrypted, gibberish));
++
+       /* bad one */
+       memcpy(decrypted, verifybad, PAYLOAD_SIZE);
+       nettle_mpz_random_size(garbage, &random_ctx,
+-- 
+2.31.1
+
+
+From 743cdf38353f6dd5d3d91eadc769106cfc116301 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
+Date: Tue, 8 Jun 2021 21:30:48 +0200
+Subject: [PATCH 2/4] Fix comment typos.
+
+(cherry picked from commit 0a714543136de97c7fd34f1c6ac1592dc5036879)
+---
+ pkcs1-sec-decrypt.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/pkcs1-sec-decrypt.c b/pkcs1-sec-decrypt.c
+index 02fd07e1..a7f85c2e 100644
+--- a/pkcs1-sec-decrypt.c
++++ b/pkcs1-sec-decrypt.c
+@@ -102,8 +102,8 @@ _pkcs1_sec_decrypt_variable(size_t *length, uint8_t *message,
+ 
+   /* length is discovered in a side-channel silent way.
+    * not_found goes to 0 when the terminator is found.
+-   * offset strts at 3 as it includes the terminator and
+-   * the fomat bytes already */
++   * offset starts at 3 as it includes the terminator and
++   * the format bytes already */
+   offset = 3;
+   for (i = 2; i < padded_message_length; i++)
+     {
+-- 
+2.31.1
+
+
+From dfce46c4540d2abf040073070cff15f9d1708050 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
+Date: Tue, 8 Jun 2021 21:31:39 +0200
+Subject: [PATCH 3/4] Change _rsa_sec_compute_root_tr to take a fix input size.
+
+Improves consistency with _rsa_sec_compute_root, and fixes zero-input bug.
+
+(cherry picked from commit 485b5e2820a057e873b1ba812fdb39cae4adf98c)
+---
+ ChangeLog                    | 17 +++++++++-
+ rsa-decrypt-tr.c             |  7 ++---
+ rsa-internal.h               |  4 +--
+ rsa-sec-decrypt.c            |  9 ++++--
+ rsa-sign-tr.c                | 61 +++++++++++++++++-------------------
+ testsuite/rsa-encrypt-test.c | 14 ++++++++-
+ 6 files changed, 69 insertions(+), 43 deletions(-)
+
+diff --git a/ChangeLog b/ChangeLog
+index 7cd0455e..ae660fc0 100644
+--- a/ChangeLog
++++ b/ChangeLog
+@@ -1,6 +1,21 @@
+-2021-05-06  Niels Möller  <nisse@lysator.liu.se>
++2021-05-14  Niels Möller  <nisse@lysator.liu.se>
+ 
+ 	Bug fixes merged from from 3.7.3 release (starting from 2021-05-06).
++	* rsa-sign-tr.c (rsa_sec_blind): Delete mn argument.
++	(_rsa_sec_compute_root_tr): Delete mn argument, instead require
++	that input size matches key size. Rearrange use of temporary
++	storage, to support in-place operation, x == m. Update all
++	callers.
++
++	* rsa-decrypt-tr.c (rsa_decrypt_tr): Make zero-padded copy of
++	input, for calling _rsa_sec_compute_root_tr.
++	* rsa-sec-decrypt.c (rsa_sec_decrypt): Likewise.
++
++	* testsuite/rsa-encrypt-test.c (test_main): Test calling all of
++	rsa_decrypt, rsa_decrypt_tr, and rsa_sec_decrypt with zero input.
++
++2021-05-06  Niels Möller  <nisse@lysator.liu.se>
++
+ 	* pkcs1-sec-decrypt.c (_pkcs1_sec_decrypt): Check that message
+ 	length is valid, for given key size.
+ 	* testsuite/rsa-sec-decrypt-test.c (test_main): Add test cases for
+diff --git a/rsa-decrypt-tr.c b/rsa-decrypt-tr.c
+index 5dfb91b1..c118e852 100644
+--- a/rsa-decrypt-tr.c
++++ b/rsa-decrypt-tr.c
+@@ -52,14 +52,13 @@ rsa_decrypt_tr(const struct rsa_public_key *pub,
+   mp_size_t key_limb_size;
+   int res;
+ 
+-  key_limb_size = NETTLE_OCTET_SIZE_TO_LIMB_SIZE(key->size);
++  key_limb_size = mpz_size(pub->n);
+ 
+   TMP_GMP_ALLOC (m, key_limb_size);
+   TMP_GMP_ALLOC (em, key->size);
++  mpz_limbs_copy(m, gibberish, key_limb_size);
+ 
+-  res = _rsa_sec_compute_root_tr (pub, key, random_ctx, random, m,
+-				  mpz_limbs_read(gibberish),
+-				  mpz_size(gibberish));
++  res = _rsa_sec_compute_root_tr (pub, key, random_ctx, random, m, m);
+ 
+   mpn_get_base256 (em, key->size, m, key_limb_size);
+ 
+diff --git a/rsa-internal.h b/rsa-internal.h
+index bd667bc2..64a7edf6 100644
+--- a/rsa-internal.h
++++ b/rsa-internal.h
+@@ -53,12 +53,12 @@ _rsa_sec_compute_root(const struct rsa_private_key *key,
+                       mp_limb_t *scratch);
+ 
+ /* Safe side-channel silent variant, using RSA blinding, and checking the
+- * result after CRT. */
++ * result after CRT. In-place calls, with x == m, is allowed. */
+ int
+ _rsa_sec_compute_root_tr(const struct rsa_public_key *pub,
+ 			 const struct rsa_private_key *key,
+ 			 void *random_ctx, nettle_random_func *random,
+-			 mp_limb_t *x, const mp_limb_t *m, size_t mn);
++			 mp_limb_t *x, const mp_limb_t *m);
+ 
+ /* additional resistance to memory access side-channel attacks.
+  * Note: message buffer is returned unchanged on error */
+diff --git a/rsa-sec-decrypt.c b/rsa-sec-decrypt.c
+index e6a4b267..633a6852 100644
+--- a/rsa-sec-decrypt.c
++++ b/rsa-sec-decrypt.c
+@@ -57,9 +57,12 @@ rsa_sec_decrypt(const struct rsa_public_key *pub,
+   TMP_GMP_ALLOC (m, mpz_size(pub->n));
+   TMP_GMP_ALLOC (em, key->size);
+ 
+-  res = _rsa_sec_compute_root_tr (pub, key, random_ctx, random, m,
+-				  mpz_limbs_read(gibberish),
+-				  mpz_size(gibberish));
++  /* We need a copy because m can be shorter than key_size,
++   * but _rsa_sec_compute_root_tr expect all inputs to be
++   * normalized to a key_size long buffer length */
++  mpz_limbs_copy(m, gibberish, mpz_size(pub->n));
++
++  res = _rsa_sec_compute_root_tr (pub, key, random_ctx, random, m, m);
+ 
+   mpn_get_base256 (em, key->size, m, mpz_size(pub->n));
+ 
+diff --git a/rsa-sign-tr.c b/rsa-sign-tr.c
+index 59c9bd07..141a52c7 100644
+--- a/rsa-sign-tr.c
++++ b/rsa-sign-tr.c
+@@ -131,35 +131,34 @@ int
+ _rsa_sec_compute_root_tr(const struct rsa_public_key *pub,
+ 			 const struct rsa_private_key *key,
+ 			 void *random_ctx, nettle_random_func *random,
+-			 mp_limb_t *x, const mp_limb_t *m, size_t mn)
++			 mp_limb_t *x, const mp_limb_t *m)
+ {
++  mp_size_t nn;
+   mpz_t mz;
+   mpz_t xz;
+   int res;
+ 
+-  mpz_init(mz);
+   mpz_init(xz);
+ 
+-  mpn_copyi(mpz_limbs_write(mz, mn), m, mn);
+-  mpz_limbs_finish(mz, mn);
++  nn = mpz_size (pub->n);
+ 
+-  res = rsa_compute_root_tr(pub, key, random_ctx, random, xz, mz);
++  res = rsa_compute_root_tr(pub, key, random_ctx, random, xz,
++			    mpz_roinit_n(mz, m, nn));
+ 
+   if (res)
+-    mpz_limbs_copy(x, xz, mpz_size(pub->n));
++    mpz_limbs_copy(x, xz, nn);
+ 
+-  mpz_clear(mz);
+   mpz_clear(xz);
+   return res;
+ }
+ #else
+ /* Blinds m, by computing c = m r^e (mod n), for a random r. Also
+-   returns the inverse (ri), for use by rsa_unblind. */
++   returns the inverse (ri), for use by rsa_unblind. Must have c != m,
++   no in-place operation.*/
+ static void
+ rsa_sec_blind (const struct rsa_public_key *pub,
+                void *random_ctx, nettle_random_func *random,
+-               mp_limb_t *c, mp_limb_t *ri, const mp_limb_t *m,
+-               mp_size_t mn)
++               mp_limb_t *c, mp_limb_t *ri, const mp_limb_t *m)
+ {
+   const mp_limb_t *ep = mpz_limbs_read (pub->e);
+   const mp_limb_t *np = mpz_limbs_read (pub->n);
+@@ -177,15 +176,15 @@ rsa_sec_blind (const struct rsa_public_key *pub,
+ 
+   /* c = m*(r^e) mod n */
+   itch = mpn_sec_powm_itch(nn, ebn, nn);
+-  i2 = mpn_sec_mul_itch(nn, mn);
++  i2 = mpn_sec_mul_itch(nn, nn);
+   itch = MAX(itch, i2);
+-  i2 = mpn_sec_div_r_itch(nn + mn, nn);
++  i2 = mpn_sec_div_r_itch(2*nn, nn);
+   itch = MAX(itch, i2);
+   i2 = mpn_sec_invert_itch(nn);
+   itch = MAX(itch, i2);
+ 
+-  TMP_GMP_ALLOC (tp, nn + mn + itch);
+-  scratch = tp + nn + mn;
++  TMP_GMP_ALLOC (tp, 2*nn  + itch);
++  scratch = tp + 2*nn;
+ 
+   /* ri = r^(-1) */
+   do
+@@ -198,9 +197,8 @@ rsa_sec_blind (const struct rsa_public_key *pub,
+   while (!mpn_sec_invert (ri, tp, np, nn, 2 * nn * GMP_NUMB_BITS, scratch));
+ 
+   mpn_sec_powm (c, rp, nn, ep, ebn, np, nn, scratch);
+-  /* normally mn == nn, but m can be smaller in some cases */
+-  mpn_sec_mul (tp, c, nn, m, mn, scratch);
+-  mpn_sec_div_r (tp, nn + mn, np, nn, scratch);
++  mpn_sec_mul (tp, c, nn, m, nn, scratch);
++  mpn_sec_div_r (tp, 2*nn, np, nn, scratch);
+   mpn_copyi(c, tp, nn);
+ 
+   TMP_GMP_FREE (r);
+@@ -208,7 +206,7 @@ rsa_sec_blind (const struct rsa_public_key *pub,
+   TMP_GMP_FREE (tp);
+ }
+ 
+-/* m = c ri mod n */
++/* m = c ri mod n. Allows x == c. */
+ static void
+ rsa_sec_unblind (const struct rsa_public_key *pub,
+                  mp_limb_t *x, mp_limb_t *ri, const mp_limb_t *c)
+@@ -298,7 +296,7 @@ int
+ _rsa_sec_compute_root_tr(const struct rsa_public_key *pub,
+ 			 const struct rsa_private_key *key,
+ 			 void *random_ctx, nettle_random_func *random,
+-			 mp_limb_t *x, const mp_limb_t *m, size_t mn)
++			 mp_limb_t *x, const mp_limb_t *m)
+ {
+   TMP_GMP_DECL (c, mp_limb_t);
+   TMP_GMP_DECL (ri, mp_limb_t);
+@@ -306,7 +304,7 @@ _rsa_sec_compute_root_tr(const struct rsa_public_key *pub,
+   size_t key_limb_size;
+   int ret;
+ 
+-  key_limb_size = NETTLE_OCTET_SIZE_TO_LIMB_SIZE(key->size);
++  key_limb_size = mpz_size(pub->n);
+ 
+   /* mpz_powm_sec handles only odd moduli. If p, q or n is even, the
+      key is invalid and rejected by rsa_private_key_prepare. However,
+@@ -320,19 +318,18 @@ _rsa_sec_compute_root_tr(const struct rsa_public_key *pub,
+     }
+ 
+   assert(mpz_size(pub->n) == key_limb_size);
+-  assert(mn <= key_limb_size);
+ 
+   TMP_GMP_ALLOC (c, key_limb_size);
+   TMP_GMP_ALLOC (ri, key_limb_size);
+   TMP_GMP_ALLOC (scratch, _rsa_sec_compute_root_itch(key));
+ 
+-  rsa_sec_blind (pub, random_ctx, random, x, ri, m, mn);
++  rsa_sec_blind (pub, random_ctx, random, c, ri, m);
+ 
+-  _rsa_sec_compute_root(key, c, x, scratch);
++  _rsa_sec_compute_root(key, x, c, scratch);
+ 
+-  ret = rsa_sec_check_root(pub, c, x);
++  ret = rsa_sec_check_root(pub, x, c);
+ 
+-  rsa_sec_unblind(pub, x, ri, c);
++  rsa_sec_unblind(pub, x, ri, x);
+ 
+   cnd_mpn_zero(1 - ret, x, key_limb_size);
+ 
+@@ -356,17 +353,17 @@ rsa_compute_root_tr(const struct rsa_public_key *pub,
+ 		    mpz_t x, const mpz_t m)
+ {
+   TMP_GMP_DECL (l, mp_limb_t);
++  mp_size_t nn = mpz_size(pub->n);
+   int res;
+ 
+-  mp_size_t l_size = NETTLE_OCTET_SIZE_TO_LIMB_SIZE(key->size);
+-  TMP_GMP_ALLOC (l, l_size);
++  TMP_GMP_ALLOC (l, nn);
++  mpz_limbs_copy(l, m, nn);
+ 
+-  res = _rsa_sec_compute_root_tr (pub, key, random_ctx, random, l,
+-				  mpz_limbs_read(m), mpz_size(m));
++  res = _rsa_sec_compute_root_tr (pub, key, random_ctx, random, l, l);
+   if (res) {
+-    mp_limb_t *xp = mpz_limbs_write (x, l_size);
+-    mpn_copyi (xp, l, l_size);
+-    mpz_limbs_finish (x, l_size);
++    mp_limb_t *xp = mpz_limbs_write (x, nn);
++    mpn_copyi (xp, l, nn);
++    mpz_limbs_finish (x, nn);
+   }
+ 
+   TMP_GMP_FREE (l);
+diff --git a/testsuite/rsa-encrypt-test.c b/testsuite/rsa-encrypt-test.c
+index 87525f78..d3bc374b 100644
+--- a/testsuite/rsa-encrypt-test.c
++++ b/testsuite/rsa-encrypt-test.c
+@@ -19,6 +19,7 @@ test_main(void)
+   uint8_t after;
+ 
+   mpz_t gibberish;
++  mpz_t zero;
+ 
+   rsa_private_key_init(&key);
+   rsa_public_key_init(&pub);
+@@ -101,6 +102,17 @@ test_main(void)
+   ASSERT(decrypted[decrypted_length] == after);
+   ASSERT(decrypted[0] == 'A');
+ 
++  /* Test zero input. */
++  mpz_init_set_ui (zero, 0);
++  decrypted_length = msg_length;
++  ASSERT(!rsa_decrypt(&key, &decrypted_length, decrypted, zero));
++  ASSERT(!rsa_decrypt_tr(&pub, &key,
++			 &lfib, (nettle_random_func *) knuth_lfib_random,
++			 &decrypted_length, decrypted, zero));
++  ASSERT(!rsa_sec_decrypt(&pub, &key,
++			  &lfib, (nettle_random_func *) knuth_lfib_random,
++			  decrypted_length, decrypted, zero));
++  ASSERT(decrypted_length == msg_length);
+ 
+   /* Test invalid key. */
+   mpz_add_ui (key.q, key.q, 2);
+@@ -112,6 +124,6 @@ test_main(void)
+   rsa_private_key_clear(&key);
+   rsa_public_key_clear(&pub);
+   mpz_clear(gibberish);
++  mpz_clear(zero);
+   free(decrypted);
+ }
+-  
+-- 
+2.31.1
+
+
+From f601611b3c315aba373c0ab2ddf24772e88c1b3e Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Niels=20M=C3=B6ller?= <nisse@lysator.liu.se>
+Date: Tue, 8 Jun 2021 21:32:38 +0200
+Subject: [PATCH 4/4] Add input check to rsa_decrypt family of functions.
+
+(cherry picked from commit 0ad0b5df315665250dfdaa4a1e087f4799edaefe)
+---
+ ChangeLog                    | 10 +++++++++-
+ rsa-decrypt-tr.c             |  4 ++++
+ rsa-decrypt.c                | 10 ++++++++++
+ rsa-sec-decrypt.c            |  4 ++++
+ rsa.h                        |  5 +++--
+ testsuite/rsa-encrypt-test.c | 38 ++++++++++++++++++++++++++++++------
+ 6 files changed, 62 insertions(+), 9 deletions(-)
+
+diff --git a/ChangeLog b/ChangeLog
+index ae660fc0..27f022db 100644
+--- a/ChangeLog
++++ b/ChangeLog
+@@ -1,6 +1,14 @@
+-2021-05-14  Niels Möller  <nisse@lysator.liu.se>
++2021-05-17  Niels Möller  <nisse@lysator.liu.se>
+ 
+ 	Bug fixes merged from from 3.7.3 release (starting from 2021-05-06).
++	* rsa-decrypt-tr.c (rsa_decrypt_tr): Check up-front that input is
++	in range.
++	* rsa-sec-decrypt.c (rsa_sec_decrypt): Likewise.
++	* rsa-decrypt.c (rsa_decrypt): Likewise.
++	* testsuite/rsa-encrypt-test.c (test_main): Add tests with input > n.
++
++2021-05-14  Niels Möller  <nisse@lysator.liu.se>
++
+ 	* rsa-sign-tr.c (rsa_sec_blind): Delete mn argument.
+ 	(_rsa_sec_compute_root_tr): Delete mn argument, instead require
+ 	that input size matches key size. Rearrange use of temporary
+diff --git a/rsa-decrypt-tr.c b/rsa-decrypt-tr.c
+index c118e852..1ba3d286 100644
+--- a/rsa-decrypt-tr.c
++++ b/rsa-decrypt-tr.c
+@@ -52,6 +52,10 @@ rsa_decrypt_tr(const struct rsa_public_key *pub,
+   mp_size_t key_limb_size;
+   int res;
+ 
++  /* First check that input is in range. */
++  if (mpz_sgn (gibberish) < 0 || mpz_cmp (gibberish, pub->n) >= 0)
++    return 0;
++
+   key_limb_size = mpz_size(pub->n);
+ 
+   TMP_GMP_ALLOC (m, key_limb_size);
+diff --git a/rsa-decrypt.c b/rsa-decrypt.c
+index 7681439d..540d8baa 100644
+--- a/rsa-decrypt.c
++++ b/rsa-decrypt.c
+@@ -48,6 +48,16 @@ rsa_decrypt(const struct rsa_private_key *key,
+   int res;
+ 
+   mpz_init(m);
++
++  /* First check that input is in range. Since we don't have the
++     public key available here, we need to reconstruct n. */
++  mpz_mul (m, key->p, key->q);
++  if (mpz_sgn (gibberish) < 0 || mpz_cmp (gibberish, m) >= 0)
++    {
++      mpz_clear (m);
++      return 0;
++    }
++
+   rsa_compute_root(key, m, gibberish);
+ 
+   res = pkcs1_decrypt (key->size, m, length, message);
+diff --git a/rsa-sec-decrypt.c b/rsa-sec-decrypt.c
+index 633a6852..53113c69 100644
+--- a/rsa-sec-decrypt.c
++++ b/rsa-sec-decrypt.c
+@@ -54,6 +54,10 @@ rsa_sec_decrypt(const struct rsa_public_key *pub,
+   TMP_GMP_DECL (em, uint8_t);
+   int res;
+ 
++  /* First check that input is in range. */
++  if (mpz_sgn (gibberish) < 0 || mpz_cmp (gibberish, pub->n) >= 0)
++    return 0;
++
+   TMP_GMP_ALLOC (m, mpz_size(pub->n));
+   TMP_GMP_ALLOC (em, key->size);
+ 
+diff --git a/rsa.h b/rsa.h
+index 0aac6a26..54c35688 100644
+--- a/rsa.h
++++ b/rsa.h
+@@ -433,13 +433,14 @@ rsa_sec_decrypt(const struct rsa_public_key *pub,
+ 	        size_t length, uint8_t *message,
+ 	        const mpz_t gibberish);
+ 
+-/* Compute x, the e:th root of m. Calling it with x == m is allowed. */
++/* Compute x, the e:th root of m. Calling it with x == m is allowed.
++   It is required that 0 <= m < n. */
+ void
+ rsa_compute_root(const struct rsa_private_key *key,
+ 		 mpz_t x, const mpz_t m);
+ 
+ /* Safer variant, using RSA blinding, and checking the result after
+-   CRT. */
++   CRT. It is required that 0 <= m < n. */
+ int
+ rsa_compute_root_tr(const struct rsa_public_key *pub,
+ 		    const struct rsa_private_key *key,
+diff --git a/testsuite/rsa-encrypt-test.c b/testsuite/rsa-encrypt-test.c
+index d3bc374b..d1a440f6 100644
+--- a/testsuite/rsa-encrypt-test.c
++++ b/testsuite/rsa-encrypt-test.c
+@@ -19,11 +19,12 @@ test_main(void)
+   uint8_t after;
+ 
+   mpz_t gibberish;
+-  mpz_t zero;
++  mpz_t bad_input;
+ 
+   rsa_private_key_init(&key);
+   rsa_public_key_init(&pub);
+   mpz_init(gibberish);
++  mpz_init(bad_input);
+ 
+   knuth_lfib_init(&lfib, 17);
+   
+@@ -103,15 +104,40 @@ test_main(void)
+   ASSERT(decrypted[0] == 'A');
+ 
+   /* Test zero input. */
+-  mpz_init_set_ui (zero, 0);
++  mpz_set_ui (bad_input, 0);
+   decrypted_length = msg_length;
+-  ASSERT(!rsa_decrypt(&key, &decrypted_length, decrypted, zero));
++  ASSERT(!rsa_decrypt(&key, &decrypted_length, decrypted, bad_input));
+   ASSERT(!rsa_decrypt_tr(&pub, &key,
+ 			 &lfib, (nettle_random_func *) knuth_lfib_random,
+-			 &decrypted_length, decrypted, zero));
++			 &decrypted_length, decrypted, bad_input));
+   ASSERT(!rsa_sec_decrypt(&pub, &key,
+ 			  &lfib, (nettle_random_func *) knuth_lfib_random,
+-			  decrypted_length, decrypted, zero));
++			  decrypted_length, decrypted, bad_input));
++  ASSERT(decrypted_length == msg_length);
++
++  /* Test input that is slightly larger than n */
++  mpz_add(bad_input, gibberish, pub.n);
++  decrypted_length = msg_length;
++  ASSERT(!rsa_decrypt(&key, &decrypted_length, decrypted, bad_input));
++  ASSERT(!rsa_decrypt_tr(&pub, &key,
++			 &lfib, (nettle_random_func *) knuth_lfib_random,
++			 &decrypted_length, decrypted, bad_input));
++  ASSERT(!rsa_sec_decrypt(&pub, &key,
++			  &lfib, (nettle_random_func *) knuth_lfib_random,
++			  decrypted_length, decrypted, bad_input));
++  ASSERT(decrypted_length == msg_length);
++
++  /* Test input that is considerably larger than n */
++  mpz_mul_2exp (bad_input, pub.n, 100);
++  mpz_add (bad_input, bad_input, gibberish);
++  decrypted_length = msg_length;
++  ASSERT(!rsa_decrypt(&key, &decrypted_length, decrypted, bad_input));
++  ASSERT(!rsa_decrypt_tr(&pub, &key,
++			 &lfib, (nettle_random_func *) knuth_lfib_random,
++			 &decrypted_length, decrypted, bad_input));
++  ASSERT(!rsa_sec_decrypt(&pub, &key,
++			  &lfib, (nettle_random_func *) knuth_lfib_random,
++			  decrypted_length, decrypted, bad_input));
+   ASSERT(decrypted_length == msg_length);
+ 
+   /* Test invalid key. */
+@@ -124,6 +150,6 @@ test_main(void)
+   rsa_private_key_clear(&key);
+   rsa_public_key_clear(&pub);
+   mpz_clear(gibberish);
+-  mpz_clear(zero);
++  mpz_clear(bad_input);
+   free(decrypted);
+ }
+-- 
+2.31.1
+
diff --git a/SPECS/nettle.spec b/SPECS/nettle.spec
index 814ab18..5bde11d 100644
--- a/SPECS/nettle.spec
+++ b/SPECS/nettle.spec
@@ -2,7 +2,7 @@
 
 Name:           nettle
 Version:        3.4.1
-Release:        2%{?dist}
+Release:        7%{?dist}
 Summary:        A low-level cryptographic library
 
 Group:          Development/Libraries
@@ -13,6 +13,11 @@ Source0:	%{name}-%{version}-hobbled.tar.xz
 Patch0:		nettle-3.3-remove-ecc-testsuite.patch
 Patch1:		nettle-3.4-annocheck.patch
 Patch2:		nettle-3.4.1-enable-intel-cet.patch
+# https://lists.lysator.liu.se/pipermail/nettle-bugs/2021/009458.html
+Patch3:		nettle-3.4.1-ecdsa-verify.patch
+Patch4:		nettle-3.4.1-powerpc64-aes-asm.patch
+Patch5:		nettle-3.4.1-powerpc64-ghash-asm.patch
+Patch6:		nettle-3.4.1-rsa-decrypt.patch
 
 BuildRequires:  gcc
 BuildRequires:  gmp-devel, m4
@@ -54,6 +59,10 @@ sed 's/ecc-224.c//g' -i Makefile.in
 %patch0 -p1
 %patch1 -p1
 %patch2 -p1
+%patch3 -p1
+%patch4 -p1
+%patch5 -p1
+%patch6 -p1
 
 %build
 autoreconf -ifv
@@ -126,6 +135,22 @@ fi
 
 
 %changelog
+* Wed Jul 14 2021 Daiki Ueno <dueno@redhat.com> - 3.4.1-7
+- Backport CVE-2021-3580 from upstream 3.7.3 release (#1967990)
+
+* Wed Jul 14 2021 Daiki Ueno <dueno@redhat.com> - 3.4.1-6
+- Enable CTR mode optimization when the block size is 16
+
+* Wed Jun 30 2021 Daiki Ueno <dueno@redhat.com> - 3.4.1-5
+- Backport powerpc64 optimization patches from upstream (#1855228)
+  Patch from Christopher M. Riedl.
+
+* Wed Apr  7 2021 Daiki Ueno <dueno@redhat.com> - 3.4.1-4
+- Fix patch application
+
+* Tue Mar 30 2021 Daiki Ueno <dueno@redhat.com> - 3.4.1-3
+- Port fixes for potential miscalculation in ecdsa_verify (#1942925)
+
 * Fri May 15 2020 Anderson Sasaki <ansasaki@redhat.com> - 3.4.1-2
 - Enable Intel CET support (#1737542)