diff --git a/Makefile.in b/Makefile.in index b43e494f..ec46a9df 100644 --- a/Makefile.in +++ b/Makefile.in @@ -189,7 +189,7 @@ hogweed_SOURCES = sexp.c sexp-format.c \ ed25519-sha512-pubkey.c \ ed25519-sha512-sign.c ed25519-sha512-verify.c -OPT_SOURCES = fat-x86_64.c fat-arm.c mini-gmp.c +OPT_SOURCES = fat-arm.c fat-ppc.c fat-x86_64.c mini-gmp.c HEADERS = aes.h arcfour.h arctwo.h asn1.h blowfish.h \ base16.h base64.h bignum.h buffer.h camellia.h cast128.h \ @@ -573,7 +573,8 @@ distdir: $(DISTFILES) done set -e; for d in sparc32 sparc64 x86 \ x86_64 x86_64/aesni x86_64/fat \ - arm arm/neon arm/v6 arm/fat ; do \ + arm arm/neon arm/v6 arm/fat \ + powerpc64 powerpc64/p8 powerpc64/fat ; do \ mkdir "$(distdir)/$$d" ; \ find "$(srcdir)/$$d" -maxdepth 1 '(' -name '*.asm' -o -name '*.m4' ')' \ -exec cp '{}' "$(distdir)/$$d" ';' ; \ diff --git a/aes-decrypt-internal.c b/aes-decrypt-internal.c index 709c52f9..9e8cf34a 100644 --- a/aes-decrypt-internal.c +++ b/aes-decrypt-internal.c @@ -40,6 +40,16 @@ #include "aes-internal.h" #include "macros.h" +/* For fat builds */ +#if HAVE_NATIVE_aes_decrypt +void +_nettle_aes_decrypt_c(unsigned rounds, const uint32_t *keys, + const struct aes_table *T, + size_t length, uint8_t *dst, + const uint8_t *src); +#define _nettle_aes_decrypt _nettle_aes_decrypt_c +#endif + void _nettle_aes_decrypt(unsigned rounds, const uint32_t *keys, const struct aes_table *T, diff --git a/aes-encrypt-internal.c b/aes-encrypt-internal.c index 9f61386d..ad17e6c1 100644 --- a/aes-encrypt-internal.c +++ b/aes-encrypt-internal.c @@ -40,6 +40,16 @@ #include "aes-internal.h" #include "macros.h" +/* For fat builds */ +#if HAVE_NATIVE_aes_encrypt +void +_nettle_aes_encrypt_c(unsigned rounds, const uint32_t *keys, + const struct aes_table *T, + size_t length, uint8_t *dst, + const uint8_t *src); +#define _nettle_aes_encrypt _nettle_aes_encrypt_c +#endif + void _nettle_aes_encrypt(unsigned rounds, const uint32_t *keys, const struct aes_table *T, diff --git a/asm.m4 b/asm.m4 index ee377a78..59d64098 100644 --- a/asm.m4 +++ b/asm.m4 @@ -51,6 +51,14 @@ define(, <.align ifelse(ALIGN_LOG,yes,,$1) >) +define(, , +WORDS_BIGENDIAN,no,<$2>, +,WORDS_BIGENDIAN,< +>) + m4exit(1)>)>) +define(, , <$1>)>) + dnl Struct defining macros dnl STRUCTURE(prefix) diff --git a/config.m4.in b/config.m4.in index 666e34b8..e480334d 100644 --- a/config.m4.in +++ b/config.m4.in @@ -9,6 +9,7 @@ define(, <@W64_ABI@>)dnl define(, <@ASM_RODATA@>)dnl define(,<@ASM_X86_ENDBR@>)dnl define(,<@ASM_X86_MARK_CET_ALIGN@>)dnl +define(, <@ASM_WORDS_BIGENDIAN@>)dnl divert(1) @ASM_X86_MARK_CET@ @ASM_MARK_NOEXEC_STACK@ diff --git a/configure.ac b/configure.ac index 090e43a4..788e6842 100644 --- a/configure.ac +++ b/configure.ac @@ -85,6 +85,10 @@ AC_ARG_ENABLE(x86-aesni, AC_HELP_STRING([--enable-x86-aesni], [Enable x86_64 aes instructions. (default=no)]),, [enable_x86_aesni=no]) +AC_ARG_ENABLE(power-crypto-ext, + AC_HELP_STRING([--enable-power-crypto-ext], [Enable POWER crypto extensions. (default=no)]),, + [enable_power_crypto_ext=no]) + AC_ARG_ENABLE(mini-gmp, AC_HELP_STRING([--enable-mini-gmp], [Enable mini-gmp, used instead of libgmp.]),, [enable_mini_gmp=no]) @@ -201,7 +205,11 @@ LSH_FUNC_STRERROR # getenv_secure is used for fat overrides, # getline is used in the testsuite AC_CHECK_FUNCS(secure_getenv getline) -AC_C_BIGENDIAN + +ASM_WORDS_BIGENDIAN=unknown +AC_C_BIGENDIAN([AC_DEFINE([WORDS_BIGENDIAN], 1) + ASM_WORDS_BIGENDIAN=yes], + [ASM_WORDS_BIGENDIAN=no]) LSH_GCC_ATTRIBUTES @@ -310,6 +318,17 @@ case "$host_cpu" in AC_TRY_COMPILE([ #if defined(__sgi) && defined(__LP64__) #error 64-bit mips +#endif + ], [], [ + ABI=32 + ], [ + ABI=64 + ]) + ;; + *powerpc64*) + AC_TRY_COMPILE([ +#if defined(__PPC64__) +#error 64-bit powerpc #endif ], [], [ ABI=32 @@ -422,6 +441,18 @@ if test "x$enable_assembler" = xyes ; then esac fi ;; + *powerpc64*) + if test "$ABI" = 64 ; then + asm_path="powerpc64" + if test "x$enable_fat" = xyes ; then + asm_path="powerpc64/fat $asm_path" + OPT_NETTLE_SOURCES="fat-ppc.c $OPT_NETTLE_SOURCES" + elif test "x$enable_power_crypto_ext" = xyes ; then + asm_path="powerpc64/p8 $asm_path" + fi + fi + ;; + *) enable_assembler=no ;; @@ -544,6 +575,8 @@ AC_SUBST([IF_ASM]) AH_VERBATIM([HAVE_NATIVE], [/* Define to 1 each of the following for which a native (ie. CPU specific) implementation of the corresponding routine exists. */ +#undef HAVE_NATIVE_aes_decrypt +#undef HAVE_NATIVE_aes_encrypt #undef HAVE_NATIVE_ecc_192_modp #undef HAVE_NATIVE_ecc_192_redc #undef HAVE_NATIVE_ecc_224_modp @@ -857,6 +890,7 @@ AC_SUBST(ASM_TYPE_PROGBITS) AC_SUBST(ASM_MARK_NOEXEC_STACK) AC_SUBST(ASM_ALIGN_LOG) AC_SUBST(W64_ABI) +AC_SUBST(ASM_WORDS_BIGENDIAN) AC_SUBST(EMULATOR) AC_SUBST(ASM_X86_ENDBR) AC_SUBST(ASM_X86_MARK_CET) diff --git a/fat-ppc.c b/fat-ppc.c new file mode 100644 index 00000000..7198e2dd --- /dev/null +++ b/fat-ppc.c @@ -0,0 +1,129 @@ +/* fat-ppc.c + + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +*/ + +#define _GNU_SOURCE + +#if HAVE_CONFIG_H +# include "config.h" +#endif + +#include +#include +#include +#include +#if defined(__FreeBSD__) && __FreeBSD__ < 12 +#include +#else +#include +#endif + +#include "nettle-types.h" + +#include "aes-internal.h" +#include "gcm.h" +#include "fat-setup.h" + +/* Define from arch/powerpc/include/uapi/asm/cputable.h in Linux kernel */ +#ifndef PPC_FEATURE2_VEC_CRYPTO +#define PPC_FEATURE2_VEC_CRYPTO 0x02000000 +#endif + +struct ppc_features +{ + int have_crypto_ext; +}; + +static void +get_ppc_features (struct ppc_features *features) +{ + unsigned long hwcap2 = 0; +#if defined(__FreeBSD__) +#if __FreeBSD__ < 12 + size_t len = sizeof(hwcap2); + sysctlbyname("hw.cpu_features2", &hwcap2, &len, NULL, 0); +#else + elf_aux_info(AT_HWCAP2, &hwcap2, sizeof(hwcap2)); +#endif +#else + hwcap2 = getauxval(AT_HWCAP2); +#endif + features->have_crypto_ext = + (hwcap2 & PPC_FEATURE2_VEC_CRYPTO) == PPC_FEATURE2_VEC_CRYPTO ? 1 : 0; +} + +DECLARE_FAT_FUNC(_nettle_aes_encrypt, aes_crypt_internal_func) +DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, c) +DECLARE_FAT_FUNC_VAR(aes_encrypt, aes_crypt_internal_func, ppc64) + +DECLARE_FAT_FUNC(_nettle_aes_decrypt, aes_crypt_internal_func) +DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, c) +DECLARE_FAT_FUNC_VAR(aes_decrypt, aes_crypt_internal_func, ppc64) + +static void CONSTRUCTOR +fat_init (void) +{ + struct ppc_features features; + int verbose; + + get_ppc_features (&features); + + verbose = getenv (ENV_VERBOSE) != NULL; + if (verbose) + fprintf (stderr, "libnettle: cpu features: %s\n", + features.have_crypto_ext ? "crypto extensions" : ""); + + if (features.have_crypto_ext) + { + if (verbose) + fprintf (stderr, "libnettle: enabling arch 2.07 code.\n"); + _nettle_aes_encrypt_vec = _nettle_aes_encrypt_ppc64; + _nettle_aes_decrypt_vec = _nettle_aes_decrypt_ppc64; + } + else + { + _nettle_aes_encrypt_vec = _nettle_aes_encrypt_c; + _nettle_aes_decrypt_vec = _nettle_aes_decrypt_c; + } +} + +DEFINE_FAT_FUNC(_nettle_aes_encrypt, void, + (unsigned rounds, const uint32_t *keys, + const struct aes_table *T, + size_t length, uint8_t *dst, + const uint8_t *src), + (rounds, keys, T, length, dst, src)) + +DEFINE_FAT_FUNC(_nettle_aes_decrypt, void, + (unsigned rounds, const uint32_t *keys, + const struct aes_table *T, + size_t length, uint8_t *dst, + const uint8_t *src), + (rounds, keys, T, length, dst, src)) diff --git a/powerpc64/fat/aes-decrypt-internal-2.asm b/powerpc64/fat/aes-decrypt-internal-2.asm new file mode 100644 index 00000000..3a4e08c2 --- /dev/null +++ b/powerpc64/fat/aes-decrypt-internal-2.asm @@ -0,0 +1,37 @@ +C powerpc64/fat/aes-decrypt-internal-2.asm + + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +dnl PROLOGUE(_nettle_aes_decrypt) picked up by configure + +define(, <$1_ppc64>) +include_src() diff --git a/powerpc64/fat/aes-encrypt-internal-2.asm b/powerpc64/fat/aes-encrypt-internal-2.asm new file mode 100644 index 00000000..42126e4f --- /dev/null +++ b/powerpc64/fat/aes-encrypt-internal-2.asm @@ -0,0 +1,37 @@ +C powerpc64/fat/aes-encrypt-internal-2.asm + + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +dnl PROLOGUE(_nettle_aes_encrypt) picked up by configure + +define(, <$1_ppc64>) +include_src() diff --git a/powerpc64/machine.m4 b/powerpc64/machine.m4 new file mode 100644 index 00000000..b76bb8b1 --- /dev/null +++ b/powerpc64/machine.m4 @@ -0,0 +1,36 @@ +define(, +<.globl C_NAME($1) +DECLARE_FUNC(C_NAME($1)) +ifelse(WORDS_BIGENDIAN,no, +,<.align FUNC_ALIGN>) +C_NAME($1): +addis 2,12,(.TOC.-C_NAME($1))@ha +addi 2,2,(.TOC.-C_NAME($1))@l +.localentry C_NAME($1), .-C_NAME($1)>, +<.section ".opd","aw" +.align 3 +C_NAME($1): +.quad .C_NAME($1),.TOC.@tocbase,0 +.previous +ifdef(,<.align FUNC_ALIGN>) +.C_NAME($1):>) +undefine()>) + +define(, +, +<.size .C_NAME($1), . - .C_NAME($1) +.size C_NAME($1), . - .C_NAME($1)>)>) + +C Get vector-scalar register from vector register +C VSR(VR) +define(,<32+$1>) + +C Load the quadword in DATA_SRC storage into +C VEC_DST. GPR is general-purpose register +C used to obtain the effective address of +C DATA_SRC storage. +C DATA_LOAD_VEC(VEC_DST, DATA_SRC, GPR) +define(, +) diff --git a/powerpc64/p8/aes-decrypt-internal.asm b/powerpc64/p8/aes-decrypt-internal.asm new file mode 100644 index 00000000..bfedb32b --- /dev/null +++ b/powerpc64/p8/aes-decrypt-internal.asm @@ -0,0 +1,356 @@ +C powerpc64/p8/aes-decrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(, <1>) +define(, <2>) + +define(, <3>) +define(, <4>) +define(, <6>) +define(, <7>) +define(, <8>) + +define(, <0>) + +define(, <1>) +define(, <2>) +define(, <3>) +define(, <4>) +define(, <5>) +define(, <6>) +define(, <7>) +define(, <8>) +define(, <9>) + +C ZERO vector register is used in place of RoundKey +C for vncipher instruction because the order of InvMixColumns +C and Xor processes are flipped in that instruction. +C The Xor process with RoundKey is executed afterward. +define(, <10>) + +.file "aes-decrypt-internal.asm" + +.text + + C _aes_decrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +define(, <5>) +PROLOGUE(_nettle_aes_decrypt) + vxor ZERO,ZERO,ZERO + + DATA_LOAD_VEC(swap_mask,.swap_mask,5) + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,3 #8x loop count + cmpldi 5,0 + beq L4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + +.align 5 +Lx8_loop: + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + lxvd2x VSR(S1),25,SRC + lxvd2x VSR(S2),26,SRC + lxvd2x VSR(S3),27,SRC + lxvd2x VSR(S4),28,SRC + lxvd2x VSR(S5),29,SRC + lxvd2x VSR(S6),30,SRC + lxvd2x VSR(S7),31,SRC + +IF_LE() + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vncipher S4,S4,ZERO + vncipher S5,S5,ZERO + vncipher S6,S6,ZERO + vncipher S7,S7,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + vncipherlast S4,S4,K + vncipherlast S5,S5,K + vncipherlast S6,S6,K + vncipherlast S7,S7,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + stxvd2x VSR(S1),25,DST + stxvd2x VSR(S2),26,DST + stxvd2x VSR(S3),27,DST + stxvd2x VSR(S4),28,DST + stxvd2x VSR(S5),29,DST + stxvd2x VSR(S6),30,DST + stxvd2x VSR(S7),31,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + subic. 5,5,1 + bne Lx8_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + li 9,0x10 + lxvd2x VSR(S1),9,SRC + addi 9,9,0x10 + lxvd2x VSR(S2),9,SRC + addi 9,9,0x10 + lxvd2x VSR(S3),9,SRC + +IF_LE() + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vncipher S2,S2,ZERO + vncipher S3,S3,ZERO + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + vncipherlast S2,S2,K + vncipherlast S3,S3,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + li 9,0x10 + stxvd2x VSR(S1),9,DST + addi 9,9,0x10 + stxvd2x VSR(S2),9,DST + addi 9,9,0x10 + stxvd2x VSR(S3),9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + li 9,0x10 + lxvd2x VSR(S1),9,SRC + +IF_LE() + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vncipher S1,S1,ZERO + vxor S0,S0,K + vxor S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + vncipherlast S1,S1,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + li 9,0x10 + stxvd2x VSR(S1),9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + +IF_LE() + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipher S0,S0,ZERO + vxor S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vncipherlast S0,S0,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_decrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>) diff --git a/powerpc64/p8/aes-encrypt-internal.asm b/powerpc64/p8/aes-encrypt-internal.asm new file mode 100644 index 00000000..67c7e597 --- /dev/null +++ b/powerpc64/p8/aes-encrypt-internal.asm @@ -0,0 +1,333 @@ +C powerpc64/p8/aes-encrypt-internal.asm + +ifelse(< + Copyright (C) 2020 Mamone Tarsha + This file is part of GNU Nettle. + + GNU Nettle is free software: you can redistribute it and/or + modify it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + + or + + * the GNU General Public License as published by the Free + Software Foundation; either version 2 of the License, or (at your + option) any later version. + + or both in parallel, as here. + + GNU Nettle is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + General Public License for more details. + + You should have received copies of the GNU General Public License and + the GNU Lesser General Public License along with this program. If + not, see http://www.gnu.org/licenses/. +>) + +C Register usage: + +define(, <1>) +define(, <2>) + +define(, <3>) +define(, <4>) +define(, <6>) +define(, <7>) +define(, <8>) + +define(, <0>) + +define(, <1>) +define(, <2>) +define(, <3>) +define(, <4>) +define(, <5>) +define(, <6>) +define(, <7>) +define(, <8>) +define(, <9>) + +.file "aes-encrypt-internal.asm" + +.text + + C _aes_encrypt(unsigned rounds, const uint32_t *keys, + C const struct aes_table *T, + C size_t length, uint8_t *dst, + C uint8_t *src) + +define(, <5>) +PROLOGUE(_nettle_aes_encrypt) + DATA_LOAD_VEC(swap_mask,.swap_mask,5) + + subi ROUNDS,ROUNDS,1 + srdi LENGTH,LENGTH,4 + + srdi 5,LENGTH,3 #8x loop count + cmpldi 5,0 + beq L4x + + std 25,-56(SP); + std 26,-48(SP); + std 27,-40(SP); + std 28,-32(SP); + std 29,-24(SP); + std 30,-16(SP); + std 31,-8(SP); + + li 25,0x10 + li 26,0x20 + li 27,0x30 + li 28,0x40 + li 29,0x50 + li 30,0x60 + li 31,0x70 + +.align 5 +Lx8_loop: + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + lxvd2x VSR(S1),25,SRC + lxvd2x VSR(S2),26,SRC + lxvd2x VSR(S3),27,SRC + lxvd2x VSR(S4),28,SRC + lxvd2x VSR(S5),29,SRC + lxvd2x VSR(S6),30,SRC + lxvd2x VSR(S7),31,SRC + +IF_LE() + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + vxor S4,S4,K + vxor S5,S5,K + vxor S6,S6,K + vxor S7,S7,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L8x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + vcipher S4,S4,K + vcipher S5,S5,K + vcipher S6,S6,K + vcipher S7,S7,K + addi 10,10,0x10 + bdnz L8x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + vcipherlast S4,S4,K + vcipherlast S5,S5,K + vcipherlast S6,S6,K + vcipherlast S7,S7,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + stxvd2x VSR(S1),25,DST + stxvd2x VSR(S2),26,DST + stxvd2x VSR(S3),27,DST + stxvd2x VSR(S4),28,DST + stxvd2x VSR(S5),29,DST + stxvd2x VSR(S6),30,DST + stxvd2x VSR(S7),31,DST + + addi SRC,SRC,0x80 + addi DST,DST,0x80 + subic. 5,5,1 + bne Lx8_loop + + ld 25,-56(SP); + ld 26,-48(SP); + ld 27,-40(SP); + ld 28,-32(SP); + ld 29,-24(SP); + ld 30,-16(SP); + ld 31,-8(SP); + + clrldi LENGTH,LENGTH,61 + +L4x: + srdi 5,LENGTH,2 + cmpldi 5,0 + beq L2x + + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + li 9,0x10 + lxvd2x VSR(S1),9,SRC + addi 9,9,0x10 + lxvd2x VSR(S2),9,SRC + addi 9,9,0x10 + lxvd2x VSR(S3),9,SRC + +IF_LE() + + vxor S0,S0,K + vxor S1,S1,K + vxor S2,S2,K + vxor S3,S3,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L4x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + vcipher S2,S2,K + vcipher S3,S3,K + addi 10,10,0x10 + bdnz L4x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + vcipherlast S2,S2,K + vcipherlast S3,S3,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + li 9,0x10 + stxvd2x VSR(S1),9,DST + addi 9,9,0x10 + stxvd2x VSR(S2),9,DST + addi 9,9,0x10 + stxvd2x VSR(S3),9,DST + + addi SRC,SRC,0x40 + addi DST,DST,0x40 + + clrldi LENGTH,LENGTH,62 + +L2x: + srdi 5,LENGTH,1 + cmpldi 5,0 + beq L1x + + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + li 9,0x10 + lxvd2x VSR(S1),9,SRC + +IF_LE() + + vxor S0,S0,K + vxor S1,S1,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L2x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + vcipher S1,S1,K + addi 10,10,0x10 + bdnz L2x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + vcipherlast S1,S1,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + li 9,0x10 + stxvd2x VSR(S1),9,DST + + addi SRC,SRC,0x20 + addi DST,DST,0x20 + + clrldi LENGTH,LENGTH,63 + +L1x: + cmpldi LENGTH,0 + beq Ldone + + lxvd2x VSR(K),0,KEYS + vperm K,K,K,swap_mask + + lxvd2x VSR(S0),0,SRC + +IF_LE() + + vxor S0,S0,K + + mtctr ROUNDS + li 10,0x10 +.align 5 +L1x_round_loop: + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipher S0,S0,K + addi 10,10,0x10 + bdnz L1x_round_loop + + lxvd2x VSR(K),10,KEYS + vperm K,K,K,swap_mask + vcipherlast S0,S0,K + +IF_LE() + + stxvd2x VSR(S0),0,DST + +Ldone: + blr +EPILOGUE(_nettle_aes_encrypt) + + .data + .align 4 +.swap_mask: +IF_LE(<.byte 8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7>) +IF_BE(<.byte 3,2,1,0,7,6,5,4,11,10,9,8,15,14,13,12>)