1334 lines
41 KiB
Diff
1334 lines
41 KiB
Diff
|
diff --git a/AUTHORS b/AUTHORS
|
||
|
index ee336b2e..77055c25 100644
|
||
|
--- a/AUTHORS
|
||
|
+++ b/AUTHORS
|
||
|
@@ -29,6 +29,7 @@ List of Copyright holders
|
||
|
Copyright (C) 1996-1999 Peter Gutmann, Paul Kendall, and Chris Wedgwood
|
||
|
Copyright (C) 1996-2006 Peter Gutmann, Matt Thomlinson and Blake Coverett
|
||
|
Copyright (C) 2003 Nikos Mavroyanopoulos
|
||
|
+ Copyright (c) 2006 CRYPTOGAMS
|
||
|
Copyright (C) 2006-2007 NTT (Nippon Telegraph and Telephone Corporation)
|
||
|
Copyright (C) 2012-2019 g10 Code GmbH
|
||
|
Copyright (C) 2012 Simon Josefsson, Niels Möller
|
||
|
diff --git a/LICENSES b/LICENSES
|
||
|
index f6733a69..c19284e2 100644
|
||
|
--- a/LICENSES
|
||
|
+++ b/LICENSES
|
||
|
@@ -54,7 +54,6 @@ with any binary distributions derived from the GNU C Library.
|
||
|
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
#+end_quote
|
||
|
|
||
|
-
|
||
|
For files:
|
||
|
- random/jitterentropy-base.c
|
||
|
- random/jitterentropy.h
|
||
|
@@ -99,6 +98,48 @@ with any binary distributions derived from the GNU C Library.
|
||
|
* DAMAGE.
|
||
|
#+end_quote
|
||
|
|
||
|
+ For files:
|
||
|
+ - cipher/cipher-gcm-ppc.c
|
||
|
+
|
||
|
+#+begin_quote
|
||
|
+ Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
|
||
|
+ All rights reserved.
|
||
|
+
|
||
|
+ Redistribution and use in source and binary forms, with or without
|
||
|
+ modification, are permitted provided that the following conditions
|
||
|
+ are met:
|
||
|
+
|
||
|
+ * Redistributions of source code must retain copyright notices,
|
||
|
+ this list of conditions and the following disclaimer.
|
||
|
+
|
||
|
+ * Redistributions in binary form must reproduce the above
|
||
|
+ copyright notice, this list of conditions and the following
|
||
|
+ disclaimer in the documentation and/or other materials
|
||
|
+ provided with the distribution.
|
||
|
+
|
||
|
+ * Neither the name of the CRYPTOGAMS nor the names of its
|
||
|
+ copyright holder and contributors may be used to endorse or
|
||
|
+ promote products derived from this software without specific
|
||
|
+ prior written permission.
|
||
|
+
|
||
|
+ ALTERNATIVELY, provided that this notice is retained in full, this
|
||
|
+ product may be distributed under the terms of the GNU General Public
|
||
|
+ License (GPL), in which case the provisions of the GPL apply INSTEAD OF
|
||
|
+ those given above.
|
||
|
+
|
||
|
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
|
||
|
+ "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||
|
+ LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||
|
+ A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||
|
+ OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||
|
+ SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
|
+ LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||
|
+ DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||
|
+ THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
+ (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
+#+end_quote
|
||
|
+
|
||
|
* X License
|
||
|
|
||
|
For files:
|
||
|
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
|
||
|
index 1728e9f9..ab5d2a38 100644
|
||
|
--- a/cipher/Makefile.am
|
||
|
+++ b/cipher/Makefile.am
|
||
|
@@ -66,6 +66,7 @@ blowfish.c blowfish-amd64.S blowfish-arm.S \
|
||
|
cast5.c cast5-amd64.S cast5-arm.S \
|
||
|
chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
|
||
|
chacha20-armv7-neon.S \
|
||
|
+cipher-gcm-ppc.c \
|
||
|
crc.c \
|
||
|
crc-intel-pclmul.c crc-ppc.c \
|
||
|
des.c des-amd64.S \
|
||
|
@@ -165,3 +166,9 @@ crc-ppc.o: $(srcdir)/crc-ppc.c Makefile
|
||
|
|
||
|
crc-ppc.lo: $(srcdir)/crc-ppc.c Makefile
|
||
|
`echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< `
|
||
|
+
|
||
|
+cipher-gcm-ppc.o: $(srcdir)/cipher-gcm-ppc.c Makefile
|
||
|
+ `echo $(COMPILE) $(ppc_vcrypto_cflags) -c $< `
|
||
|
+
|
||
|
+cipher-gcm-ppc.lo: $(srcdir)/cipher-gcm-ppc.c Makefile
|
||
|
+ `echo $(LTCOMPILE) $(ppc_vcrypto_cflags) -c $< `
|
||
|
diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c
|
||
|
new file mode 100644
|
||
|
index 00000000..ed27ef15
|
||
|
--- /dev/null
|
||
|
+++ b/cipher/cipher-gcm-ppc.c
|
||
|
@@ -0,0 +1,510 @@
|
||
|
+/* cipher-gcm-ppc.c - Power 8 vpmsum accelerated Galois Counter Mode
|
||
|
+ * implementation
|
||
|
+ * Copyright (C) 2019 Shawn Landden <shawn@git.icu>
|
||
|
+ *
|
||
|
+ * This file is part of Libgcrypt.
|
||
|
+ *
|
||
|
+ * Libgcrypt is free software; you can redistribute it and/or modify
|
||
|
+ * it under the terms of the GNU Lesser general Public License as
|
||
|
+ * published by the Free Software Foundation; either version 2.1 of
|
||
|
+ * the License, or (at your option) any later version.
|
||
|
+ *
|
||
|
+ * Libgcrypt is distributed in the hope that it will be useful,
|
||
|
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||
|
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||
|
+ * GNU Lesser General Public License for more details.
|
||
|
+ *
|
||
|
+ * You should have received a copy of the GNU Lesser General Public
|
||
|
+ * License along with this program; if not, see <http://www.gnu.org/licenses/>.
|
||
|
+ *
|
||
|
+ * Based on GHASH implementation by Andy Polyakov from CRYPTOGAMS
|
||
|
+ * distribution (ppc/ghashp8-ppc.pl). Specifically, it uses his register
|
||
|
+ * allocation (which then defers to your compiler's register allocation),
|
||
|
+ * instead of re-implementing Gerald Estrin's Scheme of parallelized
|
||
|
+ * multiplication of polynomials, as I did not understand this algorithm at
|
||
|
+ * the time.
|
||
|
+ *
|
||
|
+ * Original copyright license follows:
|
||
|
+ *
|
||
|
+ * Copyright (c) 2006, CRYPTOGAMS by <appro@openssl.org>
|
||
|
+ * All rights reserved.
|
||
|
+ *
|
||
|
+ * Redistribution and use in source and binary forms, with or without
|
||
|
+ * modification, are permitted provided that the following conditions
|
||
|
+ * are met:
|
||
|
+ *
|
||
|
+ * * Redistributions of source code must retain copyright notices,
|
||
|
+ * this list of conditions and the following disclaimer.
|
||
|
+ *
|
||
|
+ * * Redistributions in binary form must reproduce the above
|
||
|
+ * copyright notice, this list of conditions and the following
|
||
|
+ * disclaimer in the documentation and/or other materials
|
||
|
+ * provided with the distribution.
|
||
|
+ *
|
||
|
+ * * Neither the name of the CRYPTOGAMS nor the names of its
|
||
|
+ * copyright holder and contributors may be used to endorse or
|
||
|
+ * promote products derived from this software without specific
|
||
|
+ * prior written permission.
|
||
|
+ *
|
||
|
+ * ALTERNATIVELY, provided that this notice is retained in full, this
|
||
|
+ * product may be distributed under the terms of the GNU General Public
|
||
|
+ * License (GPL), in which case the provisions of the GPL apply INSTEAD OF
|
||
|
+ * those given above.
|
||
|
+ *
|
||
|
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDER AND CONTRIBUTORS
|
||
|
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
|
||
|
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
|
||
|
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
|
||
|
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
|
||
|
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
|
||
|
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
|
||
|
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
|
||
|
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
|
||
|
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
||
|
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||
|
+ *
|
||
|
+ * SPDX-License-Identifier: (BSD-3-Clause OR GPL-2.0-only)
|
||
|
+ */
|
||
|
+
|
||
|
+#include <config.h>
|
||
|
+#include <stdio.h>
|
||
|
+#include <stdlib.h>
|
||
|
+#include <string.h>
|
||
|
+#include <errno.h>
|
||
|
+#include <stdint.h>
|
||
|
+
|
||
|
+#include "g10lib.h"
|
||
|
+#include "cipher.h"
|
||
|
+#include "bufhelp.h"
|
||
|
+#include "./cipher-internal.h"
|
||
|
+
|
||
|
+#ifdef GCM_USE_PPC_VPMSUM
|
||
|
+
|
||
|
+#include <altivec.h>
|
||
|
+
|
||
|
+#define ALWAYS_INLINE inline __attribute__((always_inline))
|
||
|
+#define NO_INSTRUMENT_FUNCTION __attribute__((no_instrument_function))
|
||
|
+
|
||
|
+#define ASM_FUNC_ATTR NO_INSTRUMENT_FUNCTION
|
||
|
+#define ASM_FUNC_ATTR_INLINE ASM_FUNC_ATTR ALWAYS_INLINE
|
||
|
+
|
||
|
+typedef vector unsigned char vector16x_u8;
|
||
|
+typedef vector signed char vector16x_s8;
|
||
|
+typedef vector unsigned long long vector2x_u64;
|
||
|
+typedef vector unsigned long long block;
|
||
|
+
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+asm_vpmsumd(block a, block b)
|
||
|
+{
|
||
|
+ block r;
|
||
|
+ __asm__("vpmsumd %0, %1, %2"
|
||
|
+ : "=v" (r)
|
||
|
+ : "v" (a), "v" (b));
|
||
|
+ return r;
|
||
|
+}
|
||
|
+
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+asm_swap_u64(block a)
|
||
|
+{
|
||
|
+ __asm__("xxswapd %x0, %x1"
|
||
|
+ : "=wa" (a)
|
||
|
+ : "wa" (a));
|
||
|
+ return a;
|
||
|
+}
|
||
|
+
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+asm_rot_block_left(block a)
|
||
|
+{
|
||
|
+ block zero = {0, 0};
|
||
|
+ block mask = {2, 0};
|
||
|
+ return __builtin_shuffle(a, zero, mask);
|
||
|
+}
|
||
|
+
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+asm_rot_block_right(block a)
|
||
|
+{
|
||
|
+ block zero = {0, 0};
|
||
|
+ block mask = {1, 2};
|
||
|
+ return __builtin_shuffle(a, zero, mask);
|
||
|
+}
|
||
|
+
|
||
|
+/* vsl is a slightly strange function in the way the shift is passed... */
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+asm_ashl_128(block a, vector16x_u8 shift)
|
||
|
+{
|
||
|
+ block r;
|
||
|
+ __asm__("vsl %0, %1, %2"
|
||
|
+ : "=v" (r)
|
||
|
+ : "v" (a), "v" (shift));
|
||
|
+ return r;
|
||
|
+}
|
||
|
+
|
||
|
+#define ALIGNED_LOAD(in_ptr) \
|
||
|
+ (vec_aligned_ld (0, (const unsigned char *)(in_ptr)))
|
||
|
+
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+vec_aligned_ld(unsigned long offset, const unsigned char *ptr)
|
||
|
+{
|
||
|
+#ifndef WORDS_BIGENDIAN
|
||
|
+ block vec;
|
||
|
+ __asm__ ("lvx %0,%1,%2\n\t"
|
||
|
+ : "=v" (vec)
|
||
|
+ : "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
+ : "memory", "r0");
|
||
|
+ return vec;
|
||
|
+#else
|
||
|
+ return vec_vsx_ld (offset, ptr);
|
||
|
+#endif
|
||
|
+}
|
||
|
+
|
||
|
+#define STORE_TABLE(gcm_table, slot, vec) \
|
||
|
+ vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table));
|
||
|
+
|
||
|
+
|
||
|
+static ASM_FUNC_ATTR_INLINE void
|
||
|
+vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr)
|
||
|
+{
|
||
|
+#ifndef WORDS_BIGENDIAN
|
||
|
+ __asm__ ("stvx %0,%1,%2\n\t"
|
||
|
+ :
|
||
|
+ : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
+ : "memory", "r0");
|
||
|
+#else
|
||
|
+ vec_vsx_st ((vector16x_u8)vec, offset, ptr);
|
||
|
+#endif
|
||
|
+}
|
||
|
+
|
||
|
+#define VEC_LOAD_BE(in_ptr, bswap_const) \
|
||
|
+ (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const))
|
||
|
+
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+vec_load_be(unsigned long offset, const unsigned char *ptr,
|
||
|
+ vector unsigned char be_bswap_const)
|
||
|
+{
|
||
|
+#ifndef WORDS_BIGENDIAN
|
||
|
+ block vec;
|
||
|
+ /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
|
||
|
+ * lxvw4x directly instead. */
|
||
|
+ __asm__ ("lxvw4x %x0,%1,%2\n\t"
|
||
|
+ : "=wa" (vec)
|
||
|
+ : "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
+ : "memory", "r0");
|
||
|
+ __asm__ ("vperm %0,%1,%1,%2\n\t"
|
||
|
+ : "=v" (vec)
|
||
|
+ : "v" (vec), "v" (be_bswap_const));
|
||
|
+ return vec;
|
||
|
+#else
|
||
|
+ (void)be_bswap_const;
|
||
|
+ return vec_vsx_ld (offset, ptr);
|
||
|
+#endif
|
||
|
+}
|
||
|
+
|
||
|
+/* Power ghash based on papers:
|
||
|
+ "The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega
|
||
|
+ "Intel® Carry-Less Multiplication Instruction and its Usage for Computing
|
||
|
+ the GCM Mode - Rev 2.01"; Shay Gueron, Michael E. Kounavis.
|
||
|
+
|
||
|
+ After saving the magic c2 constant and pre-formatted version of the key,
|
||
|
+ we pre-process the key for parallel hashing. This takes advantage of the
|
||
|
+ identity of addition over a galois field being identital to XOR, and thus
|
||
|
+ can be parellized (S 2.2, page 3). We multiply and add (galois field
|
||
|
+ versions) the key over multiple iterations and save the result. This can
|
||
|
+ later be galois added (XORed) with parallel processed input (Estrin's
|
||
|
+ Scheme).
|
||
|
+
|
||
|
+ The ghash "key" is a salt. */
|
||
|
+void ASM_FUNC_ATTR
|
||
|
+_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
|
||
|
+{
|
||
|
+ vector16x_u8 bswap_const =
|
||
|
+ { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
|
||
|
+ vector16x_u8 c2 =
|
||
|
+ { 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 };
|
||
|
+ block T0, T1, T2;
|
||
|
+ block C2, H, H1, H1l, H1h, H2, H2l, H2h;
|
||
|
+ block H3l, H3, H3h, H4l, H4, H4h, T3, T4;
|
||
|
+ vector16x_s8 most_sig_of_H, t7, carry;
|
||
|
+ vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||
|
+
|
||
|
+ H = VEC_LOAD_BE(gcm_key, bswap_const);
|
||
|
+ most_sig_of_H = vec_splat((vector16x_s8)H, 15);
|
||
|
+ t7 = vec_splat_s8(7);
|
||
|
+ carry = most_sig_of_H >> t7;
|
||
|
+ carry &= c2; /* only interested in certain carries. */
|
||
|
+ H1 = asm_ashl_128(H, one);
|
||
|
+ H1 ^= (block)carry; /* complete the <<< 1 */
|
||
|
+
|
||
|
+ T1 = asm_swap_u64 (H1);
|
||
|
+ H1l = asm_rot_block_right (T1);
|
||
|
+ H1h = asm_rot_block_left (T1);
|
||
|
+ C2 = asm_rot_block_right ((block)c2);
|
||
|
+
|
||
|
+ STORE_TABLE (gcm_table, 0, C2);
|
||
|
+ STORE_TABLE (gcm_table, 1, H1l);
|
||
|
+ STORE_TABLE (gcm_table, 2, T1);
|
||
|
+ STORE_TABLE (gcm_table, 3, H1h);
|
||
|
+
|
||
|
+ /* pre-process coefficients for Gerald Estrin's scheme for parallel
|
||
|
+ * multiplication of polynomials
|
||
|
+ */
|
||
|
+ H2l = asm_vpmsumd (H1l, H1); /* do not need to mask in
|
||
|
+ because 0 * anything -> 0 */
|
||
|
+ H2 = asm_vpmsumd (T1, H1);
|
||
|
+ H2h = asm_vpmsumd (H1h, H1);
|
||
|
+
|
||
|
+ /* reduce 1 */
|
||
|
+ T0 = asm_vpmsumd (H2l, C2);
|
||
|
+
|
||
|
+ H2l ^= asm_rot_block_left (H2);;
|
||
|
+ H2h ^= asm_rot_block_right (H2);
|
||
|
+ H2l = asm_swap_u64 (H2l);
|
||
|
+ H2l ^= T0;
|
||
|
+ /* reduce 2 */
|
||
|
+ T0 = asm_swap_u64 (H2l);
|
||
|
+ H2l = asm_vpmsumd (H2l, C2);
|
||
|
+ H2 = H2l ^ H2h ^ T0;
|
||
|
+
|
||
|
+ T2 = asm_swap_u64 (H2);
|
||
|
+ H2l = asm_rot_block_right (T2);
|
||
|
+ H2h = asm_rot_block_left (T2);
|
||
|
+
|
||
|
+ STORE_TABLE (gcm_table, 4, H2l);
|
||
|
+ STORE_TABLE (gcm_table, 5, T2);
|
||
|
+ STORE_TABLE (gcm_table, 6, H2h);
|
||
|
+
|
||
|
+ H3l = asm_vpmsumd (H2l, H1);
|
||
|
+ H4l = asm_vpmsumd (H2l, H2);
|
||
|
+ H3 = asm_vpmsumd (T2, H1);
|
||
|
+ H4 = asm_vpmsumd (T2, H2);
|
||
|
+ H3h = asm_vpmsumd (H2h, H1);
|
||
|
+ H4h = asm_vpmsumd (H2h, H2);
|
||
|
+
|
||
|
+ T3 = asm_vpmsumd (H3l, C2);
|
||
|
+ T4 = asm_vpmsumd (H4l, C2);
|
||
|
+
|
||
|
+ H3l ^= asm_rot_block_left (H3);
|
||
|
+ H3h ^= asm_rot_block_right (H3);
|
||
|
+ H4l ^= asm_rot_block_left (H4);
|
||
|
+ H4h ^= asm_rot_block_right (H4);
|
||
|
+
|
||
|
+ H3 = asm_swap_u64 (H3l);
|
||
|
+ H4 = asm_swap_u64 (H4l);
|
||
|
+
|
||
|
+ H3 ^= T3;
|
||
|
+ H4 ^= T4;
|
||
|
+
|
||
|
+ /* We could have also b64 switched reduce and reduce2, however as we are
|
||
|
+ using the unrotated H and H2 above to vpmsum, this is marginally better. */
|
||
|
+ T3 = asm_swap_u64 (H3);
|
||
|
+ T4 = asm_swap_u64 (H4);
|
||
|
+
|
||
|
+ H3 = asm_vpmsumd (H3, C2);
|
||
|
+ H4 = asm_vpmsumd (H4, C2);
|
||
|
+
|
||
|
+ T3 ^= H3h;
|
||
|
+ T4 ^= H4h;
|
||
|
+ H3 ^= T3;
|
||
|
+ H4 ^= T4;
|
||
|
+ H3 = asm_swap_u64 (H3);
|
||
|
+ H4 = asm_swap_u64 (H4);
|
||
|
+
|
||
|
+ H3l = asm_rot_block_right (H3);
|
||
|
+ H3h = asm_rot_block_left (H3);
|
||
|
+ H4l = asm_rot_block_right (H4);
|
||
|
+ H4h = asm_rot_block_left (H4);
|
||
|
+
|
||
|
+ STORE_TABLE (gcm_table, 7, H3l);
|
||
|
+ STORE_TABLE (gcm_table, 8, H3);
|
||
|
+ STORE_TABLE (gcm_table, 9, H3h);
|
||
|
+ STORE_TABLE (gcm_table, 10, H4l);
|
||
|
+ STORE_TABLE (gcm_table, 11, H4);
|
||
|
+ STORE_TABLE (gcm_table, 12, H4h);
|
||
|
+}
|
||
|
+
|
||
|
+ASM_FUNC_ATTR_INLINE
|
||
|
+block
|
||
|
+vec_perm2(block l, block r, vector16x_u8 perm) {
|
||
|
+ block ret;
|
||
|
+ __asm__ ("vperm %0,%1,%2,%3\n\t"
|
||
|
+ : "=v" (ret)
|
||
|
+ : "v" (l), "v" (r), "v" (perm));
|
||
|
+ return ret;
|
||
|
+}
|
||
|
+
|
||
|
+void ASM_FUNC_ATTR
|
||
|
+_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
|
||
|
+ const byte *const buf, const size_t nblocks)
|
||
|
+{
|
||
|
+ /* This const is strange, it is reversing the bytes, and also reversing
|
||
|
+ the u32s that get switched by lxvw4 and it also addresses bytes big-endian,
|
||
|
+ and is here due to lack of proper peep-hole optimization. */
|
||
|
+ vector16x_u8 bswap_const =
|
||
|
+ { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
|
||
|
+ vector16x_u8 bswap_8_const =
|
||
|
+ { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
|
||
|
+ block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl;
|
||
|
+ block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur;
|
||
|
+ size_t blocks_remaining = nblocks, off = 0;
|
||
|
+ size_t not_multiple_of_four;
|
||
|
+ block t0;
|
||
|
+
|
||
|
+ cur = vec_load_be (0, result, bswap_const);
|
||
|
+
|
||
|
+ c2 = vec_aligned_ld (0, gcm_table);
|
||
|
+ H0l = vec_aligned_ld (16, gcm_table);
|
||
|
+ H0m = vec_aligned_ld (32, gcm_table);
|
||
|
+ H0h = vec_aligned_ld (48, gcm_table);
|
||
|
+
|
||
|
+ for (not_multiple_of_four = nblocks % 4; not_multiple_of_four;
|
||
|
+ not_multiple_of_four--)
|
||
|
+ {
|
||
|
+ in = vec_load_be (off, buf, bswap_const);
|
||
|
+ off += 16;
|
||
|
+ blocks_remaining--;
|
||
|
+ cur ^= in;
|
||
|
+
|
||
|
+ Hl = asm_vpmsumd (cur, H0l);
|
||
|
+ Hm = asm_vpmsumd (cur, H0m);
|
||
|
+ Hh = asm_vpmsumd (cur, H0h);
|
||
|
+
|
||
|
+ t0 = asm_vpmsumd (Hl, c2);
|
||
|
+
|
||
|
+ Hl ^= asm_rot_block_left (Hm);
|
||
|
+
|
||
|
+ Hm_right = asm_rot_block_right (Hm);
|
||
|
+ Hh ^= Hm_right;
|
||
|
+ Hl_rotate = asm_swap_u64 (Hl);
|
||
|
+ Hl_rotate ^= t0;
|
||
|
+ Hl = asm_swap_u64 (Hl_rotate);
|
||
|
+ Hl_rotate = asm_vpmsumd (Hl_rotate, c2);
|
||
|
+ Hl ^= Hh;
|
||
|
+ Hl ^= Hl_rotate;
|
||
|
+
|
||
|
+ cur = Hl;
|
||
|
+ }
|
||
|
+
|
||
|
+ if (blocks_remaining > 0)
|
||
|
+ {
|
||
|
+ vector16x_u8 hiperm =
|
||
|
+ {
|
||
|
+ 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
|
||
|
+ 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0
|
||
|
+ };
|
||
|
+ vector16x_u8 loperm =
|
||
|
+ {
|
||
|
+ 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
|
||
|
+ 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8
|
||
|
+ };
|
||
|
+ block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate;
|
||
|
+ block H21l, H21h, merge_l, merge_h;
|
||
|
+
|
||
|
+ H2m = vec_aligned_ld (48 + 32, gcm_table);
|
||
|
+ H3l = vec_aligned_ld (48 * 2 + 16, gcm_table);
|
||
|
+ H3m = vec_aligned_ld (48 * 2 + 32, gcm_table);
|
||
|
+ H3h = vec_aligned_ld (48 * 2 + 48, gcm_table);
|
||
|
+ H4l = vec_aligned_ld (48 * 3 + 16, gcm_table);
|
||
|
+ H4m = vec_aligned_ld (48 * 3 + 32, gcm_table);
|
||
|
+ H4h = vec_aligned_ld (48 * 3 + 48, gcm_table);
|
||
|
+
|
||
|
+ in0 = vec_load_be (off, buf, bswap_const);
|
||
|
+ in1 = vec_load_be (off + 16, buf, bswap_const);
|
||
|
+ in2 = vec_load_be (off + 32, buf, bswap_const);
|
||
|
+ in3 = vec_load_be (off + 48, buf, bswap_const);
|
||
|
+ blocks_remaining -= 4;
|
||
|
+ off += 64;
|
||
|
+
|
||
|
+ Xh = in0 ^ cur;
|
||
|
+
|
||
|
+ Xl1 = asm_vpmsumd (in1, H3l);
|
||
|
+ Xm1 = asm_vpmsumd (in1, H3m);
|
||
|
+ Xh1 = asm_vpmsumd (in1, H3h);
|
||
|
+
|
||
|
+ H21l = vec_perm2 (H2m, H0m, hiperm);
|
||
|
+ H21h = vec_perm2 (H2m, H0m, loperm);
|
||
|
+ merge_l = vec_perm2 (in2, in3, loperm);
|
||
|
+ merge_h = vec_perm2 (in2, in3, hiperm);
|
||
|
+
|
||
|
+ Xm2 = asm_vpmsumd (in2, H2m);
|
||
|
+ Xl3 = asm_vpmsumd (merge_l, H21l);
|
||
|
+ Xm3 = asm_vpmsumd (in3, H0m);
|
||
|
+ Xh3 = asm_vpmsumd (merge_h, H21h);
|
||
|
+
|
||
|
+ Xm2 ^= Xm1;
|
||
|
+ Xl3 ^= Xl1;
|
||
|
+ Xm3 ^= Xm2;
|
||
|
+ Xh3 ^= Xh1;
|
||
|
+
|
||
|
+ /* Gerald Estrin's scheme for parallel multiplication of polynomials */
|
||
|
+ for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64)
|
||
|
+ {
|
||
|
+ in0 = vec_load_be (off, buf, bswap_const);
|
||
|
+ in1 = vec_load_be (off + 16, buf, bswap_const);
|
||
|
+ in2 = vec_load_be (off + 32, buf, bswap_const);
|
||
|
+ in3 = vec_load_be (off + 48, buf, bswap_const);
|
||
|
+
|
||
|
+ Xl = asm_vpmsumd (Xh, H4l);
|
||
|
+ Xm = asm_vpmsumd (Xh, H4m);
|
||
|
+ Xh = asm_vpmsumd (Xh, H4h);
|
||
|
+ Xl1 = asm_vpmsumd (in1, H3l);
|
||
|
+ Xm1 = asm_vpmsumd (in1, H3m);
|
||
|
+ Xh1 = asm_vpmsumd (in1, H3h);
|
||
|
+
|
||
|
+ Xl ^= Xl3;
|
||
|
+ Xm ^= Xm3;
|
||
|
+ Xh ^= Xh3;
|
||
|
+ merge_l = vec_perm2 (in2, in3, loperm);
|
||
|
+ merge_h = vec_perm2 (in2, in3, hiperm);
|
||
|
+
|
||
|
+ t0 = asm_vpmsumd (Xl, c2);
|
||
|
+ Xl3 = asm_vpmsumd (merge_l, H21l);
|
||
|
+ Xh3 = asm_vpmsumd (merge_h, H21h);
|
||
|
+
|
||
|
+ Xl ^= asm_rot_block_left (Xm);
|
||
|
+ Xh ^= asm_rot_block_right (Xm);
|
||
|
+
|
||
|
+ Xl = asm_swap_u64 (Xl);
|
||
|
+ Xl ^= t0;
|
||
|
+
|
||
|
+ Xl_rotate = asm_swap_u64 (Xl);
|
||
|
+ Xm2 = asm_vpmsumd (in2, H2m);
|
||
|
+ Xm3 = asm_vpmsumd (in3, H0m);
|
||
|
+ Xl = asm_vpmsumd (Xl, c2);
|
||
|
+
|
||
|
+ Xl3 ^= Xl1;
|
||
|
+ Xh3 ^= Xh1;
|
||
|
+ Xh ^= in0;
|
||
|
+ Xm2 ^= Xm1;
|
||
|
+ Xh ^= Xl_rotate;
|
||
|
+ Xm3 ^= Xm2;
|
||
|
+ Xh ^= Xl;
|
||
|
+ }
|
||
|
+
|
||
|
+ Xl = asm_vpmsumd (Xh, H4l);
|
||
|
+ Xm = asm_vpmsumd (Xh, H4m);
|
||
|
+ Xh = asm_vpmsumd (Xh, H4h);
|
||
|
+
|
||
|
+ Xl ^= Xl3;
|
||
|
+ Xm ^= Xm3;
|
||
|
+
|
||
|
+ t0 = asm_vpmsumd (Xl, c2);
|
||
|
+
|
||
|
+ Xh ^= Xh3;
|
||
|
+ Xl ^= asm_rot_block_left (Xm);
|
||
|
+ Xh ^= asm_rot_block_right (Xm);
|
||
|
+
|
||
|
+ Xl = asm_swap_u64 (Xl);
|
||
|
+ Xl ^= t0;
|
||
|
+
|
||
|
+ Xl_rotate = asm_swap_u64 (Xl);
|
||
|
+ Xl = asm_vpmsumd (Xl, c2);
|
||
|
+ Xl_rotate ^= Xh;
|
||
|
+ Xl ^= Xl_rotate;
|
||
|
+
|
||
|
+ cur = Xl;
|
||
|
+ }
|
||
|
+
|
||
|
+ cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const);
|
||
|
+ STORE_TABLE (result, 0, cur);
|
||
|
+}
|
||
|
+
|
||
|
+#endif /* GCM_USE_PPC_VPMSUM */
|
||
|
diff --git a/cipher/cipher-gcm.c b/cipher/cipher-gcm.c
|
||
|
index 32ec9fa0..b84a0698 100644
|
||
|
--- a/cipher/cipher-gcm.c
|
||
|
+++ b/cipher/cipher-gcm.c
|
||
|
@@ -61,6 +61,28 @@ ghash_armv8_ce_pmull (gcry_cipher_hd_t c, byte *result, const byte *buf,
|
||
|
|
||
|
#endif
|
||
|
|
||
|
+#ifdef GCM_USE_PPC_VPMSUM
|
||
|
+extern void _gcry_ghash_setup_ppc_vpmsum (void *gcm_table, void *gcm_key);
|
||
|
+
|
||
|
+/* result is 128-bits */
|
||
|
+extern unsigned int _gcry_ghash_ppc_vpmsum (byte *result, void *gcm_table,
|
||
|
+ const byte *buf, size_t nblocks);
|
||
|
+
|
||
|
+static void
|
||
|
+ghash_setup_ppc_vpmsum (gcry_cipher_hd_t c)
|
||
|
+{
|
||
|
+ _gcry_ghash_setup_ppc_vpmsum(c->u_mode.gcm.gcm_table, c->u_mode.gcm.u_ghash_key.key);
|
||
|
+}
|
||
|
+
|
||
|
+static unsigned int
|
||
|
+ghash_ppc_vpmsum (gcry_cipher_hd_t c, byte *result, const byte *buf,
|
||
|
+ size_t nblocks)
|
||
|
+{
|
||
|
+ _gcry_ghash_ppc_vpmsum(result, c->u_mode.gcm.gcm_table, buf,
|
||
|
+ nblocks);
|
||
|
+ return 0;
|
||
|
+}
|
||
|
+#endif /* GCM_USE_PPC_VPMSUM */
|
||
|
|
||
|
#ifdef GCM_USE_TABLES
|
||
|
static const u16 gcmR[256] = {
|
||
|
@@ -403,7 +425,8 @@ ghash_internal (gcry_cipher_hd_t c, byte *result, const byte *buf,
|
||
|
static void
|
||
|
setupM (gcry_cipher_hd_t c)
|
||
|
{
|
||
|
-#if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL)
|
||
|
+#if defined(GCM_USE_INTEL_PCLMUL) || defined(GCM_USE_ARM_PMULL) || \
|
||
|
+ defined(GCM_USE_S390X_CRYPTO) || defined(GCM_USE_PPC_VPMSUM)
|
||
|
unsigned int features = _gcry_get_hw_features ();
|
||
|
#endif
|
||
|
|
||
|
@@ -423,7 +446,24 @@ setupM (gcry_cipher_hd_t c)
|
||
|
ghash_setup_armv8_ce_pmull (c);
|
||
|
}
|
||
|
#endif
|
||
|
- else
|
||
|
+#ifdef GCM_USE_PPC_VPMSUM
|
||
|
+ else if (features & HWF_PPC_VCRYPTO)
|
||
|
+ {
|
||
|
+ c->u_mode.gcm.ghash_fn = ghash_ppc_vpmsum;
|
||
|
+ ghash_setup_ppc_vpmsum (c);
|
||
|
+ }
|
||
|
+#endif
|
||
|
+#ifdef GCM_USE_S390X_CRYPTO
|
||
|
+ else if (features & HWF_S390X_MSA)
|
||
|
+ {
|
||
|
+ if (kimd_query () & km_function_to_mask (KMID_FUNCTION_GHASH))
|
||
|
+ {
|
||
|
+ c->u_mode.gcm.ghash_fn = ghash_s390x_kimd;
|
||
|
+ }
|
||
|
+ }
|
||
|
+#endif
|
||
|
+
|
||
|
+ if (c->u_mode.gcm.ghash_fn == NULL)
|
||
|
{
|
||
|
c->u_mode.gcm.ghash_fn = ghash_internal;
|
||
|
fillM (c);
|
||
|
diff --git a/cipher/cipher-internal.h b/cipher/cipher-internal.h
|
||
|
index a95e084b..a5fd3097 100644
|
||
|
--- a/cipher/cipher-internal.h
|
||
|
+++ b/cipher/cipher-internal.h
|
||
|
@@ -87,6 +87,18 @@
|
||
|
#endif /* GCM_USE_ARM_PMULL */
|
||
|
|
||
|
|
||
|
+/* GCM_USE_PPC_VPMSUM indicates whether to compile GCM with PPC Power 8
|
||
|
+ * polynomial multiplication instruction. */
|
||
|
+#undef GCM_USE_PPC_VPMSUM
|
||
|
+#if defined(GCM_USE_TABLES)
|
||
|
+#if defined(ENABLE_PPC_CRYPTO_SUPPORT) && defined(__powerpc64__) && \
|
||
|
+ !defined(WORDS_BIGENDIAN) && defined(HAVE_COMPATIBLE_CC_PPC_ALTIVEC) && \
|
||
|
+ defined(HAVE_GCC_INLINE_ASM_PPC_ALTIVEC) && __GNUC__ >= 4
|
||
|
+# define GCM_USE_PPC_VPMSUM 1
|
||
|
+# define NEED_16BYTE_ALIGNED_CONTEXT 1 /* this also aligns gcm_table */
|
||
|
+#endif
|
||
|
+#endif /* GCM_USE_PPC_VPMSUM */
|
||
|
+
|
||
|
typedef unsigned int (*ghash_fn_t) (gcry_cipher_hd_t c, byte *result,
|
||
|
const byte *buf, size_t nblocks);
|
||
|
|
||
|
@@ -277,9 +289,6 @@ struct gcry_cipher_handle
|
||
|
unsigned char key[MAX_BLOCKSIZE];
|
||
|
} u_ghash_key;
|
||
|
|
||
|
- /* GHASH implementation in use. */
|
||
|
- ghash_fn_t ghash_fn;
|
||
|
-
|
||
|
/* Pre-calculated table for GCM. */
|
||
|
#ifdef GCM_USE_TABLES
|
||
|
#if (SIZEOF_UNSIGNED_LONG == 8 || defined(__x86_64__))
|
||
|
@@ -290,6 +299,9 @@ struct gcry_cipher_handle
|
||
|
u32 gcm_table[4 * 16];
|
||
|
#endif
|
||
|
#endif
|
||
|
+
|
||
|
+ /* GHASH implementation in use. */
|
||
|
+ ghash_fn_t ghash_fn;
|
||
|
} gcm;
|
||
|
|
||
|
/* Mode specific storage for OCB mode. */
|
||
|
diff --git a/configure.ac b/configure.ac
|
||
|
index be35ce42..202ac888 100644
|
||
|
--- a/configure.ac
|
||
|
+++ b/configure.ac
|
||
|
@@ -2752,6 +2752,25 @@ case "${host}" in
|
||
|
;;
|
||
|
esac
|
||
|
|
||
|
+# Arch specific GCM implementations
|
||
|
+case "${host}" in
|
||
|
+ powerpc64le-*-*)
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
|
||
|
+ ;;
|
||
|
+ powerpc64-*-*)
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
|
||
|
+ ;;
|
||
|
+ powerpc-*-*)
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
|
||
|
+ ;;
|
||
|
+esac
|
||
|
+
|
||
|
+LIST_MEMBER(sm3, $enabled_digests)
|
||
|
+if test "$found" = "1" ; then
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS sm3.lo"
|
||
|
+ AC_DEFINE(USE_SM3, 1, [Defined if this module should be included])
|
||
|
+fi
|
||
|
+
|
||
|
LIST_MEMBER(scrypt, $enabled_kdfs)
|
||
|
if test "$found" = "1" ; then
|
||
|
GCRYPT_KDFS="$GCRYPT_KDFS scrypt.lo"
|
||
|
diff --git a/tests/basic.c b/tests/basic.c
|
||
|
index 0bd80201..06808d4a 100644
|
||
|
--- a/tests/basic.c
|
||
|
+++ b/tests/basic.c
|
||
|
@@ -1553,6 +1553,22 @@ _check_gcm_cipher (unsigned int step)
|
||
|
"\x0f\xc0\xc3\xb7\x80\xf2\x44\x45\x2d\xa3\xeb\xf1\xc5\xd8\x2c\xde"
|
||
|
"\xa2\x41\x89\x97\x20\x0e\xf8\x2e\x44\xae\x7e\x3f",
|
||
|
"\xa4\x4a\x82\x66\xee\x1c\x8e\xb0\xc8\xb5\xd4\xcf\x5a\xe9\xf1\x9a" },
|
||
|
+ { GCRY_CIPHER_AES256,
|
||
|
+ "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08"
|
||
|
+ "\xfe\xff\xe9\x92\x86\x65\x73\x1c\x6d\x6a\x8f\x94\x67\x30\x83\x08",
|
||
|
+ "\xca\xfe\xba\xbe\xfa\xce\xdb\xad\xde\xca\xf8\x88", 12,
|
||
|
+ "\xfe\xed\xfa\xce\xde\xad\xbe\xef\xfe\xed\xfa\xce\xde\xad\xbe\xef"
|
||
|
+ "\xab\xad\xda\xd2", 20,
|
||
|
+ "\xd9\x31\x32\x25\xf8\x84\x06\xe5\xa5\x59\x09\xc5\xaf\xf5\x26\x9a"
|
||
|
+ "\x86\xa7\xa9\x53\x15\x34\xf7\xda\x2e\x4c\x30\x3d\x8a\x31\x8a\x72"
|
||
|
+ "\x1c\x3c\x0c\x95\x95\x68\x09\x53\x2f\xcf\x0e\x24\x49\xa6\xb5\x25"
|
||
|
+ "\xb1\x6a\xed\xf5\xaa\x0d\xe6\x57\xba\x63\x7b\x39",
|
||
|
+ 60,
|
||
|
+ "\x52\x2d\xc1\xf0\x99\x56\x7d\x07\xf4\x7f\x37\xa3\x2a\x84\x42\x7d"
|
||
|
+ "\x64\x3a\x8c\xdc\xbf\xe5\xc0\xc9\x75\x98\xa2\xbd\x25\x55\xd1\xaa"
|
||
|
+ "\x8c\xb0\x8e\x48\x59\x0d\xbb\x3d\xa7\xb0\x8b\x10\x56\x82\x88\x38"
|
||
|
+ "\xc5\xf6\x1e\x63\x93\xba\x7a\x0a\xbc\xc9\xf6\x62",
|
||
|
+ "\x76\xfc\x6e\xce\x0f\x4e\x17\x68\xcd\xdf\x88\x53\xbb\x2d\x55\x1b" },
|
||
|
/* Test vectors for overflowing CTR. */
|
||
|
/* After setiv, ctr_low: 0xffffffff */
|
||
|
{ GCRY_CIPHER_AES256,
|
||
|
|
||
|
diff --git a/cipher/cipher-gcm-ppc.c b/cipher/cipher-gcm-ppc.c
|
||
|
index ed27ef15..2f60c09d 100644
|
||
|
--- a/cipher/cipher-gcm-ppc.c
|
||
|
+++ b/cipher/cipher-gcm-ppc.c
|
||
|
@@ -93,112 +93,157 @@ typedef vector signed char vector16x_s8;
|
||
|
typedef vector unsigned long long vector2x_u64;
|
||
|
typedef vector unsigned long long block;
|
||
|
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+asm_xor(block a, block b)
|
||
|
+{
|
||
|
+ block r;
|
||
|
+ __asm__ volatile ("xxlxor %x0, %x1, %x2"
|
||
|
+ : "=wa" (r)
|
||
|
+ : "wa" (a), "wa" (b));
|
||
|
+ return r;
|
||
|
+}
|
||
|
+
|
||
|
static ASM_FUNC_ATTR_INLINE block
|
||
|
asm_vpmsumd(block a, block b)
|
||
|
{
|
||
|
block r;
|
||
|
- __asm__("vpmsumd %0, %1, %2"
|
||
|
- : "=v" (r)
|
||
|
- : "v" (a), "v" (b));
|
||
|
+ __asm__ volatile ("vpmsumd %0, %1, %2"
|
||
|
+ : "=v" (r)
|
||
|
+ : "v" (a), "v" (b));
|
||
|
return r;
|
||
|
}
|
||
|
|
||
|
static ASM_FUNC_ATTR_INLINE block
|
||
|
asm_swap_u64(block a)
|
||
|
{
|
||
|
- __asm__("xxswapd %x0, %x1"
|
||
|
- : "=wa" (a)
|
||
|
- : "wa" (a));
|
||
|
- return a;
|
||
|
+ block r;
|
||
|
+ __asm__ volatile ("xxswapd %x0, %x1"
|
||
|
+ : "=wa" (r)
|
||
|
+ : "wa" (a));
|
||
|
+ return r;
|
||
|
}
|
||
|
|
||
|
static ASM_FUNC_ATTR_INLINE block
|
||
|
-asm_rot_block_left(block a)
|
||
|
+asm_mergelo(block l, block r)
|
||
|
{
|
||
|
- block zero = {0, 0};
|
||
|
- block mask = {2, 0};
|
||
|
- return __builtin_shuffle(a, zero, mask);
|
||
|
+ block ret;
|
||
|
+ __asm__ volatile ("xxmrgld %x0, %x1, %x2\n\t"
|
||
|
+ : "=wa" (ret)
|
||
|
+ : "wa" (l), "wa" (r));
|
||
|
+ return ret;
|
||
|
}
|
||
|
|
||
|
static ASM_FUNC_ATTR_INLINE block
|
||
|
-asm_rot_block_right(block a)
|
||
|
+asm_mergehi(block l, block r)
|
||
|
{
|
||
|
- block zero = {0, 0};
|
||
|
- block mask = {1, 2};
|
||
|
- return __builtin_shuffle(a, zero, mask);
|
||
|
+ block ret;
|
||
|
+ __asm__ volatile ("xxmrghd %x0, %x1, %x2\n\t"
|
||
|
+ : "=wa" (ret)
|
||
|
+ : "wa" (l), "wa" (r));
|
||
|
+ return ret;
|
||
|
}
|
||
|
|
||
|
-/* vsl is a slightly strange function in the way the shift is passed... */
|
||
|
static ASM_FUNC_ATTR_INLINE block
|
||
|
-asm_ashl_128(block a, vector16x_u8 shift)
|
||
|
+asm_rot_block_left(block a)
|
||
|
{
|
||
|
block r;
|
||
|
- __asm__("vsl %0, %1, %2"
|
||
|
- : "=v" (r)
|
||
|
- : "v" (a), "v" (shift));
|
||
|
+ block zero = { 0, 0 };
|
||
|
+ __asm__ volatile ("xxmrgld %x0, %x1, %x2"
|
||
|
+ : "=wa" (r)
|
||
|
+ : "wa" (a), "wa" (zero));
|
||
|
return r;
|
||
|
}
|
||
|
|
||
|
-#define ALIGNED_LOAD(in_ptr) \
|
||
|
- (vec_aligned_ld (0, (const unsigned char *)(in_ptr)))
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+asm_rot_block_right(block a)
|
||
|
+{
|
||
|
+ block r;
|
||
|
+ block zero = { 0, 0 };
|
||
|
+ __asm__ volatile ("xxsldwi %x0, %x2, %x1, 2"
|
||
|
+ : "=wa" (r)
|
||
|
+ : "wa" (a), "wa" (zero));
|
||
|
+ return r;
|
||
|
+}
|
||
|
|
||
|
+/* vsl is a slightly strange function in the way the shift is passed... */
|
||
|
static ASM_FUNC_ATTR_INLINE block
|
||
|
-vec_aligned_ld(unsigned long offset, const unsigned char *ptr)
|
||
|
+asm_ashl_128(block a, vector16x_u8 shift)
|
||
|
{
|
||
|
-#ifndef WORDS_BIGENDIAN
|
||
|
- block vec;
|
||
|
- __asm__ ("lvx %0,%1,%2\n\t"
|
||
|
- : "=v" (vec)
|
||
|
- : "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
- : "memory", "r0");
|
||
|
- return vec;
|
||
|
-#else
|
||
|
- return vec_vsx_ld (offset, ptr);
|
||
|
-#endif
|
||
|
+ block r;
|
||
|
+ __asm__ volatile ("vsl %0, %1, %2"
|
||
|
+ : "=v" (r)
|
||
|
+ : "v" (a), "v" (shift));
|
||
|
+ return r;
|
||
|
}
|
||
|
|
||
|
#define STORE_TABLE(gcm_table, slot, vec) \
|
||
|
- vec_aligned_st (((block)vec), slot * 16, (unsigned char *)(gcm_table));
|
||
|
-
|
||
|
+ vec_store_he (((block)vec), slot * 16, (unsigned char *)(gcm_table));
|
||
|
|
||
|
static ASM_FUNC_ATTR_INLINE void
|
||
|
-vec_aligned_st(block vec, unsigned long offset, unsigned char *ptr)
|
||
|
+vec_store_he(block vec, unsigned long offset, unsigned char *ptr)
|
||
|
{
|
||
|
#ifndef WORDS_BIGENDIAN
|
||
|
- __asm__ ("stvx %0,%1,%2\n\t"
|
||
|
- :
|
||
|
- : "v" (vec), "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
- : "memory", "r0");
|
||
|
+ /* GCC vec_vsx_ld is generating two instructions on little-endian. Use
|
||
|
+ * lxvd2x directly instead. */
|
||
|
+#if __GNUC__ >= 4
|
||
|
+ if (__builtin_constant_p (offset) && offset == 0)
|
||
|
+ __asm__ volatile ("stxvd2x %x0, 0, %1\n\t"
|
||
|
+ :
|
||
|
+ : "wa" (vec), "r" ((uintptr_t)ptr)
|
||
|
+ : "memory", "r0");
|
||
|
+ else
|
||
|
+#endif
|
||
|
+ __asm__ volatile ("stxvd2x %x0, %1, %2\n\t"
|
||
|
+ :
|
||
|
+ : "wa" (vec), "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
+ : "memory", "r0");
|
||
|
#else
|
||
|
vec_vsx_st ((vector16x_u8)vec, offset, ptr);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
#define VEC_LOAD_BE(in_ptr, bswap_const) \
|
||
|
- (vec_load_be (0, (const unsigned char *)(in_ptr), bswap_const))
|
||
|
+ vec_be_swap(vec_load_he (0, (const unsigned char *)(in_ptr)), bswap_const)
|
||
|
|
||
|
static ASM_FUNC_ATTR_INLINE block
|
||
|
-vec_load_be(unsigned long offset, const unsigned char *ptr,
|
||
|
- vector unsigned char be_bswap_const)
|
||
|
+vec_load_he(unsigned long offset, const unsigned char *ptr)
|
||
|
{
|
||
|
#ifndef WORDS_BIGENDIAN
|
||
|
block vec;
|
||
|
/* GCC vec_vsx_ld is generating two instructions on little-endian. Use
|
||
|
- * lxvw4x directly instead. */
|
||
|
- __asm__ ("lxvw4x %x0,%1,%2\n\t"
|
||
|
- : "=wa" (vec)
|
||
|
- : "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
- : "memory", "r0");
|
||
|
- __asm__ ("vperm %0,%1,%1,%2\n\t"
|
||
|
- : "=v" (vec)
|
||
|
- : "v" (vec), "v" (be_bswap_const));
|
||
|
+ * lxvd2x directly instead. */
|
||
|
+#if __GNUC__ >= 4
|
||
|
+ if (__builtin_constant_p (offset) && offset == 0)
|
||
|
+ __asm__ volatile ("lxvd2x %x0, 0, %1\n\t"
|
||
|
+ : "=wa" (vec)
|
||
|
+ : "r" ((uintptr_t)ptr)
|
||
|
+ : "memory", "r0");
|
||
|
+ else
|
||
|
+#endif
|
||
|
+ __asm__ volatile ("lxvd2x %x0, %1, %2\n\t"
|
||
|
+ : "=wa" (vec)
|
||
|
+ : "r" (offset), "r" ((uintptr_t)ptr)
|
||
|
+ : "memory", "r0");
|
||
|
return vec;
|
||
|
#else
|
||
|
- (void)be_bswap_const;
|
||
|
return vec_vsx_ld (offset, ptr);
|
||
|
#endif
|
||
|
}
|
||
|
|
||
|
+static ASM_FUNC_ATTR_INLINE block
|
||
|
+vec_be_swap(block vec, vector16x_u8 be_bswap_const)
|
||
|
+{
|
||
|
+#ifndef WORDS_BIGENDIAN
|
||
|
+ __asm__ volatile ("vperm %0, %1, %1, %2\n\t"
|
||
|
+ : "=v" (vec)
|
||
|
+ : "v" (vec), "v" (be_bswap_const));
|
||
|
+#else
|
||
|
+ (void)be_bswap_const;
|
||
|
+#endif
|
||
|
+ return vec;
|
||
|
+}
|
||
|
+
|
||
|
+
|
||
|
/* Power ghash based on papers:
|
||
|
"The Galois/Counter Mode of Operation (GCM)"; David A. McGrew, John Viega
|
||
|
"Intel® Carry-Less Multiplication Instruction and its Usage for Computing
|
||
|
@@ -216,15 +261,16 @@ vec_load_be(unsigned long offset, const unsigned char *ptr,
|
||
|
void ASM_FUNC_ATTR
|
||
|
_gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
|
||
|
{
|
||
|
- vector16x_u8 bswap_const =
|
||
|
- { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
|
||
|
- vector16x_u8 c2 =
|
||
|
+ static const vector16x_u8 bswap_const =
|
||
|
+ { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
|
||
|
+ static const vector16x_u8 c2 =
|
||
|
{ 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0b11000010 };
|
||
|
+ static const vector16x_u8 one =
|
||
|
+ { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||
|
block T0, T1, T2;
|
||
|
block C2, H, H1, H1l, H1h, H2, H2l, H2h;
|
||
|
block H3l, H3, H3h, H4l, H4, H4h, T3, T4;
|
||
|
vector16x_s8 most_sig_of_H, t7, carry;
|
||
|
- vector16x_u8 one = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 };
|
||
|
|
||
|
H = VEC_LOAD_BE(gcm_key, bswap_const);
|
||
|
most_sig_of_H = vec_splat((vector16x_s8)H, 15);
|
||
|
@@ -255,7 +301,7 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
|
||
|
/* reduce 1 */
|
||
|
T0 = asm_vpmsumd (H2l, C2);
|
||
|
|
||
|
- H2l ^= asm_rot_block_left (H2);;
|
||
|
+ H2l ^= asm_rot_block_left (H2);
|
||
|
H2h ^= asm_rot_block_right (H2);
|
||
|
H2l = asm_swap_u64 (H2l);
|
||
|
H2l ^= T0;
|
||
|
@@ -321,45 +367,30 @@ _gcry_ghash_setup_ppc_vpmsum (uint64_t *gcm_table, void *gcm_key)
|
||
|
STORE_TABLE (gcm_table, 12, H4h);
|
||
|
}
|
||
|
|
||
|
-ASM_FUNC_ATTR_INLINE
|
||
|
-block
|
||
|
-vec_perm2(block l, block r, vector16x_u8 perm) {
|
||
|
- block ret;
|
||
|
- __asm__ ("vperm %0,%1,%2,%3\n\t"
|
||
|
- : "=v" (ret)
|
||
|
- : "v" (l), "v" (r), "v" (perm));
|
||
|
- return ret;
|
||
|
-}
|
||
|
-
|
||
|
void ASM_FUNC_ATTR
|
||
|
-_gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
|
||
|
- const byte *const buf, const size_t nblocks)
|
||
|
+_gcry_ghash_ppc_vpmsum (byte *result, const void *const gcm_table,
|
||
|
+ const byte *buf, const size_t nblocks)
|
||
|
{
|
||
|
- /* This const is strange, it is reversing the bytes, and also reversing
|
||
|
- the u32s that get switched by lxvw4 and it also addresses bytes big-endian,
|
||
|
- and is here due to lack of proper peep-hole optimization. */
|
||
|
- vector16x_u8 bswap_const =
|
||
|
- { 12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3 };
|
||
|
- vector16x_u8 bswap_8_const =
|
||
|
- { 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0 };
|
||
|
+ static const vector16x_u8 bswap_const =
|
||
|
+ { ~7, ~6, ~5, ~4, ~3, ~2, ~1, ~0, ~15, ~14, ~13, ~12, ~11, ~10, ~9, ~8 };
|
||
|
block c2, H0l, H0m, H0h, H4l, H4m, H4h, H2m, H3l, H3m, H3h, Hl;
|
||
|
block Hm, Hh, in, in0, in1, in2, in3, Hm_right, Hl_rotate, cur;
|
||
|
- size_t blocks_remaining = nblocks, off = 0;
|
||
|
+ size_t blocks_remaining = nblocks;
|
||
|
size_t not_multiple_of_four;
|
||
|
block t0;
|
||
|
|
||
|
- cur = vec_load_be (0, result, bswap_const);
|
||
|
+ cur = vec_be_swap (vec_load_he (0, result), bswap_const);
|
||
|
|
||
|
- c2 = vec_aligned_ld (0, gcm_table);
|
||
|
- H0l = vec_aligned_ld (16, gcm_table);
|
||
|
- H0m = vec_aligned_ld (32, gcm_table);
|
||
|
- H0h = vec_aligned_ld (48, gcm_table);
|
||
|
+ c2 = vec_load_he (0, gcm_table);
|
||
|
+ H0l = vec_load_he (16, gcm_table);
|
||
|
+ H0m = vec_load_he (32, gcm_table);
|
||
|
+ H0h = vec_load_he (48, gcm_table);
|
||
|
|
||
|
for (not_multiple_of_four = nblocks % 4; not_multiple_of_four;
|
||
|
not_multiple_of_four--)
|
||
|
{
|
||
|
- in = vec_load_be (off, buf, bswap_const);
|
||
|
- off += 16;
|
||
|
+ in = vec_be_swap (vec_load_he (0, buf), bswap_const);
|
||
|
+ buf += 16;
|
||
|
blocks_remaining--;
|
||
|
cur ^= in;
|
||
|
|
||
|
@@ -385,62 +416,64 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
|
||
|
|
||
|
if (blocks_remaining > 0)
|
||
|
{
|
||
|
- vector16x_u8 hiperm =
|
||
|
- {
|
||
|
- 0x17, 0x16, 0x15, 0x14, 0x13, 0x12, 0x11, 0x10,
|
||
|
- 0x7, 0x6, 0x5, 0x4, 0x3, 0x2, 0x1, 0x0
|
||
|
- };
|
||
|
- vector16x_u8 loperm =
|
||
|
- {
|
||
|
- 0x1f, 0x1e, 0x1d, 0x1c, 0x1b, 0x1a, 0x19, 0x18,
|
||
|
- 0xf, 0xe, 0xd, 0xc, 0xb, 0xa, 0x9, 0x8
|
||
|
- };
|
||
|
block Xl, Xm, Xh, Xl1, Xm1, Xh1, Xm2, Xl3, Xm3, Xh3, Xl_rotate;
|
||
|
block H21l, H21h, merge_l, merge_h;
|
||
|
-
|
||
|
- H2m = vec_aligned_ld (48 + 32, gcm_table);
|
||
|
- H3l = vec_aligned_ld (48 * 2 + 16, gcm_table);
|
||
|
- H3m = vec_aligned_ld (48 * 2 + 32, gcm_table);
|
||
|
- H3h = vec_aligned_ld (48 * 2 + 48, gcm_table);
|
||
|
- H4l = vec_aligned_ld (48 * 3 + 16, gcm_table);
|
||
|
- H4m = vec_aligned_ld (48 * 3 + 32, gcm_table);
|
||
|
- H4h = vec_aligned_ld (48 * 3 + 48, gcm_table);
|
||
|
-
|
||
|
- in0 = vec_load_be (off, buf, bswap_const);
|
||
|
- in1 = vec_load_be (off + 16, buf, bswap_const);
|
||
|
- in2 = vec_load_be (off + 32, buf, bswap_const);
|
||
|
- in3 = vec_load_be (off + 48, buf, bswap_const);
|
||
|
- blocks_remaining -= 4;
|
||
|
- off += 64;
|
||
|
-
|
||
|
- Xh = in0 ^ cur;
|
||
|
+ block t1, t2;
|
||
|
+
|
||
|
+ H2m = vec_load_he (48 + 32, gcm_table);
|
||
|
+ H3l = vec_load_he (48 * 2 + 16, gcm_table);
|
||
|
+ H3m = vec_load_he (48 * 2 + 32, gcm_table);
|
||
|
+ H3h = vec_load_he (48 * 2 + 48, gcm_table);
|
||
|
+ H4l = vec_load_he (48 * 3 + 16, gcm_table);
|
||
|
+ H4m = vec_load_he (48 * 3 + 32, gcm_table);
|
||
|
+ H4h = vec_load_he (48 * 3 + 48, gcm_table);
|
||
|
+
|
||
|
+ in0 = vec_load_he (0, buf);
|
||
|
+ in1 = vec_load_he (16, buf);
|
||
|
+ in2 = vec_load_he (32, buf);
|
||
|
+ in3 = vec_load_he (48, buf);
|
||
|
+ in0 = vec_be_swap(in0, bswap_const);
|
||
|
+ in1 = vec_be_swap(in1, bswap_const);
|
||
|
+ in2 = vec_be_swap(in2, bswap_const);
|
||
|
+ in3 = vec_be_swap(in3, bswap_const);
|
||
|
+
|
||
|
+ Xh = asm_xor (in0, cur);
|
||
|
|
||
|
Xl1 = asm_vpmsumd (in1, H3l);
|
||
|
Xm1 = asm_vpmsumd (in1, H3m);
|
||
|
Xh1 = asm_vpmsumd (in1, H3h);
|
||
|
|
||
|
- H21l = vec_perm2 (H2m, H0m, hiperm);
|
||
|
- H21h = vec_perm2 (H2m, H0m, loperm);
|
||
|
- merge_l = vec_perm2 (in2, in3, loperm);
|
||
|
- merge_h = vec_perm2 (in2, in3, hiperm);
|
||
|
+ H21l = asm_mergehi (H2m, H0m);
|
||
|
+ H21h = asm_mergelo (H2m, H0m);
|
||
|
+ merge_l = asm_mergelo (in2, in3);
|
||
|
+ merge_h = asm_mergehi (in2, in3);
|
||
|
|
||
|
Xm2 = asm_vpmsumd (in2, H2m);
|
||
|
Xl3 = asm_vpmsumd (merge_l, H21l);
|
||
|
Xm3 = asm_vpmsumd (in3, H0m);
|
||
|
Xh3 = asm_vpmsumd (merge_h, H21h);
|
||
|
|
||
|
- Xm2 ^= Xm1;
|
||
|
- Xl3 ^= Xl1;
|
||
|
- Xm3 ^= Xm2;
|
||
|
- Xh3 ^= Xh1;
|
||
|
+ Xm2 = asm_xor (Xm2, Xm1);
|
||
|
+ Xl3 = asm_xor (Xl3, Xl1);
|
||
|
+ Xm3 = asm_xor (Xm3, Xm2);
|
||
|
+ Xh3 = asm_xor (Xh3, Xh1);
|
||
|
|
||
|
/* Gerald Estrin's scheme for parallel multiplication of polynomials */
|
||
|
- for (;blocks_remaining > 0; blocks_remaining -= 4, off += 64)
|
||
|
+ while (1)
|
||
|
{
|
||
|
- in0 = vec_load_be (off, buf, bswap_const);
|
||
|
- in1 = vec_load_be (off + 16, buf, bswap_const);
|
||
|
- in2 = vec_load_be (off + 32, buf, bswap_const);
|
||
|
- in3 = vec_load_be (off + 48, buf, bswap_const);
|
||
|
+ buf += 64;
|
||
|
+ blocks_remaining -= 4;
|
||
|
+ if (!blocks_remaining)
|
||
|
+ break;
|
||
|
+
|
||
|
+ in0 = vec_load_he (0, buf);
|
||
|
+ in1 = vec_load_he (16, buf);
|
||
|
+ in2 = vec_load_he (32, buf);
|
||
|
+ in3 = vec_load_he (48, buf);
|
||
|
+ in1 = vec_be_swap(in1, bswap_const);
|
||
|
+ in2 = vec_be_swap(in2, bswap_const);
|
||
|
+ in3 = vec_be_swap(in3, bswap_const);
|
||
|
+ in0 = vec_be_swap(in0, bswap_const);
|
||
|
|
||
|
Xl = asm_vpmsumd (Xh, H4l);
|
||
|
Xm = asm_vpmsumd (Xh, H4m);
|
||
|
@@ -449,62 +482,63 @@ _gcry_ghash_ppc_vpmsum (const byte *result, const void *const gcm_table,
|
||
|
Xm1 = asm_vpmsumd (in1, H3m);
|
||
|
Xh1 = asm_vpmsumd (in1, H3h);
|
||
|
|
||
|
- Xl ^= Xl3;
|
||
|
- Xm ^= Xm3;
|
||
|
- Xh ^= Xh3;
|
||
|
- merge_l = vec_perm2 (in2, in3, loperm);
|
||
|
- merge_h = vec_perm2 (in2, in3, hiperm);
|
||
|
+ Xl = asm_xor (Xl, Xl3);
|
||
|
+ Xm = asm_xor (Xm, Xm3);
|
||
|
+ Xh = asm_xor (Xh, Xh3);
|
||
|
+ merge_l = asm_mergelo (in2, in3);
|
||
|
+ merge_h = asm_mergehi (in2, in3);
|
||
|
|
||
|
t0 = asm_vpmsumd (Xl, c2);
|
||
|
Xl3 = asm_vpmsumd (merge_l, H21l);
|
||
|
Xh3 = asm_vpmsumd (merge_h, H21h);
|
||
|
|
||
|
- Xl ^= asm_rot_block_left (Xm);
|
||
|
- Xh ^= asm_rot_block_right (Xm);
|
||
|
+ t1 = asm_rot_block_left (Xm);
|
||
|
+ t2 = asm_rot_block_right (Xm);
|
||
|
+ Xl = asm_xor(Xl, t1);
|
||
|
+ Xh = asm_xor(Xh, t2);
|
||
|
|
||
|
Xl = asm_swap_u64 (Xl);
|
||
|
- Xl ^= t0;
|
||
|
+ Xl = asm_xor(Xl, t0);
|
||
|
|
||
|
Xl_rotate = asm_swap_u64 (Xl);
|
||
|
Xm2 = asm_vpmsumd (in2, H2m);
|
||
|
Xm3 = asm_vpmsumd (in3, H0m);
|
||
|
Xl = asm_vpmsumd (Xl, c2);
|
||
|
|
||
|
- Xl3 ^= Xl1;
|
||
|
- Xh3 ^= Xh1;
|
||
|
- Xh ^= in0;
|
||
|
- Xm2 ^= Xm1;
|
||
|
- Xh ^= Xl_rotate;
|
||
|
- Xm3 ^= Xm2;
|
||
|
- Xh ^= Xl;
|
||
|
+ Xl3 = asm_xor (Xl3, Xl1);
|
||
|
+ Xh3 = asm_xor (Xh3, Xh1);
|
||
|
+ Xh = asm_xor (Xh, in0);
|
||
|
+ Xm2 = asm_xor (Xm2, Xm1);
|
||
|
+ Xh = asm_xor (Xh, Xl_rotate);
|
||
|
+ Xm3 = asm_xor (Xm3, Xm2);
|
||
|
+ Xh = asm_xor (Xh, Xl);
|
||
|
}
|
||
|
|
||
|
Xl = asm_vpmsumd (Xh, H4l);
|
||
|
Xm = asm_vpmsumd (Xh, H4m);
|
||
|
Xh = asm_vpmsumd (Xh, H4h);
|
||
|
|
||
|
- Xl ^= Xl3;
|
||
|
- Xm ^= Xm3;
|
||
|
+ Xl = asm_xor (Xl, Xl3);
|
||
|
+ Xm = asm_xor (Xm, Xm3);
|
||
|
|
||
|
t0 = asm_vpmsumd (Xl, c2);
|
||
|
|
||
|
- Xh ^= Xh3;
|
||
|
- Xl ^= asm_rot_block_left (Xm);
|
||
|
- Xh ^= asm_rot_block_right (Xm);
|
||
|
+ Xh = asm_xor (Xh, Xh3);
|
||
|
+ t1 = asm_rot_block_left (Xm);
|
||
|
+ t2 = asm_rot_block_right (Xm);
|
||
|
+ Xl = asm_xor (Xl, t1);
|
||
|
+ Xh = asm_xor (Xh, t2);
|
||
|
|
||
|
Xl = asm_swap_u64 (Xl);
|
||
|
- Xl ^= t0;
|
||
|
+ Xl = asm_xor (Xl, t0);
|
||
|
|
||
|
Xl_rotate = asm_swap_u64 (Xl);
|
||
|
Xl = asm_vpmsumd (Xl, c2);
|
||
|
- Xl_rotate ^= Xh;
|
||
|
- Xl ^= Xl_rotate;
|
||
|
-
|
||
|
- cur = Xl;
|
||
|
+ Xh = asm_xor (Xh, Xl_rotate);
|
||
|
+ cur = asm_xor (Xh, Xl);
|
||
|
}
|
||
|
|
||
|
- cur = (block)vec_perm ((vector16x_u8)cur, (vector16x_u8)cur, bswap_8_const);
|
||
|
- STORE_TABLE (result, 0, cur);
|
||
|
+ vec_store_he (vec_be_swap (cur, bswap_const), 0, result);
|
||
|
}
|
||
|
|
||
|
#endif /* GCM_USE_PPC_VPMSUM */
|
||
|
|
||
|
diff --git a/cipher/Makefile.am b/cipher/Makefile.am
|
||
|
index ab5d2a38..7a777ef2 100644
|
||
|
--- a/cipher/Makefile.am
|
||
|
+++ b/cipher/Makefile.am
|
||
|
@@ -42,8 +42,7 @@ libcipher_la_LIBADD = $(GCRYPT_MODULES)
|
||
|
libcipher_la_SOURCES = \
|
||
|
cipher.c cipher-internal.h \
|
||
|
cipher-cbc.c cipher-cfb.c cipher-ofb.c cipher-ctr.c cipher-aeswrap.c \
|
||
|
-cipher-ccm.c cipher-cmac.c cipher-gcm.c cipher-gcm-intel-pclmul.c \
|
||
|
- cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
|
||
|
+cipher-ccm.c cipher-cmac.c cipher-gcm.c \
|
||
|
cipher-poly1305.c cipher-ocb.c cipher-xts.c \
|
||
|
cipher-selftest.c cipher-selftest.h \
|
||
|
pubkey.c pubkey-internal.h pubkey-util.c \
|
||
|
@@ -66,7 +65,8 @@ blowfish.c blowfish-amd64.S blowfish-arm.S \
|
||
|
cast5.c cast5-amd64.S cast5-arm.S \
|
||
|
chacha20.c chacha20-sse2-amd64.S chacha20-ssse3-amd64.S chacha20-avx2-amd64.S \
|
||
|
chacha20-armv7-neon.S \
|
||
|
-cipher-gcm-ppc.c \
|
||
|
+cipher-gcm-ppc.c cipher-gcm-intel-pclmul.c \
|
||
|
+ cipher-gcm-armv8-aarch32-ce.S cipher-gcm-armv8-aarch64-ce.S \
|
||
|
crc.c \
|
||
|
crc-intel-pclmul.c crc-ppc.c \
|
||
|
des.c des-amd64.S \
|
||
|
diff --git a/configure.ac b/configure.ac
|
||
|
index fd447906..9bcb1318 100644
|
||
|
--- a/configure.ac
|
||
|
+++ b/configure.ac
|
||
|
@@ -2754,14 +2754,18 @@ esac
|
||
|
|
||
|
# Arch specific GCM implementations
|
||
|
case "${host}" in
|
||
|
- powerpc64le-*-*)
|
||
|
- GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
|
||
|
+ i?86-*-* | x86_64-*-*)
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-intel-pclmul.lo"
|
||
|
;;
|
||
|
- powerpc64-*-*)
|
||
|
- GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
|
||
|
+ arm*-*-*)
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv7-neon.lo"
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch32-ce.lo"
|
||
|
+ ;;
|
||
|
+ aarch64-*-*)
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-armv8-aarch64-ce.lo"
|
||
|
;;
|
||
|
- powerpc-*-*)
|
||
|
- GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
|
||
|
+ powerpc64le-*-* | powerpc64-*-* | powerpc-*-*)
|
||
|
+ GCRYPT_DIGESTS="$GCRYPT_DIGESTS cipher-gcm-ppc.lo"
|
||
|
;;
|
||
|
esac
|
||
|
|