596 lines
20 KiB
Diff
596 lines
20 KiB
Diff
|
Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
|
||
|
---
|
||
|
mpn/s390_64/z13/addmul_1.c | 358 +++++++++++++++++++++++++++++++++++
|
||
|
mpn/s390_64/z13/common-vec.h | 175 +++++++++++++++++
|
||
|
mpn/s390_64/z13/mul_1.c | 31 +++
|
||
|
3 files changed, 564 insertions(+)
|
||
|
create mode 100644 mpn/s390_64/z13/addmul_1.c
|
||
|
create mode 100644 mpn/s390_64/z13/common-vec.h
|
||
|
create mode 100644 mpn/s390_64/z13/mul_1.c
|
||
|
|
||
|
diff --git a/mpn/s390_64/z13/addmul_1.c b/mpn/s390_64/z13/addmul_1.c
|
||
|
new file mode 100644
|
||
|
index 000000000..022e5edcc
|
||
|
--- /dev/null
|
||
|
+++ b/mpn/s390_64/z13/addmul_1.c
|
||
|
@@ -0,0 +1,358 @@
|
||
|
+/* Addmul_1 / mul_1 for IBM z13 and later
|
||
|
+ Contributed by Marius Hillenbrand
|
||
|
+
|
||
|
+Copyright 2021 Free Software Foundation, Inc.
|
||
|
+
|
||
|
+This file is part of the GNU MP Library.
|
||
|
+
|
||
|
+The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
+it under the terms of either:
|
||
|
+
|
||
|
+ * the GNU Lesser General Public License as published by the Free
|
||
|
+ Software Foundation; either version 3 of the License, or (at your
|
||
|
+ option) any later version.
|
||
|
+
|
||
|
+or
|
||
|
+
|
||
|
+ * the GNU General Public License as published by the Free Software
|
||
|
+ Foundation; either version 2 of the License, or (at your option) any
|
||
|
+ later version.
|
||
|
+
|
||
|
+or both in parallel, as here.
|
||
|
+
|
||
|
+The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
|
+for more details.
|
||
|
+
|
||
|
+You should have received copies of the GNU General Public License and the
|
||
|
+GNU Lesser General Public License along with the GNU MP Library. If not,
|
||
|
+see https://www.gnu.org/licenses/. */
|
||
|
+
|
||
|
+#include "gmp-impl.h"
|
||
|
+#include "s390_64/z13/common-vec.h"
|
||
|
+
|
||
|
+#undef FUNCNAME
|
||
|
+
|
||
|
+#ifdef DO_INLINE
|
||
|
+# ifdef OPERATION_addmul_1
|
||
|
+# define ADD
|
||
|
+# define FUNCNAME inline_addmul_1
|
||
|
+# elif defined(OPERATION_mul_1)
|
||
|
+# define FUNCNAME inline_mul_1
|
||
|
+# endif
|
||
|
+
|
||
|
+#else
|
||
|
+# ifdef OPERATION_addmul_1
|
||
|
+# define ADD
|
||
|
+# define FUNCNAME mpn_addmul_1
|
||
|
+# elif defined(OPERATION_mul_1)
|
||
|
+# define FUNCNAME mpn_mul_1
|
||
|
+# endif
|
||
|
+#endif
|
||
|
+
|
||
|
+#ifdef DO_INLINE
|
||
|
+static inline mp_limb_t
|
||
|
+FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
|
||
|
+ __attribute__ ((always_inline));
|
||
|
+
|
||
|
+static inline
|
||
|
+#endif
|
||
|
+mp_limb_t
|
||
|
+FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
|
||
|
+{
|
||
|
+ ASSERT (n >= 1);
|
||
|
+ ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n));
|
||
|
+
|
||
|
+ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
|
||
|
+ VRs (using each VR as a single 128-bit accumulator).
|
||
|
+ The inner loop is unrolled to four limbs, with two blocks of four
|
||
|
+ multiplications each. Since the MLGR operation operates on even/odd GPR
|
||
|
+ pairs, pin the products appropriately. */
|
||
|
+
|
||
|
+ /* products as GPR pairs */
|
||
|
+ register mp_limb_t p0_high asm("r0");
|
||
|
+ register mp_limb_t p0_low asm("r1");
|
||
|
+
|
||
|
+ register mp_limb_t p1_high asm("r8");
|
||
|
+ register mp_limb_t p1_low asm("r9");
|
||
|
+
|
||
|
+ register mp_limb_t p2_high asm("r6");
|
||
|
+ register mp_limb_t p2_low asm("r7");
|
||
|
+
|
||
|
+ register mp_limb_t p3_high asm("r10");
|
||
|
+ register mp_limb_t p3_low asm("r11");
|
||
|
+
|
||
|
+ /* carry flag for 128-bit add in VR for first carry chain */
|
||
|
+ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
|
||
|
+ mp_limb_t carry_limb = 0;
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ /* 2nd carry flag for 2nd carry chain with addmul */
|
||
|
+ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
|
||
|
+ vec_t sum0;
|
||
|
+ vec_t rp0_addend, rp1_addend;
|
||
|
+ rp0_addend.dw = vec_splat_u64 (0);
|
||
|
+ rp1_addend.dw = vec_splat_u64 (0);
|
||
|
+#endif
|
||
|
+ vec_t sum1;
|
||
|
+
|
||
|
+ vec_t carry_prod = { .dw = vec_splat_u64 (0) };
|
||
|
+
|
||
|
+ /* The scalar multiplications compete with pointer and index increments for
|
||
|
+ * issue ports. Thus, increment the loop index in the middle of the loop so
|
||
|
+ * that the operations for the next iteration's multiplications can be
|
||
|
+ * loaded in time (looks horrible, yet helps performance) and make sure we
|
||
|
+ * use addressing with base reg + index reg + immediate displacement
|
||
|
+ * (so that only the single index needs incrementing, instead of multiple
|
||
|
+ * pointers). */
|
||
|
+#undef LOOP_ADVANCE
|
||
|
+#undef IDX_OFFSET
|
||
|
+
|
||
|
+#define LOOP_ADVANCE 4 * sizeof (mp_limb_t)
|
||
|
+#define IDX_OFFSET (LOOP_ADVANCE)
|
||
|
+ register ssize_t idx = 0 - IDX_OFFSET;
|
||
|
+
|
||
|
+ /*
|
||
|
+ * branch-on-count implicitly hint to the branch prediction as taken, while
|
||
|
+ * compare-and-branch hints as not taken. currently, using branch-on-count
|
||
|
+ * has a performance advantage, but it is not clear that it is generally the
|
||
|
+ * better choice (e.g., branch-on-count requires decrementing the separate
|
||
|
+ * counter). so, allow switching the loop condition to enable either
|
||
|
+ * category of branch instructions:
|
||
|
+ * - idx is less than an upper bound, for compare-and-branch
|
||
|
+ * - iteration counter greater than zero, for branch-on-count
|
||
|
+ */
|
||
|
+#define BRCTG
|
||
|
+#ifdef BRCTG
|
||
|
+ ssize_t iterations = (size_t)n / 4;
|
||
|
+#else
|
||
|
+ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
|
||
|
+#endif
|
||
|
+
|
||
|
+ /* products will be transferred into VRs before adding up.
|
||
|
+ * see main loop below for comments on accumulation scheme. */
|
||
|
+ vec_t product0, product1, product2;
|
||
|
+
|
||
|
+ product0.dw = vec_splat_u64 (0);
|
||
|
+
|
||
|
+ switch ((size_t)n % 4)
|
||
|
+ {
|
||
|
+ case 0:
|
||
|
+ break;
|
||
|
+
|
||
|
+ case 1:
|
||
|
+ idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET;
|
||
|
+
|
||
|
+ p3_low = s1p[0];
|
||
|
+ s390_umul_ppmm (p3_high, p3_low, s2limb);
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ rp0_addend.dw[1] = rp[0];
|
||
|
+ product0.dw[1] = p3_low;
|
||
|
+
|
||
|
+ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
|
||
|
+ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
|
||
|
+
|
||
|
+ rp[0] = sum0.dw[1];
|
||
|
+#else
|
||
|
+ rp[0] = p3_low;
|
||
|
+#endif
|
||
|
+
|
||
|
+ carry_limb = p3_high;
|
||
|
+ break;
|
||
|
+
|
||
|
+ case 2:
|
||
|
+ p0_low = s1p[0];
|
||
|
+ p3_low = s1p[1];
|
||
|
+ idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET;
|
||
|
+
|
||
|
+ s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb);
|
||
|
+
|
||
|
+ carry_prod.dw[0] = p3_low;
|
||
|
+
|
||
|
+ product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
|
||
|
+
|
||
|
+ carry_limb = p3_high;
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ rp0_addend = vec_load_elements_reversed (rp, 0);
|
||
|
+ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
|
||
|
+ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
|
||
|
+
|
||
|
+ sum1.sw = vec_add_u128 (sum0.sw, product0.sw);
|
||
|
+ carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw);
|
||
|
+#else
|
||
|
+ sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw);
|
||
|
+ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw);
|
||
|
+#endif
|
||
|
+
|
||
|
+ vec_store_elements_reversed (rp, 0, sum1);
|
||
|
+
|
||
|
+ break;
|
||
|
+
|
||
|
+ case 3:
|
||
|
+ idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET;
|
||
|
+
|
||
|
+ p0_low = s1p[0];
|
||
|
+ s390_umul_ppmm (p0_high, p0_low, s2limb);
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ rp0_addend.dw[1] = rp[0];
|
||
|
+ product0.dw[1] = p0_low;
|
||
|
+
|
||
|
+ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
|
||
|
+ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
|
||
|
+
|
||
|
+ rp[0] = sum0.dw[1];
|
||
|
+#else
|
||
|
+ rp[0] = p0_low;
|
||
|
+#endif
|
||
|
+ carry_limb = p0_high;
|
||
|
+
|
||
|
+ p1_low = s1p[1];
|
||
|
+ p3_low = s1p[2];
|
||
|
+
|
||
|
+ s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb);
|
||
|
+
|
||
|
+ carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb);
|
||
|
+ product1.dw = vec_load_2di_as_pair (p1_high, p1_low);
|
||
|
+ carry_limb = p3_high;
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ rp0_addend = vec_load_elements_reversed (rp, 8);
|
||
|
+ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
|
||
|
+ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
|
||
|
+
|
||
|
+ sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw);
|
||
|
+ carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw);
|
||
|
+#else
|
||
|
+ sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
|
||
|
+ carry_vec0.sw
|
||
|
+ = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
|
||
|
+#endif
|
||
|
+ vec_store_elements_reversed (rp, 8, sum1);
|
||
|
+ break;
|
||
|
+ }
|
||
|
+
|
||
|
+#ifdef BRCTG
|
||
|
+ for (; iterations > 0; iterations--)
|
||
|
+ {
|
||
|
+#else
|
||
|
+ while (idx < idx_bound)
|
||
|
+ {
|
||
|
+#endif
|
||
|
+ vec_t overlap_addend0;
|
||
|
+ vec_t overlap_addend1;
|
||
|
+
|
||
|
+ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
|
||
|
+ * result in a GPR pair. One of the factors is taken from the GPR pair
|
||
|
+ * and overwritten.
|
||
|
+ * To reuse factors, it turned out cheaper to load limbs multiple times
|
||
|
+ * than copying GPR contents. Enforce that and the use of addressing by
|
||
|
+ * base + index gpr + immediate displacement via inline asm.
|
||
|
+ */
|
||
|
+ ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET);
|
||
|
+ ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET);
|
||
|
+ ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET);
|
||
|
+ ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * accumulate products as follows (for addmul):
|
||
|
+ * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] |
|
||
|
+ * p0_high | p0_low |
|
||
|
+ * p1_high | p1_low | carry-limb in
|
||
|
+ * p2_high | p2_low |
|
||
|
+ * c-limb out <- p3_high | p3_low |
|
||
|
+ * | < 128-bit VR > < 128-bit VR >
|
||
|
+ *
|
||
|
+ * < rp1_addend > < rp0_addend >
|
||
|
+ * carry-chain 0 <- + <- + <- carry_vec0[127]
|
||
|
+ * < product1 > < product0 >
|
||
|
+ * carry-chain 1 <- + <- + <- carry_vec1[127]
|
||
|
+ * < overlap_addend1 > < overlap_addend0 >
|
||
|
+ *
|
||
|
+ * note that a 128-bit add with carry in + out is built from two insns
|
||
|
+ * - vec_adde_u128 (vacq) provides sum
|
||
|
+ * - vec_addec_u128 (vacccq) provides the new carry bit
|
||
|
+ */
|
||
|
+
|
||
|
+ s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb);
|
||
|
+
|
||
|
+ /*
|
||
|
+ * "barrier" to enforce scheduling loads for all limbs and first round
|
||
|
+ * of MLGR before anything else.
|
||
|
+ */
|
||
|
+ asm volatile("");
|
||
|
+
|
||
|
+ product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET);
|
||
|
+ rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET);
|
||
|
+#endif
|
||
|
+ /* increment loop index to unblock dependant loads of limbs for the next
|
||
|
+ * iteration (see above at #define LOOP_ADVANCE) */
|
||
|
+ idx += LOOP_ADVANCE;
|
||
|
+
|
||
|
+ s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb);
|
||
|
+
|
||
|
+ overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb);
|
||
|
+ asm volatile("");
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
|
||
|
+ sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
|
||
|
+
|
||
|
+ carry_vec0.sw
|
||
|
+ = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
|
||
|
+ carry_vec1.sw
|
||
|
+ = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
|
||
|
+#else
|
||
|
+ sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
|
||
|
+ carry_vec0.sw
|
||
|
+ = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
|
||
|
+#endif
|
||
|
+
|
||
|
+ asm volatile("");
|
||
|
+ product2.dw = vec_load_2di_as_pair (p2_high, p2_low);
|
||
|
+ overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high);
|
||
|
+
|
||
|
+ vec_t sum4;
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ vec_t sum3;
|
||
|
+ sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
|
||
|
+ sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
|
||
|
+
|
||
|
+ carry_vec0.sw
|
||
|
+ = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
|
||
|
+ carry_vec1.sw
|
||
|
+ = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
|
||
|
+#else
|
||
|
+ sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
|
||
|
+ carry_vec0.sw
|
||
|
+ = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
|
||
|
+#endif
|
||
|
+ vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE,
|
||
|
+ sum1);
|
||
|
+ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
|
||
|
+ sum4);
|
||
|
+
|
||
|
+ carry_limb = p3_high;
|
||
|
+ }
|
||
|
+
|
||
|
+#ifdef ADD
|
||
|
+ carry_vec0.dw += carry_vec1.dw;
|
||
|
+ carry_limb += carry_vec0.dw[1];
|
||
|
+#else
|
||
|
+ carry_limb += carry_vec0.dw[1];
|
||
|
+#endif
|
||
|
+
|
||
|
+ return carry_limb;
|
||
|
+}
|
||
|
+
|
||
|
+#undef OPERATION_addmul_1
|
||
|
+#undef OPERATION_mul_1
|
||
|
+#undef FUNCNAME
|
||
|
+#undef ADD
|
||
|
diff --git a/mpn/s390_64/z13/common-vec.h b/mpn/s390_64/z13/common-vec.h
|
||
|
new file mode 100644
|
||
|
index 000000000..a59e6eefe
|
||
|
--- /dev/null
|
||
|
+++ b/mpn/s390_64/z13/common-vec.h
|
||
|
@@ -0,0 +1,175 @@
|
||
|
+/* Common vector helpers and macros for IBM z13 and later
|
||
|
+
|
||
|
+Copyright 2021 Free Software Foundation, Inc.
|
||
|
+
|
||
|
+This file is part of the GNU MP Library.
|
||
|
+
|
||
|
+The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
+it under the terms of either:
|
||
|
+
|
||
|
+ * the GNU Lesser General Public License as published by the Free
|
||
|
+ Software Foundation; either version 3 of the License, or (at your
|
||
|
+ option) any later version.
|
||
|
+
|
||
|
+or
|
||
|
+
|
||
|
+ * the GNU General Public License as published by the Free Software
|
||
|
+ Foundation; either version 2 of the License, or (at your option) any
|
||
|
+ later version.
|
||
|
+
|
||
|
+or both in parallel, as here.
|
||
|
+
|
||
|
+The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
|
+for more details.
|
||
|
+
|
||
|
+You should have received copies of the GNU General Public License and the
|
||
|
+GNU Lesser General Public License along with the GNU MP Library. If not,
|
||
|
+see https://www.gnu.org/licenses/. */
|
||
|
+
|
||
|
+#ifndef __S390_64_Z13_COMMON_VEC_H
|
||
|
+#define __S390_64_Z13_COMMON_VEC_H
|
||
|
+
|
||
|
+#include <unistd.h>
|
||
|
+#include <vecintrin.h>
|
||
|
+
|
||
|
+/*
|
||
|
+ * Vector intrinsics use vector element types that kind-of make sense for the
|
||
|
+ * specific operation (e.g., vec_permi permutes doublewords). To use VRs
|
||
|
+ * interchangeably with different intrinsics, typedef the two variants and wrap
|
||
|
+ * them in a union.
|
||
|
+ */
|
||
|
+#define VLEN_BYTES 16
|
||
|
+typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES)));
|
||
|
+typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES)));
|
||
|
+
|
||
|
+/*
|
||
|
+ * The Z vector intrinsics use vectors with different element types (e.g.,
|
||
|
+ * v16qi for the 128-bit adds and v2di for vec_permi).
|
||
|
+ */
|
||
|
+union vec
|
||
|
+{
|
||
|
+ v2di dw;
|
||
|
+ v16qi sw;
|
||
|
+};
|
||
|
+
|
||
|
+typedef union vec vec_t;
|
||
|
+
|
||
|
+/*
|
||
|
+ * single-instruction combine of two GPRs into a VR
|
||
|
+ */
|
||
|
+static inline v2di
|
||
|
+vec_load_2di_as_pair (unsigned long a, unsigned long b)
|
||
|
+{
|
||
|
+ v2di res;
|
||
|
+ __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b));
|
||
|
+ return res;
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * 64x64 mult where caller needs to care about proper register allocation:
|
||
|
+ * multiply xl with m1, treating both as unsigned, and place the result in
|
||
|
+ * xh:xl.
|
||
|
+ * mlgr operates on register pairs, so xh must be an even gpr followed by xl
|
||
|
+ */
|
||
|
+#define s390_umul_ppmm(xh, xl, m1) \
|
||
|
+ do \
|
||
|
+ { \
|
||
|
+ asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \
|
||
|
+ } \
|
||
|
+ while (0);
|
||
|
+
|
||
|
+/*
|
||
|
+ * two 64x64 multiplications, scheduled so that they will dispatch and issue to
|
||
|
+ * different sides: each mlgr is dispatched alone in an instruction group and
|
||
|
+ * subsequent groups will issue on different execution sides.
|
||
|
+ * there is a variant where both products use the same multiplicand and one
|
||
|
+ * that uses two different multiplicands. constraints from s390_umul_ppmm apply
|
||
|
+ * here.
|
||
|
+ */
|
||
|
+#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \
|
||
|
+ do \
|
||
|
+ { \
|
||
|
+ asm("mlgr\t%[x0h],%[mx]\n\t" \
|
||
|
+ "mlgr\t%[x1h],%[mx]" \
|
||
|
+ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \
|
||
|
+ [x1l] "=r"(X1L) \
|
||
|
+ : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \
|
||
|
+ } \
|
||
|
+ while (0);
|
||
|
+
|
||
|
+#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \
|
||
|
+ do \
|
||
|
+ { \
|
||
|
+ asm("mlgr\t%[x0h],%[mx0]\n\t" \
|
||
|
+ "mlgr\t%[x1h],%[mx1]" \
|
||
|
+ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \
|
||
|
+ [x1l] "=r"(X1L) \
|
||
|
+ : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \
|
||
|
+ } \
|
||
|
+ while (0);
|
||
|
+
|
||
|
+#define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \
|
||
|
+ asm volatile("lg\t%[r],%[off](%[b])" \
|
||
|
+ : [r] "=r"(DST) \
|
||
|
+ : [b] "a"(BASE), [off] "L"(OFFSET) \
|
||
|
+ : "memory");
|
||
|
+
|
||
|
+#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \
|
||
|
+ asm volatile("lg\t%[r],%[off](%[b],%[x])" \
|
||
|
+ : [r] "=r"(DST) \
|
||
|
+ : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \
|
||
|
+ : "memory");
|
||
|
+
|
||
|
+/*
|
||
|
+ * Load a vector register from memory and swap the two 64-bit doubleword
|
||
|
+ * elements.
|
||
|
+ */
|
||
|
+static inline vec_t
|
||
|
+vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index,
|
||
|
+ ssize_t const offset)
|
||
|
+{
|
||
|
+ vec_t res;
|
||
|
+ char *ptr = (char *)base;
|
||
|
+
|
||
|
+ res.sw = *(v16qi *)(ptr + index + offset);
|
||
|
+ res.dw = vec_permi (res.dw, res.dw, 2);
|
||
|
+
|
||
|
+ return res;
|
||
|
+}
|
||
|
+
|
||
|
+static inline vec_t
|
||
|
+vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset)
|
||
|
+{
|
||
|
+ return vec_load_elements_reversed_idx (base, 0, offset);
|
||
|
+}
|
||
|
+
|
||
|
+/*
|
||
|
+ * Store a vector register to memory and swap the two 64-bit doubleword
|
||
|
+ * elements.
|
||
|
+ */
|
||
|
+static inline void
|
||
|
+vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index,
|
||
|
+ ssize_t const offset, vec_t vec)
|
||
|
+{
|
||
|
+ char *ptr = (char *)base;
|
||
|
+
|
||
|
+ vec.dw = vec_permi (vec.dw, vec.dw, 2);
|
||
|
+ *(v16qi *)(ptr + index + offset) = vec.sw;
|
||
|
+}
|
||
|
+
|
||
|
+static inline void
|
||
|
+vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec)
|
||
|
+{
|
||
|
+ vec_store_elements_reversed_idx (base, 0, offset, vec);
|
||
|
+}
|
||
|
+
|
||
|
+#define ASM_VZERO(VEC) \
|
||
|
+ do \
|
||
|
+ { \
|
||
|
+ asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \
|
||
|
+ } \
|
||
|
+ while (0)
|
||
|
+
|
||
|
+#endif
|
||
|
diff --git a/mpn/s390_64/z13/mul_1.c b/mpn/s390_64/z13/mul_1.c
|
||
|
new file mode 100644
|
||
|
index 000000000..7584dc8c7
|
||
|
--- /dev/null
|
||
|
+++ b/mpn/s390_64/z13/mul_1.c
|
||
|
@@ -0,0 +1,31 @@
|
||
|
+/* mul_1 for IBM z13 or later
|
||
|
+
|
||
|
+Copyright 2021 Free Software Foundation, Inc.
|
||
|
+
|
||
|
+This file is part of the GNU MP Library.
|
||
|
+
|
||
|
+The GNU MP Library is free software; you can redistribute it and/or modify
|
||
|
+it under the terms of either:
|
||
|
+
|
||
|
+ * the GNU Lesser General Public License as published by the Free
|
||
|
+ Software Foundation; either version 3 of the License, or (at your
|
||
|
+ option) any later version.
|
||
|
+
|
||
|
+or
|
||
|
+
|
||
|
+ * the GNU General Public License as published by the Free Software
|
||
|
+ Foundation; either version 2 of the License, or (at your option) any
|
||
|
+ later version.
|
||
|
+
|
||
|
+or both in parallel, as here.
|
||
|
+
|
||
|
+The GNU MP Library is distributed in the hope that it will be useful, but
|
||
|
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
|
||
|
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
|
||
|
+for more details.
|
||
|
+
|
||
|
+You should have received copies of the GNU General Public License and the
|
||
|
+GNU Lesser General Public License along with the GNU MP Library. If not,
|
||
|
+see https://www.gnu.org/licenses/. */
|
||
|
+
|
||
|
+#include "s390_64/z13/addmul_1.c"
|
||
|
--
|
||
|
2.40.1
|
||
|
|