Co-authored-by: Stefan Liebler --- mpn/s390_64/z13/addmul_1.c | 358 +++++++++++++++++++++++++++++++++++ mpn/s390_64/z13/common-vec.h | 175 +++++++++++++++++ mpn/s390_64/z13/mul_1.c | 31 +++ 3 files changed, 564 insertions(+) create mode 100644 mpn/s390_64/z13/addmul_1.c create mode 100644 mpn/s390_64/z13/common-vec.h create mode 100644 mpn/s390_64/z13/mul_1.c diff --git a/mpn/s390_64/z13/addmul_1.c b/mpn/s390_64/z13/addmul_1.c new file mode 100644 index 000000000..022e5edcc --- /dev/null +++ b/mpn/s390_64/z13/addmul_1.c @@ -0,0 +1,358 @@ +/* Addmul_1 / mul_1 for IBM z13 and later + Contributed by Marius Hillenbrand + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "gmp-impl.h" +#include "s390_64/z13/common-vec.h" + +#undef FUNCNAME + +#ifdef DO_INLINE +# ifdef OPERATION_addmul_1 +# define ADD +# define FUNCNAME inline_addmul_1 +# elif defined(OPERATION_mul_1) +# define FUNCNAME inline_mul_1 +# endif + +#else +# ifdef OPERATION_addmul_1 +# define ADD +# define FUNCNAME mpn_addmul_1 +# elif defined(OPERATION_mul_1) +# define FUNCNAME mpn_mul_1 +# endif +#endif + +#ifdef DO_INLINE +static inline mp_limb_t +FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) + __attribute__ ((always_inline)); + +static inline +#endif +mp_limb_t +FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) +{ + ASSERT (n >= 1); + ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n)); + + /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in + VRs (using each VR as a single 128-bit accumulator). + The inner loop is unrolled to four limbs, with two blocks of four + multiplications each. Since the MLGR operation operates on even/odd GPR + pairs, pin the products appropriately. */ + + /* products as GPR pairs */ + register mp_limb_t p0_high asm("r0"); + register mp_limb_t p0_low asm("r1"); + + register mp_limb_t p1_high asm("r8"); + register mp_limb_t p1_low asm("r9"); + + register mp_limb_t p2_high asm("r6"); + register mp_limb_t p2_low asm("r7"); + + register mp_limb_t p3_high asm("r10"); + register mp_limb_t p3_low asm("r11"); + + /* carry flag for 128-bit add in VR for first carry chain */ + vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; + mp_limb_t carry_limb = 0; + +#ifdef ADD + /* 2nd carry flag for 2nd carry chain with addmul */ + vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; + vec_t sum0; + vec_t rp0_addend, rp1_addend; + rp0_addend.dw = vec_splat_u64 (0); + rp1_addend.dw = vec_splat_u64 (0); +#endif + vec_t sum1; + + vec_t carry_prod = { .dw = vec_splat_u64 (0) }; + + /* The scalar multiplications compete with pointer and index increments for + * issue ports. Thus, increment the loop index in the middle of the loop so + * that the operations for the next iteration's multiplications can be + * loaded in time (looks horrible, yet helps performance) and make sure we + * use addressing with base reg + index reg + immediate displacement + * (so that only the single index needs incrementing, instead of multiple + * pointers). */ +#undef LOOP_ADVANCE +#undef IDX_OFFSET + +#define LOOP_ADVANCE 4 * sizeof (mp_limb_t) +#define IDX_OFFSET (LOOP_ADVANCE) + register ssize_t idx = 0 - IDX_OFFSET; + + /* + * branch-on-count implicitly hint to the branch prediction as taken, while + * compare-and-branch hints as not taken. currently, using branch-on-count + * has a performance advantage, but it is not clear that it is generally the + * better choice (e.g., branch-on-count requires decrementing the separate + * counter). so, allow switching the loop condition to enable either + * category of branch instructions: + * - idx is less than an upper bound, for compare-and-branch + * - iteration counter greater than zero, for branch-on-count + */ +#define BRCTG +#ifdef BRCTG + ssize_t iterations = (size_t)n / 4; +#else + ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; +#endif + + /* products will be transferred into VRs before adding up. + * see main loop below for comments on accumulation scheme. */ + vec_t product0, product1, product2; + + product0.dw = vec_splat_u64 (0); + + switch ((size_t)n % 4) + { + case 0: + break; + + case 1: + idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET; + + p3_low = s1p[0]; + s390_umul_ppmm (p3_high, p3_low, s2limb); + +#ifdef ADD + rp0_addend.dw[1] = rp[0]; + product0.dw[1] = p3_low; + + sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); + carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); + + rp[0] = sum0.dw[1]; +#else + rp[0] = p3_low; +#endif + + carry_limb = p3_high; + break; + + case 2: + p0_low = s1p[0]; + p3_low = s1p[1]; + idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET; + + s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb); + + carry_prod.dw[0] = p3_low; + + product0.dw = vec_load_2di_as_pair (p0_high, p0_low); + + carry_limb = p3_high; + +#ifdef ADD + rp0_addend = vec_load_elements_reversed (rp, 0); + sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); + carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); + + sum1.sw = vec_add_u128 (sum0.sw, product0.sw); + carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw); +#else + sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw); + carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw); +#endif + + vec_store_elements_reversed (rp, 0, sum1); + + break; + + case 3: + idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET; + + p0_low = s1p[0]; + s390_umul_ppmm (p0_high, p0_low, s2limb); + +#ifdef ADD + rp0_addend.dw[1] = rp[0]; + product0.dw[1] = p0_low; + + sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); + carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); + + rp[0] = sum0.dw[1]; +#else + rp[0] = p0_low; +#endif + carry_limb = p0_high; + + p1_low = s1p[1]; + p3_low = s1p[2]; + + s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb); + + carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb); + product1.dw = vec_load_2di_as_pair (p1_high, p1_low); + carry_limb = p3_high; + +#ifdef ADD + rp0_addend = vec_load_elements_reversed (rp, 8); + sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); + carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); + + sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw); + carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw); +#else + sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); + carry_vec0.sw + = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); +#endif + vec_store_elements_reversed (rp, 8, sum1); + break; + } + +#ifdef BRCTG + for (; iterations > 0; iterations--) + { +#else + while (idx < idx_bound) + { +#endif + vec_t overlap_addend0; + vec_t overlap_addend1; + + /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the + * result in a GPR pair. One of the factors is taken from the GPR pair + * and overwritten. + * To reuse factors, it turned out cheaper to load limbs multiple times + * than copying GPR contents. Enforce that and the use of addressing by + * base + index gpr + immediate displacement via inline asm. + */ + ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET); + ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET); + ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET); + ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET); + + /* + * accumulate products as follows (for addmul): + * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] | + * p0_high | p0_low | + * p1_high | p1_low | carry-limb in + * p2_high | p2_low | + * c-limb out <- p3_high | p3_low | + * | < 128-bit VR > < 128-bit VR > + * + * < rp1_addend > < rp0_addend > + * carry-chain 0 <- + <- + <- carry_vec0[127] + * < product1 > < product0 > + * carry-chain 1 <- + <- + <- carry_vec1[127] + * < overlap_addend1 > < overlap_addend0 > + * + * note that a 128-bit add with carry in + out is built from two insns + * - vec_adde_u128 (vacq) provides sum + * - vec_addec_u128 (vacccq) provides the new carry bit + */ + + s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb); + + /* + * "barrier" to enforce scheduling loads for all limbs and first round + * of MLGR before anything else. + */ + asm volatile(""); + + product0.dw = vec_load_2di_as_pair (p0_high, p0_low); + +#ifdef ADD + rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET); + rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET); +#endif + /* increment loop index to unblock dependant loads of limbs for the next + * iteration (see above at #define LOOP_ADVANCE) */ + idx += LOOP_ADVANCE; + + s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb); + + overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb); + asm volatile(""); + +#ifdef ADD + sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); + sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); + + carry_vec0.sw + = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); + carry_vec1.sw + = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); +#else + sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); + carry_vec0.sw + = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); +#endif + + asm volatile(""); + product2.dw = vec_load_2di_as_pair (p2_high, p2_low); + overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high); + + vec_t sum4; + +#ifdef ADD + vec_t sum3; + sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); + sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); + + carry_vec0.sw + = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); + carry_vec1.sw + = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); +#else + sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); + carry_vec0.sw + = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); +#endif + vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE, + sum1); + vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, + sum4); + + carry_limb = p3_high; + } + +#ifdef ADD + carry_vec0.dw += carry_vec1.dw; + carry_limb += carry_vec0.dw[1]; +#else + carry_limb += carry_vec0.dw[1]; +#endif + + return carry_limb; +} + +#undef OPERATION_addmul_1 +#undef OPERATION_mul_1 +#undef FUNCNAME +#undef ADD diff --git a/mpn/s390_64/z13/common-vec.h b/mpn/s390_64/z13/common-vec.h new file mode 100644 index 000000000..a59e6eefe --- /dev/null +++ b/mpn/s390_64/z13/common-vec.h @@ -0,0 +1,175 @@ +/* Common vector helpers and macros for IBM z13 and later + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#ifndef __S390_64_Z13_COMMON_VEC_H +#define __S390_64_Z13_COMMON_VEC_H + +#include +#include + +/* + * Vector intrinsics use vector element types that kind-of make sense for the + * specific operation (e.g., vec_permi permutes doublewords). To use VRs + * interchangeably with different intrinsics, typedef the two variants and wrap + * them in a union. + */ +#define VLEN_BYTES 16 +typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES))); +typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES))); + +/* + * The Z vector intrinsics use vectors with different element types (e.g., + * v16qi for the 128-bit adds and v2di for vec_permi). + */ +union vec +{ + v2di dw; + v16qi sw; +}; + +typedef union vec vec_t; + +/* + * single-instruction combine of two GPRs into a VR + */ +static inline v2di +vec_load_2di_as_pair (unsigned long a, unsigned long b) +{ + v2di res; + __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b)); + return res; +} + +/* + * 64x64 mult where caller needs to care about proper register allocation: + * multiply xl with m1, treating both as unsigned, and place the result in + * xh:xl. + * mlgr operates on register pairs, so xh must be an even gpr followed by xl + */ +#define s390_umul_ppmm(xh, xl, m1) \ + do \ + { \ + asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \ + } \ + while (0); + +/* + * two 64x64 multiplications, scheduled so that they will dispatch and issue to + * different sides: each mlgr is dispatched alone in an instruction group and + * subsequent groups will issue on different execution sides. + * there is a variant where both products use the same multiplicand and one + * that uses two different multiplicands. constraints from s390_umul_ppmm apply + * here. + */ +#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \ + do \ + { \ + asm("mlgr\t%[x0h],%[mx]\n\t" \ + "mlgr\t%[x1h],%[mx]" \ + : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ + [x1l] "=r"(X1L) \ + : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \ + } \ + while (0); + +#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \ + do \ + { \ + asm("mlgr\t%[x0h],%[mx0]\n\t" \ + "mlgr\t%[x1h],%[mx1]" \ + : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ + [x1l] "=r"(X1L) \ + : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \ + } \ + while (0); + +#define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \ + asm volatile("lg\t%[r],%[off](%[b])" \ + : [r] "=r"(DST) \ + : [b] "a"(BASE), [off] "L"(OFFSET) \ + : "memory"); + +#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \ + asm volatile("lg\t%[r],%[off](%[b],%[x])" \ + : [r] "=r"(DST) \ + : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \ + : "memory"); + +/* + * Load a vector register from memory and swap the two 64-bit doubleword + * elements. + */ +static inline vec_t +vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index, + ssize_t const offset) +{ + vec_t res; + char *ptr = (char *)base; + + res.sw = *(v16qi *)(ptr + index + offset); + res.dw = vec_permi (res.dw, res.dw, 2); + + return res; +} + +static inline vec_t +vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset) +{ + return vec_load_elements_reversed_idx (base, 0, offset); +} + +/* + * Store a vector register to memory and swap the two 64-bit doubleword + * elements. + */ +static inline void +vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index, + ssize_t const offset, vec_t vec) +{ + char *ptr = (char *)base; + + vec.dw = vec_permi (vec.dw, vec.dw, 2); + *(v16qi *)(ptr + index + offset) = vec.sw; +} + +static inline void +vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec) +{ + vec_store_elements_reversed_idx (base, 0, offset, vec); +} + +#define ASM_VZERO(VEC) \ + do \ + { \ + asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \ + } \ + while (0) + +#endif diff --git a/mpn/s390_64/z13/mul_1.c b/mpn/s390_64/z13/mul_1.c new file mode 100644 index 000000000..7584dc8c7 --- /dev/null +++ b/mpn/s390_64/z13/mul_1.c @@ -0,0 +1,31 @@ +/* mul_1 for IBM z13 or later + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include "s390_64/z13/addmul_1.c" -- 2.40.1