From ffc3e74bc906c185fb184510c06190dee7900a6b Mon Sep 17 00:00:00 2001 From: eabdullin Date: Tue, 7 Nov 2023 11:25:56 +0000 Subject: [PATCH] import UBI gmp-6.2.0-13.el9 --- SOURCES/cve-2021-43618.patch | 25 ++ SOURCES/ibm_z13_simd_part1.patch | 595 +++++++++++++++++++++++++++++++ SOURCES/ibm_z13_simd_part2.patch | 535 +++++++++++++++++++++++++++ SOURCES/ibm_z13_simd_part3.patch | 138 +++++++ SOURCES/ibm_z13_simd_part4.patch | 151 ++++++++ SPECS/gmp.spec | 22 +- 6 files changed, 1464 insertions(+), 2 deletions(-) create mode 100644 SOURCES/cve-2021-43618.patch create mode 100644 SOURCES/ibm_z13_simd_part1.patch create mode 100644 SOURCES/ibm_z13_simd_part2.patch create mode 100644 SOURCES/ibm_z13_simd_part3.patch create mode 100644 SOURCES/ibm_z13_simd_part4.patch diff --git a/SOURCES/cve-2021-43618.patch b/SOURCES/cve-2021-43618.patch new file mode 100644 index 0000000..f741972 --- /dev/null +++ b/SOURCES/cve-2021-43618.patch @@ -0,0 +1,25 @@ + +# HG changeset patch +# User Marco Bodrato +# Date 1634836009 -7200 +# Node ID 561a9c25298e17bb01896801ff353546c6923dbd +# Parent e1fd9db13b475209a864577237ea4b9105b3e96e +mpz/inp_raw.c: Avoid bit size overflows + +diff -r e1fd9db13b47 -r 561a9c25298e mpz/inp_raw.c +--- a/mpz/inp_raw.c Tue Dec 22 23:49:51 2020 +0100 ++++ b/mpz/inp_raw.c Thu Oct 21 19:06:49 2021 +0200 +@@ -88,8 +88,11 @@ + + abs_csize = ABS (csize); + ++ if (UNLIKELY (abs_csize > ~(mp_bitcnt_t) 0 / 8)) ++ return 0; /* Bit size overflows */ ++ + /* round up to a multiple of limbs */ +- abs_xsize = BITS_TO_LIMBS (abs_csize*8); ++ abs_xsize = BITS_TO_LIMBS ((mp_bitcnt_t) abs_csize * 8); + + if (abs_xsize != 0) + { + diff --git a/SOURCES/ibm_z13_simd_part1.patch b/SOURCES/ibm_z13_simd_part1.patch new file mode 100644 index 0000000..73f6c83 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part1.patch @@ -0,0 +1,595 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/addmul_1.c | 358 +++++++++++++++++++++++++++++++++++ + mpn/s390_64/z13/common-vec.h | 175 +++++++++++++++++ + mpn/s390_64/z13/mul_1.c | 31 +++ + 3 files changed, 564 insertions(+) + create mode 100644 mpn/s390_64/z13/addmul_1.c + create mode 100644 mpn/s390_64/z13/common-vec.h + create mode 100644 mpn/s390_64/z13/mul_1.c + +diff --git a/mpn/s390_64/z13/addmul_1.c b/mpn/s390_64/z13/addmul_1.c +new file mode 100644 +index 000000000..022e5edcc +--- /dev/null ++++ b/mpn/s390_64/z13/addmul_1.c +@@ -0,0 +1,358 @@ ++/* Addmul_1 / mul_1 for IBM z13 and later ++ Contributed by Marius Hillenbrand ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp-impl.h" ++#include "s390_64/z13/common-vec.h" ++ ++#undef FUNCNAME ++ ++#ifdef DO_INLINE ++# ifdef OPERATION_addmul_1 ++# define ADD ++# define FUNCNAME inline_addmul_1 ++# elif defined(OPERATION_mul_1) ++# define FUNCNAME inline_mul_1 ++# endif ++ ++#else ++# ifdef OPERATION_addmul_1 ++# define ADD ++# define FUNCNAME mpn_addmul_1 ++# elif defined(OPERATION_mul_1) ++# define FUNCNAME mpn_mul_1 ++# endif ++#endif ++ ++#ifdef DO_INLINE ++static inline mp_limb_t ++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) ++ __attribute__ ((always_inline)); ++ ++static inline ++#endif ++mp_limb_t ++FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb) ++{ ++ ASSERT (n >= 1); ++ ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n)); ++ ++ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in ++ VRs (using each VR as a single 128-bit accumulator). ++ The inner loop is unrolled to four limbs, with two blocks of four ++ multiplications each. Since the MLGR operation operates on even/odd GPR ++ pairs, pin the products appropriately. */ ++ ++ /* products as GPR pairs */ ++ register mp_limb_t p0_high asm("r0"); ++ register mp_limb_t p0_low asm("r1"); ++ ++ register mp_limb_t p1_high asm("r8"); ++ register mp_limb_t p1_low asm("r9"); ++ ++ register mp_limb_t p2_high asm("r6"); ++ register mp_limb_t p2_low asm("r7"); ++ ++ register mp_limb_t p3_high asm("r10"); ++ register mp_limb_t p3_low asm("r11"); ++ ++ /* carry flag for 128-bit add in VR for first carry chain */ ++ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; ++ mp_limb_t carry_limb = 0; ++ ++#ifdef ADD ++ /* 2nd carry flag for 2nd carry chain with addmul */ ++ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; ++ vec_t sum0; ++ vec_t rp0_addend, rp1_addend; ++ rp0_addend.dw = vec_splat_u64 (0); ++ rp1_addend.dw = vec_splat_u64 (0); ++#endif ++ vec_t sum1; ++ ++ vec_t carry_prod = { .dw = vec_splat_u64 (0) }; ++ ++ /* The scalar multiplications compete with pointer and index increments for ++ * issue ports. Thus, increment the loop index in the middle of the loop so ++ * that the operations for the next iteration's multiplications can be ++ * loaded in time (looks horrible, yet helps performance) and make sure we ++ * use addressing with base reg + index reg + immediate displacement ++ * (so that only the single index needs incrementing, instead of multiple ++ * pointers). */ ++#undef LOOP_ADVANCE ++#undef IDX_OFFSET ++ ++#define LOOP_ADVANCE 4 * sizeof (mp_limb_t) ++#define IDX_OFFSET (LOOP_ADVANCE) ++ register ssize_t idx = 0 - IDX_OFFSET; ++ ++ /* ++ * branch-on-count implicitly hint to the branch prediction as taken, while ++ * compare-and-branch hints as not taken. currently, using branch-on-count ++ * has a performance advantage, but it is not clear that it is generally the ++ * better choice (e.g., branch-on-count requires decrementing the separate ++ * counter). so, allow switching the loop condition to enable either ++ * category of branch instructions: ++ * - idx is less than an upper bound, for compare-and-branch ++ * - iteration counter greater than zero, for branch-on-count ++ */ ++#define BRCTG ++#ifdef BRCTG ++ ssize_t iterations = (size_t)n / 4; ++#else ++ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; ++#endif ++ ++ /* products will be transferred into VRs before adding up. ++ * see main loop below for comments on accumulation scheme. */ ++ vec_t product0, product1, product2; ++ ++ product0.dw = vec_splat_u64 (0); ++ ++ switch ((size_t)n % 4) ++ { ++ case 0: ++ break; ++ ++ case 1: ++ idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ p3_low = s1p[0]; ++ s390_umul_ppmm (p3_high, p3_low, s2limb); ++ ++#ifdef ADD ++ rp0_addend.dw[1] = rp[0]; ++ product0.dw[1] = p3_low; ++ ++ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); ++ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); ++ ++ rp[0] = sum0.dw[1]; ++#else ++ rp[0] = p3_low; ++#endif ++ ++ carry_limb = p3_high; ++ break; ++ ++ case 2: ++ p0_low = s1p[0]; ++ p3_low = s1p[1]; ++ idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb); ++ ++ carry_prod.dw[0] = p3_low; ++ ++ product0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ carry_limb = p3_high; ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed (rp, 0); ++ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); ++ ++ sum1.sw = vec_add_u128 (sum0.sw, product0.sw); ++ carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw); ++#else ++ sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw); ++#endif ++ ++ vec_store_elements_reversed (rp, 0, sum1); ++ ++ break; ++ ++ case 3: ++ idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET; ++ ++ p0_low = s1p[0]; ++ s390_umul_ppmm (p0_high, p0_low, s2limb); ++ ++#ifdef ADD ++ rp0_addend.dw[1] = rp[0]; ++ product0.dw[1] = p0_low; ++ ++ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw); ++ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0); ++ ++ rp[0] = sum0.dw[1]; ++#else ++ rp[0] = p0_low; ++#endif ++ carry_limb = p0_high; ++ ++ p1_low = s1p[1]; ++ p3_low = s1p[2]; ++ ++ s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb); ++ ++ carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb); ++ product1.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ carry_limb = p3_high; ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed (rp, 8); ++ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw); ++ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw); ++ ++ sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw); ++ carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw); ++#else ++ sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw); ++#endif ++ vec_store_elements_reversed (rp, 8, sum1); ++ break; ++ } ++ ++#ifdef BRCTG ++ for (; iterations > 0; iterations--) ++ { ++#else ++ while (idx < idx_bound) ++ { ++#endif ++ vec_t overlap_addend0; ++ vec_t overlap_addend1; ++ ++ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the ++ * result in a GPR pair. One of the factors is taken from the GPR pair ++ * and overwritten. ++ * To reuse factors, it turned out cheaper to load limbs multiple times ++ * than copying GPR contents. Enforce that and the use of addressing by ++ * base + index gpr + immediate displacement via inline asm. ++ */ ++ ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET); ++ ++ /* ++ * accumulate products as follows (for addmul): ++ * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] | ++ * p0_high | p0_low | ++ * p1_high | p1_low | carry-limb in ++ * p2_high | p2_low | ++ * c-limb out <- p3_high | p3_low | ++ * | < 128-bit VR > < 128-bit VR > ++ * ++ * < rp1_addend > < rp0_addend > ++ * carry-chain 0 <- + <- + <- carry_vec0[127] ++ * < product1 > < product0 > ++ * carry-chain 1 <- + <- + <- carry_vec1[127] ++ * < overlap_addend1 > < overlap_addend0 > ++ * ++ * note that a 128-bit add with carry in + out is built from two insns ++ * - vec_adde_u128 (vacq) provides sum ++ * - vec_addec_u128 (vacccq) provides the new carry bit ++ */ ++ ++ s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb); ++ ++ /* ++ * "barrier" to enforce scheduling loads for all limbs and first round ++ * of MLGR before anything else. ++ */ ++ asm volatile(""); ++ ++ product0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++#ifdef ADD ++ rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET); ++ rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET); ++#endif ++ /* increment loop index to unblock dependant loads of limbs for the next ++ * iteration (see above at #define LOOP_ADVANCE) */ ++ idx += LOOP_ADVANCE; ++ ++ s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb); ++ ++ overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb); ++ asm volatile(""); ++ ++#ifdef ADD ++ sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); ++ sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); ++ ++ carry_vec0.sw ++ = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw); ++ carry_vec1.sw ++ = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw); ++#else ++ sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw); ++#endif ++ ++ asm volatile(""); ++ product2.dw = vec_load_2di_as_pair (p2_high, p2_low); ++ overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high); ++ ++ vec_t sum4; ++ ++#ifdef ADD ++ vec_t sum3; ++ sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); ++ sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); ++ ++ carry_vec0.sw ++ = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw); ++ carry_vec1.sw ++ = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw); ++#else ++ sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); ++ carry_vec0.sw ++ = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw); ++#endif ++ vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE, ++ sum1); ++ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, ++ sum4); ++ ++ carry_limb = p3_high; ++ } ++ ++#ifdef ADD ++ carry_vec0.dw += carry_vec1.dw; ++ carry_limb += carry_vec0.dw[1]; ++#else ++ carry_limb += carry_vec0.dw[1]; ++#endif ++ ++ return carry_limb; ++} ++ ++#undef OPERATION_addmul_1 ++#undef OPERATION_mul_1 ++#undef FUNCNAME ++#undef ADD +diff --git a/mpn/s390_64/z13/common-vec.h b/mpn/s390_64/z13/common-vec.h +new file mode 100644 +index 000000000..a59e6eefe +--- /dev/null ++++ b/mpn/s390_64/z13/common-vec.h +@@ -0,0 +1,175 @@ ++/* Common vector helpers and macros for IBM z13 and later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#ifndef __S390_64_Z13_COMMON_VEC_H ++#define __S390_64_Z13_COMMON_VEC_H ++ ++#include ++#include ++ ++/* ++ * Vector intrinsics use vector element types that kind-of make sense for the ++ * specific operation (e.g., vec_permi permutes doublewords). To use VRs ++ * interchangeably with different intrinsics, typedef the two variants and wrap ++ * them in a union. ++ */ ++#define VLEN_BYTES 16 ++typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES))); ++typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES))); ++ ++/* ++ * The Z vector intrinsics use vectors with different element types (e.g., ++ * v16qi for the 128-bit adds and v2di for vec_permi). ++ */ ++union vec ++{ ++ v2di dw; ++ v16qi sw; ++}; ++ ++typedef union vec vec_t; ++ ++/* ++ * single-instruction combine of two GPRs into a VR ++ */ ++static inline v2di ++vec_load_2di_as_pair (unsigned long a, unsigned long b) ++{ ++ v2di res; ++ __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b)); ++ return res; ++} ++ ++/* ++ * 64x64 mult where caller needs to care about proper register allocation: ++ * multiply xl with m1, treating both as unsigned, and place the result in ++ * xh:xl. ++ * mlgr operates on register pairs, so xh must be an even gpr followed by xl ++ */ ++#define s390_umul_ppmm(xh, xl, m1) \ ++ do \ ++ { \ ++ asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \ ++ } \ ++ while (0); ++ ++/* ++ * two 64x64 multiplications, scheduled so that they will dispatch and issue to ++ * different sides: each mlgr is dispatched alone in an instruction group and ++ * subsequent groups will issue on different execution sides. ++ * there is a variant where both products use the same multiplicand and one ++ * that uses two different multiplicands. constraints from s390_umul_ppmm apply ++ * here. ++ */ ++#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \ ++ do \ ++ { \ ++ asm("mlgr\t%[x0h],%[mx]\n\t" \ ++ "mlgr\t%[x1h],%[mx]" \ ++ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ ++ [x1l] "=r"(X1L) \ ++ : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \ ++ } \ ++ while (0); ++ ++#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \ ++ do \ ++ { \ ++ asm("mlgr\t%[x0h],%[mx0]\n\t" \ ++ "mlgr\t%[x1h],%[mx1]" \ ++ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \ ++ [x1l] "=r"(X1L) \ ++ : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \ ++ } \ ++ while (0); ++ ++#define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \ ++ asm volatile("lg\t%[r],%[off](%[b])" \ ++ : [r] "=r"(DST) \ ++ : [b] "a"(BASE), [off] "L"(OFFSET) \ ++ : "memory"); ++ ++#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \ ++ asm volatile("lg\t%[r],%[off](%[b],%[x])" \ ++ : [r] "=r"(DST) \ ++ : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \ ++ : "memory"); ++ ++/* ++ * Load a vector register from memory and swap the two 64-bit doubleword ++ * elements. ++ */ ++static inline vec_t ++vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index, ++ ssize_t const offset) ++{ ++ vec_t res; ++ char *ptr = (char *)base; ++ ++ res.sw = *(v16qi *)(ptr + index + offset); ++ res.dw = vec_permi (res.dw, res.dw, 2); ++ ++ return res; ++} ++ ++static inline vec_t ++vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset) ++{ ++ return vec_load_elements_reversed_idx (base, 0, offset); ++} ++ ++/* ++ * Store a vector register to memory and swap the two 64-bit doubleword ++ * elements. ++ */ ++static inline void ++vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index, ++ ssize_t const offset, vec_t vec) ++{ ++ char *ptr = (char *)base; ++ ++ vec.dw = vec_permi (vec.dw, vec.dw, 2); ++ *(v16qi *)(ptr + index + offset) = vec.sw; ++} ++ ++static inline void ++vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec) ++{ ++ vec_store_elements_reversed_idx (base, 0, offset, vec); ++} ++ ++#define ASM_VZERO(VEC) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \ ++ } \ ++ while (0) ++ ++#endif +diff --git a/mpn/s390_64/z13/mul_1.c b/mpn/s390_64/z13/mul_1.c +new file mode 100644 +index 000000000..7584dc8c7 +--- /dev/null ++++ b/mpn/s390_64/z13/mul_1.c +@@ -0,0 +1,31 @@ ++/* mul_1 for IBM z13 or later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "s390_64/z13/addmul_1.c" +-- +2.40.1 + diff --git a/SOURCES/ibm_z13_simd_part2.patch b/SOURCES/ibm_z13_simd_part2.patch new file mode 100644 index 0000000..3d216d9 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part2.patch @@ -0,0 +1,535 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/aormul_2.c | 476 +++++++++++++++++++++++++++++++++++ + mpn/s390_64/z13/gmp-mparam.h | 37 +++ + 2 files changed, 513 insertions(+) + create mode 100644 mpn/s390_64/z13/aormul_2.c + create mode 100644 mpn/s390_64/z13/gmp-mparam.h + +diff --git a/mpn/s390_64/z13/aormul_2.c b/mpn/s390_64/z13/aormul_2.c +new file mode 100644 +index 000000000..9a69fc38e +--- /dev/null ++++ b/mpn/s390_64/z13/aormul_2.c +@@ -0,0 +1,476 @@ ++/* Addmul_2 / mul_2 for IBM z13 or later ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include "gmp-impl.h" ++ ++#include "s390_64/z13/common-vec.h" ++ ++#undef FUNCNAME ++ ++#ifdef DO_INLINE ++# ifdef OPERATION_addmul_2 ++# define ADD ++# define FUNCNAME inline_addmul_2 ++# elif defined(OPERATION_mul_2) ++# define FUNCNAME inline_mul_2 ++# else ++# error Missing define for operation to perform ++# endif ++#else ++# ifdef OPERATION_addmul_2 ++# define ADD ++# define FUNCNAME mpn_addmul_2 ++# elif defined(OPERATION_mul_2) ++# define FUNCNAME mpn_mul_2 ++# else ++# error Missing define for operation to perform ++# endif ++#endif ++ ++#ifdef DO_INLINE ++static inline mp_limb_t ++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp) ++ __attribute__ ((always_inline)); ++ ++static inline ++#endif ++mp_limb_t ++FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, ++ const mp_limb_t *vp) ++{ ++ ++ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in ++ VRs (using each VR as a single 128-bit accumulator). ++ The inner loop is unrolled to four limbs, with two blocks of four ++ multiplications each. Since the MLGR operation operates on even/odd GPR ++ pairs, pin the products appropriately. */ ++ ++ register mp_limb_t p0_high asm("r0"); ++ register mp_limb_t p0_low asm("r1"); ++ ++ register mp_limb_t p1_high asm("r8"); ++ register mp_limb_t p1_low asm("r9"); ++ ++ register mp_limb_t p2_high asm("r6"); ++ register mp_limb_t p2_low asm("r7"); ++ ++ register mp_limb_t p3_high asm("r10"); ++ register mp_limb_t p3_low asm("r11"); ++ ++ vec_t carry_prod = { .dw = vec_splat_u64 (0) }; ++ vec_t zero = { .dw = vec_splat_u64 (0) }; ++ ++ /* two carry-bits for the 128-bit VR adds - stored in VRs */ ++#ifdef ADD ++ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) }; ++#endif ++ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) }; ++ ++ vec_t tmp; ++ ++ vec_t sum0, sum1; ++ ++ /* products transferred into VRs for accumulating there */ ++ vec_t pv0, pv3; ++ vec_t pv1_low, pv1_high, pv2_low, pv2_high; ++ vec_t low, middle, high; ++#ifdef ADD ++ vec_t rp0, rp1; ++#endif ++ ++ register mp_limb_t v0 asm("r12"); ++ register mp_limb_t v1 asm("r5"); ++ v0 = vp[0]; ++ v1 = vp[1]; ++ ++ /* The scalar multiplications compete with pointer and index increments for ++ * issue ports. Thus, increment the loop index in the middle of the loop so ++ * that the operations for the next iteration's multiplications can be ++ * loaded in time (looks horrible, yet helps performance) and make sure we ++ * use addressing with base reg + index reg + immediate displacement ++ * (so that only the single index needs incrementing, instead of multiple ++ * pointers). */ ++#undef LOOP_ADVANCE ++#define LOOP_ADVANCE (4 * sizeof (mp_limb_t)) ++#define IDX_OFFSET (LOOP_ADVANCE) ++ ++ register ssize_t idx = 0 - IDX_OFFSET; ++#ifdef BRCTG ++ ssize_t iterations = (size_t)n / 4; ++#else ++ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET; ++#endif ++ ++ /* ++ * To minimize latency in the carry chain, accumulate in VRs with 128-bit ++ * adds with carry in and out. As a downside, these require two insns for ++ * each add - one to calculate the sum, one to deliver the carry out. ++ * To reduce the overall number of insns to execute, combine adding up ++ * product limbs such that there cannot be a carry out and one (for mul) or ++ * two (for addmul) adds with carry chains. ++ * ++ * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two ++ * limbs into each 128-bit product without causing carry out. ++ * ++ * For each block of 2 limbs * 2 limbs ++ * ++ * | u[i] * v[0] (p2) | ++ * | u[i] * v[1] (p0) | ++ * | u[i+1] * v[0](p1) | ++ * | u[i+1] * v[1](p3) | ++ * < 128 bits > < 128 bits > ++ * ++ * we can begin accumulating with "simple" carry-oblivious 128-bit adds: ++ * - p0 + low limb of p1 ++ * + high limb of p2 ++ * and combine resulting low limb with p2's low limb ++ * - p3 + high limb of p1 ++ * + high limb of sum above ++ * ... which will will result in two 128-bit limbs to be fed into the carry ++ * chain(s). ++ * Overall, that scheme saves instructions and improves performance, despite ++ * slightly increasing latency between multiplications and carry chain (yet ++ * not in the carry chain). ++ */ ++ ++#define LOAD_LOW_LIMB(VEC, LIMB) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]\n\t" \ ++ "vlvgg\t%[vec],%[limb],1" \ ++ : [vec] "=v"(VEC) \ ++ : [limb] "r"(LIMB)); \ ++ } \ ++ while (0) ++ ++ /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we ++ * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new ++ * carry) */ ++#define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ ++ do \ ++ { \ ++ sum##SUMIDX.sw \ ++ = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ ++ carry_vec##CARRYIDX.sw \ ++ = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \ ++ } \ ++ while (0) ++ ++#define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2) \ ++ ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) ++ ++ /* variant without carry-in for prologue */ ++#define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \ ++ do \ ++ { \ ++ sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw); \ ++ carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw); \ ++ } \ ++ while (0) ++ ++#define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2) \ ++ ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2) ++ ++ /* prologue for 4x-unrolled main loop */ ++ switch ((size_t)n % 4) ++ { ++ case 1: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 0); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); ++ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ ++/* gcc tries to be too clever and vlr from a reg that is already zero. vzero is ++ * cheaper. */ ++# define NEW_CARRY(VEC, LIMB) \ ++ do \ ++ { \ ++ asm("vzero\t%[vec]\n\t" \ ++ "vlvgg\t%[vec],%[limb],1" \ ++ : [vec] "=v"(VEC) \ ++ : [limb] "r"(LIMB)); \ ++ } \ ++ while (0) ++ ++ NEW_CARRY (tmp, p0_high); ++ ++ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); ++#ifdef ADD ++ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); ++#else ++ rp[0] = p0_low; ++#endif ++ idx += sizeof (mp_limb_t); ++ break; ++ ++ case 2: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 8); ++ ASM_LOADGPR_BASE (p2_low, up, 0); ++ ASM_LOADGPR_BASE (p3_low, up, 8); ++ ++ asm("" ++ : "=r"(p0_low), "=r"(p2_low) ++ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ low.dw = vec_permi (middle.dw, pv2_low.dw, 3); ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++#ifdef ADD ++ rp0 = vec_load_elements_reversed (rp, 0); ++ ADD_UP_CARRY_OUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_OUT (1, sum0, low); ++ vec_store_elements_reversed (rp, 0, sum1); ++ carry_prod = high; ++ ++ idx += 2 * sizeof (mp_limb_t); ++ break; ++ ++ case 3: ++ ASM_LOADGPR_BASE (p0_low, up, 0); ++ ASM_LOADGPR_BASE (p1_low, up, 0); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1); ++ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low); ++ NEW_CARRY (tmp, p0_high); ++ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw); ++ ++#ifdef ADD ++ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp); ++#else ++ rp[0] = p0_low; ++#endif ++ ++ ASM_LOADGPR_BASE (p0_low, up, 8); ++ ASM_LOADGPR_BASE (p1_low, up, 16); ++ ASM_LOADGPR_BASE (p2_low, up, 8); ++ ASM_LOADGPR_BASE (p3_low, up, 16); ++ ++ asm("" ++ : "=r"(p0_low), "=r"(p2_low) ++ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low)); ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ ++ low.dw = vec_permi (middle.dw, pv2_low.dw, 3); ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++ ++#ifdef ADD ++ vec_t rp0 = vec_load_elements_reversed (rp, 8); ++ ADD_UP_CARRY_OUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_INOUT (1, sum0, low); ++ ++ vec_store_elements_reversed (rp, 8, sum1); ++ ++ carry_prod = high; ++ ++ idx += 3 * sizeof (mp_limb_t); ++ break; ++ } ++ ++ /* ++ * branch-on-count implicitly hint to the branch prediction as taken, while ++ * compare-and-branch hints as not taken. currently, using branch-on-count ++ * has a performance advantage, but it is not clear that it is generally ++ * the better choice (e.g., branch-on-count requires decrementing the ++ * separate counter). so, allow switching the loop condition to enable ++ * either category of branch instructions: ++ * - idx is less than an upper bound, for compare-and-branch ++ * - iteration counter greater than zero, for branch-on-count ++ */ ++#ifdef BRCTG ++ for (; iterations > 0; iterations--) ++ { ++#else ++ while (idx < idx_bound) ++ { ++#endif ++ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the ++ * result in a GPR pair. One of the factors is taken from the GPR pair ++ * and overwritten. ++ * To reuse factors, it turned out cheaper to load limbs multiple times ++ * than copying GPR contents. Enforce that and the use of addressing by ++ * base + index gpr + immediate displacement via inline asm. ++ */ ++ ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET); ++ ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ ++ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ ++ LOAD_LOW_LIMB (pv1_low, p1_low); ++ LOAD_LOW_LIMB (pv1_high, p1_high); ++ ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw); ++ LOAD_LOW_LIMB (pv2_high, p2_high); ++ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ ++ LOAD_LOW_LIMB (pv2_low, p2_low); ++ ++ ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET); ++ ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET); ++ ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET); ++ ++ idx += LOOP_ADVANCE; ++ ++ /* ++ * "barrier" to enforce scheduling the index increment before the second ++ * block of multiplications. not required for clang. ++ */ ++#ifndef __clang__ ++ asm("" ++ : "=r"(idx), "=r"(p0_high), "=r"(p2_high) ++ : "0"(idx), "1"(p0_high), "2"(p2_high)); ++#endif ++ ++ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0); ++ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1); ++ ++ /* ++ * "barrier" to enforce scheduling all MLGRs first, before any adding ++ * up. note that clang produces better code without. ++ */ ++#ifndef __clang__ ++ asm("" ++ : "=v"(pv0.sw), "=v"(pv3.sw) ++ : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high)); ++#endif ++ ++ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw); ++ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw); ++ ++ low.dw = vec_permi (middle.dw, pv2_low.dw, ++ 3); /* least-significant doubleword from both vectors */ ++ middle.dw = vec_permi (zero.dw, middle.dw, 0); ++ high.sw = vec_add_u128 (middle.sw, pv3.sw); ++ ++#ifdef ADD ++ rp0 = vec_load_elements_reversed_idx (rp, idx, ++ 0 + IDX_OFFSET - LOOP_ADVANCE); ++ ADD_UP_CARRY_INOUT (0, rp0, carry_prod); ++#else ++ sum0 = carry_prod; ++#endif ++ ADD_UP_CARRY_INOUT (1, sum0, low); ++ ++ vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE, ++ sum1); ++ ++ carry_prod = high; ++ ++ vec_t pv0_2, pv3_2; ++ vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2; ++ vec_t low_2, middle_2, high_2; ++ vec_t sum2, sum3; ++ ++ pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low); ++ LOAD_LOW_LIMB (pv1_low_2, p1_low); ++ LOAD_LOW_LIMB (pv1_high_2, p1_high); ++ ++ pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw); ++ LOAD_LOW_LIMB (pv2_high_2, p2_high); ++ pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low); ++ pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw); ++ middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw); ++ ++ LOAD_LOW_LIMB (pv2_low_2, p2_low); ++ low_2.dw ++ = vec_permi (middle_2.dw, pv2_low_2.dw, ++ 3); /* least-significant doubleword from both vectors */ ++ middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0); ++ high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw); ++ ++ /* ++ * another "barrier" to influence scheduling. (also helps in clang) ++ */ ++ asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw)); ++ ++#ifdef ADD ++ rp1 = vec_load_elements_reversed_idx (rp, idx, ++ 16 + IDX_OFFSET - LOOP_ADVANCE); ++ ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod); ++#else ++ sum2 = carry_prod; ++#endif ++ ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2); ++ ++ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE, ++ sum3); ++ ++ carry_prod = high_2; ++ } ++ ++#ifdef ADD ++ sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw); ++#else ++ sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw); ++#endif ++ ++ *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1]; ++ ++ return (mp_limb_t)sum0.dw[0]; ++} +diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h +new file mode 100644 +index 000000000..a17503fd0 +--- /dev/null ++++ b/mpn/s390_64/z13/gmp-mparam.h +@@ -0,0 +1,37 @@ ++/* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file. ++ ++Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#define GMP_LIMB_BITS 64 ++#define GMP_LIMB_BYTES 8 ++ ++#define HAVE_NATIVE_mpn_addmul_2 1 ++#define HAVE_NATIVE_mpn_mul_2 1 ++ ++#include "mpn/s390_64/gmp-mparam.h" +-- +2.40.1 diff --git a/SOURCES/ibm_z13_simd_part3.patch b/SOURCES/ibm_z13_simd_part3.patch new file mode 100644 index 0000000..8301e57 --- /dev/null +++ b/SOURCES/ibm_z13_simd_part3.patch @@ -0,0 +1,138 @@ +Co-authored-by: Stefan Liebler +--- + mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++ + 1 file changed, 124 insertions(+) + create mode 100644 mpn/s390_64/z13/mul_basecase.c + +diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c +new file mode 100644 +index 000000000..f1b7160b3 +--- /dev/null ++++ b/mpn/s390_64/z13/mul_basecase.c +@@ -0,0 +1,124 @@ ++/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two ++ natural numbers of length m and n. ++ ++ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY ++ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. ++ ++Copyright 2021 Free Software Foundation, Inc. ++ ++This file is part of the GNU MP Library. ++ ++The GNU MP Library is free software; you can redistribute it and/or modify ++it under the terms of either: ++ ++ * the GNU Lesser General Public License as published by the Free ++ Software Foundation; either version 3 of the License, or (at your ++ option) any later version. ++ ++or ++ ++ * the GNU General Public License as published by the Free Software ++ Foundation; either version 2 of the License, or (at your option) any ++ later version. ++ ++or both in parallel, as here. ++ ++The GNU MP Library is distributed in the hope that it will be useful, but ++WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License ++for more details. ++ ++You should have received copies of the GNU General Public License and the ++GNU Lesser General Public License along with the GNU MP Library. If not, ++see https://www.gnu.org/licenses/. */ ++ ++#include ++ ++#include "gmp-impl.h" ++ ++/* Note: we explicitly inline all mul and addmul routines here to reduce the ++ * number of branches in prologues of unrolled functions. That comes at the ++ cost of duplicating common loop bodies in object code. */ ++#define DO_INLINE ++ ++/* ++ * tweak loop conditions in addmul subroutines to enable use of ++ * branch-relative-on-count (BRCTG) instructions, which currently results in ++ * better performance. ++ */ ++#define BRCTG ++ ++#include "s390_64/z13/common-vec.h" ++ ++#define OPERATION_mul_1 ++#include "s390_64/z13/addmul_1.c" ++#undef OPERATION_mul_1 ++ ++#define OPERATION_addmul_1 ++#include "s390_64/z13/addmul_1.c" ++#undef OPERATION_addmul_1 ++ ++#define OPERATION_mul_2 ++#include "s390_64/z13/aormul_2.c" ++#undef OPERATION_mul_2 ++ ++#define OPERATION_addmul_2 ++#include "s390_64/z13/aormul_2.c" ++#undef OPERATION_addmul_2 ++ ++void ++mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, ++ mp_size_t vn) ++{ ++ ASSERT (un >= vn); ++ ASSERT (vn >= 1); ++ ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un)); ++ ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn)); ++ ++ /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch ++ * for un%4 and inline specific variants. */ ++ ++#define BRANCH_FOR_MOD(N) \ ++ do \ ++ { \ ++ if (vn >= 2) \ ++ { \ ++ rp[un + 1] = inline_mul_2 (rp, up, un, vp); \ ++ rp += 2, vp += 2, vn -= 2; \ ++ } \ ++ else \ ++ { \ ++ rp[un] = inline_mul_1 (rp, up, un, vp[0]); \ ++ return; \ ++ } \ ++ \ ++ while (vn >= 2) \ ++ { \ ++ rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp); \ ++ rp += 2, vp += 2, vn -= 2; \ ++ } \ ++ \ ++ while (vn >= 1) \ ++ { \ ++ rp[un] = inline_addmul_1 (rp, up, un, vp[0]); \ ++ rp += 1, vp += 1, vn -= 1; \ ++ } \ ++ } \ ++ while (0); ++ ++ switch (((size_t)un) % 4) ++ { ++ case 0: ++ BRANCH_FOR_MOD (0); ++ break; ++ case 1: ++ BRANCH_FOR_MOD (1); ++ break; ++ case 2: ++ BRANCH_FOR_MOD (2); ++ break; ++ case 3: ++ BRANCH_FOR_MOD (3); ++ break; ++ } ++} +-- +2.40.1 diff --git a/SOURCES/ibm_z13_simd_part4.patch b/SOURCES/ibm_z13_simd_part4.patch new file mode 100644 index 0000000..c87c17c --- /dev/null +++ b/SOURCES/ibm_z13_simd_part4.patch @@ -0,0 +1,151 @@ +From: Marius Hillenbrand + +--- + mpn/s390_64/z13/gmp-mparam.h | 129 ++++++++++++++++++++++++++++++++++- + 1 file changed, 127 insertions(+), 2 deletions(-) + +diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h +index a17503fd0..50e7f39d1 100644 +--- a/mpn/s390_64/z13/gmp-mparam.h ++++ b/mpn/s390_64/z13/gmp-mparam.h +@@ -1,6 +1,6 @@ + /* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file. + +-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc. ++Copyright 2021 Free Software Foundation, Inc. + + This file is part of the GNU MP Library. + +@@ -34,4 +34,129 @@ see https://www.gnu.org/licenses/. */ + #define HAVE_NATIVE_mpn_addmul_2 1 + #define HAVE_NATIVE_mpn_mul_2 1 + +-#include "mpn/s390_64/gmp-mparam.h" ++/* Generated by tuneup.c, 2021-07-30, gcc 10.2 */ ++ ++#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1_1P_METHOD 2 ++#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define MOD_1N_TO_MOD_1_1_THRESHOLD 17 ++#define MOD_1U_TO_MOD_1_1_THRESHOLD 15 ++#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */ ++#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */ ++#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5 ++#define USE_PREINV_DIVREM_1 1 ++#define DIV_QR_1N_PI1_METHOD 3 ++#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */ ++#define DIV_QR_2_PI2_THRESHOLD 996 ++#define DIVEXACT_1_THRESHOLD 4 ++#define BMOD_1_TO_MOD_1_THRESHOLD 0 /* always */ ++ ++#define DIV_1_VS_MUL_1_PERCENT 404 ++ ++#define MUL_TOOM22_THRESHOLD 23 ++#define MUL_TOOM33_THRESHOLD 94 ++#define MUL_TOOM44_THRESHOLD 166 ++#define MUL_TOOM6H_THRESHOLD 286 ++#define MUL_TOOM8H_THRESHOLD 626 ++ ++#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113 ++#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138 ++#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143 ++#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145 ++#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130 ++ ++#define SQR_BASECASE_THRESHOLD 0 /* always (native) */ ++#define SQR_TOOM2_THRESHOLD 12 ++#define SQR_TOOM3_THRESHOLD 84 ++#define SQR_TOOM4_THRESHOLD 234 ++#define SQR_TOOM6_THRESHOLD 318 ++#define SQR_TOOM8_THRESHOLD 478 ++ ++#define MULMID_TOOM42_THRESHOLD 42 ++ ++#define MULMOD_BNM1_THRESHOLD 13 ++#define SQRMOD_BNM1_THRESHOLD 7 ++ ++#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */ ++#define MUL_FFT_TABLE3 \ ++ { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \ ++ { 21, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \ ++ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \ ++ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \ ++ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \ ++ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \ ++ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \ ++ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \ ++ { 67,10}, { 47,11}, { 2048,12}, { 4096,13}, \ ++ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \ ++ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \ ++ {2097152,22}, {4194304,23}, {8388608,24} } ++#define MUL_FFT_TABLE3_SIZE 47 ++#define MUL_FFT_THRESHOLD 2752 ++ ++#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */ ++#define SQR_FFT_TABLE3 \ ++ { { 240, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \ ++ { 7, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \ ++ { 9, 5}, { 19, 6}, { 15, 7}, { 8, 6}, \ ++ { 17, 7}, { 9, 6}, { 19, 7}, { 10, 6}, \ ++ { 21, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \ ++ { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \ ++ { 21, 9}, { 11, 8}, { 23, 9}, { 15, 8}, \ ++ { 31, 9}, { 19, 8}, { 39, 9}, { 23,10}, \ ++ { 15, 9}, { 39,10}, { 23,11}, { 15,10}, \ ++ { 31, 9}, { 63,10}, { 47,11}, { 2048,12}, \ ++ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16}, \ ++ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \ ++ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} } ++#define SQR_FFT_TABLE3_SIZE 52 ++#define SQR_FFT_THRESHOLD 1856 ++ ++#define MULLO_BASECASE_THRESHOLD 0 /* always */ ++#define MULLO_DC_THRESHOLD 25 ++#define MULLO_MUL_N_THRESHOLD 5397 ++#define SQRLO_BASECASE_THRESHOLD 0 /* always */ ++#define SQRLO_DC_THRESHOLD 396 ++#define SQRLO_SQR_THRESHOLD 3704 ++ ++#define DC_DIV_QR_THRESHOLD 15 ++#define DC_DIVAPPR_Q_THRESHOLD 50 ++#define DC_BDIV_QR_THRESHOLD 66 ++#define DC_BDIV_Q_THRESHOLD 202 ++ ++#define INV_MULMOD_BNM1_THRESHOLD 46 ++#define INV_NEWTON_THRESHOLD 29 ++#define INV_APPR_THRESHOLD 13 ++ ++#define BINV_NEWTON_THRESHOLD 312 ++#define REDC_1_TO_REDC_2_THRESHOLD 79 ++#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */ ++ ++#define MU_DIV_QR_THRESHOLD 979 ++#define MU_DIVAPPR_Q_THRESHOLD 979 ++#define MUPI_DIV_QR_THRESHOLD 13 ++#define MU_BDIV_QR_THRESHOLD 942 ++#define MU_BDIV_Q_THRESHOLD 1367 ++ ++#define POWM_SEC_TABLE 3,19,215,1730 ++ ++#define GET_STR_DC_THRESHOLD 10 ++#define GET_STR_PRECOMPUTE_THRESHOLD 15 ++#define SET_STR_DC_THRESHOLD 882 ++#define SET_STR_PRECOMPUTE_THRESHOLD 2520 ++ ++#define FAC_DSC_THRESHOLD 228 ++#define FAC_ODD_THRESHOLD 24 ++ ++#define MATRIX22_STRASSEN_THRESHOLD 19 ++#define HGCD2_DIV1_METHOD 1 ++#define HGCD_THRESHOLD 61 ++#define HGCD_APPR_THRESHOLD 51 ++#define HGCD_REDUCE_THRESHOLD 1962 ++#define GCD_DC_THRESHOLD 217 ++#define GCDEXT_DC_THRESHOLD 263 ++#define JACOBI_BASE_METHOD 4 ++ +-- +2.40.1 diff --git a/SPECS/gmp.spec b/SPECS/gmp.spec index 482ce6d..069f8d7 100644 --- a/SPECS/gmp.spec +++ b/SPECS/gmp.spec @@ -6,7 +6,7 @@ Summary: A GNU arbitrary precision library Name: gmp Version: 6.2.0 -Release: 10%{?dist} +Release: 13%{?dist} Epoch: 1 URL: http://gmplib.org/ Source0: ftp://ftp.gmplib.org/pub/gmp-%{version}/gmp-%{version}.tar.bz2 @@ -15,6 +15,11 @@ Source2: gmp.h Source3: gmp-mparam.h Patch2: gmp-6.0.0-debuginfo.patch Patch3: gmp-intel-cet.patch +Patch4: cve-2021-43618.patch +Patch5: ibm_z13_simd_part1.patch +Patch6: ibm_z13_simd_part2.patch +Patch7: ibm_z13_simd_part3.patch +Patch8: ibm_z13_simd_part4.patch License: LGPLv3+ or GPLv2+ BuildRequires: autoconf automake libtool BuildRequires: gcc @@ -74,7 +79,7 @@ in applications. # switch the defaults to new cpus on s390x %ifarch s390x -( cd mpn/s390_64; ln -s z10 s390x ) +( cd mpn/s390_64; ln -s z13 s390x ) %endif %build @@ -182,6 +187,19 @@ export LD_LIBRARY_PATH=`pwd`/.libs %{_libdir}/libgmpxx.a %changelog +* Tue Aug 03 2023 Jakub Martisko - 1:6.2.0-13 +- Fix: previous commit removed one function from the library and thus broke the ABI +- function gmpn_preinv_divrem_1 should now not be removed +Related: rhbz#2044216 + +* Tue Jul 18 2023 Jakub Martisko - 1:6.2.0-12 +- Add SIMD optimization patches for s390x (provided by the IBM) +Resolves: rhbz#2044216 + +* Tue Jun 06 2023 Jakub Martisko - 1:6.2.0-11 +Fix: Integer overflow and resultant buffer overflow via crafted input +Resolves: CVE-2021-43618 + * Fri Aug 27 2021 Jakub Martisko - 1:6.2.0-10 - Add the support for intel CET Resolves: rhbz#1977890