Co-authored-by: Stefan Liebler --- mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++ 1 file changed, 124 insertions(+) create mode 100644 mpn/s390_64/z13/mul_basecase.c diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c new file mode 100644 index 000000000..f1b7160b3 --- /dev/null +++ b/mpn/s390_64/z13/mul_basecase.c @@ -0,0 +1,125 @@ +/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two + natural numbers of length m and n. + + THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY + SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES. + +Copyright 2021 Free Software Foundation, Inc. + +This file is part of the GNU MP Library. + +The GNU MP Library is free software; you can redistribute it and/or modify +it under the terms of either: + + * the GNU Lesser General Public License as published by the Free + Software Foundation; either version 3 of the License, or (at your + option) any later version. + +or + + * the GNU General Public License as published by the Free Software + Foundation; either version 2 of the License, or (at your option) any + later version. + +or both in parallel, as here. + +The GNU MP Library is distributed in the hope that it will be useful, but +WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY +or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License +for more details. + +You should have received copies of the GNU General Public License and the +GNU Lesser General Public License along with the GNU MP Library. If not, +see https://www.gnu.org/licenses/. */ + +#include + +#include "gmp.h" +#include "gmp-impl.h" + +/* Note: we explicitly inline all mul and addmul routines here to reduce the + * number of branches in prologues of unrolled functions. That comes at the + cost of duplicating common loop bodies in object code. */ +#define DO_INLINE + +/* + * tweak loop conditions in addmul subroutines to enable use of + * branch-relative-on-count (BRCTG) instructions, which currently results in + * better performance. + */ +#define BRCTG + +#include "s390_64/z13/common-vec.h" + +#define OPERATION_mul_1 +#include "s390_64/z13/addmul_1.c" +#undef OPERATION_mul_1 + +#define OPERATION_addmul_1 +#include "s390_64/z13/addmul_1.c" +#undef OPERATION_addmul_1 + +#define OPERATION_mul_2 +#include "s390_64/z13/aormul_2.c" +#undef OPERATION_mul_2 + +#define OPERATION_addmul_2 +#include "s390_64/z13/aormul_2.c" +#undef OPERATION_addmul_2 + +void +mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp, + mp_size_t vn) +{ + ASSERT (un >= vn); + ASSERT (vn >= 1); + ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un)); + ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn)); + + /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch + * for un%4 and inline specific variants. */ + +#define BRANCH_FOR_MOD(N) \ + do \ + { \ + if (vn >= 2) \ + { \ + rp[un + 1] = inline_mul_2 (rp, up, un, vp); \ + rp += 2, vp += 2, vn -= 2; \ + } \ + else \ + { \ + rp[un] = inline_mul_1 (rp, up, un, vp[0]); \ + return; \ + } \ + \ + while (vn >= 2) \ + { \ + rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp); \ + rp += 2, vp += 2, vn -= 2; \ + } \ + \ + while (vn >= 1) \ + { \ + rp[un] = inline_addmul_1 (rp, up, un, vp[0]); \ + rp += 1, vp += 1, vn -= 1; \ + } \ + } \ + while (0); + + switch (((size_t)un) % 4) + { + case 0: + BRANCH_FOR_MOD (0); + break; + case 1: + BRANCH_FOR_MOD (1); + break; + case 2: + BRANCH_FOR_MOD (2); + break; + case 3: + BRANCH_FOR_MOD (3); + break; + } +} -- 2.40.1