Compare commits

...

No commits in common. "c8" and "c9s" have entirely different histories.
c8 ... c9s

16 changed files with 5105 additions and 2050 deletions

11
.gitignore vendored
View File

@ -1 +1,10 @@
SOURCES/gmp-6.1.2.tar.bz2
/gmp-5.0.5.tar.xz
/gmp-5.1.0.tar.bz2
/gmp-5.1.1.tar.bz2
/gmp-5.1.2.tar.bz2
/gmp-5.1.3.tar.bz2
/gmp-6.0.0a.tar.bz2
/gmp-6.1.0.tar.bz2
/gmp-6.1.1.tar.bz2
/gmp-6.1.2.tar.bz2
/gmp-6.2.0.tar.bz2

View File

@ -1 +1 @@
366ded6a44cd108ba6b3f5b9a252eab3f3a95cdf SOURCES/gmp-6.1.2.tar.bz2
5e9341d3807bc7505376f9ed9f5c1c6c57050aa6 gmp-6.2.0.tar.bz2

File diff suppressed because it is too large Load Diff

22
STAGE1-gmp Normal file
View File

@ -0,0 +1,22 @@
L=$1
srpm $L
mcd $BUILDDIR/t-$L
$SRC/${L}-*/configure $TCONFIGARGS
make $J
# The contents of libdir (which in this case is
# /usr/lib${SUFFIX}), will get passed to -L in one of the
# (re)link stages while installing, thus getting libgcc_so.*
# from the host.
#
# ?? I don't understand why --libdir=/usr/lib64 is being
# passed in $TCONFIGARGS instead of
# --libdir=${ROOTFS}/usr/lib64, but assuming that's the way
# it's supposed to to be, then fix libdir here right before
# installing the binaries. ??--aldyh
for i in *.la; do
sed -i~ -e "s;libdir='/usr/lib;libdir='$ROOTFS/usr/lib;" $i
done
make $J install DESTDIR=${ROOTFS}
fix_la $L

25
cve-2021-43618.patch Normal file
View File

@ -0,0 +1,25 @@
# HG changeset patch
# User Marco Bodrato <bodrato@mail.dm.unipi.it>
# Date 1634836009 -7200
# Node ID 561a9c25298e17bb01896801ff353546c6923dbd
# Parent e1fd9db13b475209a864577237ea4b9105b3e96e
mpz/inp_raw.c: Avoid bit size overflows
diff -r e1fd9db13b47 -r 561a9c25298e mpz/inp_raw.c
--- a/mpz/inp_raw.c Tue Dec 22 23:49:51 2020 +0100
+++ b/mpz/inp_raw.c Thu Oct 21 19:06:49 2021 +0200
@@ -88,8 +88,11 @@
abs_csize = ABS (csize);
+ if (UNLIKELY (abs_csize > ~(mp_bitcnt_t) 0 / 8))
+ return 0; /* Bit size overflows */
+
/* round up to a multiple of limbs */
- abs_xsize = BITS_TO_LIMBS (abs_csize*8);
+ abs_xsize = BITS_TO_LIMBS ((mp_bitcnt_t) abs_csize * 8);
if (abs_xsize != 0)
{

6
gating.yaml Normal file
View File

@ -0,0 +1,6 @@
--- !Policy
product_versions:
- rhel-9
decision_context: osci_compose_gate
rules:
- !PassingTestCaseRule {test_case_name: baseos-ci.brew-build.tier1.functional}

3518
gmp-intel-cet.patch Normal file

File diff suppressed because it is too large Load Diff

View File

@ -5,8 +5,8 @@
Summary: A GNU arbitrary precision library
Name: gmp
Version: 6.1.2
Release: 10%{?dist}
Version: 6.2.0
Release: 13%{?dist}
Epoch: 1
URL: http://gmplib.org/
Source0: ftp://ftp.gmplib.org/pub/gmp-%{version}/gmp-%{version}.tar.bz2
@ -14,15 +14,25 @@ Source0: ftp://ftp.gmplib.org/pub/gmp-%{version}/gmp-%{version}.tar.bz2
Source2: gmp.h
Source3: gmp-mparam.h
Patch2: gmp-6.0.0-debuginfo.patch
Patch3: gmp-fcf-protection.patch
Patch3: gmp-intel-cet.patch
Patch4: cve-2021-43618.patch
Patch5: ibm_z13_simd_part1.patch
Patch6: ibm_z13_simd_part2.patch
Patch7: ibm_z13_simd_part3.patch
Patch8: ibm_z13_simd_part4.patch
License: LGPLv3+ or GPLv2+
Group: System Environment/Libraries
BuildRoot: %{_tmppath}/%{name}-%{version}-%{release}-root-%(%{__id_u} -n)
BuildRequires: autoconf automake libtool
BuildRequires: gcc
BuildRequires: gcc-c++
BuildRequires: git
#autoreconf on arm needs:
BuildRequires: perl-Carp
# Generate the .hmac checksum unless --without fips is used
%bcond_without fips
%if %{with fips}
BuildRequires: fipscheck
%endif
BuildRequires: make
%description
The gmp package contains GNU MP, a library for arbitrary precision
@ -38,7 +48,6 @@ library.
%package c++
Summary: C++ bindings for the GNU MP arbitrary precision library
Group: System Environment/Libraries
Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release}
%description c++
@ -46,11 +55,8 @@ Bindings for using the GNU MP arbitrary precision library in C++ applications.
%package devel
Summary: Development tools for the GNU MP arbitrary precision library
Group: Development/Libraries
Requires: %{name}%{?_isa} = %{epoch}:%{version}-%{release}
Requires: %{name}-c++%{?_isa} = %{epoch}:%{version}-%{release}
Requires(post): /sbin/install-info
Requires(preun): /sbin/install-info
%description devel
The libraries, header files and documentation for using the GNU MP
@ -62,7 +68,6 @@ install the gmp package.
%package static
Summary: Development tools for the GNU MP arbitrary precision library
Group: Development/Libraries
Requires: %{name}-devel = %{epoch}:%{version}-%{release}
%description static
@ -74,7 +79,7 @@ in applications.
# switch the defaults to new cpus on s390x
%ifarch s390x
( cd mpn/s390_64; ln -s z10 s390x )
( cd mpn/s390_64; ln -s z13 s390x )
%endif
%build
@ -89,10 +94,6 @@ fi
export CXXFLAGS=$(echo %{optflags} | sed -e "s/-mtune=[^ ]*//g" | sed -e "s/-march=[^ ]*/-march=i686/g")
%endif
export CCAS="$CCAS -Wa,--generate-missing-build-notes=yes"
export CFLAGS="$(echo %{optflags}) -fplugin=annobin"
export CXXFLAGS="$(echo %{optflags}) -fplugin=annobin"
%configure --enable-cxx --enable-fat
sed -e 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' \
@ -100,23 +101,26 @@ sed -e 's|^hardcode_libdir_flag_spec=.*|hardcode_libdir_flag_spec=""|g' \
-e 's|-lstdc++ -lm|-lstdc++|' \
-i libtool
export LD_LIBRARY_PATH=`pwd`/.libs
make %{?_smp_mflags}
make check
%make_build
# Add generation of HMAC checksums of the final stripped binaries
# bz#1117188
%if %{with fips}
%define __spec_install_post \
%{?__debug_package:%{__debug_install_post}} \
%{__arch_install_post} \
%{__os_install_post} \
mkdir -p $RPM_BUILD_ROOT%{_libdir}/fipscheck \
fipshmac -d $RPM_BUILD_ROOT%{_libdir}/fipscheck $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.3.2 \
ln -s libgmp.so.10.3.2.hmac $RPM_BUILD_ROOT%{_libdir}/fipscheck/libgmp.so.10.hmac \
%{nil}
%{?__debug_package:%{__debug_install_post}} \
%{__arch_install_post} \
%{__os_install_post} \
fipshmac -d $RPM_BUILD_ROOT%{_libdir} $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.* \
file=`basename $RPM_BUILD_ROOT%{_libdir}/libgmp.so.10.*.hmac` && \
mkdir -p $RPM_BUILD_ROOT%{_libdir}/fipscheck && \
mv $RPM_BUILD_ROOT%{_libdir}/$file $RPM_BUILD_ROOT%{_libdir}/fipscheck/$file && \
ln -s $file $RPM_BUILD_ROOT%{_libdir}/fipscheck/libgmp.so.10.hmac && \
cp $RPM_BUILD_ROOT%{_libdir}/fipscheck/$file $RPM_BUILD_ROOT%{_libdir}/.$file && \
ln -s .$file $RPM_BUILD_ROOT%{_libdir}/.libgmp.so.10.hmac
%{nil}
%endif
%install
export LD_LIBRARY_PATH=`pwd`/.libs
make install DESTDIR=$RPM_BUILD_ROOT
%make_install
install -m 644 gmp-mparam.h ${RPM_BUILD_ROOT}%{_includedir}
rm -f $RPM_BUILD_ROOT%{_libdir}/lib{gmp,mp,gmpxx}.la
rm -f $RPM_BUILD_ROOT%{_infodir}/dir
@ -150,68 +154,104 @@ install -m644 %{SOURCE3} %{buildroot}/%{_includedir}/gmp-mparam.h
%check
%ifnarch ppc
export LD_LIBRARY_PATH=`pwd`/.libs
make %{?_smp_mflags} check
%make_build check
%endif
%post -p /sbin/ldconfig
%ldconfig_scriptlets
%postun -p /sbin/ldconfig
%post c++ -p /sbin/ldconfig
%postun c++ -p /sbin/ldconfig
%post devel
if [ -f %{_infodir}/gmp.info.gz ]; then
/sbin/install-info %{_infodir}/gmp.info.gz %{_infodir}/dir || :
fi
exit 0
%preun devel
if [ $1 = 0 ]; then
if [ -f %{_infodir}/gmp.info.gz ]; then
/sbin/install-info --delete %{_infodir}/gmp.info.gz %{_infodir}/dir || :
fi
fi
exit 0
%ldconfig_scriptlets c++
%files
%defattr(-,root,root,-)
%{!?_licensedir:%global license %%doc}
%license COPYING COPYING.LESSERv3 COPYINGv2 COPYINGv3
%doc NEWS README
%{_libdir}/libgmp.so.*
%{_libdir}/fipscheck/libgmp.so.10.3.2.hmac
%{_libdir}/fipscheck/libgmp.so.10.hmac
%if %{with fips}
%{_libdir}/.libgmp.so.*.hmac
%{_libdir}/fipscheck/libgmp.so.*.hmac
%endif
%files c++
%{_libdir}/libgmpxx.so.*
%files devel
%defattr(-,root,root,-)
%{_libdir}/libgmp.so
%{_libdir}/libgmpxx.so
%{_libdir}/pkgconfig/gmp.pc
%{_libdir}/pkgconfig/gmpxx.pc
%{_includedir}/*.h
%{_infodir}/gmp.info*
%files static
%defattr(-,root,root,-)
%{_libdir}/libgmp.a
%{_libdir}/libgmpxx.a
%changelog
* Fri Jun 14 2019 Jakub Martisko <jamartis@redhat.com> - 1:6.1.2-10
- Add gating.yaml
Related: #1681026
* Tue Aug 03 2023 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-13
- Fix: previous commit removed one function from the library and thus broke the ABI
- function gmpn_preinv_divrem_1 should now not be removed
Related: rhbz#2044216
* Tue Jun 11 2019 Jakub Martisko <jamartis@redhat.com> - 1:6.1.2-9
- Add support for intel CET and -fcf-protection
- Add missing compiler/linker flags
Related: #1630567
* Tue Jul 18 2023 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-12
- Add SIMD optimization patches for s390x (provided by the IBM)
Resolves: rhbz#2044216
* Thu Jul 26 2018 David Kaspar [Dee'Kej] <dkaspar@redhat.com> - 1:6.1.2-8
- Missing fipschecks added into build process (bug #1553679)
- --enable-fat option added to %%configure (bug #1493218)
* Tue Jun 06 2023 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-11
Fix: Integer overflow and resultant buffer overflow via crafted input
Resolves: CVE-2021-43618
* Fri Aug 27 2021 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-10
- Add the support for intel CET
Resolves: rhbz#1977890
* Wed Aug 18 2021 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-9
- Move the .hmac files to the fipscheck subfolder
- Make symlinks from their original location (Fedora contains the .hmac files there) pointing to their new location
Resolves: rhbz#1980758
* Mon Aug 09 2021 Mohan Boddu <mboddu@redhat.com> - 1:6.2.0-8
- Rebuilt for IMA sigs, glibc 2.34, aarch64 flags
Related: rhbz#1991688
* Thu Apr 15 2021 Mohan Boddu <mboddu@redhat.com> - 1:6.2.0-7
- Rebuilt for RHEL 9 BETA on Apr 15th 2021. Related: rhbz#1947937
* Tue Jan 26 2021 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.2.0-6
- Rebuilt for https://fedoraproject.org/wiki/Fedora_34_Mass_Rebuild
* Tue Sep 15 2020 Kalev Lember <klember@redhat.com> - 1:6.2.0-5
- Move gmpxx.pc to -devel subpackage as well
* Fri Aug 07 2020 Peter Robinson <pbrobinson@fedoraproject.org> - 1:6.2.0-4
- The pkgcfg file should be in devel
* Tue Jul 28 2020 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-3
- Use make macros
* Mon Jul 27 2020 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.2.0-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_33_Mass_Rebuild
* Mon Feb 17 2020 Jakub Martisko <jamartis@redhat.com> - 1:6.2.0-1
- Rebase to 6.2.0
* Tue Jan 28 2020 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-13
- Rebuilt for https://fedoraproject.org/wiki/Fedora_32_Mass_Rebuild
* Tue Dec 03 2019 Jakub Martisko <jamartis@redhat.com> - 1:6.1.2-12
- Reenable the fat binaries build option
Resolves: #1779060
* Thu Jul 25 2019 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-11
- Rebuilt for https://fedoraproject.org/wiki/Fedora_31_Mass_Rebuild
* Fri Feb 15 2019 Anderson Toshiyuki Sasaki <ansasaki@redhat.com> - 1:6.1.2-10
- Create HMAC checksum for FIPS integrity self tests
* Thu Jan 31 2019 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-9
- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild
* Fri Jul 13 2018 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-8
- Rebuilt for https://fedoraproject.org/wiki/Fedora_29_Mass_Rebuild
* Wed Feb 07 2018 Fedora Release Engineering <releng@fedoraproject.org> - 1:6.1.2-7
- Rebuilt for https://fedoraproject.org/wiki/Fedora_28_Mass_Rebuild

595
ibm_z13_simd_part1.patch Normal file
View File

@ -0,0 +1,595 @@
Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
---
mpn/s390_64/z13/addmul_1.c | 358 +++++++++++++++++++++++++++++++++++
mpn/s390_64/z13/common-vec.h | 175 +++++++++++++++++
mpn/s390_64/z13/mul_1.c | 31 +++
3 files changed, 564 insertions(+)
create mode 100644 mpn/s390_64/z13/addmul_1.c
create mode 100644 mpn/s390_64/z13/common-vec.h
create mode 100644 mpn/s390_64/z13/mul_1.c
diff --git a/mpn/s390_64/z13/addmul_1.c b/mpn/s390_64/z13/addmul_1.c
new file mode 100644
index 000000000..022e5edcc
--- /dev/null
+++ b/mpn/s390_64/z13/addmul_1.c
@@ -0,0 +1,358 @@
+/* Addmul_1 / mul_1 for IBM z13 and later
+ Contributed by Marius Hillenbrand
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp-impl.h"
+#include "s390_64/z13/common-vec.h"
+
+#undef FUNCNAME
+
+#ifdef DO_INLINE
+# ifdef OPERATION_addmul_1
+# define ADD
+# define FUNCNAME inline_addmul_1
+# elif defined(OPERATION_mul_1)
+# define FUNCNAME inline_mul_1
+# endif
+
+#else
+# ifdef OPERATION_addmul_1
+# define ADD
+# define FUNCNAME mpn_addmul_1
+# elif defined(OPERATION_mul_1)
+# define FUNCNAME mpn_mul_1
+# endif
+#endif
+
+#ifdef DO_INLINE
+static inline mp_limb_t
+FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
+ __attribute__ ((always_inline));
+
+static inline
+#endif
+mp_limb_t
+FUNCNAME (mp_ptr rp, mp_srcptr s1p, mp_size_t n, mp_limb_t s2limb)
+{
+ ASSERT (n >= 1);
+ ASSERT (MPN_SAME_OR_INCR_P(rp, s1p, n));
+
+ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
+ VRs (using each VR as a single 128-bit accumulator).
+ The inner loop is unrolled to four limbs, with two blocks of four
+ multiplications each. Since the MLGR operation operates on even/odd GPR
+ pairs, pin the products appropriately. */
+
+ /* products as GPR pairs */
+ register mp_limb_t p0_high asm("r0");
+ register mp_limb_t p0_low asm("r1");
+
+ register mp_limb_t p1_high asm("r8");
+ register mp_limb_t p1_low asm("r9");
+
+ register mp_limb_t p2_high asm("r6");
+ register mp_limb_t p2_low asm("r7");
+
+ register mp_limb_t p3_high asm("r10");
+ register mp_limb_t p3_low asm("r11");
+
+ /* carry flag for 128-bit add in VR for first carry chain */
+ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
+ mp_limb_t carry_limb = 0;
+
+#ifdef ADD
+ /* 2nd carry flag for 2nd carry chain with addmul */
+ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
+ vec_t sum0;
+ vec_t rp0_addend, rp1_addend;
+ rp0_addend.dw = vec_splat_u64 (0);
+ rp1_addend.dw = vec_splat_u64 (0);
+#endif
+ vec_t sum1;
+
+ vec_t carry_prod = { .dw = vec_splat_u64 (0) };
+
+ /* The scalar multiplications compete with pointer and index increments for
+ * issue ports. Thus, increment the loop index in the middle of the loop so
+ * that the operations for the next iteration's multiplications can be
+ * loaded in time (looks horrible, yet helps performance) and make sure we
+ * use addressing with base reg + index reg + immediate displacement
+ * (so that only the single index needs incrementing, instead of multiple
+ * pointers). */
+#undef LOOP_ADVANCE
+#undef IDX_OFFSET
+
+#define LOOP_ADVANCE 4 * sizeof (mp_limb_t)
+#define IDX_OFFSET (LOOP_ADVANCE)
+ register ssize_t idx = 0 - IDX_OFFSET;
+
+ /*
+ * branch-on-count implicitly hint to the branch prediction as taken, while
+ * compare-and-branch hints as not taken. currently, using branch-on-count
+ * has a performance advantage, but it is not clear that it is generally the
+ * better choice (e.g., branch-on-count requires decrementing the separate
+ * counter). so, allow switching the loop condition to enable either
+ * category of branch instructions:
+ * - idx is less than an upper bound, for compare-and-branch
+ * - iteration counter greater than zero, for branch-on-count
+ */
+#define BRCTG
+#ifdef BRCTG
+ ssize_t iterations = (size_t)n / 4;
+#else
+ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
+#endif
+
+ /* products will be transferred into VRs before adding up.
+ * see main loop below for comments on accumulation scheme. */
+ vec_t product0, product1, product2;
+
+ product0.dw = vec_splat_u64 (0);
+
+ switch ((size_t)n % 4)
+ {
+ case 0:
+ break;
+
+ case 1:
+ idx = 1 * sizeof (mp_limb_t) - IDX_OFFSET;
+
+ p3_low = s1p[0];
+ s390_umul_ppmm (p3_high, p3_low, s2limb);
+
+#ifdef ADD
+ rp0_addend.dw[1] = rp[0];
+ product0.dw[1] = p3_low;
+
+ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
+ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
+
+ rp[0] = sum0.dw[1];
+#else
+ rp[0] = p3_low;
+#endif
+
+ carry_limb = p3_high;
+ break;
+
+ case 2:
+ p0_low = s1p[0];
+ p3_low = s1p[1];
+ idx = 2 * sizeof (mp_limb_t) - IDX_OFFSET;
+
+ s390_double_umul_ppmm (p0_high, p0_low, p3_high, p3_low, s2limb);
+
+ carry_prod.dw[0] = p3_low;
+
+ product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
+
+ carry_limb = p3_high;
+
+#ifdef ADD
+ rp0_addend = vec_load_elements_reversed (rp, 0);
+ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
+ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
+
+ sum1.sw = vec_add_u128 (sum0.sw, product0.sw);
+ carry_vec1.sw = vec_addc_u128 (sum0.sw, product0.sw);
+#else
+ sum1.sw = vec_add_u128 (carry_prod.sw, product0.sw);
+ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, product0.sw);
+#endif
+
+ vec_store_elements_reversed (rp, 0, sum1);
+
+ break;
+
+ case 3:
+ idx = 3 * sizeof (mp_limb_t) - IDX_OFFSET;
+
+ p0_low = s1p[0];
+ s390_umul_ppmm (p0_high, p0_low, s2limb);
+
+#ifdef ADD
+ rp0_addend.dw[1] = rp[0];
+ product0.dw[1] = p0_low;
+
+ sum0.sw = vec_add_u128 (product0.sw, rp0_addend.sw);
+ carry_vec1.dw = vec_permi (sum0.dw, sum0.dw, 0);
+
+ rp[0] = sum0.dw[1];
+#else
+ rp[0] = p0_low;
+#endif
+ carry_limb = p0_high;
+
+ p1_low = s1p[1];
+ p3_low = s1p[2];
+
+ s390_double_umul_ppmm (p1_high, p1_low, p3_high, p3_low, s2limb);
+
+ carry_prod.dw = vec_load_2di_as_pair (p3_low, carry_limb);
+ product1.dw = vec_load_2di_as_pair (p1_high, p1_low);
+ carry_limb = p3_high;
+
+#ifdef ADD
+ rp0_addend = vec_load_elements_reversed (rp, 8);
+ sum0.sw = vec_add_u128 (carry_prod.sw, rp0_addend.sw);
+ carry_vec0.sw = vec_addc_u128 (carry_prod.sw, rp0_addend.sw);
+
+ sum1.sw = vec_adde_u128 (sum0.sw, product1.sw, carry_vec1.sw);
+ carry_vec1.sw = vec_addec_u128 (sum0.sw, product1.sw, carry_vec1.sw);
+#else
+ sum1.sw = vec_adde_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
+ carry_vec0.sw
+ = vec_addec_u128 (carry_prod.sw, product1.sw, carry_vec0.sw);
+#endif
+ vec_store_elements_reversed (rp, 8, sum1);
+ break;
+ }
+
+#ifdef BRCTG
+ for (; iterations > 0; iterations--)
+ {
+#else
+ while (idx < idx_bound)
+ {
+#endif
+ vec_t overlap_addend0;
+ vec_t overlap_addend1;
+
+ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
+ * result in a GPR pair. One of the factors is taken from the GPR pair
+ * and overwritten.
+ * To reuse factors, it turned out cheaper to load limbs multiple times
+ * than copying GPR contents. Enforce that and the use of addressing by
+ * base + index gpr + immediate displacement via inline asm.
+ */
+ ASM_LOADGPR (p0_low, s1p, idx, 0 + IDX_OFFSET);
+ ASM_LOADGPR (p1_low, s1p, idx, 8 + IDX_OFFSET);
+ ASM_LOADGPR (p2_low, s1p, idx, 16 + IDX_OFFSET);
+ ASM_LOADGPR (p3_low, s1p, idx, 24 + IDX_OFFSET);
+
+ /*
+ * accumulate products as follows (for addmul):
+ * | rp[i+3] | rp[i+2] | rp[i+1] | rp[i] |
+ * p0_high | p0_low |
+ * p1_high | p1_low | carry-limb in
+ * p2_high | p2_low |
+ * c-limb out <- p3_high | p3_low |
+ * | < 128-bit VR > < 128-bit VR >
+ *
+ * < rp1_addend > < rp0_addend >
+ * carry-chain 0 <- + <- + <- carry_vec0[127]
+ * < product1 > < product0 >
+ * carry-chain 1 <- + <- + <- carry_vec1[127]
+ * < overlap_addend1 > < overlap_addend0 >
+ *
+ * note that a 128-bit add with carry in + out is built from two insns
+ * - vec_adde_u128 (vacq) provides sum
+ * - vec_addec_u128 (vacccq) provides the new carry bit
+ */
+
+ s390_double_umul_ppmm (p0_high, p0_low, p1_high, p1_low, s2limb);
+
+ /*
+ * "barrier" to enforce scheduling loads for all limbs and first round
+ * of MLGR before anything else.
+ */
+ asm volatile("");
+
+ product0.dw = vec_load_2di_as_pair (p0_high, p0_low);
+
+#ifdef ADD
+ rp0_addend = vec_load_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET);
+ rp1_addend = vec_load_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET);
+#endif
+ /* increment loop index to unblock dependant loads of limbs for the next
+ * iteration (see above at #define LOOP_ADVANCE) */
+ idx += LOOP_ADVANCE;
+
+ s390_double_umul_ppmm (p2_high, p2_low, p3_high, p3_low, s2limb);
+
+ overlap_addend0.dw = vec_load_2di_as_pair (p1_low, carry_limb);
+ asm volatile("");
+
+#ifdef ADD
+ sum0.sw = vec_adde_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
+ sum1.sw = vec_adde_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
+
+ carry_vec0.sw
+ = vec_addec_u128 (product0.sw, rp0_addend.sw, carry_vec0.sw);
+ carry_vec1.sw
+ = vec_addec_u128 (sum0.sw, overlap_addend0.sw, carry_vec1.sw);
+#else
+ sum1.sw = vec_adde_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
+ carry_vec0.sw
+ = vec_addec_u128 (product0.sw, overlap_addend0.sw, carry_vec0.sw);
+#endif
+
+ asm volatile("");
+ product2.dw = vec_load_2di_as_pair (p2_high, p2_low);
+ overlap_addend1.dw = vec_load_2di_as_pair (p3_low, p1_high);
+
+ vec_t sum4;
+
+#ifdef ADD
+ vec_t sum3;
+ sum3.sw = vec_adde_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
+ sum4.sw = vec_adde_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
+
+ carry_vec0.sw
+ = vec_addec_u128 (product2.sw, rp1_addend.sw, carry_vec0.sw);
+ carry_vec1.sw
+ = vec_addec_u128 (sum3.sw, overlap_addend1.sw, carry_vec1.sw);
+#else
+ sum4.sw = vec_adde_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
+ carry_vec0.sw
+ = vec_addec_u128 (product2.sw, overlap_addend1.sw, carry_vec0.sw);
+#endif
+ vec_store_elements_reversed_idx (rp, idx, IDX_OFFSET - LOOP_ADVANCE,
+ sum1);
+ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
+ sum4);
+
+ carry_limb = p3_high;
+ }
+
+#ifdef ADD
+ carry_vec0.dw += carry_vec1.dw;
+ carry_limb += carry_vec0.dw[1];
+#else
+ carry_limb += carry_vec0.dw[1];
+#endif
+
+ return carry_limb;
+}
+
+#undef OPERATION_addmul_1
+#undef OPERATION_mul_1
+#undef FUNCNAME
+#undef ADD
diff --git a/mpn/s390_64/z13/common-vec.h b/mpn/s390_64/z13/common-vec.h
new file mode 100644
index 000000000..a59e6eefe
--- /dev/null
+++ b/mpn/s390_64/z13/common-vec.h
@@ -0,0 +1,175 @@
+/* Common vector helpers and macros for IBM z13 and later
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#ifndef __S390_64_Z13_COMMON_VEC_H
+#define __S390_64_Z13_COMMON_VEC_H
+
+#include <unistd.h>
+#include <vecintrin.h>
+
+/*
+ * Vector intrinsics use vector element types that kind-of make sense for the
+ * specific operation (e.g., vec_permi permutes doublewords). To use VRs
+ * interchangeably with different intrinsics, typedef the two variants and wrap
+ * them in a union.
+ */
+#define VLEN_BYTES 16
+typedef unsigned long long v2di __attribute__ ((vector_size (VLEN_BYTES)));
+typedef unsigned char v16qi __attribute__ ((vector_size (VLEN_BYTES)));
+
+/*
+ * The Z vector intrinsics use vectors with different element types (e.g.,
+ * v16qi for the 128-bit adds and v2di for vec_permi).
+ */
+union vec
+{
+ v2di dw;
+ v16qi sw;
+};
+
+typedef union vec vec_t;
+
+/*
+ * single-instruction combine of two GPRs into a VR
+ */
+static inline v2di
+vec_load_2di_as_pair (unsigned long a, unsigned long b)
+{
+ v2di res;
+ __asm__("vlvgp\t%0,%1,%2" : "=v"(res) : "r"(a), "r"(b));
+ return res;
+}
+
+/*
+ * 64x64 mult where caller needs to care about proper register allocation:
+ * multiply xl with m1, treating both as unsigned, and place the result in
+ * xh:xl.
+ * mlgr operates on register pairs, so xh must be an even gpr followed by xl
+ */
+#define s390_umul_ppmm(xh, xl, m1) \
+ do \
+ { \
+ asm("mlgr\t%0,%3" : "=r"(xh), "=r"(xl) : "%1"(xl), "r"(m1)); \
+ } \
+ while (0);
+
+/*
+ * two 64x64 multiplications, scheduled so that they will dispatch and issue to
+ * different sides: each mlgr is dispatched alone in an instruction group and
+ * subsequent groups will issue on different execution sides.
+ * there is a variant where both products use the same multiplicand and one
+ * that uses two different multiplicands. constraints from s390_umul_ppmm apply
+ * here.
+ */
+#define s390_double_umul_ppmm(X0H, X0L, X1H, X1L, MX) \
+ do \
+ { \
+ asm("mlgr\t%[x0h],%[mx]\n\t" \
+ "mlgr\t%[x1h],%[mx]" \
+ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \
+ [x1l] "=r"(X1L) \
+ : "[x0l]"(X0L), "[x1l]"(X1L), [mx] "r"(MX)); \
+ } \
+ while (0);
+
+#define s390_double_umul_ppmm_distinct(X0H, X0L, X1H, X1L, MX0, MX1) \
+ do \
+ { \
+ asm("mlgr\t%[x0h],%[mx0]\n\t" \
+ "mlgr\t%[x1h],%[mx1]" \
+ : [x0h] "=&r"(X0H), [x0l] "=&r"(X0L), [x1h] "=r"(X1H), \
+ [x1l] "=r"(X1L) \
+ : "[x0l]"(X0L), "[x1l]"(X1L), [mx0] "r"(MX0), [mx1] "r"(MX1)); \
+ } \
+ while (0);
+
+#define ASM_LOADGPR_BASE(DST, BASE, OFFSET) \
+ asm volatile("lg\t%[r],%[off](%[b])" \
+ : [r] "=r"(DST) \
+ : [b] "a"(BASE), [off] "L"(OFFSET) \
+ : "memory");
+
+#define ASM_LOADGPR(DST, BASE, INDEX, OFFSET) \
+ asm volatile("lg\t%[r],%[off](%[b],%[x])" \
+ : [r] "=r"(DST) \
+ : [b] "a"(BASE), [x] "a"(INDEX), [off] "L"(OFFSET) \
+ : "memory");
+
+/*
+ * Load a vector register from memory and swap the two 64-bit doubleword
+ * elements.
+ */
+static inline vec_t
+vec_load_elements_reversed_idx (mp_limb_t const *base, ssize_t const index,
+ ssize_t const offset)
+{
+ vec_t res;
+ char *ptr = (char *)base;
+
+ res.sw = *(v16qi *)(ptr + index + offset);
+ res.dw = vec_permi (res.dw, res.dw, 2);
+
+ return res;
+}
+
+static inline vec_t
+vec_load_elements_reversed (mp_limb_t const *base, ssize_t const offset)
+{
+ return vec_load_elements_reversed_idx (base, 0, offset);
+}
+
+/*
+ * Store a vector register to memory and swap the two 64-bit doubleword
+ * elements.
+ */
+static inline void
+vec_store_elements_reversed_idx (mp_limb_t *base, ssize_t const index,
+ ssize_t const offset, vec_t vec)
+{
+ char *ptr = (char *)base;
+
+ vec.dw = vec_permi (vec.dw, vec.dw, 2);
+ *(v16qi *)(ptr + index + offset) = vec.sw;
+}
+
+static inline void
+vec_store_elements_reversed (mp_limb_t *base, ssize_t const offset, vec_t vec)
+{
+ vec_store_elements_reversed_idx (base, 0, offset, vec);
+}
+
+#define ASM_VZERO(VEC) \
+ do \
+ { \
+ asm("vzero\t%[vec]" : [vec] "=v"(VEC)); \
+ } \
+ while (0)
+
+#endif
diff --git a/mpn/s390_64/z13/mul_1.c b/mpn/s390_64/z13/mul_1.c
new file mode 100644
index 000000000..7584dc8c7
--- /dev/null
+++ b/mpn/s390_64/z13/mul_1.c
@@ -0,0 +1,31 @@
+/* mul_1 for IBM z13 or later
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "s390_64/z13/addmul_1.c"
--
2.40.1

535
ibm_z13_simd_part2.patch Normal file
View File

@ -0,0 +1,535 @@
Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
---
mpn/s390_64/z13/aormul_2.c | 476 +++++++++++++++++++++++++++++++++++
mpn/s390_64/z13/gmp-mparam.h | 37 +++
2 files changed, 513 insertions(+)
create mode 100644 mpn/s390_64/z13/aormul_2.c
create mode 100644 mpn/s390_64/z13/gmp-mparam.h
diff --git a/mpn/s390_64/z13/aormul_2.c b/mpn/s390_64/z13/aormul_2.c
new file mode 100644
index 000000000..9a69fc38e
--- /dev/null
+++ b/mpn/s390_64/z13/aormul_2.c
@@ -0,0 +1,476 @@
+/* Addmul_2 / mul_2 for IBM z13 or later
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include "gmp-impl.h"
+
+#include "s390_64/z13/common-vec.h"
+
+#undef FUNCNAME
+
+#ifdef DO_INLINE
+# ifdef OPERATION_addmul_2
+# define ADD
+# define FUNCNAME inline_addmul_2
+# elif defined(OPERATION_mul_2)
+# define FUNCNAME inline_mul_2
+# else
+# error Missing define for operation to perform
+# endif
+#else
+# ifdef OPERATION_addmul_2
+# define ADD
+# define FUNCNAME mpn_addmul_2
+# elif defined(OPERATION_mul_2)
+# define FUNCNAME mpn_mul_2
+# else
+# error Missing define for operation to perform
+# endif
+#endif
+
+#ifdef DO_INLINE
+static inline mp_limb_t
+FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n, const mp_limb_t *vp)
+ __attribute__ ((always_inline));
+
+static inline
+#endif
+mp_limb_t
+FUNCNAME (mp_limb_t *rp, const mp_limb_t *up, mp_size_t n,
+ const mp_limb_t *vp)
+{
+
+ /* Combine 64x64 multiplication into GPR pairs (MLGR) with 128-bit adds in
+ VRs (using each VR as a single 128-bit accumulator).
+ The inner loop is unrolled to four limbs, with two blocks of four
+ multiplications each. Since the MLGR operation operates on even/odd GPR
+ pairs, pin the products appropriately. */
+
+ register mp_limb_t p0_high asm("r0");
+ register mp_limb_t p0_low asm("r1");
+
+ register mp_limb_t p1_high asm("r8");
+ register mp_limb_t p1_low asm("r9");
+
+ register mp_limb_t p2_high asm("r6");
+ register mp_limb_t p2_low asm("r7");
+
+ register mp_limb_t p3_high asm("r10");
+ register mp_limb_t p3_low asm("r11");
+
+ vec_t carry_prod = { .dw = vec_splat_u64 (0) };
+ vec_t zero = { .dw = vec_splat_u64 (0) };
+
+ /* two carry-bits for the 128-bit VR adds - stored in VRs */
+#ifdef ADD
+ vec_t carry_vec0 = { .dw = vec_splat_u64 (0) };
+#endif
+ vec_t carry_vec1 = { .dw = vec_splat_u64 (0) };
+
+ vec_t tmp;
+
+ vec_t sum0, sum1;
+
+ /* products transferred into VRs for accumulating there */
+ vec_t pv0, pv3;
+ vec_t pv1_low, pv1_high, pv2_low, pv2_high;
+ vec_t low, middle, high;
+#ifdef ADD
+ vec_t rp0, rp1;
+#endif
+
+ register mp_limb_t v0 asm("r12");
+ register mp_limb_t v1 asm("r5");
+ v0 = vp[0];
+ v1 = vp[1];
+
+ /* The scalar multiplications compete with pointer and index increments for
+ * issue ports. Thus, increment the loop index in the middle of the loop so
+ * that the operations for the next iteration's multiplications can be
+ * loaded in time (looks horrible, yet helps performance) and make sure we
+ * use addressing with base reg + index reg + immediate displacement
+ * (so that only the single index needs incrementing, instead of multiple
+ * pointers). */
+#undef LOOP_ADVANCE
+#define LOOP_ADVANCE (4 * sizeof (mp_limb_t))
+#define IDX_OFFSET (LOOP_ADVANCE)
+
+ register ssize_t idx = 0 - IDX_OFFSET;
+#ifdef BRCTG
+ ssize_t iterations = (size_t)n / 4;
+#else
+ ssize_t const idx_bound = n * sizeof (mp_limb_t) - IDX_OFFSET;
+#endif
+
+ /*
+ * To minimize latency in the carry chain, accumulate in VRs with 128-bit
+ * adds with carry in and out. As a downside, these require two insns for
+ * each add - one to calculate the sum, one to deliver the carry out.
+ * To reduce the overall number of insns to execute, combine adding up
+ * product limbs such that there cannot be a carry out and one (for mul) or
+ * two (for addmul) adds with carry chains.
+ *
+ * Since (2^64-1) * (2^64-1) = (2^128-1) - 2 * (2^64-1), we can add two
+ * limbs into each 128-bit product without causing carry out.
+ *
+ * For each block of 2 limbs * 2 limbs
+ *
+ * | u[i] * v[0] (p2) |
+ * | u[i] * v[1] (p0) |
+ * | u[i+1] * v[0](p1) |
+ * | u[i+1] * v[1](p3) |
+ * < 128 bits > < 128 bits >
+ *
+ * we can begin accumulating with "simple" carry-oblivious 128-bit adds:
+ * - p0 + low limb of p1
+ * + high limb of p2
+ * and combine resulting low limb with p2's low limb
+ * - p3 + high limb of p1
+ * + high limb of sum above
+ * ... which will will result in two 128-bit limbs to be fed into the carry
+ * chain(s).
+ * Overall, that scheme saves instructions and improves performance, despite
+ * slightly increasing latency between multiplications and carry chain (yet
+ * not in the carry chain).
+ */
+
+#define LOAD_LOW_LIMB(VEC, LIMB) \
+ do \
+ { \
+ asm("vzero\t%[vec]\n\t" \
+ "vlvgg\t%[vec],%[limb],1" \
+ : [vec] "=v"(VEC) \
+ : [limb] "r"(LIMB)); \
+ } \
+ while (0)
+
+ /* for the 128-bit adds in the carry chain, to calculate a + b + carry-in we
+ * need paired vec_adde_u128 (delivers sum) and vec_addec_u128 (delivers new
+ * carry) */
+#define ADD_UP2_CARRY_INOUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \
+ do \
+ { \
+ sum##SUMIDX.sw \
+ = vec_adde_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \
+ carry_vec##CARRYIDX.sw \
+ = vec_addec_u128 (ADDEND1.sw, ADDEND2.sw, carry_vec##CARRYIDX.sw); \
+ } \
+ while (0)
+
+#define ADD_UP_CARRY_INOUT(SUMIDX, ADDEND1, ADDEND2) \
+ ADD_UP2_CARRY_INOUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
+
+ /* variant without carry-in for prologue */
+#define ADD_UP2_CARRY_OUT(SUMIDX, CARRYIDX, ADDEND1, ADDEND2) \
+ do \
+ { \
+ sum##SUMIDX.sw = vec_add_u128 (ADDEND1.sw, ADDEND2.sw); \
+ carry_vec##CARRYIDX.sw = vec_addc_u128 (ADDEND1.sw, ADDEND2.sw); \
+ } \
+ while (0)
+
+#define ADD_UP_CARRY_OUT(SUMIDX, ADDEND1, ADDEND2) \
+ ADD_UP2_CARRY_OUT (SUMIDX, SUMIDX, ADDEND1, ADDEND2)
+
+ /* prologue for 4x-unrolled main loop */
+ switch ((size_t)n % 4)
+ {
+ case 1:
+ ASM_LOADGPR_BASE (p0_low, up, 0);
+ ASM_LOADGPR_BASE (p1_low, up, 0);
+ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
+ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
+
+/* gcc tries to be too clever and vlr from a reg that is already zero. vzero is
+ * cheaper. */
+# define NEW_CARRY(VEC, LIMB) \
+ do \
+ { \
+ asm("vzero\t%[vec]\n\t" \
+ "vlvgg\t%[vec],%[limb],1" \
+ : [vec] "=v"(VEC) \
+ : [limb] "r"(LIMB)); \
+ } \
+ while (0)
+
+ NEW_CARRY (tmp, p0_high);
+
+ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
+#ifdef ADD
+ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
+#else
+ rp[0] = p0_low;
+#endif
+ idx += sizeof (mp_limb_t);
+ break;
+
+ case 2:
+ ASM_LOADGPR_BASE (p0_low, up, 0);
+ ASM_LOADGPR_BASE (p1_low, up, 8);
+ ASM_LOADGPR_BASE (p2_low, up, 0);
+ ASM_LOADGPR_BASE (p3_low, up, 8);
+
+ asm(""
+ : "=r"(p0_low), "=r"(p2_low)
+ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
+ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
+ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
+
+ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
+ LOAD_LOW_LIMB (pv1_low, p1_low);
+ LOAD_LOW_LIMB (pv1_high, p1_high);
+ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
+ LOAD_LOW_LIMB (pv2_high, p2_high);
+ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
+ LOAD_LOW_LIMB (pv2_low, p2_low);
+ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
+ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
+ low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
+ middle.dw = vec_permi (zero.dw, middle.dw, 0);
+ high.sw = vec_add_u128 (middle.sw, pv3.sw);
+#ifdef ADD
+ rp0 = vec_load_elements_reversed (rp, 0);
+ ADD_UP_CARRY_OUT (0, rp0, carry_prod);
+#else
+ sum0 = carry_prod;
+#endif
+ ADD_UP_CARRY_OUT (1, sum0, low);
+ vec_store_elements_reversed (rp, 0, sum1);
+ carry_prod = high;
+
+ idx += 2 * sizeof (mp_limb_t);
+ break;
+
+ case 3:
+ ASM_LOADGPR_BASE (p0_low, up, 0);
+ ASM_LOADGPR_BASE (p1_low, up, 0);
+ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v0, v1);
+ carry_prod.dw = vec_load_2di_as_pair (p1_high, p1_low);
+ NEW_CARRY (tmp, p0_high);
+ carry_prod.sw = vec_add_u128 (carry_prod.sw, tmp.sw);
+
+#ifdef ADD
+ carry_vec1.dw[1] = __builtin_add_overflow (rp[0], p0_low, rp);
+#else
+ rp[0] = p0_low;
+#endif
+
+ ASM_LOADGPR_BASE (p0_low, up, 8);
+ ASM_LOADGPR_BASE (p1_low, up, 16);
+ ASM_LOADGPR_BASE (p2_low, up, 8);
+ ASM_LOADGPR_BASE (p3_low, up, 16);
+
+ asm(""
+ : "=r"(p0_low), "=r"(p2_low)
+ : "r"(p3_low), "0"(p0_low), "r"(p1_low), "1"(p2_low));
+ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
+ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
+
+ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
+
+ LOAD_LOW_LIMB (pv1_low, p1_low);
+ LOAD_LOW_LIMB (pv1_high, p1_high);
+
+ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
+ LOAD_LOW_LIMB (pv2_high, p2_high);
+ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
+
+ LOAD_LOW_LIMB (pv2_low, p2_low);
+
+ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
+ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
+
+ low.dw = vec_permi (middle.dw, pv2_low.dw, 3);
+ middle.dw = vec_permi (zero.dw, middle.dw, 0);
+ high.sw = vec_add_u128 (middle.sw, pv3.sw);
+
+#ifdef ADD
+ vec_t rp0 = vec_load_elements_reversed (rp, 8);
+ ADD_UP_CARRY_OUT (0, rp0, carry_prod);
+#else
+ sum0 = carry_prod;
+#endif
+ ADD_UP_CARRY_INOUT (1, sum0, low);
+
+ vec_store_elements_reversed (rp, 8, sum1);
+
+ carry_prod = high;
+
+ idx += 3 * sizeof (mp_limb_t);
+ break;
+ }
+
+ /*
+ * branch-on-count implicitly hint to the branch prediction as taken, while
+ * compare-and-branch hints as not taken. currently, using branch-on-count
+ * has a performance advantage, but it is not clear that it is generally
+ * the better choice (e.g., branch-on-count requires decrementing the
+ * separate counter). so, allow switching the loop condition to enable
+ * either category of branch instructions:
+ * - idx is less than an upper bound, for compare-and-branch
+ * - iteration counter greater than zero, for branch-on-count
+ */
+#ifdef BRCTG
+ for (; iterations > 0; iterations--)
+ {
+#else
+ while (idx < idx_bound)
+ {
+#endif
+ /* The 64x64->128 MLGR multiplies two factors in GPRs and stores the
+ * result in a GPR pair. One of the factors is taken from the GPR pair
+ * and overwritten.
+ * To reuse factors, it turned out cheaper to load limbs multiple times
+ * than copying GPR contents. Enforce that and the use of addressing by
+ * base + index gpr + immediate displacement via inline asm.
+ */
+ ASM_LOADGPR (p0_low, up, idx, 0 + IDX_OFFSET);
+ ASM_LOADGPR (p1_low, up, idx, 8 + IDX_OFFSET);
+ ASM_LOADGPR (p2_low, up, idx, 0 + IDX_OFFSET);
+ ASM_LOADGPR (p3_low, up, idx, 8 + IDX_OFFSET);
+
+ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
+
+ pv0.dw = vec_load_2di_as_pair (p0_high, p0_low);
+
+ LOAD_LOW_LIMB (pv1_low, p1_low);
+ LOAD_LOW_LIMB (pv1_high, p1_high);
+
+ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
+
+ pv0.sw = vec_add_u128 (pv0.sw, pv1_low.sw);
+ LOAD_LOW_LIMB (pv2_high, p2_high);
+ pv3.dw = vec_load_2di_as_pair (p3_high, p3_low);
+
+ LOAD_LOW_LIMB (pv2_low, p2_low);
+
+ ASM_LOADGPR (p0_low, up, idx, 16 + IDX_OFFSET);
+ ASM_LOADGPR (p1_low, up, idx, 24 + IDX_OFFSET);
+ ASM_LOADGPR (p2_low, up, idx, 16 + IDX_OFFSET);
+ ASM_LOADGPR (p3_low, up, idx, 24 + IDX_OFFSET);
+
+ idx += LOOP_ADVANCE;
+
+ /*
+ * "barrier" to enforce scheduling the index increment before the second
+ * block of multiplications. not required for clang.
+ */
+#ifndef __clang__
+ asm(""
+ : "=r"(idx), "=r"(p0_high), "=r"(p2_high)
+ : "0"(idx), "1"(p0_high), "2"(p2_high));
+#endif
+
+ s390_double_umul_ppmm_distinct (p0_high, p0_low, p1_high, p1_low, v1, v0);
+ s390_double_umul_ppmm_distinct (p2_high, p2_low, p3_high, p3_low, v0, v1);
+
+ /*
+ * "barrier" to enforce scheduling all MLGRs first, before any adding
+ * up. note that clang produces better code without.
+ */
+#ifndef __clang__
+ asm(""
+ : "=v"(pv0.sw), "=v"(pv3.sw)
+ : "1"(pv3.sw), "0"(pv0.sw), "r"(p0_high), "r"(p2_high));
+#endif
+
+ pv3.sw = vec_add_u128 (pv3.sw, pv1_high.sw);
+ middle.sw = vec_add_u128 (pv0.sw, pv2_high.sw);
+
+ low.dw = vec_permi (middle.dw, pv2_low.dw,
+ 3); /* least-significant doubleword from both vectors */
+ middle.dw = vec_permi (zero.dw, middle.dw, 0);
+ high.sw = vec_add_u128 (middle.sw, pv3.sw);
+
+#ifdef ADD
+ rp0 = vec_load_elements_reversed_idx (rp, idx,
+ 0 + IDX_OFFSET - LOOP_ADVANCE);
+ ADD_UP_CARRY_INOUT (0, rp0, carry_prod);
+#else
+ sum0 = carry_prod;
+#endif
+ ADD_UP_CARRY_INOUT (1, sum0, low);
+
+ vec_store_elements_reversed_idx (rp, idx, 0 + IDX_OFFSET - LOOP_ADVANCE,
+ sum1);
+
+ carry_prod = high;
+
+ vec_t pv0_2, pv3_2;
+ vec_t pv1_low_2, pv1_high_2, pv2_low_2, pv2_high_2;
+ vec_t low_2, middle_2, high_2;
+ vec_t sum2, sum3;
+
+ pv0_2.dw = vec_load_2di_as_pair (p0_high, p0_low);
+ LOAD_LOW_LIMB (pv1_low_2, p1_low);
+ LOAD_LOW_LIMB (pv1_high_2, p1_high);
+
+ pv0_2.sw = vec_add_u128 (pv0_2.sw, pv1_low_2.sw);
+ LOAD_LOW_LIMB (pv2_high_2, p2_high);
+ pv3_2.dw = vec_load_2di_as_pair (p3_high, p3_low);
+ pv3_2.sw = vec_add_u128 (pv3_2.sw, pv1_high_2.sw);
+ middle_2.sw = vec_add_u128 (pv0_2.sw, pv2_high_2.sw);
+
+ LOAD_LOW_LIMB (pv2_low_2, p2_low);
+ low_2.dw
+ = vec_permi (middle_2.dw, pv2_low_2.dw,
+ 3); /* least-significant doubleword from both vectors */
+ middle_2.dw = vec_permi (zero.dw, middle_2.dw, 0);
+ high_2.sw = vec_add_u128 (middle_2.sw, pv3_2.sw);
+
+ /*
+ * another "barrier" to influence scheduling. (also helps in clang)
+ */
+ asm("" : : "v"(pv0_2.sw), "r"(p2_high), "r"(p3_high), "v"(pv3_2.sw));
+
+#ifdef ADD
+ rp1 = vec_load_elements_reversed_idx (rp, idx,
+ 16 + IDX_OFFSET - LOOP_ADVANCE);
+ ADD_UP2_CARRY_INOUT (2, 0, rp1, carry_prod);
+#else
+ sum2 = carry_prod;
+#endif
+ ADD_UP2_CARRY_INOUT (3, 1, sum2, low_2);
+
+ vec_store_elements_reversed_idx (rp, idx, 16 + IDX_OFFSET - LOOP_ADVANCE,
+ sum3);
+
+ carry_prod = high_2;
+ }
+
+#ifdef ADD
+ sum0.sw = vec_adde_u128 (carry_prod.sw, carry_vec0.sw, carry_vec1.sw);
+#else
+ sum0.sw = vec_add_u128 (carry_prod.sw, carry_vec1.sw);
+#endif
+
+ *(mp_ptr) (((char *)rp) + idx + 0 + IDX_OFFSET) = (mp_limb_t)sum0.dw[1];
+
+ return (mp_limb_t)sum0.dw[0];
+}
diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h
new file mode 100644
index 000000000..a17503fd0
--- /dev/null
+++ b/mpn/s390_64/z13/gmp-mparam.h
@@ -0,0 +1,37 @@
+/* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file.
+
+Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#define GMP_LIMB_BITS 64
+#define GMP_LIMB_BYTES 8
+
+#define HAVE_NATIVE_mpn_addmul_2 1
+#define HAVE_NATIVE_mpn_mul_2 1
+
+#include "mpn/s390_64/gmp-mparam.h"
--
2.40.1

138
ibm_z13_simd_part3.patch Normal file
View File

@ -0,0 +1,138 @@
Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
---
mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++
1 file changed, 124 insertions(+)
create mode 100644 mpn/s390_64/z13/mul_basecase.c
diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c
new file mode 100644
index 000000000..f1b7160b3
--- /dev/null
+++ b/mpn/s390_64/z13/mul_basecase.c
@@ -0,0 +1,124 @@
+/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two
+ natural numbers of length m and n.
+
+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY
+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+ * the GNU Lesser General Public License as published by the Free
+ Software Foundation; either version 3 of the License, or (at your
+ option) any later version.
+
+or
+
+ * the GNU General Public License as published by the Free Software
+ Foundation; either version 2 of the License, or (at your option) any
+ later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library. If not,
+see https://www.gnu.org/licenses/. */
+
+#include <stdlib.h>
+
+#include "gmp-impl.h"
+
+/* Note: we explicitly inline all mul and addmul routines here to reduce the
+ * number of branches in prologues of unrolled functions. That comes at the
+ cost of duplicating common loop bodies in object code. */
+#define DO_INLINE
+
+/*
+ * tweak loop conditions in addmul subroutines to enable use of
+ * branch-relative-on-count (BRCTG) instructions, which currently results in
+ * better performance.
+ */
+#define BRCTG
+
+#include "s390_64/z13/common-vec.h"
+
+#define OPERATION_mul_1
+#include "s390_64/z13/addmul_1.c"
+#undef OPERATION_mul_1
+
+#define OPERATION_addmul_1
+#include "s390_64/z13/addmul_1.c"
+#undef OPERATION_addmul_1
+
+#define OPERATION_mul_2
+#include "s390_64/z13/aormul_2.c"
+#undef OPERATION_mul_2
+
+#define OPERATION_addmul_2
+#include "s390_64/z13/aormul_2.c"
+#undef OPERATION_addmul_2
+
+void
+mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp,
+ mp_size_t vn)
+{
+ ASSERT (un >= vn);
+ ASSERT (vn >= 1);
+ ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un));
+ ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn));
+
+ /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch
+ * for un%4 and inline specific variants. */
+
+#define BRANCH_FOR_MOD(N) \
+ do \
+ { \
+ if (vn >= 2) \
+ { \
+ rp[un + 1] = inline_mul_2 (rp, up, un, vp); \
+ rp += 2, vp += 2, vn -= 2; \
+ } \
+ else \
+ { \
+ rp[un] = inline_mul_1 (rp, up, un, vp[0]); \
+ return; \
+ } \
+ \
+ while (vn >= 2) \
+ { \
+ rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp); \
+ rp += 2, vp += 2, vn -= 2; \
+ } \
+ \
+ while (vn >= 1) \
+ { \
+ rp[un] = inline_addmul_1 (rp, up, un, vp[0]); \
+ rp += 1, vp += 1, vn -= 1; \
+ } \
+ } \
+ while (0);
+
+ switch (((size_t)un) % 4)
+ {
+ case 0:
+ BRANCH_FOR_MOD (0);
+ break;
+ case 1:
+ BRANCH_FOR_MOD (1);
+ break;
+ case 2:
+ BRANCH_FOR_MOD (2);
+ break;
+ case 3:
+ BRANCH_FOR_MOD (3);
+ break;
+ }
+}
--
2.40.1

151
ibm_z13_simd_part4.patch Normal file
View File

@ -0,0 +1,151 @@
From: Marius Hillenbrand <mhillen at linux.ibm.com>
---
mpn/s390_64/z13/gmp-mparam.h | 129 ++++++++++++++++++++++++++++++++++-
1 file changed, 127 insertions(+), 2 deletions(-)
diff --git a/mpn/s390_64/z13/gmp-mparam.h b/mpn/s390_64/z13/gmp-mparam.h
index a17503fd0..50e7f39d1 100644
--- a/mpn/s390_64/z13/gmp-mparam.h
+++ b/mpn/s390_64/z13/gmp-mparam.h
@@ -1,6 +1,6 @@
/* S/390-64 for IBM z13 gmp-mparam.h -- Compiler/machine parameter header file.
-Copyright 1991, 1993, 1994, 2000-2011 Free Software Foundation, Inc.
+Copyright 2021 Free Software Foundation, Inc.
This file is part of the GNU MP Library.
@@ -34,4 +34,129 @@ see https://www.gnu.org/licenses/. */
#define HAVE_NATIVE_mpn_addmul_2 1
#define HAVE_NATIVE_mpn_mul_2 1
-#include "mpn/s390_64/gmp-mparam.h"
+/* Generated by tuneup.c, 2021-07-30, gcc 10.2 */
+
+#define DIVREM_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIVREM_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1_1P_METHOD 2
+#define MOD_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define MOD_1N_TO_MOD_1_1_THRESHOLD 17
+#define MOD_1U_TO_MOD_1_1_THRESHOLD 15
+#define MOD_1_1_TO_MOD_1_2_THRESHOLD 0 /* never mpn_mod_1_1p */
+#define MOD_1_2_TO_MOD_1_4_THRESHOLD 0 /* never mpn_mod_1s_2p */
+#define PREINV_MOD_1_TO_MOD_1_THRESHOLD 5
+#define USE_PREINV_DIVREM_1 1
+#define DIV_QR_1N_PI1_METHOD 3
+#define DIV_QR_1_NORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_1_UNNORM_THRESHOLD MP_SIZE_T_MAX /* never */
+#define DIV_QR_2_PI2_THRESHOLD 996
+#define DIVEXACT_1_THRESHOLD 4
+#define BMOD_1_TO_MOD_1_THRESHOLD 0 /* always */
+
+#define DIV_1_VS_MUL_1_PERCENT 404
+
+#define MUL_TOOM22_THRESHOLD 23
+#define MUL_TOOM33_THRESHOLD 94
+#define MUL_TOOM44_THRESHOLD 166
+#define MUL_TOOM6H_THRESHOLD 286
+#define MUL_TOOM8H_THRESHOLD 626
+
+#define MUL_TOOM32_TO_TOOM43_THRESHOLD 113
+#define MUL_TOOM32_TO_TOOM53_THRESHOLD 138
+#define MUL_TOOM42_TO_TOOM53_THRESHOLD 143
+#define MUL_TOOM42_TO_TOOM63_THRESHOLD 145
+#define MUL_TOOM43_TO_TOOM54_THRESHOLD 130
+
+#define SQR_BASECASE_THRESHOLD 0 /* always (native) */
+#define SQR_TOOM2_THRESHOLD 12
+#define SQR_TOOM3_THRESHOLD 84
+#define SQR_TOOM4_THRESHOLD 234
+#define SQR_TOOM6_THRESHOLD 318
+#define SQR_TOOM8_THRESHOLD 478
+
+#define MULMID_TOOM42_THRESHOLD 42
+
+#define MULMOD_BNM1_THRESHOLD 13
+#define SQRMOD_BNM1_THRESHOLD 7
+
+#define MUL_FFT_MODF_THRESHOLD 332 /* k = 5 */
+#define MUL_FFT_TABLE3 \
+ { { 332, 5}, { 19, 6}, { 10, 5}, { 21, 6}, \
+ { 21, 7}, { 21, 8}, { 11, 7}, { 24, 8}, \
+ { 13, 7}, { 27, 8}, { 15, 7}, { 31, 8}, \
+ { 17, 7}, { 35, 8}, { 19, 7}, { 39, 8}, \
+ { 21, 9}, { 11, 8}, { 27, 9}, { 15, 8}, \
+ { 35, 9}, { 19, 8}, { 41, 9}, { 23, 8}, \
+ { 47, 9}, { 27,10}, { 15, 9}, { 39,10}, \
+ { 23, 9}, { 47,11}, { 15,10}, { 31, 9}, \
+ { 67,10}, { 47,11}, { 2048,12}, { 4096,13}, \
+ { 8192,14}, { 16384,15}, { 32768,16}, { 65536,17}, \
+ { 131072,18}, { 262144,19}, { 524288,20}, {1048576,21}, \
+ {2097152,22}, {4194304,23}, {8388608,24} }
+#define MUL_FFT_TABLE3_SIZE 47
+#define MUL_FFT_THRESHOLD 2752
+
+#define SQR_FFT_MODF_THRESHOLD 240 /* k = 5 */
+#define SQR_FFT_TABLE3 \
+ { { 240, 5}, { 8, 4}, { 17, 5}, { 13, 6}, \
+ { 7, 5}, { 15, 6}, { 8, 5}, { 17, 6}, \
+ { 9, 5}, { 19, 6}, { 15, 7}, { 8, 6}, \
+ { 17, 7}, { 9, 6}, { 19, 7}, { 10, 6}, \
+ { 21, 7}, { 17, 8}, { 9, 7}, { 20, 8}, \
+ { 11, 7}, { 23, 8}, { 13, 9}, { 7, 8}, \
+ { 21, 9}, { 11, 8}, { 23, 9}, { 15, 8}, \
+ { 31, 9}, { 19, 8}, { 39, 9}, { 23,10}, \
+ { 15, 9}, { 39,10}, { 23,11}, { 15,10}, \
+ { 31, 9}, { 63,10}, { 47,11}, { 2048,12}, \
+ { 4096,13}, { 8192,14}, { 16384,15}, { 32768,16}, \
+ { 65536,17}, { 131072,18}, { 262144,19}, { 524288,20}, \
+ {1048576,21}, {2097152,22}, {4194304,23}, {8388608,24} }
+#define SQR_FFT_TABLE3_SIZE 52
+#define SQR_FFT_THRESHOLD 1856
+
+#define MULLO_BASECASE_THRESHOLD 0 /* always */
+#define MULLO_DC_THRESHOLD 25
+#define MULLO_MUL_N_THRESHOLD 5397
+#define SQRLO_BASECASE_THRESHOLD 0 /* always */
+#define SQRLO_DC_THRESHOLD 396
+#define SQRLO_SQR_THRESHOLD 3704
+
+#define DC_DIV_QR_THRESHOLD 15
+#define DC_DIVAPPR_Q_THRESHOLD 50
+#define DC_BDIV_QR_THRESHOLD 66
+#define DC_BDIV_Q_THRESHOLD 202
+
+#define INV_MULMOD_BNM1_THRESHOLD 46
+#define INV_NEWTON_THRESHOLD 29
+#define INV_APPR_THRESHOLD 13
+
+#define BINV_NEWTON_THRESHOLD 312
+#define REDC_1_TO_REDC_2_THRESHOLD 79
+#define REDC_2_TO_REDC_N_THRESHOLD 0 /* always */
+
+#define MU_DIV_QR_THRESHOLD 979
+#define MU_DIVAPPR_Q_THRESHOLD 979
+#define MUPI_DIV_QR_THRESHOLD 13
+#define MU_BDIV_QR_THRESHOLD 942
+#define MU_BDIV_Q_THRESHOLD 1367
+
+#define POWM_SEC_TABLE 3,19,215,1730
+
+#define GET_STR_DC_THRESHOLD 10
+#define GET_STR_PRECOMPUTE_THRESHOLD 15
+#define SET_STR_DC_THRESHOLD 882
+#define SET_STR_PRECOMPUTE_THRESHOLD 2520
+
+#define FAC_DSC_THRESHOLD 228
+#define FAC_ODD_THRESHOLD 24
+
+#define MATRIX22_STRASSEN_THRESHOLD 19
+#define HGCD2_DIV1_METHOD 1
+#define HGCD_THRESHOLD 61
+#define HGCD_APPR_THRESHOLD 51
+#define HGCD_REDUCE_THRESHOLD 1962
+#define GCD_DC_THRESHOLD 217
+#define GCDEXT_DC_THRESHOLD 263
+#define JACOBI_BASE_METHOD 4
+
--
2.40.1

1
sources Normal file
View File

@ -0,0 +1 @@
SHA512 (gmp-6.2.0.tar.bz2) = ff22ed47fff176ed56301ecab0213316150a3abb370fed031635804f829c878296d7c65597b1f687f394479eef04fae6eba771162f7d363dc4c94c7334fc1fc0