gmp/SOURCES/ibm_z13_simd_part3.patch

Co-authored-by: Stefan Liebler <stli at linux.ibm.com>
---
 mpn/s390_64/z13/mul_basecase.c | 124 +++++++++++++++++++++++++++++++++
 1 file changed, 124 insertions(+)
 create mode 100644 mpn/s390_64/z13/mul_basecase.c

diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c
new file mode 100644
index 000000000..f1b7160b3
--- /dev/null
+++ b/mpn/s390_64/z13/mul_basecase.c
@@ -0,0 +1,125 @@
+/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two
+   natural numbers of length m and n.
+
+   THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE.  IT IS ONLY
+   SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.
+
+Copyright 2021 Free Software Foundation, Inc.
+
+This file is part of the GNU MP Library.
+
+The GNU MP Library is free software; you can redistribute it and/or modify
+it under the terms of either:
+
+  * the GNU Lesser General Public License as published by the Free
+    Software Foundation; either version 3 of the License, or (at your
+    option) any later version.
+
+or
+
+  * the GNU General Public License as published by the Free Software
+    Foundation; either version 2 of the License, or (at your option) any
+    later version.
+
+or both in parallel, as here.
+
+The GNU MP Library is distributed in the hope that it will be useful, but
+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY
+or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
+for more details.
+
+You should have received copies of the GNU General Public License and the
+GNU Lesser General Public License along with the GNU MP Library.  If not,
+see https://www.gnu.org/licenses/.  */
+
+#include <stdlib.h>
+
+#include "gmp.h"
+#include "gmp-impl.h"
+
+/* Note: we explicitly inline all mul and addmul routines here to reduce the
+ * number of branches in prologues of unrolled functions. That comes at the
+   cost of duplicating common loop bodies in object code. */
+#define DO_INLINE
+
+/*
+ * tweak loop conditions in addmul subroutines to enable use of
+ * branch-relative-on-count (BRCTG) instructions, which currently results in
+ * better performance.
+ */
+#define BRCTG
+
+#include "s390_64/z13/common-vec.h"
+
+#define OPERATION_mul_1
+#include "s390_64/z13/addmul_1.c"
+#undef OPERATION_mul_1
+
+#define OPERATION_addmul_1
+#include "s390_64/z13/addmul_1.c"
+#undef OPERATION_addmul_1
+
+#define OPERATION_mul_2
+#include "s390_64/z13/aormul_2.c"
+#undef OPERATION_mul_2
+
+#define OPERATION_addmul_2
+#include "s390_64/z13/aormul_2.c"
+#undef OPERATION_addmul_2
+
+void
+mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp,
+                  mp_size_t vn)
+{
+  ASSERT (un >= vn);
+  ASSERT (vn >= 1);
+  ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un));
+  ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn));
+
+  /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch
+   * for un%4 and inline specific variants. */
+
+#define BRANCH_FOR_MOD(N)                                                     \
+  do                                                                          \
+    {                                                                         \
+      if (vn >= 2)                                                            \
+        {                                                                     \
+          rp[un + 1] = inline_mul_2 (rp, up, un, vp);                         \
+          rp += 2, vp += 2, vn -= 2;                                          \
+        }                                                                     \
+      else                                                                    \
+        {                                                                     \
+          rp[un] = inline_mul_1 (rp, up, un, vp[0]);                          \
+          return;                                                             \
+        }                                                                     \
+                                                                              \
+      while (vn >= 2)                                                         \
+        {                                                                     \
+          rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp);                  \
+          rp += 2, vp += 2, vn -= 2;                                          \
+        }                                                                     \
+                                                                              \
+      while (vn >= 1)                                                         \
+        {                                                                     \
+          rp[un] = inline_addmul_1 (rp, up, un, vp[0]);                       \
+          rp += 1, vp += 1, vn -= 1;                                          \
+        }                                                                     \
+    }                                                                         \
+  while (0);
+
+  switch (((size_t)un) % 4)
+    {
+    case 0:
+      BRANCH_FOR_MOD (0);
+      break;
+    case 1:
+      BRANCH_FOR_MOD (1);
+      break;
+    case 2:
+      BRANCH_FOR_MOD (2);
+      break;
+    case 3:
+      BRANCH_FOR_MOD (3);
+      break;
+    }
+}
-- 
2.40.1
import CS gmp-6.1.2-12.el8 2024-03-27 19:47:39 +00:00			`Co-authored-by: Stefan Liebler <stli at linux.ibm.com>`
			`---`
			`mpn/s390_64/z13/mul_basecase.c \| 124 +++++++++++++++++++++++++++++++++`
			`1 file changed, 124 insertions(+)`
			`create mode 100644 mpn/s390_64/z13/mul_basecase.c`

			`diff --git a/mpn/s390_64/z13/mul_basecase.c b/mpn/s390_64/z13/mul_basecase.c`
			`new file mode 100644`
			`index 000000000..f1b7160b3`
			`--- /dev/null`
			`+++ b/mpn/s390_64/z13/mul_basecase.c`
			`@@ -0,0 +1,125 @@`
			`+/* mpn_mul_basecase for IBM z13 and later -- Internal routine to multiply two`
			`+ natural numbers of length m and n.`
			`+`
			`+ THIS IS AN INTERNAL FUNCTION WITH A MUTABLE INTERFACE. IT IS ONLY`
			`+ SAFE TO REACH THIS FUNCTION THROUGH DOCUMENTED INTERFACES.`
			`+`
			`+Copyright 2021 Free Software Foundation, Inc.`
			`+`
			`+This file is part of the GNU MP Library.`
			`+`
			`+The GNU MP Library is free software; you can redistribute it and/or modify`
			`+it under the terms of either:`
			`+`
			`+ * the GNU Lesser General Public License as published by the Free`
			`+ Software Foundation; either version 3 of the License, or (at your`
			`+ option) any later version.`
			`+`
			`+or`
			`+`
			`+ * the GNU General Public License as published by the Free Software`
			`+ Foundation; either version 2 of the License, or (at your option) any`
			`+ later version.`
			`+`
			`+or both in parallel, as here.`
			`+`
			`+The GNU MP Library is distributed in the hope that it will be useful, but`
			`+WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY`
			`+or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License`
			`+for more details.`
			`+`
			`+You should have received copies of the GNU General Public License and the`
			`+GNU Lesser General Public License along with the GNU MP Library. If not,`
			`+see https://www.gnu.org/licenses/. */`
			`+`
			`+#include <stdlib.h>`
			`+`
			`+#include "gmp.h"`
			`+#include "gmp-impl.h"`
			`+`
			`+/* Note: we explicitly inline all mul and addmul routines here to reduce the`
			`+ * number of branches in prologues of unrolled functions. That comes at the`
			`+ cost of duplicating common loop bodies in object code. */`
			`+#define DO_INLINE`
			`+`
			`+/*`
			`+ * tweak loop conditions in addmul subroutines to enable use of`
			`+ * branch-relative-on-count (BRCTG) instructions, which currently results in`
			`+ * better performance.`
			`+ */`
			`+#define BRCTG`
			`+`
			`+#include "s390_64/z13/common-vec.h"`
			`+`
			`+#define OPERATION_mul_1`
			`+#include "s390_64/z13/addmul_1.c"`
			`+#undef OPERATION_mul_1`
			`+`
			`+#define OPERATION_addmul_1`
			`+#include "s390_64/z13/addmul_1.c"`
			`+#undef OPERATION_addmul_1`
			`+`
			`+#define OPERATION_mul_2`
			`+#include "s390_64/z13/aormul_2.c"`
			`+#undef OPERATION_mul_2`
			`+`
			`+#define OPERATION_addmul_2`
			`+#include "s390_64/z13/aormul_2.c"`
			`+#undef OPERATION_addmul_2`
			`+`
			`+void`
			`+mpn_mul_basecase (mp_ptr rp, mp_srcptr up, mp_size_t un, mp_srcptr vp,`
			`+ mp_size_t vn)`
			`+{`
			`+ ASSERT (un >= vn);`
			`+ ASSERT (vn >= 1);`
			`+ ASSERT (!MPN_OVERLAP_P (rp, un + vn, up, un));`
			`+ ASSERT (!MPN_OVERLAP_P (rp, un + vn, vp, vn));`
			`+`
			`+ /* The implementations of (add)mul_1/2 are 4x-unrolled. Pull out the branch`
			`+ * for un%4 and inline specific variants. */`
			`+`
			`+#define BRANCH_FOR_MOD(N) \`
			`+ do \`
			`+ { \`
			`+ if (vn >= 2) \`
			`+ { \`
			`+ rp[un + 1] = inline_mul_2 (rp, up, un, vp); \`
			`+ rp += 2, vp += 2, vn -= 2; \`
			`+ } \`
			`+ else \`
			`+ { \`
			`+ rp[un] = inline_mul_1 (rp, up, un, vp[0]); \`
			`+ return; \`
			`+ } \`
			`+ \`
			`+ while (vn >= 2) \`
			`+ { \`
			`+ rp[un + 2 - 1] = inline_addmul_2 (rp, up, un, vp); \`
			`+ rp += 2, vp += 2, vn -= 2; \`
			`+ } \`
			`+ \`
			`+ while (vn >= 1) \`
			`+ { \`
			`+ rp[un] = inline_addmul_1 (rp, up, un, vp[0]); \`
			`+ rp += 1, vp += 1, vn -= 1; \`
			`+ } \`
			`+ } \`
			`+ while (0);`
			`+`
			`+ switch (((size_t)un) % 4)`
			`+ {`
			`+ case 0:`
			`+ BRANCH_FOR_MOD (0);`
			`+ break;`
			`+ case 1:`
			`+ BRANCH_FOR_MOD (1);`
			`+ break;`
			`+ case 2:`
			`+ BRANCH_FOR_MOD (2);`
			`+ break;`
			`+ case 3:`
			`+ BRANCH_FOR_MOD (3);`
			`+ break;`
			`+ }`
			`+}`
			`--`
			`2.40.1`