Import patches from upstream to fix gcc 9 compatibility.

This commit is contained in:
Susi Lehtola 2019-02-17 21:36:32 +01:00
parent 2529d97e84
commit 2a8a1574d7
7 changed files with 3297 additions and 1 deletions

499
2010.patch Normal file
View File

@ -0,0 +1,499 @@
From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 15:33:48 +0100
Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64
s/dGEMV_T and s/dGEMV_N kernels
Arguments 0 and 1 need to be tagged as both input and output
---
kernel/x86_64/dgemv_n_4.c | 10 +++++-----
kernel/x86_64/dgemv_t_4.c | 18 +++++++++---------
kernel/x86_64/sgemv_n_4.c | 14 +++++++-------
kernel/x86_64/sgemv_t_4.c | 18 +++++++++---------
4 files changed, 30 insertions(+), 30 deletions(-)
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
index 6d2530e81..6d33641e9 100644
--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"jnz 1b \n\t"
:
+ "+r" (i), // 0
+ "+r" (n) // 1
:
- "r" (i), // 0
- "r" (n), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
index a7478e3a8..ed672a757 100644
--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movsd %%xmm11,8(%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movsd %%xmm10, (%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
index 65305ac59..63697970f 100644
--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (ap[0]), // 4
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
"3: \n\t"
:
+ "+r" (i), // 0
+ "+r" (n1) // 1
:
- "r" (i), // 0
- "r" (n1), // 1
"r" (x), // 2
"r" (y), // 3
"r" (ap), // 4
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
"jnz 1b \n\t"
:
+ "+r" (i), // 0
+ "+r" (n) // 1
:
- "r" (i), // 0
- "r" (n), // 1
"r" (src), // 2
"r" (dest) // 3
: "cc",
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
index 065e5b385..86ecaf516 100644
--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
"movss %%xmm11,4(%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap0), // 3
"r" (ap1), // 4
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
"movss %%xmm10, (%2) \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (y), // 2
"r" (ap), // 3
"r" (x) // 4
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
"jnz 1b \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (&da), // 2
"r" (src), // 3
"r" (dest) // 4
From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 15:51:43 +0100
Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly
Argument 0 is modified as it doubles as a counter
---
kernel/x86_64/dscal.c | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
index ef9a0a6ba..d0d7801fd 100644
--- a/kernel/x86_64/dscal.c
+++ b/kernel/x86_64/dscal.c
@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
"jnz 1b \n\t"
:
+ "+r" (n) // 0
:
- "r" (n), // 0
"r" (x), // 1
"r" (x1), // 2
"r" (alpha), // 3
From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 16:00:18 +0100
Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV
microkernels
Arguments 0 and 1 are both input and output
---
kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++---
kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++---
kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++---
kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++---
kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++---
8 files changed, 24 insertions(+), 24 deletions(-)
diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
index d7166fe4b..ae287b6d8 100644
--- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c
index d83d20f8e..4778f644a 100644
--- a/kernel/x86_64/dsymv_U_microk_haswell-2.c
+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c
@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
index 1344c75f7..065182286 100644
--- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c
+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c
index 1ef6fbafd..d84e703bd 100644
--- a/kernel/x86_64/dsymv_U_microk_sandy-2.c
+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c
@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
index 8c01ab806..4a4f4d68d 100644
--- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c
index a32e59b44..e6a09ccf8 100644
--- a/kernel/x86_64/ssymv_U_microk_haswell-2.c
+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c
@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
index b8e6ee732..c56ff3b15 100644
--- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c
+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c
index e8650650c..c4919a39a 100644
--- a/kernel/x86_64/ssymv_U_microk_sandy-2.c
+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c
@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
"vzeroupper \n\t"
:
- :
- "r" (i), // 0
- "r" (n), // 1
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
"r" (x), // 2
"r" (y), // 3
"r" (a0), // 4
From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Tue, 12 Feb 2019 16:14:02 +0100
Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly
Argument 0 is modified so should be input and output
---
kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++--
kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++--
kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++--
kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++--
kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++----
8 files changed, 18 insertions(+), 18 deletions(-)
diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
index d84470cc4..bfa07b6d0 100644
--- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c
index 866782ee6..6241879d5 100644
--- a/kernel/x86_64/dsymv_L_microk_haswell-2.c
+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c
@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
index 38479f77a..a161dcd8b 100644
--- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c
+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c
index b4e6ab369..b205b1019 100644
--- a/kernel/x86_64/dsymv_L_microk_sandy-2.c
+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c
@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
index 9002228f3..602c3edf2 100644
--- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c
index 69db008b6..fdfe4349a 100644
--- a/kernel/x86_64/ssymv_L_microk_haswell-2.c
+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c
@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
index c0fe5d640..6bb9c02f6 100644
--- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c
+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
"movss %%xmm3 , 12(%9) \n\t" // save temp2
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c
index 093ca8073..0c78212e7 100644
--- a/kernel/x86_64/ssymv_L_microk_sandy-2.c
+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c
@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3
@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
"vzeroupper \n\t"
:
- :
- "r" (from), // 0
+ "+r" (from) // 0
+ :
"r" (to), // 1
"r" (x), // 2
"r" (y), // 3

27
2018.patch Normal file
View File

@ -0,0 +1,27 @@
From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
Date: Thu, 14 Feb 2019 16:19:41 +0000
Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for
xmm0,xmm1,xmm2,xmm3
This fixes a crash in dblat2 when OpenBLAS is compiled using
-march=znver1 -ftree-vectorize -O2
See also:
https://github.com/easybuilders/easybuild-easyconfigs/issues/7180
---
kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 +
1 file changed, 1 insertion(+)
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
index 584a6c6b5..da0fa2fff 100644
--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"r" (ap[3]), // 7
"r" (alpha) // 8
: "cc",
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",

274
2019.patch Normal file
View File

@ -0,0 +1,274 @@
From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Thu, 14 Feb 2019 22:43:18 +0100
Subject: [PATCH 1/2] Save and restore input argument 8 (lda4)
Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009)
---
kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++--
1 file changed, 5 insertions(+), 2 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index 2c90f8aa9..e89a16785 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*****************************************************************************/
-
#define HAVE_KERNEL_4x8 1
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
+ "movq %8, %%xmm10 \n\t" //save lda
+
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"4: \n\t"
"vzeroupper \n\t"
+ "movq %%xmm10, %8 \n\t" //restore lda
:
"+r" (i), // 0
@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
+ "%xmm10",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);
@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
}
-
#define HAVE_KERNEL_4x4 1
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
+
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Feb 2019 10:10:04 +0100
Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint
list
---
kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------
1 file changed, 61 insertions(+), 65 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
index e89a16785..93e1e26e8 100644
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
- "movq %8, %%xmm10 \n\t" //save lda
-
"testq $0x04, %1 \n\t"
"jz 2f \n\t"
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
- "addq $4 , %8 \n\t"
+ "addq $4 , %2 \n\t"
"addq $4 , %0 \n\t"
"subq $4 , %1 \n\t"
@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"testq $0x08, %1 \n\t"
"jz 3f \n\t"
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
- "addq $8 , %8 \n\t"
+ "addq $8 , %2 \n\t"
"addq $8 , %0 \n\t"
"subq $8 , %1 \n\t"
@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
-
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
-
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
+
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
"addq $16, %0 \n\t"
- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
+ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
+ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
- "addq $16, %8 \n\t"
- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
+ "addq $16, %2 \n\t"
+ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
"subq $16, %1 \n\t"
- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
"jnz 1b \n\t"
"4: \n\t"
"vzeroupper \n\t"
- "movq %%xmm10, %8 \n\t" //restore lda
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"%xmm4", "%xmm5",
"%xmm6", "%xmm7",
"%xmm8", "%xmm9",
- "%xmm10",
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
"memory"
);

255
2021.patch Normal file
View File

@ -0,0 +1,255 @@
From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Fri, 15 Feb 2019 15:08:16 +0100
Subject: [PATCH] Fix wrong constraints in inline assembly
for #2009
---
kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++-------------
1 file changed, 49 insertions(+), 49 deletions(-)
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
index fcab8e2c7..9ab78fc8e 100644
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" cmpq $0, %0 \n\t"
" je 4f \n\t"
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" addq $8, %1 \n\t"
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" .p2align 4 \n\t"
"1: \n\t"
- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a
+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" jz 22f \n\t"
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
- " vmovups (%9), %%ymm0 \n\t"
+ " vmovups (%3), %%ymm0 \n\t"
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
- " vmovups 32(%9), %%ymm4 \n\t"
+ " vmovups 32(%3), %%ymm4 \n\t"
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
"5: \n\t" // i = 0
- " addq $64, %9 \n\t" // b=b+8
+ " addq $64, %3 \n\t" // b=b+8
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups %%ymm8 , (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups %%ymm8 , (%2) \n\t" // write a
" vmovups %%ymm8 , (%4) \n\t" // write c
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm9 , (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm9 , (%2) \n\t" // write a
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
- " vmovups (%9), %%ymm0 \n\t"
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm10, (%8) \n\t" // write a
+ " vmovups (%3), %%ymm0 \n\t"
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm10, (%2) \n\t" // write a
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm11, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm11, (%2) \n\t" // write a
" vmovups %%ymm11, (%5) \n\t" // write c
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm12, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm12, (%2) \n\t" // write a
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm13, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm13, (%2) \n\t" // write a
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
- " addq $64, %9 \n\t" // b=b+8
- " addq $32, %8 \n\t" // a=a+8
+ " addq $64, %3 \n\t" // b=b+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
- " vmovups 32(%9), %%ymm1 \n\t"
- " vmovups %%ymm14, (%8) \n\t" // write a
+ " vmovups 32(%3), %%ymm1 \n\t"
+ " vmovups %%ymm14, (%2) \n\t" // write a
" vmovups %%ymm14, (%6) \n\t" // write c
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
- " addq $32, %8 \n\t" // a=a+8
+ " addq $32, %2 \n\t" // a=a+8
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
- " vmovups %%ymm15, (%8) \n\t" // write a
+ " vmovups %%ymm15, (%2) \n\t" // write a
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
" vzeroupper \n\t"
:
+ "+r" (n1), // 0
+ "+a" (i), // 1
+ "+r" (as), // 2
+ "+r" (bs) // 3
:
- "r" (n1), // 0
- "a" (i), // 1
- "r" (a), // 2
- "r" (b), // 3
"r" (c), // 4
"r" (c3), // 5
"r" (c6), // 6
"r" (ldc), // 7
- "r" (as), // 8
- "r" (bs) // 9
+ "r" (a), // 8
+ "r" (b) // 9
: "cc",
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7",

874
2023.patch Normal file
View File

@ -0,0 +1,874 @@
From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:24:11 +0100
Subject: [PATCH 1/4] Fix inline assembly constraints
rework indices to allow marking argument lda4 as input and output. For #2009
---
kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------
1 file changed, 27 insertions(+), 27 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
index 11a3e943b..d21232bfa 100644
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
- "movss (%2), %%xmm12 \n\t" // x0
- "movss 4(%2), %%xmm13 \n\t" // x1
- "movss 8(%2), %%xmm14 \n\t" // x2
- "movss 12(%2), %%xmm15 \n\t" // x3
+ "movss (%3), %%xmm12 \n\t" // x0
+ "movss 4(%3), %%xmm13 \n\t" // x1
+ "movss 8(%3), %%xmm14 \n\t" // x2
+ "movss 12(%3), %%xmm15 \n\t" // x3
"shufps $0, %%xmm12, %%xmm12\n\t"
"shufps $0, %%xmm13, %%xmm13\n\t"
"shufps $0, %%xmm14, %%xmm14\n\t"
"shufps $0, %%xmm15, %%xmm15\n\t"
- "movss 16(%2), %%xmm0 \n\t" // x4
- "movss 20(%2), %%xmm1 \n\t" // x5
- "movss 24(%2), %%xmm2 \n\t" // x6
- "movss 28(%2), %%xmm3 \n\t" // x7
+ "movss 16(%3), %%xmm0 \n\t" // x4
+ "movss 20(%3), %%xmm1 \n\t" // x5
+ "movss 24(%3), %%xmm2 \n\t" // x6
+ "movss 28(%3), %%xmm3 \n\t" // x7
"shufps $0, %%xmm0 , %%xmm0 \n\t"
"shufps $0, %%xmm1 , %%xmm1 \n\t"
"shufps $0, %%xmm2 , %%xmm2 \n\t"
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"1: \n\t"
"xorps %%xmm4 , %%xmm4 \n\t"
"xorps %%xmm5 , %%xmm5 \n\t"
- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
".p2align 1 \n\t"
- "movups (%4,%0,4), %%xmm8 \n\t"
- "movups (%5,%0,4), %%xmm9 \n\t"
- "movups (%6,%0,4), %%xmm10 \n\t"
- "movups (%7,%0,4), %%xmm11 \n\t"
+ "movups (%5,%0,4), %%xmm8 \n\t"
+ "movups (%6,%0,4), %%xmm9 \n\t"
+ "movups (%7,%0,4), %%xmm10 \n\t"
+ "movups (%8,%0,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm12, %%xmm8 \n\t"
"mulps %%xmm13, %%xmm9 \n\t"
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
- "movups (%4,%8,4), %%xmm8 \n\t"
- "movups (%5,%8,4), %%xmm9 \n\t"
- "movups (%6,%8,4), %%xmm10 \n\t"
- "movups (%7,%8,4), %%xmm11 \n\t"
+ "movups (%5,%2,4), %%xmm8 \n\t"
+ "movups (%6,%2,4), %%xmm9 \n\t"
+ "movups (%7,%2,4), %%xmm10 \n\t"
+ "movups (%8,%2,4), %%xmm11 \n\t"
".p2align 1 \n\t"
"mulps %%xmm0 , %%xmm8 \n\t"
"mulps %%xmm1 , %%xmm9 \n\t"
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"addps %%xmm10, %%xmm4 \n\t"
"addps %%xmm11, %%xmm5 \n\t"
- "addq $4 , %8 \n\t"
+ "addq $4 , %2 \n\t"
"addps %%xmm5 , %%xmm4 \n\t"
"addq $4 , %0 \n\t"
"mulps %%xmm6 , %%xmm4 \n\t"
"subq $4 , %1 \n\t"
"addps %%xmm4 , %%xmm7 \n\t"
- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
+ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
"jnz 1b \n\t"
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:36:39 +0100
Subject: [PATCH 2/4] Fix inline assembly constraints
rework indices to allow marking argument lda as input and output.
---
kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++-------------
1 file changed, 65 insertions(+), 65 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
index b35daa35b..3fc46542b 100644
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
"vzeroupper \n\t"
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
+ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
+ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
+ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
+ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
+ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
+ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
+ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
+ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
- "addq $4, %8 \n\t"
+ "addq $4, %2 \n\t"
"addq $4, %0 \n\t"
"subq $4, %1 \n\t"
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
+ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
- "addq $8, %8 \n\t"
+ "addq $8, %2 \n\t"
"addq $8, %0 \n\t"
"subq $8, %1 \n\t"
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
- "prefetcht0 192(%4,%0,4) \n\t"
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
+ "prefetcht0 192(%6,%0,4) \n\t"
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 192(%6,%0,4) \n\t"
- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
+ "prefetcht0 192(%8,%0,4) \n\t"
+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 192(%4,%8,4) \n\t"
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
- "prefetcht0 192(%5,%8,4) \n\t"
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
+ "prefetcht0 192(%5,%2,4) \n\t"
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
+ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
+ "prefetcht0 192(%6,%2,4) \n\t"
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
+ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
- "prefetcht0 192(%6,%8,4) \n\t"
- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
- "prefetcht0 192(%7,%8,4) \n\t"
- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
+ "prefetcht0 192(%7,%2,4) \n\t"
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
+ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
+ "prefetcht0 192(%8,%2,4) \n\t"
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
+ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
+ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
+ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
+ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
- "addq $16, %8 \n\t"
+ "addq $16, %2 \n\t"
"addq $16, %0 \n\t"
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:46:17 +0100
Subject: [PATCH 3/4] Fix inline assembly constraints
---
kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++-----------
1 file changed, 97 insertions(+), 97 deletions(-)
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
index 31001c7f3..bbf06c84b 100644
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
__asm__ __volatile__
(
- "vbroadcastss (%2), %%xmm12 \n\t" // x0
- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1
- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2
- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3
- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4
- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5
- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6
- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7
+ "vbroadcastss (%3), %%xmm12 \n\t" // x0
+ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1
+ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2
+ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3
+ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4
+ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5
+ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6
+ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
"addq $4 , %0 \n\t"
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
- "addq $4 , %8 \n\t"
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
+ "addq $4 , %2 \n\t"
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
+ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
"subq $4 , %1 \n\t"
- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
"2: \n\t"
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
-
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
"addq $8 , %0 \n\t"
- "addq $8 , %8 \n\t"
+ "addq $8 , %2 \n\t"
"subq $8 , %1 \n\t"
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
- "prefetcht0 192(%4,%0,4) \n\t"
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%5,%0,4) \n\t"
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
"prefetcht0 192(%6,%0,4) \n\t"
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
"prefetcht0 192(%7,%0,4) \n\t"
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
+ "prefetcht0 192(%8,%0,4) \n\t"
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
".align 2 \n\t"
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
-
- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
-
- "prefetcht0 192(%4,%8,4) \n\t"
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
- "prefetcht0 192(%5,%8,4) \n\t"
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
- "prefetcht0 192(%6,%8,4) \n\t"
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
- "prefetcht0 192(%7,%8,4) \n\t"
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
+
+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
+
+ "prefetcht0 192(%5,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
+ "prefetcht0 192(%6,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
+ "prefetcht0 192(%7,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
+ "prefetcht0 192(%8,%2,4) \n\t"
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
+ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
+ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
+ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
+ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
"addq $16, %0 \n\t"
- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
- "addq $16, %8 \n\t"
- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
+ "addq $16, %2 \n\t"
+ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
+ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
"subq $16, %1 \n\t"
"jnz 1b \n\t"
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
:
"+r" (i), // 0
- "+r" (n) // 1
+ "+r" (n), // 1
+ "+r" (lda4) // 2
:
- "r" (x), // 2
- "r" (y), // 3
- "r" (ap[0]), // 4
- "r" (ap[1]), // 5
- "r" (ap[2]), // 6
- "r" (ap[3]), // 7
- "r" (lda4), // 8
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
"r" (alpha) // 9
: "cc",
"%xmm0", "%xmm1",
From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
Date: Sat, 16 Feb 2019 18:51:09 +0100
Subject: [PATCH 4/4] Fix inline assembly constraints
---
dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++
1 file changed, 247 insertions(+)
create mode 100644 dgemv_n_microk_piledriver-4.c
diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c
new file mode 100644
index 000000000..466931b82
--- /dev/null
+++ b/dgemv_n_microk_piledriver-4.c
@@ -0,0 +1,247 @@
+/***************************************************************************
+Copyright (c) 2014, The OpenBLAS Project
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+1. Redistributions of source code must retain the above copyright
+notice, this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright
+notice, this list of conditions and the following disclaimer in
+the documentation and/or other materials provided with the
+distribution.
+3. Neither the name of the OpenBLAS project nor the names of
+its contributors may be used to endorse or promote products
+derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*****************************************************************************/
+
+
+
+#define HAVE_KERNEL_4x8 1
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0
+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
+
+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz 2f \n\t"
+
+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
+
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+
+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
+
+ "addq $4 , %2 \n\t"
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ "2: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je 3f \n\t"
+
+
+ ".align 16 \n\t"
+ "1: \n\t"
+
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
+
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
+ "addq $8 , %0 \n\t"
+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
+
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "addq $8 , %2 \n\t"
+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
+ "subq $8 , %1 \n\t"
+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
+
+ "jnz 1b \n\t"
+
+ "3: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ "+r" (i), // 0
+ "+r" (n), // 1
+ "+r" (lda4) // 2
+ :
+ "r" (x), // 3
+ "r" (y), // 4
+ "r" (ap[0]), // 5
+ "r" (ap[1]), // 6
+ "r" (ap[2]), // 7
+ "r" (ap[3]), // 8
+ "r" (alpha) // 9
+ : "cc",
+ "%xmm0", "%xmm1",
+ "%xmm2", "%xmm3",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+
+
+#define HAVE_KERNEL_4x4 1
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
+
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
+{
+
+ BLASLONG register i = 0;
+
+ __asm__ __volatile__
+ (
+ "vzeroupper \n\t"
+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
+
+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha
+
+ "testq $0x04, %1 \n\t"
+ "jz 2f \n\t"
+
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
+
+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
+
+ "addq $4 , %0 \n\t"
+ "subq $4 , %1 \n\t"
+
+ "2: \n\t"
+
+ "cmpq $0, %1 \n\t"
+ "je 3f \n\t"
+
+
+ ".align 16 \n\t"
+ "1: \n\t"
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
+
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
+
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
+
+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
+
+ "addq $8 , %0 \n\t"
+ "subq $8 , %1 \n\t"
+ "jnz 1b \n\t"
+
+ "3: \n\t"
+ "vzeroupper \n\t"
+
+ :
+ "+r" (i), // 0
+ "+r" (n) // 1
+ :
+ "r" (x), // 2
+ "r" (y), // 3
+ "r" (ap[0]), // 4
+ "r" (ap[1]), // 5
+ "r" (ap[2]), // 6
+ "r" (ap[3]), // 7
+ "r" (alpha) // 8
+ : "cc",
+ "%xmm4", "%xmm5",
+ "%xmm6", "%xmm7",
+ "%xmm8", "%xmm9",
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
+ "memory"
+ );
+
+}
+
+

1349
2024.patch Normal file

File diff suppressed because it is too large Load Diff

View File

@ -15,7 +15,7 @@
Name: openblas
Version: 0.3.5
Release: 2%{?dist}
Release: 3%{?dist}
Summary: An optimized BLAS library based on GotoBLAS2
License: BSD
URL: https://github.com/xianyi/OpenBLAS/
@ -29,6 +29,14 @@ Patch2: openblas-0.2.15-constructor.patch
# Supply the proper flags to the test makefile
Patch3: openblas-0.3.2-tests.patch
# Fix assembly code
Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch
Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch
Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch
Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch
Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch
Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch
BuildRequires: gcc
BuildRequires: gcc-gfortran
BuildRequires: perl-devel
@ -239,6 +247,13 @@ cd OpenBLAS-%{version}
%endif
%patch3 -p1 -b .tests
%patch10 -p0
%patch11 -p0
%patch12 -p0
%patch13 -p0
%patch14 -p0
%patch15 -p0
# Fix source permissions
find -name \*.f -exec chmod 644 {} \;
@ -674,6 +689,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig
%endif
%changelog
* Sun Feb 17 2019 Susi Lehtola <jussilehtola@fedoraproject.org> - 0.3.5-3
- Patch assembly kernels to satisfy gcc 9 demands.
* Fri Feb 01 2019 Fedora Release Engineering <releng@fedoraproject.org> - 0.3.5-2
- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild