Import patches from upstream to fix gcc 9 compatibility.
This commit is contained in:
parent
2529d97e84
commit
2a8a1574d7
499
2010.patch
Normal file
499
2010.patch
Normal file
@ -0,0 +1,499 @@
|
||||
From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Tue, 12 Feb 2019 15:33:48 +0100
|
||||
Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64
|
||||
s/dGEMV_T and s/dGEMV_N kernels
|
||||
|
||||
Arguments 0 and 1 need to be tagged as both input and output
|
||||
---
|
||||
kernel/x86_64/dgemv_n_4.c | 10 +++++-----
|
||||
kernel/x86_64/dgemv_t_4.c | 18 +++++++++---------
|
||||
kernel/x86_64/sgemv_n_4.c | 14 +++++++-------
|
||||
kernel/x86_64/sgemv_t_4.c | 18 +++++++++---------
|
||||
4 files changed, 30 insertions(+), 30 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
|
||||
index 6d2530e81..6d33641e9 100644
|
||||
--- a/kernel/x86_64/dgemv_n_4.c
|
||||
+++ b/kernel/x86_64/dgemv_n_4.c
|
||||
@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
:
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
|
||||
index a7478e3a8..ed672a757 100644
|
||||
--- a/kernel/x86_64/dgemv_t_4.c
|
||||
+++ b/kernel/x86_64/dgemv_t_4.c
|
||||
@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||
"movsd %%xmm11,8(%2) \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"movsd %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
|
||||
index 65305ac59..63697970f 100644
|
||||
--- a/kernel/x86_64/sgemv_n_4.c
|
||||
+++ b/kernel/x86_64/sgemv_n_4.c
|
||||
@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap[0]), // 4
|
||||
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
||||
|
||||
"3: \n\t"
|
||||
:
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n1) // 1
|
||||
:
|
||||
- "r" (i), // 0
|
||||
- "r" (n1), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (ap), // 4
|
||||
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
:
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
"r" (src), // 2
|
||||
"r" (dest) // 3
|
||||
: "cc",
|
||||
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
|
||||
index 065e5b385..86ecaf516 100644
|
||||
--- a/kernel/x86_64/sgemv_t_4.c
|
||||
+++ b/kernel/x86_64/sgemv_t_4.c
|
||||
@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
||||
"movss %%xmm11,4(%2) \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (y), // 2
|
||||
"r" (ap0), // 3
|
||||
"r" (ap1), // 4
|
||||
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
||||
"movss %%xmm10, (%2) \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (y), // 2
|
||||
"r" (ap), // 3
|
||||
"r" (x) // 4
|
||||
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (&da), // 2
|
||||
"r" (src), // 3
|
||||
"r" (dest) // 4
|
||||
|
||||
From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Tue, 12 Feb 2019 15:51:43 +0100
|
||||
Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly
|
||||
|
||||
Argument 0 is modified as it doubles as a counter
|
||||
---
|
||||
kernel/x86_64/dscal.c | 2 +-
|
||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
||||
|
||||
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
|
||||
index ef9a0a6ba..d0d7801fd 100644
|
||||
--- a/kernel/x86_64/dscal.c
|
||||
+++ b/kernel/x86_64/dscal.c
|
||||
@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
+ "+r" (n) // 0
|
||||
:
|
||||
- "r" (n), // 0
|
||||
"r" (x), // 1
|
||||
"r" (x1), // 2
|
||||
"r" (alpha), // 3
|
||||
|
||||
From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Tue, 12 Feb 2019 16:00:18 +0100
|
||||
Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV
|
||||
microkernels
|
||||
|
||||
Arguments 0 and 1 are both input and output
|
||||
---
|
||||
kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++---
|
||||
kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++---
|
||||
kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++---
|
||||
kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++---
|
||||
kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++---
|
||||
kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++---
|
||||
kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++---
|
||||
kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++---
|
||||
8 files changed, 24 insertions(+), 24 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
|
||||
index d7166fe4b..ae287b6d8 100644
|
||||
--- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
|
||||
+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
|
||||
@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c
|
||||
index d83d20f8e..4778f644a 100644
|
||||
--- a/kernel/x86_64/dsymv_U_microk_haswell-2.c
|
||||
+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c
|
||||
@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
|
||||
index 1344c75f7..065182286 100644
|
||||
--- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c
|
||||
+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
|
||||
@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c
|
||||
index 1ef6fbafd..d84e703bd 100644
|
||||
--- a/kernel/x86_64/dsymv_U_microk_sandy-2.c
|
||||
+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c
|
||||
@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
|
||||
index 8c01ab806..4a4f4d68d 100644
|
||||
--- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
|
||||
+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
|
||||
@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c
|
||||
index a32e59b44..e6a09ccf8 100644
|
||||
--- a/kernel/x86_64/ssymv_U_microk_haswell-2.c
|
||||
+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c
|
||||
@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
|
||||
index b8e6ee732..c56ff3b15 100644
|
||||
--- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c
|
||||
+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
|
||||
@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c
|
||||
index e8650650c..c4919a39a 100644
|
||||
--- a/kernel/x86_64/ssymv_U_microk_sandy-2.c
|
||||
+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c
|
||||
@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (i), // 0
|
||||
- "r" (n), // 1
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
"r" (a0), // 4
|
||||
|
||||
From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Tue, 12 Feb 2019 16:14:02 +0100
|
||||
Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly
|
||||
|
||||
Argument 0 is modified so should be input and output
|
||||
---
|
||||
kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++--
|
||||
kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++--
|
||||
kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++--
|
||||
kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++--
|
||||
kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++--
|
||||
kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++--
|
||||
kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++--
|
||||
kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++----
|
||||
8 files changed, 18 insertions(+), 18 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
|
||||
index d84470cc4..bfa07b6d0 100644
|
||||
--- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
|
||||
+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
|
||||
@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c
|
||||
index 866782ee6..6241879d5 100644
|
||||
--- a/kernel/x86_64/dsymv_L_microk_haswell-2.c
|
||||
+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c
|
||||
@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
|
||||
index 38479f77a..a161dcd8b 100644
|
||||
--- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c
|
||||
+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
|
||||
@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c
|
||||
index b4e6ab369..b205b1019 100644
|
||||
--- a/kernel/x86_64/dsymv_L_microk_sandy-2.c
|
||||
+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c
|
||||
@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
|
||||
index 9002228f3..602c3edf2 100644
|
||||
--- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
|
||||
+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
|
||||
@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c
|
||||
index 69db008b6..fdfe4349a 100644
|
||||
--- a/kernel/x86_64/ssymv_L_microk_haswell-2.c
|
||||
+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c
|
||||
@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
|
||||
index c0fe5d640..6bb9c02f6 100644
|
||||
--- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c
|
||||
+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
|
||||
@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
|
||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c
|
||||
index 093ca8073..0c78212e7 100644
|
||||
--- a/kernel/x86_64/ssymv_L_microk_sandy-2.c
|
||||
+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c
|
||||
@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
||||
@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
||||
"vzeroupper \n\t"
|
||||
|
||||
:
|
||||
- :
|
||||
- "r" (from), // 0
|
||||
+ "+r" (from) // 0
|
||||
+ :
|
||||
"r" (to), // 1
|
||||
"r" (x), // 2
|
||||
"r" (y), // 3
|
27
2018.patch
Normal file
27
2018.patch
Normal file
@ -0,0 +1,27 @@
|
||||
From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001
|
||||
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
|
||||
Date: Thu, 14 Feb 2019 16:19:41 +0000
|
||||
Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for
|
||||
xmm0,xmm1,xmm2,xmm3
|
||||
|
||||
This fixes a crash in dblat2 when OpenBLAS is compiled using
|
||||
-march=znver1 -ftree-vectorize -O2
|
||||
|
||||
See also:
|
||||
https://github.com/easybuilders/easybuild-easyconfigs/issues/7180
|
||||
---
|
||||
kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 +
|
||||
1 file changed, 1 insertion(+)
|
||||
|
||||
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
|
||||
index 584a6c6b5..da0fa2fff 100644
|
||||
--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
|
||||
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
|
||||
@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
"r" (ap[3]), // 7
|
||||
"r" (alpha) // 8
|
||||
: "cc",
|
||||
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
274
2019.patch
Normal file
274
2019.patch
Normal file
@ -0,0 +1,274 @@
|
||||
From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Thu, 14 Feb 2019 22:43:18 +0100
|
||||
Subject: [PATCH 1/2] Save and restore input argument 8 (lda4)
|
||||
|
||||
Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009)
|
||||
---
|
||||
kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++--
|
||||
1 file changed, 5 insertions(+), 2 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
||||
index 2c90f8aa9..e89a16785 100644
|
||||
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
||||
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
||||
@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
*****************************************************************************/
|
||||
|
||||
|
||||
-
|
||||
#define HAVE_KERNEL_4x8 1
|
||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
+ "movq %8, %%xmm10 \n\t" //save lda
|
||||
+
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"4: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
+ "movq %%xmm10, %8 \n\t" //restore lda
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
+ "%xmm10",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
||||
@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
}
|
||||
|
||||
|
||||
-
|
||||
#define HAVE_KERNEL_4x4 1
|
||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
|
||||
@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
||||
|
||||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
||||
|
||||
+
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
|
||||
From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Fri, 15 Feb 2019 10:10:04 +0100
|
||||
Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint
|
||||
list
|
||||
|
||||
---
|
||||
kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------
|
||||
1 file changed, 61 insertions(+), 65 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
||||
index e89a16785..93e1e26e8 100644
|
||||
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
||||
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
||||
@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
|
||||
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
||||
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
||||
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
||||
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
||||
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
||||
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
||||
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
- "movq %8, %%xmm10 \n\t" //save lda
|
||||
-
|
||||
"testq $0x04, %1 \n\t"
|
||||
"jz 2f \n\t"
|
||||
|
||||
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
|
||||
|
||||
- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
+ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
+ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
+ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
|
||||
- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
+ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
+ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
+ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
+ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
- "addq $4 , %8 \n\t"
|
||||
+ "addq $4 , %2 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
|
||||
@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"testq $0x08, %1 \n\t"
|
||||
"jz 3f \n\t"
|
||||
|
||||
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
|
||||
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
|
||||
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
||||
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
||||
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
|
||||
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
- "addq $8 , %8 \n\t"
|
||||
+ "addq $8 , %2 \n\t"
|
||||
"addq $8 , %0 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
-
|
||||
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
-
|
||||
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
||||
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
|
||||
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
|
||||
+
|
||||
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
||||
+ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
|
||||
+ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
||||
+ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
|
||||
+ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
||||
+
|
||||
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
|
||||
- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
||||
- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
|
||||
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
|
||||
- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
||||
+ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
|
||||
+ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
||||
+ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
|
||||
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
|
||||
+ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
||||
|
||||
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
|
||||
- "addq $16, %8 \n\t"
|
||||
- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
|
||||
+ "addq $16, %2 \n\t"
|
||||
+ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
|
||||
"subq $16, %1 \n\t"
|
||||
- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
|
||||
+ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
"4: \n\t"
|
||||
"vzeroupper \n\t"
|
||||
- "movq %%xmm10, %8 \n\t" //restore lda
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
- "+r" (n) // 1
|
||||
+ "+r" (n), // 1
|
||||
+ "+r" (lda4) // 2
|
||||
:
|
||||
- "r" (x), // 2
|
||||
- "r" (y), // 3
|
||||
- "r" (ap[0]), // 4
|
||||
- "r" (ap[1]), // 5
|
||||
- "r" (ap[2]), // 6
|
||||
- "r" (ap[3]), // 7
|
||||
- "r" (lda4), // 8
|
||||
+ "r" (x), // 3
|
||||
+ "r" (y), // 4
|
||||
+ "r" (ap[0]), // 5
|
||||
+ "r" (ap[1]), // 6
|
||||
+ "r" (ap[2]), // 7
|
||||
+ "r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"%xmm4", "%xmm5",
|
||||
"%xmm6", "%xmm7",
|
||||
"%xmm8", "%xmm9",
|
||||
- "%xmm10",
|
||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
"memory"
|
||||
);
|
255
2021.patch
Normal file
255
2021.patch
Normal file
@ -0,0 +1,255 @@
|
||||
From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Fri, 15 Feb 2019 15:08:16 +0100
|
||||
Subject: [PATCH] Fix wrong constraints in inline assembly
|
||||
|
||||
for #2009
|
||||
---
|
||||
kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++-------------
|
||||
1 file changed, 49 insertions(+), 49 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
||||
index fcab8e2c7..9ab78fc8e 100644
|
||||
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
||||
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
||||
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" cmpq $0, %0 \n\t"
|
||||
" je 4f \n\t"
|
||||
|
||||
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
||||
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
||||
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
||||
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
||||
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
||||
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
||||
|
||||
|
||||
" addq $8, %1 \n\t"
|
||||
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" .p2align 4 \n\t"
|
||||
"1: \n\t"
|
||||
|
||||
- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a
|
||||
+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a
|
||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
||||
|
||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
|
||||
|
||||
- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
|
||||
+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
|
||||
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
|
||||
|
||||
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
|
||||
- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
|
||||
+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
|
||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
|
||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
|
||||
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
" jz 22f \n\t"
|
||||
|
||||
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
||||
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
||||
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
|
||||
|
||||
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
|
||||
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
||||
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
|
||||
|
||||
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
|
||||
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
||||
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
|
||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
|
||||
|
||||
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
|
||||
|
||||
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
|
||||
- " vmovups (%9), %%ymm0 \n\t"
|
||||
+ " vmovups (%3), %%ymm0 \n\t"
|
||||
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
|
||||
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
|
||||
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
|
||||
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
|
||||
|
||||
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
|
||||
- " vmovups 32(%9), %%ymm4 \n\t"
|
||||
+ " vmovups 32(%3), %%ymm4 \n\t"
|
||||
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
|
||||
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
|
||||
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
|
||||
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
|
||||
"5: \n\t" // i = 0
|
||||
|
||||
- " addq $64, %9 \n\t" // b=b+8
|
||||
+ " addq $64, %3 \n\t" // b=b+8
|
||||
|
||||
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
|
||||
- " vmovups (%9), %%ymm0 \n\t"
|
||||
- " vmovups %%ymm8 , (%8) \n\t" // write a
|
||||
+ " vmovups (%3), %%ymm0 \n\t"
|
||||
+ " vmovups %%ymm8 , (%2) \n\t" // write a
|
||||
" vmovups %%ymm8 , (%4) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
|
||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
||||
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
|
||||
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
|
||||
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
|
||||
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
- " addq $64, %9 \n\t" // b=b+8
|
||||
- " addq $32, %8 \n\t" // a=a+8
|
||||
+ " addq $64, %3 \n\t" // b=b+8
|
||||
+ " addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
|
||||
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
|
||||
- " vmovups (%9), %%ymm0 \n\t"
|
||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
||||
- " vmovups %%ymm9 , (%8) \n\t" // write a
|
||||
+ " vmovups (%3), %%ymm0 \n\t"
|
||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
||||
+ " vmovups %%ymm9 , (%2) \n\t" // write a
|
||||
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
|
||||
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
- " addq $64, %9 \n\t" // b=b+8
|
||||
- " addq $32, %8 \n\t" // a=a+8
|
||||
+ " addq $64, %3 \n\t" // b=b+8
|
||||
+ " addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
|
||||
- " vmovups (%9), %%ymm0 \n\t"
|
||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
||||
- " vmovups %%ymm10, (%8) \n\t" // write a
|
||||
+ " vmovups (%3), %%ymm0 \n\t"
|
||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
||||
+ " vmovups %%ymm10, (%2) \n\t" // write a
|
||||
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
|
||||
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
||||
|
||||
|
||||
- " addq $64, %9 \n\t" // b=b+8
|
||||
- " addq $32, %8 \n\t" // a=a+8
|
||||
+ " addq $64, %3 \n\t" // b=b+8
|
||||
+ " addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
|
||||
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
|
||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
||||
- " vmovups %%ymm11, (%8) \n\t" // write a
|
||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
||||
+ " vmovups %%ymm11, (%2) \n\t" // write a
|
||||
" vmovups %%ymm11, (%5) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
|
||||
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
|
||||
- " addq $64, %9 \n\t" // b=b+8
|
||||
- " addq $32, %8 \n\t" // a=a+8
|
||||
+ " addq $64, %3 \n\t" // b=b+8
|
||||
+ " addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
|
||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
||||
- " vmovups %%ymm12, (%8) \n\t" // write a
|
||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
||||
+ " vmovups %%ymm12, (%2) \n\t" // write a
|
||||
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
|
||||
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
||||
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
- " addq $64, %9 \n\t" // b=b+8
|
||||
- " addq $32, %8 \n\t" // a=a+8
|
||||
+ " addq $64, %3 \n\t" // b=b+8
|
||||
+ " addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
|
||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
||||
- " vmovups %%ymm13, (%8) \n\t" // write a
|
||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
||||
+ " vmovups %%ymm13, (%2) \n\t" // write a
|
||||
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
|
||||
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
||||
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
|
||||
- " addq $64, %9 \n\t" // b=b+8
|
||||
- " addq $32, %8 \n\t" // a=a+8
|
||||
+ " addq $64, %3 \n\t" // b=b+8
|
||||
+ " addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
|
||||
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
|
||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
||||
- " vmovups %%ymm14, (%8) \n\t" // write a
|
||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
||||
+ " vmovups %%ymm14, (%2) \n\t" // write a
|
||||
" vmovups %%ymm14, (%6) \n\t" // write c
|
||||
|
||||
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
|
||||
|
||||
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
|
||||
|
||||
- " addq $32, %8 \n\t" // a=a+8
|
||||
+ " addq $32, %2 \n\t" // a=a+8
|
||||
|
||||
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
|
||||
- " vmovups %%ymm15, (%8) \n\t" // write a
|
||||
+ " vmovups %%ymm15, (%2) \n\t" // write a
|
||||
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
|
||||
|
||||
" vzeroupper \n\t"
|
||||
|
||||
:
|
||||
+ "+r" (n1), // 0
|
||||
+ "+a" (i), // 1
|
||||
+ "+r" (as), // 2
|
||||
+ "+r" (bs) // 3
|
||||
:
|
||||
- "r" (n1), // 0
|
||||
- "a" (i), // 1
|
||||
- "r" (a), // 2
|
||||
- "r" (b), // 3
|
||||
"r" (c), // 4
|
||||
"r" (c3), // 5
|
||||
"r" (c6), // 6
|
||||
"r" (ldc), // 7
|
||||
- "r" (as), // 8
|
||||
- "r" (bs) // 9
|
||||
+ "r" (a), // 8
|
||||
+ "r" (b) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
874
2023.patch
Normal file
874
2023.patch
Normal file
@ -0,0 +1,874 @@
|
||||
From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Sat, 16 Feb 2019 18:24:11 +0100
|
||||
Subject: [PATCH 1/4] Fix inline assembly constraints
|
||||
|
||||
rework indices to allow marking argument lda4 as input and output. For #2009
|
||||
---
|
||||
kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------
|
||||
1 file changed, 27 insertions(+), 27 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
|
||||
index 11a3e943b..d21232bfa 100644
|
||||
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
|
||||
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
|
||||
@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
- "movss (%2), %%xmm12 \n\t" // x0
|
||||
- "movss 4(%2), %%xmm13 \n\t" // x1
|
||||
- "movss 8(%2), %%xmm14 \n\t" // x2
|
||||
- "movss 12(%2), %%xmm15 \n\t" // x3
|
||||
+ "movss (%3), %%xmm12 \n\t" // x0
|
||||
+ "movss 4(%3), %%xmm13 \n\t" // x1
|
||||
+ "movss 8(%3), %%xmm14 \n\t" // x2
|
||||
+ "movss 12(%3), %%xmm15 \n\t" // x3
|
||||
"shufps $0, %%xmm12, %%xmm12\n\t"
|
||||
"shufps $0, %%xmm13, %%xmm13\n\t"
|
||||
"shufps $0, %%xmm14, %%xmm14\n\t"
|
||||
"shufps $0, %%xmm15, %%xmm15\n\t"
|
||||
|
||||
- "movss 16(%2), %%xmm0 \n\t" // x4
|
||||
- "movss 20(%2), %%xmm1 \n\t" // x5
|
||||
- "movss 24(%2), %%xmm2 \n\t" // x6
|
||||
- "movss 28(%2), %%xmm3 \n\t" // x7
|
||||
+ "movss 16(%3), %%xmm0 \n\t" // x4
|
||||
+ "movss 20(%3), %%xmm1 \n\t" // x5
|
||||
+ "movss 24(%3), %%xmm2 \n\t" // x6
|
||||
+ "movss 28(%3), %%xmm3 \n\t" // x7
|
||||
"shufps $0, %%xmm0 , %%xmm0 \n\t"
|
||||
"shufps $0, %%xmm1 , %%xmm1 \n\t"
|
||||
"shufps $0, %%xmm2 , %%xmm2 \n\t"
|
||||
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"1: \n\t"
|
||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
||||
- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
+ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
".p2align 1 \n\t"
|
||||
- "movups (%4,%0,4), %%xmm8 \n\t"
|
||||
- "movups (%5,%0,4), %%xmm9 \n\t"
|
||||
- "movups (%6,%0,4), %%xmm10 \n\t"
|
||||
- "movups (%7,%0,4), %%xmm11 \n\t"
|
||||
+ "movups (%5,%0,4), %%xmm8 \n\t"
|
||||
+ "movups (%6,%0,4), %%xmm9 \n\t"
|
||||
+ "movups (%7,%0,4), %%xmm10 \n\t"
|
||||
+ "movups (%8,%0,4), %%xmm11 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"mulps %%xmm12, %%xmm8 \n\t"
|
||||
"mulps %%xmm13, %%xmm9 \n\t"
|
||||
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
- "movups (%4,%8,4), %%xmm8 \n\t"
|
||||
- "movups (%5,%8,4), %%xmm9 \n\t"
|
||||
- "movups (%6,%8,4), %%xmm10 \n\t"
|
||||
- "movups (%7,%8,4), %%xmm11 \n\t"
|
||||
+ "movups (%5,%2,4), %%xmm8 \n\t"
|
||||
+ "movups (%6,%2,4), %%xmm9 \n\t"
|
||||
+ "movups (%7,%2,4), %%xmm10 \n\t"
|
||||
+ "movups (%8,%2,4), %%xmm11 \n\t"
|
||||
".p2align 1 \n\t"
|
||||
"mulps %%xmm0 , %%xmm8 \n\t"
|
||||
"mulps %%xmm1 , %%xmm9 \n\t"
|
||||
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"addps %%xmm10, %%xmm4 \n\t"
|
||||
"addps %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
- "addq $4 , %8 \n\t"
|
||||
+ "addq $4 , %2 \n\t"
|
||||
"addps %%xmm5 , %%xmm4 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
"mulps %%xmm6 , %%xmm4 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
"addps %%xmm4 , %%xmm7 \n\t"
|
||||
|
||||
- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
|
||||
+ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"jnz 1b \n\t"
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
- "+r" (n) // 1
|
||||
+ "+r" (n), // 1
|
||||
+ "+r" (lda4) // 2
|
||||
:
|
||||
- "r" (x), // 2
|
||||
- "r" (y), // 3
|
||||
- "r" (ap[0]), // 4
|
||||
- "r" (ap[1]), // 5
|
||||
- "r" (ap[2]), // 6
|
||||
- "r" (ap[3]), // 7
|
||||
- "r" (lda4), // 8
|
||||
+ "r" (x), // 3
|
||||
+ "r" (y), // 4
|
||||
+ "r" (ap[0]), // 5
|
||||
+ "r" (ap[1]), // 6
|
||||
+ "r" (ap[2]), // 7
|
||||
+ "r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
||||
From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Sat, 16 Feb 2019 18:36:39 +0100
|
||||
Subject: [PATCH 2/4] Fix inline assembly constraints
|
||||
|
||||
rework indices to allow marking argument lda as input and output.
|
||||
---
|
||||
kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++-------------
|
||||
1 file changed, 65 insertions(+), 65 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
|
||||
index b35daa35b..3fc46542b 100644
|
||||
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
|
||||
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
|
||||
@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
__asm__ __volatile__
|
||||
(
|
||||
"vzeroupper \n\t"
|
||||
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
|
||||
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
||||
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
||||
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
||||
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
||||
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
||||
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
||||
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
||||
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
|
||||
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
||||
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
||||
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
||||
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
||||
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
||||
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
||||
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
||||
|
||||
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
|
||||
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
||||
|
||||
- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
+ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
|
||||
+ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
|
||||
+ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
|
||||
+ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
|
||||
|
||||
- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
|
||||
- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
|
||||
- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
|
||||
- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
|
||||
+ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
|
||||
+ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
|
||||
+ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
|
||||
+ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
|
||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
||||
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
|
||||
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
|
||||
|
||||
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
||||
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
- "addq $4, %8 \n\t"
|
||||
+ "addq $4, %2 \n\t"
|
||||
"addq $4, %0 \n\t"
|
||||
"subq $4, %1 \n\t"
|
||||
|
||||
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
||||
|
||||
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
+ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
+ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
||||
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
||||
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
||||
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
||||
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
|
||||
|
||||
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
||||
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
- "addq $8, %8 \n\t"
|
||||
+ "addq $8, %2 \n\t"
|
||||
"addq $8, %0 \n\t"
|
||||
"subq $8, %1 \n\t"
|
||||
|
||||
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
- "prefetcht0 192(%4,%0,4) \n\t"
|
||||
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
||||
+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
|
||||
+ "prefetcht0 192(%6,%0,4) \n\t"
|
||||
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
||||
+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
- "prefetcht0 192(%6,%0,4) \n\t"
|
||||
- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
|
||||
+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
||||
+ "prefetcht0 192(%8,%0,4) \n\t"
|
||||
+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
|
||||
+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
- "prefetcht0 192(%4,%8,4) \n\t"
|
||||
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
||||
- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
|
||||
- "prefetcht0 192(%5,%8,4) \n\t"
|
||||
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
||||
- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
|
||||
+ "prefetcht0 192(%5,%2,4) \n\t"
|
||||
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
||||
+ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
|
||||
+ "prefetcht0 192(%6,%2,4) \n\t"
|
||||
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
||||
+ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
||||
|
||||
- "prefetcht0 192(%6,%8,4) \n\t"
|
||||
- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
|
||||
- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
||||
- "prefetcht0 192(%7,%8,4) \n\t"
|
||||
- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
|
||||
- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
||||
+ "prefetcht0 192(%7,%2,4) \n\t"
|
||||
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
|
||||
+ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
||||
+ "prefetcht0 192(%8,%2,4) \n\t"
|
||||
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
|
||||
+ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
||||
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
|
||||
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
|
||||
|
||||
- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
||||
- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
||||
+ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
||||
+ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
||||
|
||||
- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
|
||||
- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
|
||||
+ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
|
||||
+ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
|
||||
|
||||
- "addq $16, %8 \n\t"
|
||||
+ "addq $16, %2 \n\t"
|
||||
"addq $16, %0 \n\t"
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
- "+r" (n) // 1
|
||||
+ "+r" (n), // 1
|
||||
+ "+r" (lda4) // 2
|
||||
:
|
||||
- "r" (x), // 2
|
||||
- "r" (y), // 3
|
||||
- "r" (ap[0]), // 4
|
||||
- "r" (ap[1]), // 5
|
||||
- "r" (ap[2]), // 6
|
||||
- "r" (ap[3]), // 7
|
||||
- "r" (lda4), // 8
|
||||
+ "r" (x), // 3
|
||||
+ "r" (y), // 4
|
||||
+ "r" (ap[0]), // 5
|
||||
+ "r" (ap[1]), // 6
|
||||
+ "r" (ap[2]), // 7
|
||||
+ "r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
||||
From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Sat, 16 Feb 2019 18:46:17 +0100
|
||||
Subject: [PATCH 3/4] Fix inline assembly constraints
|
||||
|
||||
---
|
||||
kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++-----------
|
||||
1 file changed, 97 insertions(+), 97 deletions(-)
|
||||
|
||||
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
|
||||
index 31001c7f3..bbf06c84b 100644
|
||||
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
|
||||
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
|
||||
@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
__asm__ __volatile__
|
||||
(
|
||||
- "vbroadcastss (%2), %%xmm12 \n\t" // x0
|
||||
- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1
|
||||
- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2
|
||||
- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3
|
||||
- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4
|
||||
- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5
|
||||
- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6
|
||||
- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7
|
||||
+ "vbroadcastss (%3), %%xmm12 \n\t" // x0
|
||||
+ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1
|
||||
+ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2
|
||||
+ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3
|
||||
+ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4
|
||||
+ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5
|
||||
+ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6
|
||||
+ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7
|
||||
|
||||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
||||
|
||||
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
"addq $4 , %0 \n\t"
|
||||
|
||||
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
- "addq $4 , %8 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
+ "addq $4 , %2 \n\t"
|
||||
|
||||
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
|
||||
- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
||||
+ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
||||
"subq $4 , %1 \n\t"
|
||||
- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
||||
+ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"2: \n\t"
|
||||
|
||||
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
||||
|
||||
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
-
|
||||
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
+
|
||||
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
|
||||
- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
|
||||
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
+ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
|
||||
+ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"addq $8 , %0 \n\t"
|
||||
- "addq $8 , %8 \n\t"
|
||||
+ "addq $8 , %2 \n\t"
|
||||
"subq $8 , %1 \n\t"
|
||||
|
||||
|
||||
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
|
||||
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
|
||||
|
||||
- "prefetcht0 192(%4,%0,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%5,%0,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%6,%0,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
||||
"prefetcht0 192(%7,%0,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
||||
+ "prefetcht0 192(%8,%0,4) \n\t"
|
||||
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
||||
".align 2 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
-
|
||||
- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
-
|
||||
- "prefetcht0 192(%4,%8,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
||||
- "prefetcht0 192(%5,%8,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
||||
- "prefetcht0 192(%6,%8,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
||||
- "prefetcht0 192(%7,%8,4) \n\t"
|
||||
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
||||
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
||||
+
|
||||
+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
|
||||
+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
|
||||
+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
|
||||
+ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
|
||||
+
|
||||
+ "prefetcht0 192(%5,%2,4) \n\t"
|
||||
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
||||
+ "prefetcht0 192(%6,%2,4) \n\t"
|
||||
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
||||
+ "prefetcht0 192(%7,%2,4) \n\t"
|
||||
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
||||
+ "prefetcht0 192(%8,%2,4) \n\t"
|
||||
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
||||
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
||||
|
||||
- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
|
||||
- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
|
||||
- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
|
||||
- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
|
||||
- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
|
||||
+ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
|
||||
+ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
|
||||
+ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
|
||||
+ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
|
||||
+ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
|
||||
|
||||
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
||||
- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
||||
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
||||
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
||||
+ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
||||
+ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
||||
|
||||
"addq $16, %0 \n\t"
|
||||
- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
|
||||
- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
|
||||
- "addq $16, %8 \n\t"
|
||||
- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
|
||||
- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
||||
+ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
|
||||
+ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
|
||||
+ "addq $16, %2 \n\t"
|
||||
+ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
|
||||
+ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
|
||||
|
||||
"subq $16, %1 \n\t"
|
||||
"jnz 1b \n\t"
|
||||
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
||||
|
||||
:
|
||||
"+r" (i), // 0
|
||||
- "+r" (n) // 1
|
||||
+ "+r" (n), // 1
|
||||
+ "+r" (lda4) // 2
|
||||
:
|
||||
- "r" (x), // 2
|
||||
- "r" (y), // 3
|
||||
- "r" (ap[0]), // 4
|
||||
- "r" (ap[1]), // 5
|
||||
- "r" (ap[2]), // 6
|
||||
- "r" (ap[3]), // 7
|
||||
- "r" (lda4), // 8
|
||||
+ "r" (x), // 3
|
||||
+ "r" (y), // 4
|
||||
+ "r" (ap[0]), // 5
|
||||
+ "r" (ap[1]), // 6
|
||||
+ "r" (ap[2]), // 7
|
||||
+ "r" (ap[3]), // 8
|
||||
"r" (alpha) // 9
|
||||
: "cc",
|
||||
"%xmm0", "%xmm1",
|
||||
|
||||
From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001
|
||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
||||
Date: Sat, 16 Feb 2019 18:51:09 +0100
|
||||
Subject: [PATCH 4/4] Fix inline assembly constraints
|
||||
|
||||
---
|
||||
dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++
|
||||
1 file changed, 247 insertions(+)
|
||||
create mode 100644 dgemv_n_microk_piledriver-4.c
|
||||
|
||||
diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c
|
||||
new file mode 100644
|
||||
index 000000000..466931b82
|
||||
--- /dev/null
|
||||
+++ b/dgemv_n_microk_piledriver-4.c
|
||||
@@ -0,0 +1,247 @@
|
||||
+/***************************************************************************
|
||||
+Copyright (c) 2014, The OpenBLAS Project
|
||||
+All rights reserved.
|
||||
+Redistribution and use in source and binary forms, with or without
|
||||
+modification, are permitted provided that the following conditions are
|
||||
+met:
|
||||
+1. Redistributions of source code must retain the above copyright
|
||||
+notice, this list of conditions and the following disclaimer.
|
||||
+2. Redistributions in binary form must reproduce the above copyright
|
||||
+notice, this list of conditions and the following disclaimer in
|
||||
+the documentation and/or other materials provided with the
|
||||
+distribution.
|
||||
+3. Neither the name of the OpenBLAS project nor the names of
|
||||
+its contributors may be used to endorse or promote products
|
||||
+derived from this software without specific prior written permission.
|
||||
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
||||
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
||||
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
||||
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
||||
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
||||
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
||||
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
||||
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
||||
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
||||
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||
+*****************************************************************************/
|
||||
+
|
||||
+
|
||||
+
|
||||
+#define HAVE_KERNEL_4x8 1
|
||||
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
||||
+
|
||||
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
||||
+{
|
||||
+
|
||||
+ BLASLONG register i = 0;
|
||||
+
|
||||
+ __asm__ __volatile__
|
||||
+ (
|
||||
+ "vzeroupper \n\t"
|
||||
+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0
|
||||
+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
|
||||
+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
|
||||
+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
|
||||
+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
|
||||
+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
|
||||
+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
|
||||
+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
|
||||
+
|
||||
+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
||||
+
|
||||
+ "testq $0x04, %1 \n\t"
|
||||
+ "jz 2f \n\t"
|
||||
+
|
||||
+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
+
|
||||
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
+
|
||||
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
||||
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
||||
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
||||
+
|
||||
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
+
|
||||
+
|
||||
+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
|
||||
+
|
||||
+ "addq $4 , %2 \n\t"
|
||||
+ "addq $4 , %0 \n\t"
|
||||
+ "subq $4 , %1 \n\t"
|
||||
+
|
||||
+ "2: \n\t"
|
||||
+
|
||||
+ "cmpq $0, %1 \n\t"
|
||||
+ "je 3f \n\t"
|
||||
+
|
||||
+
|
||||
+ ".align 16 \n\t"
|
||||
+ "1: \n\t"
|
||||
+
|
||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
+
|
||||
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
+
|
||||
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
||||
+ "addq $8 , %0 \n\t"
|
||||
+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
||||
+
|
||||
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
+
|
||||
+ "addq $8 , %2 \n\t"
|
||||
+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
||||
+ "subq $8 , %1 \n\t"
|
||||
+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
|
||||
+
|
||||
+ "jnz 1b \n\t"
|
||||
+
|
||||
+ "3: \n\t"
|
||||
+ "vzeroupper \n\t"
|
||||
+
|
||||
+ :
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n), // 1
|
||||
+ "+r" (lda4) // 2
|
||||
+ :
|
||||
+ "r" (x), // 3
|
||||
+ "r" (y), // 4
|
||||
+ "r" (ap[0]), // 5
|
||||
+ "r" (ap[1]), // 6
|
||||
+ "r" (ap[2]), // 7
|
||||
+ "r" (ap[3]), // 8
|
||||
+ "r" (alpha) // 9
|
||||
+ : "cc",
|
||||
+ "%xmm0", "%xmm1",
|
||||
+ "%xmm2", "%xmm3",
|
||||
+ "%xmm4", "%xmm5",
|
||||
+ "%xmm6", "%xmm7",
|
||||
+ "%xmm8", "%xmm9",
|
||||
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
+ "memory"
|
||||
+ );
|
||||
+
|
||||
+}
|
||||
+
|
||||
+
|
||||
+
|
||||
+#define HAVE_KERNEL_4x4 1
|
||||
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
||||
+
|
||||
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
||||
+{
|
||||
+
|
||||
+ BLASLONG register i = 0;
|
||||
+
|
||||
+ __asm__ __volatile__
|
||||
+ (
|
||||
+ "vzeroupper \n\t"
|
||||
+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
||||
+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
||||
+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
||||
+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
||||
+
|
||||
+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
||||
+
|
||||
+ "testq $0x04, %1 \n\t"
|
||||
+ "jz 2f \n\t"
|
||||
+
|
||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
||||
+
|
||||
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
+
|
||||
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
||||
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
||||
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
||||
+
|
||||
+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
||||
+
|
||||
+ "addq $4 , %0 \n\t"
|
||||
+ "subq $4 , %1 \n\t"
|
||||
+
|
||||
+ "2: \n\t"
|
||||
+
|
||||
+ "cmpq $0, %1 \n\t"
|
||||
+ "je 3f \n\t"
|
||||
+
|
||||
+
|
||||
+ ".align 16 \n\t"
|
||||
+ "1: \n\t"
|
||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
||||
+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
||||
+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
||||
+
|
||||
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
||||
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
||||
+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
||||
+
|
||||
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
||||
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
||||
+
|
||||
+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
|
||||
+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
|
||||
+
|
||||
+ "addq $8 , %0 \n\t"
|
||||
+ "subq $8 , %1 \n\t"
|
||||
+ "jnz 1b \n\t"
|
||||
+
|
||||
+ "3: \n\t"
|
||||
+ "vzeroupper \n\t"
|
||||
+
|
||||
+ :
|
||||
+ "+r" (i), // 0
|
||||
+ "+r" (n) // 1
|
||||
+ :
|
||||
+ "r" (x), // 2
|
||||
+ "r" (y), // 3
|
||||
+ "r" (ap[0]), // 4
|
||||
+ "r" (ap[1]), // 5
|
||||
+ "r" (ap[2]), // 6
|
||||
+ "r" (ap[3]), // 7
|
||||
+ "r" (alpha) // 8
|
||||
+ : "cc",
|
||||
+ "%xmm4", "%xmm5",
|
||||
+ "%xmm6", "%xmm7",
|
||||
+ "%xmm8", "%xmm9",
|
||||
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
||||
+ "memory"
|
||||
+ );
|
||||
+
|
||||
+}
|
||||
+
|
||||
+
|
1349
2024.patch
Normal file
1349
2024.patch
Normal file
File diff suppressed because it is too large
Load Diff
@ -15,7 +15,7 @@
|
||||
|
||||
Name: openblas
|
||||
Version: 0.3.5
|
||||
Release: 2%{?dist}
|
||||
Release: 3%{?dist}
|
||||
Summary: An optimized BLAS library based on GotoBLAS2
|
||||
License: BSD
|
||||
URL: https://github.com/xianyi/OpenBLAS/
|
||||
@ -29,6 +29,14 @@ Patch2: openblas-0.2.15-constructor.patch
|
||||
# Supply the proper flags to the test makefile
|
||||
Patch3: openblas-0.3.2-tests.patch
|
||||
|
||||
# Fix assembly code
|
||||
Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch
|
||||
Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch
|
||||
Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch
|
||||
Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch
|
||||
Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch
|
||||
Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch
|
||||
|
||||
BuildRequires: gcc
|
||||
BuildRequires: gcc-gfortran
|
||||
BuildRequires: perl-devel
|
||||
@ -239,6 +247,13 @@ cd OpenBLAS-%{version}
|
||||
%endif
|
||||
%patch3 -p1 -b .tests
|
||||
|
||||
%patch10 -p0
|
||||
%patch11 -p0
|
||||
%patch12 -p0
|
||||
%patch13 -p0
|
||||
%patch14 -p0
|
||||
%patch15 -p0
|
||||
|
||||
# Fix source permissions
|
||||
find -name \*.f -exec chmod 644 {} \;
|
||||
|
||||
@ -674,6 +689,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig
|
||||
%endif
|
||||
|
||||
%changelog
|
||||
* Sun Feb 17 2019 Susi Lehtola <jussilehtola@fedoraproject.org> - 0.3.5-3
|
||||
- Patch assembly kernels to satisfy gcc 9 demands.
|
||||
|
||||
* Fri Feb 01 2019 Fedora Release Engineering <releng@fedoraproject.org> - 0.3.5-2
|
||||
- Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user