Update to 0.3.6.
This commit is contained in:
parent
4e591d8725
commit
64c2df1d85
1
.gitignore
vendored
1
.gitignore
vendored
@ -15,3 +15,4 @@
|
|||||||
/v0.3.0.tar.gz
|
/v0.3.0.tar.gz
|
||||||
/v0.3.1.tar.gz
|
/v0.3.1.tar.gz
|
||||||
/openblas-0.3.2.tar.gz
|
/openblas-0.3.2.tar.gz
|
||||||
|
/openblas-0.3.6.tar.gz
|
||||||
|
3283
1965.patch
3283
1965.patch
File diff suppressed because it is too large
Load Diff
960
1966.patch
960
1966.patch
@ -1,960 +0,0 @@
|
|||||||
From 63cdd8f4a04f3a5ac1733e202b6b3678c34fb8dd Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:27:38 +0100
|
|
||||||
Subject: [PATCH 01/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/cscal_microk_bulldozer-2.c | 32 ++++++++++++------------
|
|
||||||
1 file changed, 16 insertions(+), 16 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c
|
|
||||||
index 3abffc4cf..f526fd611 100644
|
|
||||||
--- a/kernel/x86_64/cscal_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c
|
|
||||||
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
|
|
||||||
From b6136be686e415fbdb035267c5020cb08e4e49ac Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:30:03 +0100
|
|
||||||
Subject: [PATCH 02/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/cscal_microk_haswell-2.c | 30 +++++++++++++-------------
|
|
||||||
1 file changed, 15 insertions(+), 15 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c
|
|
||||||
index 0a4eb683c..8623dcd10 100644
|
|
||||||
--- a/kernel/x86_64/cscal_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/cscal_microk_haswell-2.c
|
|
||||||
@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"0", "1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc", // "0", "1",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc", //"%0", "%1",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
- :
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ :
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"0", "1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
|
|
||||||
From f447fb4c54870710cd6304553df59f50ff51b8f5 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:32:48 +0100
|
|
||||||
Subject: [PATCH 03/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++-----------
|
|
||||||
1 file changed, 16 insertions(+), 16 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c
|
|
||||||
index 8346e1748..fbeb857e2 100644
|
|
||||||
--- a/kernel/x86_64/cscal_microk_steamroller-2.c
|
|
||||||
+++ b/kernel/x86_64/cscal_microk_steamroller-2.c
|
|
||||||
@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"0", "1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
+ :
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"0", "1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"0", "1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
|
|
||||||
From fcd7fde5702cf7270332a5dd747f83efe7be93dd Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:35:18 +0100
|
|
||||||
Subject: [PATCH 04/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++++------
|
|
||||||
1 file changed, 6 insertions(+), 6 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c
|
|
||||||
index de53b0bc4..71d3a9846 100644
|
|
||||||
--- a/kernel/x86_64/dscal_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c
|
|
||||||
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n1), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n1), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
: "cc",
|
|
||||||
@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n1), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n1), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
: "cc",
|
|
||||||
|
|
||||||
From 05e961994401bfc6dc8639fa9bc159148569ca9d Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:36:37 +0100
|
|
||||||
Subject: [PATCH 05/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++++------
|
|
||||||
1 file changed, 6 insertions(+), 6 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c
|
|
||||||
index e732a2718..90790cfdc 100644
|
|
||||||
--- a/kernel/x86_64/dscal_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/dscal_microk_haswell-2.c
|
|
||||||
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n1), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n1), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
: "cc",
|
|
||||||
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
+ :
|
|
||||||
+ "+r" (n1), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n1), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
: "cc",
|
|
||||||
|
|
||||||
From 7a11cc5b9f7c9669ee1f9818a1ea3f44c2f6d98d Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:37:49 +0100
|
|
||||||
Subject: [PATCH 06/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++++------
|
|
||||||
1 file changed, 6 insertions(+), 6 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c
|
|
||||||
index 8d855072b..0f187ba88 100644
|
|
||||||
--- a/kernel/x86_64/dscal_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/dscal_microk_sandy-2.c
|
|
||||||
@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n1), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n1), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
: "cc",
|
|
||||||
@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
+ :
|
|
||||||
+ "+r" (n1), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n1), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
: "cc",
|
|
||||||
|
|
||||||
From a6c06bffe1ec60ec359b300b8cc9e18b30c72d0d Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:40:28 +0100
|
|
||||||
Subject: [PATCH 07/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++--------
|
|
||||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c
|
|
||||||
index 03882d6b6..1ce59d2c7 100644
|
|
||||||
--- a/kernel/x86_64/zscal_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c
|
|
||||||
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
|
|
||||||
From 5efc7ce079fd87de9ab7ca20aaaf8c5c627170fa Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:42:34 +0100
|
|
||||||
Subject: [PATCH 08/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++++-------------
|
|
||||||
1 file changed, 16 insertions(+), 16 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c
|
|
||||||
index d9253c1ed..534370959 100644
|
|
||||||
--- a/kernel/x86_64/zscal_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/zscal_microk_haswell-2.c
|
|
||||||
@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
|
|
||||||
From 1a1471c6be597a176a4dbfe2757c134eb3780af0 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Wed, 16 Jan 2019 23:44:42 +0100
|
|
||||||
Subject: [PATCH 09/18] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++-----------
|
|
||||||
1 file changed, 16 insertions(+), 16 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c
|
|
||||||
index 97b07add6..4b489d9f3 100644
|
|
||||||
--- a/kernel/x86_64/zscal_microk_steamroller-2.c
|
|
||||||
+++ b/kernel/x86_64/zscal_microk_steamroller-2.c
|
|
||||||
@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
+ :
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x), // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
- : "cc", //"%0", "%1",
|
|
||||||
+ : "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9", "%xmm10", "%xmm11",
|
|
||||||
|
|
||||||
From 90e28665183cd8da3a6129016977f57dd415c6a9 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:38:20 +0100
|
|
||||||
Subject: [PATCH 10/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/cscal_microk_bulldozer-2.c | 8 ++++----
|
|
||||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c
|
|
||||||
index f526fd611..31451aa6c 100644
|
|
||||||
--- a/kernel/x86_64/cscal_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c
|
|
||||||
@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
|
|
||||||
From b8dd71bddcb41d3d88af1a1eb77f845760452f5f Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:39:23 +0100
|
|
||||||
Subject: [PATCH 11/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/cscal_microk_haswell-2.c | 8 ++++----
|
|
||||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c
|
|
||||||
index 8623dcd10..a04a4c4ab 100644
|
|
||||||
--- a/kernel/x86_64/cscal_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/cscal_microk_haswell-2.c
|
|
||||||
@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc", // "0", "1",
|
|
||||||
@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc", //"%0", "%1",
|
|
||||||
@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
|
|
||||||
From 8c9a6356eaba102124147856422b9a0570daeb55 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:40:25 +0100
|
|
||||||
Subject: [PATCH 12/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/cscal_microk_steamroller-2.c | 8 ++++----
|
|
||||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c
|
|
||||||
index fbeb857e2..e8073d485 100644
|
|
||||||
--- a/kernel/x86_64/cscal_microk_steamroller-2.c
|
|
||||||
+++ b/kernel/x86_64/cscal_microk_steamroller-2.c
|
|
||||||
@@ -118,7 +118,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -210,7 +210,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -287,7 +287,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -332,7 +332,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
|
|
||||||
From ebe8882eb23e88d410f824d8d6a113f0fca94a3b Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:41:27 +0100
|
|
||||||
Subject: [PATCH 13/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dscal_microk_bulldozer-2.c | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c
|
|
||||||
index 71d3a9846..096662781 100644
|
|
||||||
--- a/kernel/x86_64/dscal_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c
|
|
||||||
@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n1), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n1), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
|
|
||||||
From fd3e2c862286019589530ece0a61be6d86a01e92 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:42:12 +0100
|
|
||||||
Subject: [PATCH 14/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dscal_microk_sandy-2.c | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c
|
|
||||||
index 0f187ba88..9982b8e58 100644
|
|
||||||
--- a/kernel/x86_64/dscal_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/dscal_microk_sandy-2.c
|
|
||||||
@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n1), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n1), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
|
|
||||||
From 45339034256043b4405fd6330f918cbed3660ac4 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:43:14 +0100
|
|
||||||
Subject: [PATCH 15/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dscal_microk_haswell-2.c | 4 ++--
|
|
||||||
1 file changed, 2 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c
|
|
||||||
index 90790cfdc..77ed59a4e 100644
|
|
||||||
--- a/kernel/x86_64/dscal_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/dscal_microk_haswell-2.c
|
|
||||||
@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n1), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n1), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha), // 2
|
|
||||||
"r" (n2) // 3
|
|
||||||
|
|
||||||
From 3b0b5ce0f69a45753b126d8bd96a48de2f882a4c Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:46:05 +0100
|
|
||||||
Subject: [PATCH 16/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++--------
|
|
||||||
1 file changed, 8 insertions(+), 8 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c
|
|
||||||
index 1ce59d2c7..5e733ffda 100644
|
|
||||||
--- a/kernel/x86_64/zscal_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c
|
|
||||||
@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
+ :
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc", //"%0", "%1",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
+ :
|
|
||||||
+ "+r" (n), // 0
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (n), // 0
|
|
||||||
- "r" (x), // 1
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc", //"%0", "%1",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
|
|
||||||
From c17d2f61c2387b5a6cfab22d964d70afcce69b23 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:47:12 +0100
|
|
||||||
Subject: [PATCH 17/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/zscal_microk_haswell-2.c | 8 ++++----
|
|
||||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c
|
|
||||||
index 534370959..8c8f5b75c 100644
|
|
||||||
--- a/kernel/x86_64/zscal_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/zscal_microk_haswell-2.c
|
|
||||||
@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -286,7 +286,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -331,7 +331,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
|
|
||||||
From ccb2b2175751037b5625b4ec3c60ddca26a04394 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:48:40 +0100
|
|
||||||
Subject: [PATCH 18/18] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/zscal_microk_steamroller-2.c | 8 ++++----
|
|
||||||
1 file changed, 4 insertions(+), 4 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c
|
|
||||||
index 4b489d9f3..c9267ee0c 100644
|
|
||||||
--- a/kernel/x86_64/zscal_microk_steamroller-2.c
|
|
||||||
+++ b/kernel/x86_64/zscal_microk_steamroller-2.c
|
|
||||||
@@ -118,7 +118,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -210,7 +210,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -287,7 +287,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
||||||
@@ -332,7 +332,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (n), // 0
|
|
||||||
- "+r" (x), // 1
|
|
||||||
+ "+r" (x) // 1
|
|
||||||
:
|
|
||||||
"r" (alpha) // 2
|
|
||||||
: "cc",
|
|
99
1967.patch
99
1967.patch
@ -1,99 +0,0 @@
|
|||||||
From 7ff08e4b06e2c643829b566a4f2c1daba25b1029 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 00:04:44 +0100
|
|
||||||
Subject: [PATCH 1/4] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dger_microk_sandy-2.c | 6 +++---
|
|
||||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c
|
|
||||||
index 2bf966a5f..944d4c6f1 100644
|
|
||||||
--- a/kernel/x86_64/dger_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/dger_microk_sandy-2.c
|
|
||||||
@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (alpha) // 4
|
|
||||||
|
|
||||||
From 003583675d31ce5ddabfede7fc0f93cfbac51e5f Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 00:05:47 +0100
|
|
||||||
Subject: [PATCH 2/4] Tag arguments 0 and 1 as both input and output
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/sger_microk_sandy-2.c | 6 +++---
|
|
||||||
1 file changed, 3 insertions(+), 3 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c
|
|
||||||
index 79180b991..d38fdd551 100644
|
|
||||||
--- a/kernel/x86_64/sger_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/sger_microk_sandy-2.c
|
|
||||||
@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (alpha) // 4
|
|
||||||
|
|
||||||
From 78aeb19e4613104c1ae8ea1c67022451dcfed7e6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:34:12 +0100
|
|
||||||
Subject: [PATCH 3/4] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/sger_microk_sandy-2.c | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c
|
|
||||||
index d38fdd551..14f13475b 100644
|
|
||||||
--- a/kernel/x86_64/sger_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/sger_microk_sandy-2.c
|
|
||||||
@@ -106,7 +106,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
- "+r" (n), // 1
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
:
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
|
|
||||||
From d3e7e25bfb73e16bdbf89ee07d0ab584339be2a0 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 17 Jan 2019 09:35:56 +0100
|
|
||||||
Subject: [PATCH 4/4] Remove stray comma
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/dger_microk_sandy-2.c | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c
|
|
||||||
index 944d4c6f1..e8494500f 100644
|
|
||||||
--- a/kernel/x86_64/dger_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/dger_microk_sandy-2.c
|
|
||||||
@@ -106,7 +106,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
- "+r" (n), // 1
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
:
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
499
2010.patch
499
2010.patch
@ -1,499 +0,0 @@
|
|||||||
From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Tue, 12 Feb 2019 15:33:48 +0100
|
|
||||||
Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64
|
|
||||||
s/dGEMV_T and s/dGEMV_N kernels
|
|
||||||
|
|
||||||
Arguments 0 and 1 need to be tagged as both input and output
|
|
||||||
---
|
|
||||||
kernel/x86_64/dgemv_n_4.c | 10 +++++-----
|
|
||||||
kernel/x86_64/dgemv_t_4.c | 18 +++++++++---------
|
|
||||||
kernel/x86_64/sgemv_n_4.c | 14 +++++++-------
|
|
||||||
kernel/x86_64/sgemv_t_4.c | 18 +++++++++---------
|
|
||||||
4 files changed, 30 insertions(+), 30 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c
|
|
||||||
index 6d2530e81..6d33641e9 100644
|
|
||||||
--- a/kernel/x86_64/dgemv_n_4.c
|
|
||||||
+++ b/kernel/x86_64/dgemv_n_4.c
|
|
||||||
@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (ap[0]), // 4
|
|
||||||
@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
:
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (ap), // 4
|
|
||||||
diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c
|
|
||||||
index a7478e3a8..ed672a757 100644
|
|
||||||
--- a/kernel/x86_64/dgemv_t_4.c
|
|
||||||
+++ b/kernel/x86_64/dgemv_t_4.c
|
|
||||||
@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|
||||||
"movsd %%xmm11,8(%2) \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (y), // 2
|
|
||||||
"r" (ap0), // 3
|
|
||||||
"r" (ap1), // 4
|
|
||||||
@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|
||||||
"movsd %%xmm10, (%2) \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (y), // 2
|
|
||||||
"r" (ap), // 3
|
|
||||||
"r" (x) // 4
|
|
||||||
@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (&da), // 2
|
|
||||||
"r" (src), // 3
|
|
||||||
"r" (dest) // 4
|
|
||||||
diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c
|
|
||||||
index 65305ac59..63697970f 100644
|
|
||||||
--- a/kernel/x86_64/sgemv_n_4.c
|
|
||||||
+++ b/kernel/x86_64/sgemv_n_4.c
|
|
||||||
@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (ap[0]), // 4
|
|
||||||
@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a
|
|
||||||
|
|
||||||
"3: \n\t"
|
|
||||||
:
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n1) // 1
|
|
||||||
:
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n1), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (ap), // 4
|
|
||||||
@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
:
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
"r" (src), // 2
|
|
||||||
"r" (dest) // 3
|
|
||||||
: "cc",
|
|
||||||
diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c
|
|
||||||
index 065e5b385..86ecaf516 100644
|
|
||||||
--- a/kernel/x86_64/sgemv_t_4.c
|
|
||||||
+++ b/kernel/x86_64/sgemv_t_4.c
|
|
||||||
@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT
|
|
||||||
"movss %%xmm11,4(%2) \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (y), // 2
|
|
||||||
"r" (ap0), // 3
|
|
||||||
"r" (ap1), // 4
|
|
||||||
@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
|
|
||||||
"movss %%xmm10, (%2) \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (y), // 2
|
|
||||||
"r" (ap), // 3
|
|
||||||
"r" (x) // 4
|
|
||||||
@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (&da), // 2
|
|
||||||
"r" (src), // 3
|
|
||||||
"r" (dest) // 4
|
|
||||||
|
|
||||||
From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Tue, 12 Feb 2019 15:51:43 +0100
|
|
||||||
Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly
|
|
||||||
|
|
||||||
Argument 0 is modified as it doubles as a counter
|
|
||||||
---
|
|
||||||
kernel/x86_64/dscal.c | 2 +-
|
|
||||||
1 file changed, 1 insertion(+), 1 deletion(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c
|
|
||||||
index ef9a0a6ba..d0d7801fd 100644
|
|
||||||
--- a/kernel/x86_64/dscal.c
|
|
||||||
+++ b/kernel/x86_64/dscal.c
|
|
||||||
@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
+ "+r" (n) // 0
|
|
||||||
:
|
|
||||||
- "r" (n), // 0
|
|
||||||
"r" (x), // 1
|
|
||||||
"r" (x1), // 2
|
|
||||||
"r" (alpha), // 3
|
|
||||||
|
|
||||||
From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Tue, 12 Feb 2019 16:00:18 +0100
|
|
||||||
Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV
|
|
||||||
microkernels
|
|
||||||
|
|
||||||
Arguments 0 and 1 are both input and output
|
|
||||||
---
|
|
||||||
kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++---
|
|
||||||
kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++---
|
|
||||||
kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++---
|
|
||||||
kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++---
|
|
||||||
kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++---
|
|
||||||
kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++---
|
|
||||||
kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++---
|
|
||||||
kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++---
|
|
||||||
8 files changed, 24 insertions(+), 24 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
|
|
||||||
index d7166fe4b..ae287b6d8 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c
|
|
||||||
@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c
|
|
||||||
index d83d20f8e..4778f644a 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_U_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c
|
|
||||||
@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
|
|
||||||
index 1344c75f7..065182286 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c
|
|
||||||
@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c
|
|
||||||
index 1ef6fbafd..d84e703bd 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_U_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c
|
|
||||||
@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
|
|
||||||
index 8c01ab806..4a4f4d68d 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c
|
|
||||||
@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c
|
|
||||||
index a32e59b44..e6a09ccf8 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_U_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c
|
|
||||||
@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
|
|
||||||
index b8e6ee732..c56ff3b15 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c
|
|
||||||
@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c
|
|
||||||
index e8650650c..c4919a39a 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_U_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c
|
|
||||||
@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (i), // 0
|
|
||||||
- "r" (n), // 1
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
"r" (a0), // 4
|
|
||||||
|
|
||||||
From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Tue, 12 Feb 2019 16:14:02 +0100
|
|
||||||
Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly
|
|
||||||
|
|
||||||
Argument 0 is modified so should be input and output
|
|
||||||
---
|
|
||||||
kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++--
|
|
||||||
kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++--
|
|
||||||
kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++--
|
|
||||||
kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++--
|
|
||||||
kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++--
|
|
||||||
kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++--
|
|
||||||
kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++--
|
|
||||||
kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++----
|
|
||||||
8 files changed, 18 insertions(+), 18 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
|
|
||||||
index d84470cc4..bfa07b6d0 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c
|
|
||||||
@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"vmovsd %%xmm3 ,24(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c
|
|
||||||
index 866782ee6..6241879d5 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_L_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c
|
|
||||||
@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
|
|
||||||
index 38479f77a..a161dcd8b 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c
|
|
||||||
@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"movsd %%xmm3 , 24(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c
|
|
||||||
index b4e6ab369..b205b1019 100644
|
|
||||||
--- a/kernel/x86_64/dsymv_L_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c
|
|
||||||
@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
|
|
||||||
index 9002228f3..602c3edf2 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c
|
|
||||||
@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"vmovss %%xmm3 ,12(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c
|
|
||||||
index 69db008b6..fdfe4349a 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_L_microk_haswell-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c
|
|
||||||
@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
|
|
||||||
index c0fe5d640..6bb9c02f6 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c
|
|
||||||
@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F
|
|
||||||
"movss %%xmm3 , 12(%9) \n\t" // save temp2
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c
|
|
||||||
index 093ca8073..0c78212e7 100644
|
|
||||||
--- a/kernel/x86_64/ssymv_L_microk_sandy-2.c
|
|
||||||
+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c
|
|
||||||
@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
||||||
@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
|
||||||
- "r" (from), // 0
|
|
||||||
+ "+r" (from) // 0
|
|
||||||
+ :
|
|
||||||
"r" (to), // 1
|
|
||||||
"r" (x), // 2
|
|
||||||
"r" (y), // 3
|
|
27
2018.patch
27
2018.patch
@ -1,27 +0,0 @@
|
|||||||
From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Bart Oldeman <bart.oldeman@calculquebec.ca>
|
|
||||||
Date: Thu, 14 Feb 2019 16:19:41 +0000
|
|
||||||
Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for
|
|
||||||
xmm0,xmm1,xmm2,xmm3
|
|
||||||
|
|
||||||
This fixes a crash in dblat2 when OpenBLAS is compiled using
|
|
||||||
-march=znver1 -ftree-vectorize -O2
|
|
||||||
|
|
||||||
See also:
|
|
||||||
https://github.com/easybuilders/easybuild-easyconfigs/issues/7180
|
|
||||||
---
|
|
||||||
kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 +
|
|
||||||
1 file changed, 1 insertion(+)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c
|
|
||||||
index 584a6c6b5..da0fa2fff 100644
|
|
||||||
--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
|
|
||||||
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
|
|
||||||
@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|
||||||
"r" (ap[3]), // 7
|
|
||||||
"r" (alpha) // 8
|
|
||||||
: "cc",
|
|
||||||
+ "%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5",
|
|
||||||
"%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9",
|
|
274
2019.patch
274
2019.patch
@ -1,274 +0,0 @@
|
|||||||
From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Thu, 14 Feb 2019 22:43:18 +0100
|
|
||||||
Subject: [PATCH 1/2] Save and restore input argument 8 (lda4)
|
|
||||||
|
|
||||||
Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009)
|
|
||||||
---
|
|
||||||
kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++--
|
|
||||||
1 file changed, 5 insertions(+), 2 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
|
||||||
index 2c90f8aa9..e89a16785 100644
|
|
||||||
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
|
||||||
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
|
||||||
@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
*****************************************************************************/
|
|
||||||
|
|
||||||
|
|
||||||
-
|
|
||||||
#define HAVE_KERNEL_4x8 1
|
|
||||||
static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
|
||||||
|
|
||||||
@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
|
||||||
|
|
||||||
+ "movq %8, %%xmm10 \n\t" //save lda
|
|
||||||
+
|
|
||||||
"testq $0x04, %1 \n\t"
|
|
||||||
"jz 2f \n\t"
|
|
||||||
|
|
||||||
@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
"4: \n\t"
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
+ "movq %%xmm10, %8 \n\t" //restore lda
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"%xmm4", "%xmm5",
|
|
||||||
"%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9",
|
|
||||||
+ "%xmm10",
|
|
||||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
|
||||||
"memory"
|
|
||||||
);
|
|
||||||
@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
}
|
|
||||||
|
|
||||||
|
|
||||||
-
|
|
||||||
#define HAVE_KERNEL_4x4 1
|
|
||||||
static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
|
||||||
|
|
||||||
@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT
|
|
||||||
|
|
||||||
"vbroadcastss (%8), %%ymm6 \n\t" // alpha
|
|
||||||
|
|
||||||
+
|
|
||||||
"testq $0x04, %1 \n\t"
|
|
||||||
"jz 2f \n\t"
|
|
||||||
|
|
||||||
|
|
||||||
From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Fri, 15 Feb 2019 10:10:04 +0100
|
|
||||||
Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint
|
|
||||||
list
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------
|
|
||||||
1 file changed, 61 insertions(+), 65 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
|
||||||
index e89a16785..93e1e26e8 100644
|
|
||||||
--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
|
||||||
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
|
|
||||||
@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
__asm__ __volatile__
|
|
||||||
(
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
|
|
||||||
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
|
||||||
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
|
||||||
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
|
||||||
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
|
||||||
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
|
||||||
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
|
||||||
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
|
||||||
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
|
|
||||||
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
|
||||||
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
|
||||||
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
|
||||||
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
|
||||||
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
|
||||||
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
|
||||||
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
|
||||||
|
|
||||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
|
||||||
|
|
||||||
- "movq %8, %%xmm10 \n\t" //save lda
|
|
||||||
-
|
|
||||||
"testq $0x04, %1 \n\t"
|
|
||||||
"jz 2f \n\t"
|
|
||||||
|
|
||||||
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
|
||||||
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
|
||||||
"vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t"
|
|
||||||
"vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
|
|
||||||
"vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t"
|
|
||||||
"vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t"
|
|
||||||
"vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
|
||||||
|
|
||||||
- "addq $4 , %8 \n\t"
|
|
||||||
+ "addq $4 , %2 \n\t"
|
|
||||||
"addq $4 , %0 \n\t"
|
|
||||||
"subq $4 , %1 \n\t"
|
|
||||||
|
|
||||||
@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"testq $0x08, %1 \n\t"
|
|
||||||
"jz 3f \n\t"
|
|
||||||
|
|
||||||
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
|
||||||
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
|
||||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
"vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
"vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
|
|
||||||
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
|
||||||
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
|
||||||
|
|
||||||
- "addq $8 , %8 \n\t"
|
|
||||||
+ "addq $8 , %2 \n\t"
|
|
||||||
"addq $8 , %0 \n\t"
|
|
||||||
"subq $8 , %1 \n\t"
|
|
||||||
|
|
||||||
@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
"vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
"vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y
|
|
||||||
- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y
|
|
||||||
-
|
|
||||||
- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y
|
|
||||||
+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y
|
|
||||||
+
|
|
||||||
+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
"addq $16, %0 \n\t"
|
|
||||||
- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
"vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
|
||||||
"vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
|
||||||
|
|
||||||
- "addq $16, %8 \n\t"
|
|
||||||
- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y
|
|
||||||
+ "addq $16, %2 \n\t"
|
|
||||||
+ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y
|
|
||||||
"subq $16, %1 \n\t"
|
|
||||||
- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y
|
|
||||||
+ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y
|
|
||||||
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
"4: \n\t"
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
- "movq %%xmm10, %8 \n\t" //restore lda
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
- "+r" (n) // 1
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ "+r" (lda4) // 2
|
|
||||||
:
|
|
||||||
- "r" (x), // 2
|
|
||||||
- "r" (y), // 3
|
|
||||||
- "r" (ap[0]), // 4
|
|
||||||
- "r" (ap[1]), // 5
|
|
||||||
- "r" (ap[2]), // 6
|
|
||||||
- "r" (ap[3]), // 7
|
|
||||||
- "r" (lda4), // 8
|
|
||||||
+ "r" (x), // 3
|
|
||||||
+ "r" (y), // 4
|
|
||||||
+ "r" (ap[0]), // 5
|
|
||||||
+ "r" (ap[1]), // 6
|
|
||||||
+ "r" (ap[2]), // 7
|
|
||||||
+ "r" (ap[3]), // 8
|
|
||||||
"r" (alpha) // 9
|
|
||||||
: "cc",
|
|
||||||
"%xmm0", "%xmm1",
|
|
||||||
@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"%xmm4", "%xmm5",
|
|
||||||
"%xmm6", "%xmm7",
|
|
||||||
"%xmm8", "%xmm9",
|
|
||||||
- "%xmm10",
|
|
||||||
"%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
|
||||||
"memory"
|
|
||||||
);
|
|
255
2021.patch
255
2021.patch
@ -1,255 +0,0 @@
|
|||||||
From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Fri, 15 Feb 2019 15:08:16 +0100
|
|
||||||
Subject: [PATCH] Fix wrong constraints in inline assembly
|
|
||||||
|
|
||||||
for #2009
|
|
||||||
---
|
|
||||||
kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++-------------
|
|
||||||
1 file changed, 49 insertions(+), 49 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
|
||||||
index fcab8e2c7..9ab78fc8e 100644
|
|
||||||
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
|
||||||
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
|
||||||
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" cmpq $0, %0 \n\t"
|
|
||||||
" je 4f \n\t"
|
|
||||||
|
|
||||||
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
|
||||||
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
|
||||||
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
|
||||||
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
|
||||||
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
|
||||||
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
|
||||||
|
|
||||||
|
|
||||||
" addq $8, %1 \n\t"
|
|
||||||
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" .p2align 4 \n\t"
|
|
||||||
"1: \n\t"
|
|
||||||
|
|
||||||
- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a
|
|
||||||
+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a
|
|
||||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
|
||||||
|
|
||||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
|
|
||||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
|
|
||||||
|
|
||||||
- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
|
|
||||||
+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
|
|
||||||
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
|
|
||||||
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
|
|
||||||
|
|
||||||
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
|
|
||||||
- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
|
|
||||||
+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
|
|
||||||
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
|
||||||
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
|
|
||||||
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
|
|
||||||
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
|
|
||||||
" jz 22f \n\t"
|
|
||||||
|
|
||||||
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
|
||||||
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
|
||||||
|
|
||||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
|
|
||||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
|
|
||||||
|
|
||||||
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
|
|
||||||
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
|
||||||
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
|
||||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
|
|
||||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
|
|
||||||
|
|
||||||
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
|
|
||||||
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
|
||||||
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
|
||||||
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
|
|
||||||
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
|
|
||||||
|
|
||||||
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
|
|
||||||
|
|
||||||
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
|
|
||||||
- " vmovups (%9), %%ymm0 \n\t"
|
|
||||||
+ " vmovups (%3), %%ymm0 \n\t"
|
|
||||||
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
|
|
||||||
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
|
|
||||||
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
|
|
||||||
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
|
|
||||||
|
|
||||||
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
|
|
||||||
- " vmovups 32(%9), %%ymm4 \n\t"
|
|
||||||
+ " vmovups 32(%3), %%ymm4 \n\t"
|
|
||||||
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
|
|
||||||
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
|
|
||||||
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
|
|
||||||
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
|
|
||||||
"5: \n\t" // i = 0
|
|
||||||
|
|
||||||
- " addq $64, %9 \n\t" // b=b+8
|
|
||||||
+ " addq $64, %3 \n\t" // b=b+8
|
|
||||||
|
|
||||||
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
|
|
||||||
- " vmovups (%9), %%ymm0 \n\t"
|
|
||||||
- " vmovups %%ymm8 , (%8) \n\t" // write a
|
|
||||||
+ " vmovups (%3), %%ymm0 \n\t"
|
|
||||||
+ " vmovups %%ymm8 , (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm8 , (%4) \n\t" // write c
|
|
||||||
|
|
||||||
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
|
|
||||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
||||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
||||||
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
|
|
||||||
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
|
|
||||||
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
|
|
||||||
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
|
||||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
|
||||||
|
|
||||||
- " addq $64, %9 \n\t" // b=b+8
|
|
||||||
- " addq $32, %8 \n\t" // a=a+8
|
|
||||||
+ " addq $64, %3 \n\t" // b=b+8
|
|
||||||
+ " addq $32, %2 \n\t" // a=a+8
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
|
|
||||||
- " vmovups (%9), %%ymm0 \n\t"
|
|
||||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
||||||
- " vmovups %%ymm9 , (%8) \n\t" // write a
|
|
||||||
+ " vmovups (%3), %%ymm0 \n\t"
|
|
||||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
||||||
+ " vmovups %%ymm9 , (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
|
|
||||||
|
|
||||||
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
|
|
||||||
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
|
||||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
|
||||||
|
|
||||||
- " addq $64, %9 \n\t" // b=b+8
|
|
||||||
- " addq $32, %8 \n\t" // a=a+8
|
|
||||||
+ " addq $64, %3 \n\t" // b=b+8
|
|
||||||
+ " addq $32, %2 \n\t" // a=a+8
|
|
||||||
|
|
||||||
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
|
|
||||||
- " vmovups (%9), %%ymm0 \n\t"
|
|
||||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
||||||
- " vmovups %%ymm10, (%8) \n\t" // write a
|
|
||||||
+ " vmovups (%3), %%ymm0 \n\t"
|
|
||||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
||||||
+ " vmovups %%ymm10, (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
|
|
||||||
|
|
||||||
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
|
|
||||||
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
|
||||||
|
|
||||||
|
|
||||||
- " addq $64, %9 \n\t" // b=b+8
|
|
||||||
- " addq $32, %8 \n\t" // a=a+8
|
|
||||||
+ " addq $64, %3 \n\t" // b=b+8
|
|
||||||
+ " addq $32, %2 \n\t" // a=a+8
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
|
|
||||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
||||||
- " vmovups %%ymm11, (%8) \n\t" // write a
|
|
||||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
||||||
+ " vmovups %%ymm11, (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm11, (%5) \n\t" // write c
|
|
||||||
|
|
||||||
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
|
|
||||||
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
|
|
||||||
|
|
||||||
|
|
||||||
- " addq $64, %9 \n\t" // b=b+8
|
|
||||||
- " addq $32, %8 \n\t" // a=a+8
|
|
||||||
+ " addq $64, %3 \n\t" // b=b+8
|
|
||||||
+ " addq $32, %2 \n\t" // a=a+8
|
|
||||||
|
|
||||||
|
|
||||||
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
|
|
||||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
||||||
- " vmovups %%ymm12, (%8) \n\t" // write a
|
|
||||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
||||||
+ " vmovups %%ymm12, (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
|
|
||||||
|
|
||||||
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
|
|
||||||
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
|
||||||
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
|
|
||||||
|
|
||||||
- " addq $64, %9 \n\t" // b=b+8
|
|
||||||
- " addq $32, %8 \n\t" // a=a+8
|
|
||||||
+ " addq $64, %3 \n\t" // b=b+8
|
|
||||||
+ " addq $32, %2 \n\t" // a=a+8
|
|
||||||
|
|
||||||
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
|
|
||||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
||||||
- " vmovups %%ymm13, (%8) \n\t" // write a
|
|
||||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
||||||
+ " vmovups %%ymm13, (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
|
|
||||||
|
|
||||||
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
|
|
||||||
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
||||||
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
|
|
||||||
|
|
||||||
|
|
||||||
- " addq $64, %9 \n\t" // b=b+8
|
|
||||||
- " addq $32, %8 \n\t" // a=a+8
|
|
||||||
+ " addq $64, %3 \n\t" // b=b+8
|
|
||||||
+ " addq $32, %2 \n\t" // a=a+8
|
|
||||||
|
|
||||||
|
|
||||||
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
|
|
||||||
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
||||||
- " vmovups %%ymm14, (%8) \n\t" // write a
|
|
||||||
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
||||||
+ " vmovups %%ymm14, (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm14, (%6) \n\t" // write c
|
|
||||||
|
|
||||||
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
|
|
||||||
|
|
||||||
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
|
|
||||||
|
|
||||||
- " addq $32, %8 \n\t" // a=a+8
|
|
||||||
+ " addq $32, %2 \n\t" // a=a+8
|
|
||||||
|
|
||||||
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
|
|
||||||
- " vmovups %%ymm15, (%8) \n\t" // write a
|
|
||||||
+ " vmovups %%ymm15, (%2) \n\t" // write a
|
|
||||||
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
|
|
||||||
|
|
||||||
" vzeroupper \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
+ "+r" (n1), // 0
|
|
||||||
+ "+a" (i), // 1
|
|
||||||
+ "+r" (as), // 2
|
|
||||||
+ "+r" (bs) // 3
|
|
||||||
:
|
|
||||||
- "r" (n1), // 0
|
|
||||||
- "a" (i), // 1
|
|
||||||
- "r" (a), // 2
|
|
||||||
- "r" (b), // 3
|
|
||||||
"r" (c), // 4
|
|
||||||
"r" (c3), // 5
|
|
||||||
"r" (c6), // 6
|
|
||||||
"r" (ldc), // 7
|
|
||||||
- "r" (as), // 8
|
|
||||||
- "r" (bs) // 9
|
|
||||||
+ "r" (a), // 8
|
|
||||||
+ "r" (b) // 9
|
|
||||||
: "cc",
|
|
||||||
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
||||||
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|
|
874
2023.patch
874
2023.patch
@ -1,874 +0,0 @@
|
|||||||
From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Sat, 16 Feb 2019 18:24:11 +0100
|
|
||||||
Subject: [PATCH 1/4] Fix inline assembly constraints
|
|
||||||
|
|
||||||
rework indices to allow marking argument lda4 as input and output. For #2009
|
|
||||||
---
|
|
||||||
kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------
|
|
||||||
1 file changed, 27 insertions(+), 27 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
|
|
||||||
index 11a3e943b..d21232bfa 100644
|
|
||||||
--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
|
|
||||||
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
|
|
||||||
@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
__asm__ __volatile__
|
|
||||||
(
|
|
||||||
- "movss (%2), %%xmm12 \n\t" // x0
|
|
||||||
- "movss 4(%2), %%xmm13 \n\t" // x1
|
|
||||||
- "movss 8(%2), %%xmm14 \n\t" // x2
|
|
||||||
- "movss 12(%2), %%xmm15 \n\t" // x3
|
|
||||||
+ "movss (%3), %%xmm12 \n\t" // x0
|
|
||||||
+ "movss 4(%3), %%xmm13 \n\t" // x1
|
|
||||||
+ "movss 8(%3), %%xmm14 \n\t" // x2
|
|
||||||
+ "movss 12(%3), %%xmm15 \n\t" // x3
|
|
||||||
"shufps $0, %%xmm12, %%xmm12\n\t"
|
|
||||||
"shufps $0, %%xmm13, %%xmm13\n\t"
|
|
||||||
"shufps $0, %%xmm14, %%xmm14\n\t"
|
|
||||||
"shufps $0, %%xmm15, %%xmm15\n\t"
|
|
||||||
|
|
||||||
- "movss 16(%2), %%xmm0 \n\t" // x4
|
|
||||||
- "movss 20(%2), %%xmm1 \n\t" // x5
|
|
||||||
- "movss 24(%2), %%xmm2 \n\t" // x6
|
|
||||||
- "movss 28(%2), %%xmm3 \n\t" // x7
|
|
||||||
+ "movss 16(%3), %%xmm0 \n\t" // x4
|
|
||||||
+ "movss 20(%3), %%xmm1 \n\t" // x5
|
|
||||||
+ "movss 24(%3), %%xmm2 \n\t" // x6
|
|
||||||
+ "movss 28(%3), %%xmm3 \n\t" // x7
|
|
||||||
"shufps $0, %%xmm0 , %%xmm0 \n\t"
|
|
||||||
"shufps $0, %%xmm1 , %%xmm1 \n\t"
|
|
||||||
"shufps $0, %%xmm2 , %%xmm2 \n\t"
|
|
||||||
@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"1: \n\t"
|
|
||||||
"xorps %%xmm4 , %%xmm4 \n\t"
|
|
||||||
"xorps %%xmm5 , %%xmm5 \n\t"
|
|
||||||
- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
|
||||||
+ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
|
||||||
|
|
||||||
".p2align 1 \n\t"
|
|
||||||
- "movups (%4,%0,4), %%xmm8 \n\t"
|
|
||||||
- "movups (%5,%0,4), %%xmm9 \n\t"
|
|
||||||
- "movups (%6,%0,4), %%xmm10 \n\t"
|
|
||||||
- "movups (%7,%0,4), %%xmm11 \n\t"
|
|
||||||
+ "movups (%5,%0,4), %%xmm8 \n\t"
|
|
||||||
+ "movups (%6,%0,4), %%xmm9 \n\t"
|
|
||||||
+ "movups (%7,%0,4), %%xmm10 \n\t"
|
|
||||||
+ "movups (%8,%0,4), %%xmm11 \n\t"
|
|
||||||
".p2align 1 \n\t"
|
|
||||||
"mulps %%xmm12, %%xmm8 \n\t"
|
|
||||||
"mulps %%xmm13, %%xmm9 \n\t"
|
|
||||||
@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"addps %%xmm10, %%xmm4 \n\t"
|
|
||||||
"addps %%xmm11, %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "movups (%4,%8,4), %%xmm8 \n\t"
|
|
||||||
- "movups (%5,%8,4), %%xmm9 \n\t"
|
|
||||||
- "movups (%6,%8,4), %%xmm10 \n\t"
|
|
||||||
- "movups (%7,%8,4), %%xmm11 \n\t"
|
|
||||||
+ "movups (%5,%2,4), %%xmm8 \n\t"
|
|
||||||
+ "movups (%6,%2,4), %%xmm9 \n\t"
|
|
||||||
+ "movups (%7,%2,4), %%xmm10 \n\t"
|
|
||||||
+ "movups (%8,%2,4), %%xmm11 \n\t"
|
|
||||||
".p2align 1 \n\t"
|
|
||||||
"mulps %%xmm0 , %%xmm8 \n\t"
|
|
||||||
"mulps %%xmm1 , %%xmm9 \n\t"
|
|
||||||
@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"addps %%xmm10, %%xmm4 \n\t"
|
|
||||||
"addps %%xmm11, %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "addq $4 , %8 \n\t"
|
|
||||||
+ "addq $4 , %2 \n\t"
|
|
||||||
"addps %%xmm5 , %%xmm4 \n\t"
|
|
||||||
"addq $4 , %0 \n\t"
|
|
||||||
"mulps %%xmm6 , %%xmm4 \n\t"
|
|
||||||
"subq $4 , %1 \n\t"
|
|
||||||
"addps %%xmm4 , %%xmm7 \n\t"
|
|
||||||
|
|
||||||
- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y
|
|
||||||
+ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y
|
|
||||||
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
- "+r" (n) // 1
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ "+r" (lda4) // 2
|
|
||||||
:
|
|
||||||
- "r" (x), // 2
|
|
||||||
- "r" (y), // 3
|
|
||||||
- "r" (ap[0]), // 4
|
|
||||||
- "r" (ap[1]), // 5
|
|
||||||
- "r" (ap[2]), // 6
|
|
||||||
- "r" (ap[3]), // 7
|
|
||||||
- "r" (lda4), // 8
|
|
||||||
+ "r" (x), // 3
|
|
||||||
+ "r" (y), // 4
|
|
||||||
+ "r" (ap[0]), // 5
|
|
||||||
+ "r" (ap[1]), // 6
|
|
||||||
+ "r" (ap[2]), // 7
|
|
||||||
+ "r" (ap[3]), // 8
|
|
||||||
"r" (alpha) // 9
|
|
||||||
: "cc",
|
|
||||||
"%xmm0", "%xmm1",
|
|
||||||
|
|
||||||
From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Sat, 16 Feb 2019 18:36:39 +0100
|
|
||||||
Subject: [PATCH 2/4] Fix inline assembly constraints
|
|
||||||
|
|
||||||
rework indices to allow marking argument lda as input and output.
|
|
||||||
---
|
|
||||||
kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++-------------
|
|
||||||
1 file changed, 65 insertions(+), 65 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c
|
|
||||||
index b35daa35b..3fc46542b 100644
|
|
||||||
--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
|
|
||||||
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
|
|
||||||
@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
__asm__ __volatile__
|
|
||||||
(
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
- "vbroadcastss (%2), %%ymm12 \n\t" // x0
|
|
||||||
- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1
|
|
||||||
- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2
|
|
||||||
- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3
|
|
||||||
- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4
|
|
||||||
- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5
|
|
||||||
- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6
|
|
||||||
- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7
|
|
||||||
+ "vbroadcastss (%3), %%ymm12 \n\t" // x0
|
|
||||||
+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1
|
|
||||||
+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2
|
|
||||||
+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3
|
|
||||||
+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4
|
|
||||||
+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5
|
|
||||||
+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6
|
|
||||||
+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7
|
|
||||||
|
|
||||||
"vbroadcastss (%9), %%ymm6 \n\t" // alpha
|
|
||||||
|
|
||||||
@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
"vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t"
|
|
||||||
"vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t"
|
|
||||||
- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y
|
|
||||||
+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y
|
|
||||||
|
|
||||||
- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t"
|
|
||||||
- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t"
|
|
||||||
- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t"
|
|
||||||
- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t"
|
|
||||||
+ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t"
|
|
||||||
+ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t"
|
|
||||||
+ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t"
|
|
||||||
+ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t"
|
|
||||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
|
||||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
|
||||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
|
||||||
"vaddps %%xmm5, %%xmm11, %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t"
|
|
||||||
- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t"
|
|
||||||
- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t"
|
|
||||||
- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t"
|
|
||||||
+ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t"
|
|
||||||
+ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t"
|
|
||||||
+ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t"
|
|
||||||
+ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t"
|
|
||||||
"vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t"
|
|
||||||
"vaddps %%xmm5, %%xmm10, %%xmm5 \n\t"
|
|
||||||
"vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t"
|
|
||||||
@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t"
|
|
||||||
"vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y
|
|
||||||
|
|
||||||
- "addq $4, %8 \n\t"
|
|
||||||
+ "addq $4, %2 \n\t"
|
|
||||||
"addq $4, %0 \n\t"
|
|
||||||
"subq $4, %1 \n\t"
|
|
||||||
|
|
||||||
@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
|
||||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y
|
|
||||||
+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y
|
|
||||||
|
|
||||||
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
|
||||||
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
|
||||||
- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
|
||||||
- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
|
||||||
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
|
||||||
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
|
||||||
+ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
|
||||||
+ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
|
||||||
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
|
||||||
- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
|
||||||
- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
|
||||||
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
|
||||||
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
|
||||||
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
|
||||||
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm10, %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t"
|
|
||||||
@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y
|
|
||||||
+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y
|
|
||||||
|
|
||||||
- "addq $8, %8 \n\t"
|
|
||||||
+ "addq $8, %2 \n\t"
|
|
||||||
"addq $8, %0 \n\t"
|
|
||||||
"subq $8, %1 \n\t"
|
|
||||||
|
|
||||||
@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t"
|
|
||||||
"vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "prefetcht0 192(%4,%0,4) \n\t"
|
|
||||||
- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t"
|
|
||||||
- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t"
|
|
||||||
"prefetcht0 192(%5,%0,4) \n\t"
|
|
||||||
- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t"
|
|
||||||
- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t"
|
|
||||||
+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t"
|
|
||||||
+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t"
|
|
||||||
+ "prefetcht0 192(%6,%0,4) \n\t"
|
|
||||||
+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t"
|
|
||||||
+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "prefetcht0 192(%6,%0,4) \n\t"
|
|
||||||
- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t"
|
|
||||||
- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t"
|
|
||||||
"prefetcht0 192(%7,%0,4) \n\t"
|
|
||||||
- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t"
|
|
||||||
- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t"
|
|
||||||
+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t"
|
|
||||||
+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t"
|
|
||||||
+ "prefetcht0 192(%8,%0,4) \n\t"
|
|
||||||
+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t"
|
|
||||||
+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "prefetcht0 192(%4,%8,4) \n\t"
|
|
||||||
- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t"
|
|
||||||
- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t"
|
|
||||||
- "prefetcht0 192(%5,%8,4) \n\t"
|
|
||||||
- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t"
|
|
||||||
- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t"
|
|
||||||
+ "prefetcht0 192(%5,%2,4) \n\t"
|
|
||||||
+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t"
|
|
||||||
+ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t"
|
|
||||||
+ "prefetcht0 192(%6,%2,4) \n\t"
|
|
||||||
+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t"
|
|
||||||
+ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm11, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "prefetcht0 192(%6,%8,4) \n\t"
|
|
||||||
- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t"
|
|
||||||
- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t"
|
|
||||||
- "prefetcht0 192(%7,%8,4) \n\t"
|
|
||||||
- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t"
|
|
||||||
- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t"
|
|
||||||
+ "prefetcht0 192(%7,%2,4) \n\t"
|
|
||||||
+ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t"
|
|
||||||
+ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t"
|
|
||||||
+ "prefetcht0 192(%8,%2,4) \n\t"
|
|
||||||
+ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t"
|
|
||||||
+ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t"
|
|
||||||
"vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t"
|
|
||||||
"vaddps %%ymm4, %%ymm10, %%ymm4 \n\t"
|
|
||||||
@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t"
|
|
||||||
"vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
|
||||||
- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
|
||||||
+ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y
|
|
||||||
+ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y
|
|
||||||
|
|
||||||
- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y
|
|
||||||
- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y
|
|
||||||
+ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y
|
|
||||||
+ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y
|
|
||||||
|
|
||||||
- "addq $16, %8 \n\t"
|
|
||||||
+ "addq $16, %2 \n\t"
|
|
||||||
"addq $16, %0 \n\t"
|
|
||||||
"subq $16, %1 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
- "+r" (n) // 1
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ "+r" (lda4) // 2
|
|
||||||
:
|
|
||||||
- "r" (x), // 2
|
|
||||||
- "r" (y), // 3
|
|
||||||
- "r" (ap[0]), // 4
|
|
||||||
- "r" (ap[1]), // 5
|
|
||||||
- "r" (ap[2]), // 6
|
|
||||||
- "r" (ap[3]), // 7
|
|
||||||
- "r" (lda4), // 8
|
|
||||||
+ "r" (x), // 3
|
|
||||||
+ "r" (y), // 4
|
|
||||||
+ "r" (ap[0]), // 5
|
|
||||||
+ "r" (ap[1]), // 6
|
|
||||||
+ "r" (ap[2]), // 7
|
|
||||||
+ "r" (ap[3]), // 8
|
|
||||||
"r" (alpha) // 9
|
|
||||||
: "cc",
|
|
||||||
"%xmm0", "%xmm1",
|
|
||||||
|
|
||||||
From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Sat, 16 Feb 2019 18:46:17 +0100
|
|
||||||
Subject: [PATCH 3/4] Fix inline assembly constraints
|
|
||||||
|
|
||||||
---
|
|
||||||
kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++-----------
|
|
||||||
1 file changed, 97 insertions(+), 97 deletions(-)
|
|
||||||
|
|
||||||
diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
|
|
||||||
index 31001c7f3..bbf06c84b 100644
|
|
||||||
--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
|
|
||||||
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
|
|
||||||
@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
__asm__ __volatile__
|
|
||||||
(
|
|
||||||
- "vbroadcastss (%2), %%xmm12 \n\t" // x0
|
|
||||||
- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1
|
|
||||||
- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2
|
|
||||||
- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3
|
|
||||||
- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4
|
|
||||||
- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5
|
|
||||||
- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6
|
|
||||||
- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7
|
|
||||||
+ "vbroadcastss (%3), %%xmm12 \n\t" // x0
|
|
||||||
+ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1
|
|
||||||
+ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2
|
|
||||||
+ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3
|
|
||||||
+ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4
|
|
||||||
+ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5
|
|
||||||
+ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6
|
|
||||||
+ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7
|
|
||||||
|
|
||||||
"vbroadcastss (%9), %%xmm8 \n\t" // alpha
|
|
||||||
|
|
||||||
@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
|
||||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
"addq $4 , %0 \n\t"
|
|
||||||
|
|
||||||
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
- "addq $4 , %8 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
+ "addq $4 , %2 \n\t"
|
|
||||||
|
|
||||||
"vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
|
||||||
+ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
|
|
||||||
"subq $4 , %1 \n\t"
|
|
||||||
- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y
|
|
||||||
|
|
||||||
"2: \n\t"
|
|
||||||
|
|
||||||
@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t"
|
|
||||||
"vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
|
||||||
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
|
||||||
- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y
|
|
||||||
- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
|
||||||
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
|
||||||
+ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y
|
|
||||||
|
|
||||||
"addq $8 , %0 \n\t"
|
|
||||||
- "addq $8 , %8 \n\t"
|
|
||||||
+ "addq $8 , %2 \n\t"
|
|
||||||
"subq $8 , %1 \n\t"
|
|
||||||
|
|
||||||
|
|
||||||
@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
"vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t"
|
|
||||||
"vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t"
|
|
||||||
|
|
||||||
- "prefetcht0 192(%4,%0,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t"
|
|
||||||
"prefetcht0 192(%5,%0,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t"
|
|
||||||
"prefetcht0 192(%6,%0,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t"
|
|
||||||
"prefetcht0 192(%7,%0,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t"
|
|
||||||
+ "prefetcht0 192(%8,%0,4) \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t"
|
|
||||||
".align 2 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t"
|
|
||||||
- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t"
|
|
||||||
- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t"
|
|
||||||
- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t"
|
|
||||||
-
|
|
||||||
- "prefetcht0 192(%4,%8,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t"
|
|
||||||
- "prefetcht0 192(%5,%8,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
- "prefetcht0 192(%6,%8,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t"
|
|
||||||
- "prefetcht0 192(%7,%8,4) \n\t"
|
|
||||||
- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t"
|
|
||||||
- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t"
|
|
||||||
+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t"
|
|
||||||
+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t"
|
|
||||||
+ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t"
|
|
||||||
+
|
|
||||||
+ "prefetcht0 192(%5,%2,4) \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t"
|
|
||||||
+ "prefetcht0 192(%6,%2,4) \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t"
|
|
||||||
+ "prefetcht0 192(%7,%2,4) \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t"
|
|
||||||
+ "prefetcht0 192(%8,%2,4) \n\t"
|
|
||||||
+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t"
|
|
||||||
+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t"
|
|
||||||
- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t"
|
|
||||||
- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t"
|
|
||||||
- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t"
|
|
||||||
- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t"
|
|
||||||
+ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t"
|
|
||||||
+ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t"
|
|
||||||
+ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t"
|
|
||||||
+ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t"
|
|
||||||
+ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t"
|
|
||||||
|
|
||||||
- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
|
||||||
- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
|
||||||
- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
|
||||||
- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
|
||||||
+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
|
|
||||||
+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
|
|
||||||
+ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
|
|
||||||
+ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"
|
|
||||||
|
|
||||||
"addq $16, %0 \n\t"
|
|
||||||
- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y
|
|
||||||
- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y
|
|
||||||
- "addq $16, %8 \n\t"
|
|
||||||
- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y
|
|
||||||
- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y
|
|
||||||
+ "addq $16, %2 \n\t"
|
|
||||||
+ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y
|
|
||||||
+ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y
|
|
||||||
|
|
||||||
"subq $16, %1 \n\t"
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
- "+r" (n) // 1
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ "+r" (lda4) // 2
|
|
||||||
:
|
|
||||||
- "r" (x), // 2
|
|
||||||
- "r" (y), // 3
|
|
||||||
- "r" (ap[0]), // 4
|
|
||||||
- "r" (ap[1]), // 5
|
|
||||||
- "r" (ap[2]), // 6
|
|
||||||
- "r" (ap[3]), // 7
|
|
||||||
- "r" (lda4), // 8
|
|
||||||
+ "r" (x), // 3
|
|
||||||
+ "r" (y), // 4
|
|
||||||
+ "r" (ap[0]), // 5
|
|
||||||
+ "r" (ap[1]), // 6
|
|
||||||
+ "r" (ap[2]), // 7
|
|
||||||
+ "r" (ap[3]), // 8
|
|
||||||
"r" (alpha) // 9
|
|
||||||
: "cc",
|
|
||||||
"%xmm0", "%xmm1",
|
|
||||||
|
|
||||||
From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
||||||
Date: Sat, 16 Feb 2019 18:51:09 +0100
|
|
||||||
Subject: [PATCH 4/4] Fix inline assembly constraints
|
|
||||||
|
|
||||||
---
|
|
||||||
dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++
|
|
||||||
1 file changed, 247 insertions(+)
|
|
||||||
create mode 100644 dgemv_n_microk_piledriver-4.c
|
|
||||||
|
|
||||||
diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c
|
|
||||||
new file mode 100644
|
|
||||||
index 000000000..466931b82
|
|
||||||
--- /dev/null
|
|
||||||
+++ b/dgemv_n_microk_piledriver-4.c
|
|
||||||
@@ -0,0 +1,247 @@
|
|
||||||
+/***************************************************************************
|
|
||||||
+Copyright (c) 2014, The OpenBLAS Project
|
|
||||||
+All rights reserved.
|
|
||||||
+Redistribution and use in source and binary forms, with or without
|
|
||||||
+modification, are permitted provided that the following conditions are
|
|
||||||
+met:
|
|
||||||
+1. Redistributions of source code must retain the above copyright
|
|
||||||
+notice, this list of conditions and the following disclaimer.
|
|
||||||
+2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
+notice, this list of conditions and the following disclaimer in
|
|
||||||
+the documentation and/or other materials provided with the
|
|
||||||
+distribution.
|
|
||||||
+3. Neither the name of the OpenBLAS project nor the names of
|
|
||||||
+its contributors may be used to endorse or promote products
|
|
||||||
+derived from this software without specific prior written permission.
|
|
||||||
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
||||||
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
||||||
+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
+*****************************************************************************/
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+#define HAVE_KERNEL_4x8 1
|
|
||||||
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
|
||||||
+
|
|
||||||
+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
|
||||||
+{
|
|
||||||
+
|
|
||||||
+ BLASLONG register i = 0;
|
|
||||||
+
|
|
||||||
+ __asm__ __volatile__
|
|
||||||
+ (
|
|
||||||
+ "vzeroupper \n\t"
|
|
||||||
+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0
|
|
||||||
+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
|
|
||||||
+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
|
|
||||||
+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
|
|
||||||
+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
|
|
||||||
+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
|
|
||||||
+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
|
|
||||||
+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
|
|
||||||
+
|
|
||||||
+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
|
||||||
+
|
|
||||||
+ "testq $0x04, %1 \n\t"
|
|
||||||
+ "jz 2f \n\t"
|
|
||||||
+
|
|
||||||
+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
|
|
||||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "addq $4 , %2 \n\t"
|
|
||||||
+ "addq $4 , %0 \n\t"
|
|
||||||
+ "subq $4 , %1 \n\t"
|
|
||||||
+
|
|
||||||
+ "2: \n\t"
|
|
||||||
+
|
|
||||||
+ "cmpq $0, %1 \n\t"
|
|
||||||
+ "je 3f \n\t"
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ ".align 16 \n\t"
|
|
||||||
+ "1: \n\t"
|
|
||||||
+
|
|
||||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
|
|
||||||
+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
+ "addq $8 , %0 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
|
||||||
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
|
||||||
+
|
|
||||||
+ "addq $8 , %2 \n\t"
|
|
||||||
+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
|
||||||
+ "subq $8 , %1 \n\t"
|
|
||||||
+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "jnz 1b \n\t"
|
|
||||||
+
|
|
||||||
+ "3: \n\t"
|
|
||||||
+ "vzeroupper \n\t"
|
|
||||||
+
|
|
||||||
+ :
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ "+r" (lda4) // 2
|
|
||||||
+ :
|
|
||||||
+ "r" (x), // 3
|
|
||||||
+ "r" (y), // 4
|
|
||||||
+ "r" (ap[0]), // 5
|
|
||||||
+ "r" (ap[1]), // 6
|
|
||||||
+ "r" (ap[2]), // 7
|
|
||||||
+ "r" (ap[3]), // 8
|
|
||||||
+ "r" (alpha) // 9
|
|
||||||
+ : "cc",
|
|
||||||
+ "%xmm0", "%xmm1",
|
|
||||||
+ "%xmm2", "%xmm3",
|
|
||||||
+ "%xmm4", "%xmm5",
|
|
||||||
+ "%xmm6", "%xmm7",
|
|
||||||
+ "%xmm8", "%xmm9",
|
|
||||||
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
|
||||||
+ "memory"
|
|
||||||
+ );
|
|
||||||
+
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+#define HAVE_KERNEL_4x4 1
|
|
||||||
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
|
||||||
+
|
|
||||||
+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|
||||||
+{
|
|
||||||
+
|
|
||||||
+ BLASLONG register i = 0;
|
|
||||||
+
|
|
||||||
+ __asm__ __volatile__
|
|
||||||
+ (
|
|
||||||
+ "vzeroupper \n\t"
|
|
||||||
+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
|
||||||
+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
|
||||||
+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
|
||||||
+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
|
||||||
+
|
|
||||||
+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
|
||||||
+
|
|
||||||
+ "testq $0x04, %1 \n\t"
|
|
||||||
+ "jz 2f \n\t"
|
|
||||||
+
|
|
||||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "addq $4 , %0 \n\t"
|
|
||||||
+ "subq $4 , %1 \n\t"
|
|
||||||
+
|
|
||||||
+ "2: \n\t"
|
|
||||||
+
|
|
||||||
+ "cmpq $0, %1 \n\t"
|
|
||||||
+ "je 3f \n\t"
|
|
||||||
+
|
|
||||||
+
|
|
||||||
+ ".align 16 \n\t"
|
|
||||||
+ "1: \n\t"
|
|
||||||
+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
|
||||||
+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
|
||||||
+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
|
||||||
+
|
|
||||||
+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
|
|
||||||
+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "addq $8 , %0 \n\t"
|
|
||||||
+ "subq $8 , %1 \n\t"
|
|
||||||
+ "jnz 1b \n\t"
|
|
||||||
+
|
|
||||||
+ "3: \n\t"
|
|
||||||
+ "vzeroupper \n\t"
|
|
||||||
+
|
|
||||||
+ :
|
|
||||||
+ "+r" (i), // 0
|
|
||||||
+ "+r" (n) // 1
|
|
||||||
+ :
|
|
||||||
+ "r" (x), // 2
|
|
||||||
+ "r" (y), // 3
|
|
||||||
+ "r" (ap[0]), // 4
|
|
||||||
+ "r" (ap[1]), // 5
|
|
||||||
+ "r" (ap[2]), // 6
|
|
||||||
+ "r" (ap[3]), // 7
|
|
||||||
+ "r" (alpha) // 8
|
|
||||||
+ : "cc",
|
|
||||||
+ "%xmm4", "%xmm5",
|
|
||||||
+ "%xmm6", "%xmm7",
|
|
||||||
+ "%xmm8", "%xmm9",
|
|
||||||
+ "%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
|
||||||
+ "memory"
|
|
||||||
+ );
|
|
||||||
+
|
|
||||||
+}
|
|
||||||
+
|
|
||||||
+
|
|
1349
2024.patch
1349
2024.patch
File diff suppressed because it is too large
Load Diff
412
2028.patch
412
2028.patch
@ -1,412 +0,0 @@
|
|||||||
From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001
|
|
||||||
From: Andrew <16061801+brada4@users.noreply.github.com>
|
|
||||||
Date: Sun, 24 Feb 2019 20:41:02 +0200
|
|
||||||
Subject: [PATCH 2/2] move fix to right place
|
|
||||||
|
|
||||||
---
|
|
||||||
dgemv_n_microk_piledriver-4.c | 247 --------------------
|
|
||||||
kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++----
|
|
||||||
2 files changed, 49 insertions(+), 296 deletions(-)
|
|
||||||
delete mode 100644 dgemv_n_microk_piledriver-4.c
|
|
||||||
|
|
||||||
diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c
|
|
||||||
deleted file mode 100644
|
|
||||||
index 466931b82..000000000
|
|
||||||
--- a/dgemv_n_microk_piledriver-4.c
|
|
||||||
+++ /dev/null
|
|
||||||
@@ -1,247 +0,0 @@
|
|
||||||
-/***************************************************************************
|
|
||||||
-Copyright (c) 2014, The OpenBLAS Project
|
|
||||||
-All rights reserved.
|
|
||||||
-Redistribution and use in source and binary forms, with or without
|
|
||||||
-modification, are permitted provided that the following conditions are
|
|
||||||
-met:
|
|
||||||
-1. Redistributions of source code must retain the above copyright
|
|
||||||
-notice, this list of conditions and the following disclaimer.
|
|
||||||
-2. Redistributions in binary form must reproduce the above copyright
|
|
||||||
-notice, this list of conditions and the following disclaimer in
|
|
||||||
-the documentation and/or other materials provided with the
|
|
||||||
-distribution.
|
|
||||||
-3. Neither the name of the OpenBLAS project nor the names of
|
|
||||||
-its contributors may be used to endorse or promote products
|
|
||||||
-derived from this software without specific prior written permission.
|
|
||||||
-THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
|
||||||
-AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
||||||
-IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
||||||
-ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
|
|
||||||
-LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
||||||
-DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
|
||||||
-SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
|
||||||
-CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
|
||||||
-OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
|
|
||||||
-USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
|
||||||
-*****************************************************************************/
|
|
||||||
-
|
|
||||||
-
|
|
||||||
-
|
|
||||||
-#define HAVE_KERNEL_4x8 1
|
|
||||||
-static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));
|
|
||||||
-
|
|
||||||
-static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
|
|
||||||
-{
|
|
||||||
-
|
|
||||||
- BLASLONG register i = 0;
|
|
||||||
-
|
|
||||||
- __asm__ __volatile__
|
|
||||||
- (
|
|
||||||
- "vzeroupper \n\t"
|
|
||||||
- "vbroadcastsd (%3), %%ymm12 \n\t" // x0
|
|
||||||
- "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
|
|
||||||
- "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
|
|
||||||
- "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
|
|
||||||
- "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
|
|
||||||
- "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
|
|
||||||
- "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
|
|
||||||
- "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
|
|
||||||
-
|
|
||||||
- "vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
|
||||||
-
|
|
||||||
- "testq $0x04, %1 \n\t"
|
|
||||||
- "jz 2f \n\t"
|
|
||||||
-
|
|
||||||
- "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
|
|
||||||
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
-
|
|
||||||
- "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "addq $4 , %2 \n\t"
|
|
||||||
- "addq $4 , %0 \n\t"
|
|
||||||
- "subq $4 , %1 \n\t"
|
|
||||||
-
|
|
||||||
- "2: \n\t"
|
|
||||||
-
|
|
||||||
- "cmpq $0, %1 \n\t"
|
|
||||||
- "je 3f \n\t"
|
|
||||||
-
|
|
||||||
-
|
|
||||||
- ".align 16 \n\t"
|
|
||||||
- "1: \n\t"
|
|
||||||
-
|
|
||||||
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
- "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
|
|
||||||
- "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
- "addq $8 , %0 \n\t"
|
|
||||||
- "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
|
||||||
- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
|
||||||
-
|
|
||||||
- "addq $8 , %2 \n\t"
|
|
||||||
- "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
|
||||||
- "subq $8 , %1 \n\t"
|
|
||||||
- "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "jnz 1b \n\t"
|
|
||||||
-
|
|
||||||
- "3: \n\t"
|
|
||||||
- "vzeroupper \n\t"
|
|
||||||
-
|
|
||||||
- :
|
|
||||||
- "+r" (i), // 0
|
|
||||||
- "+r" (n), // 1
|
|
||||||
- "+r" (lda4) // 2
|
|
||||||
- :
|
|
||||||
- "r" (x), // 3
|
|
||||||
- "r" (y), // 4
|
|
||||||
- "r" (ap[0]), // 5
|
|
||||||
- "r" (ap[1]), // 6
|
|
||||||
- "r" (ap[2]), // 7
|
|
||||||
- "r" (ap[3]), // 8
|
|
||||||
- "r" (alpha) // 9
|
|
||||||
- : "cc",
|
|
||||||
- "%xmm0", "%xmm1",
|
|
||||||
- "%xmm2", "%xmm3",
|
|
||||||
- "%xmm4", "%xmm5",
|
|
||||||
- "%xmm6", "%xmm7",
|
|
||||||
- "%xmm8", "%xmm9",
|
|
||||||
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
|
||||||
- "memory"
|
|
||||||
- );
|
|
||||||
-
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
-
|
|
||||||
-
|
|
||||||
-#define HAVE_KERNEL_4x4 1
|
|
||||||
-static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
|
|
||||||
-
|
|
||||||
-static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
|
|
||||||
-{
|
|
||||||
-
|
|
||||||
- BLASLONG register i = 0;
|
|
||||||
-
|
|
||||||
- __asm__ __volatile__
|
|
||||||
- (
|
|
||||||
- "vzeroupper \n\t"
|
|
||||||
- "vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
|
||||||
- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
|
||||||
- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
|
||||||
- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
|
||||||
-
|
|
||||||
- "vbroadcastsd (%8), %%ymm6 \n\t" // alpha
|
|
||||||
-
|
|
||||||
- "testq $0x04, %1 \n\t"
|
|
||||||
- "jz 2f \n\t"
|
|
||||||
-
|
|
||||||
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "addq $4 , %0 \n\t"
|
|
||||||
- "subq $4 , %1 \n\t"
|
|
||||||
-
|
|
||||||
- "2: \n\t"
|
|
||||||
-
|
|
||||||
- "cmpq $0, %1 \n\t"
|
|
||||||
- "je 3f \n\t"
|
|
||||||
-
|
|
||||||
-
|
|
||||||
- ".align 16 \n\t"
|
|
||||||
- "1: \n\t"
|
|
||||||
- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
|
||||||
- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
|
||||||
- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
|
||||||
-
|
|
||||||
- "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y
|
|
||||||
- "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "addq $8 , %0 \n\t"
|
|
||||||
- "subq $8 , %1 \n\t"
|
|
||||||
- "jnz 1b \n\t"
|
|
||||||
-
|
|
||||||
- "3: \n\t"
|
|
||||||
- "vzeroupper \n\t"
|
|
||||||
-
|
|
||||||
- :
|
|
||||||
- "+r" (i), // 0
|
|
||||||
- "+r" (n) // 1
|
|
||||||
- :
|
|
||||||
- "r" (x), // 2
|
|
||||||
- "r" (y), // 3
|
|
||||||
- "r" (ap[0]), // 4
|
|
||||||
- "r" (ap[1]), // 5
|
|
||||||
- "r" (ap[2]), // 6
|
|
||||||
- "r" (ap[3]), // 7
|
|
||||||
- "r" (alpha) // 8
|
|
||||||
- : "cc",
|
|
||||||
- "%xmm4", "%xmm5",
|
|
||||||
- "%xmm6", "%xmm7",
|
|
||||||
- "%xmm8", "%xmm9",
|
|
||||||
- "%xmm12", "%xmm13", "%xmm14", "%xmm15",
|
|
||||||
- "memory"
|
|
||||||
- );
|
|
||||||
-
|
|
||||||
-}
|
|
||||||
-
|
|
||||||
-
|
|
||||||
diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
|
|
||||||
index 530780bab..466931b82 100644
|
|
||||||
--- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c
|
|
||||||
+++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c
|
|
||||||
@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
__asm__ __volatile__
|
|
||||||
(
|
|
||||||
"vzeroupper \n\t"
|
|
||||||
- "vbroadcastsd (%2), %%ymm12 \n\t" // x0
|
|
||||||
- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1
|
|
||||||
- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2
|
|
||||||
- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3
|
|
||||||
- "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4
|
|
||||||
- "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5
|
|
||||||
- "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6
|
|
||||||
- "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7
|
|
||||||
+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0
|
|
||||||
+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1
|
|
||||||
+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2
|
|
||||||
+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3
|
|
||||||
+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4
|
|
||||||
+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5
|
|
||||||
+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6
|
|
||||||
+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7
|
|
||||||
|
|
||||||
"vbroadcastsd (%9), %%ymm6 \n\t" // alpha
|
|
||||||
|
|
||||||
"testq $0x04, %1 \n\t"
|
|
||||||
"jz 2f \n\t"
|
|
||||||
|
|
||||||
- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y
|
|
||||||
+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y
|
|
||||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
|
|
||||||
- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
"vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
"vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
"vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
|
|
||||||
- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y
|
|
||||||
+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y
|
|
||||||
|
|
||||||
- "addq $4 , %8 \n\t"
|
|
||||||
+ "addq $4 , %2 \n\t"
|
|
||||||
"addq $4 , %0 \n\t"
|
|
||||||
"subq $4 , %1 \n\t"
|
|
||||||
|
|
||||||
@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
"vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t"
|
|
||||||
"vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t"
|
|
||||||
- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y
|
|
||||||
- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
-
|
|
||||||
- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y
|
|
||||||
+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t"
|
|
||||||
+
|
|
||||||
+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t"
|
|
||||||
"addq $8 , %0 \n\t"
|
|
||||||
- "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t"
|
|
||||||
- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t"
|
|
||||||
- "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t"
|
|
||||||
+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t"
|
|
||||||
+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t"
|
|
||||||
|
|
||||||
"vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t"
|
|
||||||
"vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t"
|
|
||||||
|
|
||||||
- "addq $8 , %8 \n\t"
|
|
||||||
+ "addq $8 , %2 \n\t"
|
|
||||||
"vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y
|
|
||||||
"subq $8 , %1 \n\t"
|
|
||||||
- "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y
|
|
||||||
+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y
|
|
||||||
|
|
||||||
"jnz 1b \n\t"
|
|
||||||
|
|
||||||
@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO
|
|
||||||
|
|
||||||
:
|
|
||||||
"+r" (i), // 0
|
|
||||||
- "+r" (n) // 1
|
|
||||||
+ "+r" (n), // 1
|
|
||||||
+ "+r" (lda4) // 2
|
|
||||||
:
|
|
||||||
- "r" (x), // 2
|
|
||||||
- "r" (y), // 3
|
|
||||||
- "r" (ap[0]), // 4
|
|
||||||
- "r" (ap[1]), // 5
|
|
||||||
- "r" (ap[2]), // 6
|
|
||||||
- "r" (ap[3]), // 7
|
|
||||||
- "r" (lda4), // 8
|
|
||||||
+ "r" (x), // 3
|
|
||||||
+ "r" (y), // 4
|
|
||||||
+ "r" (ap[0]), // 5
|
|
||||||
+ "r" (ap[1]), // 6
|
|
||||||
+ "r" (ap[2]), // 7
|
|
||||||
+ "r" (ap[3]), // 8
|
|
||||||
"r" (alpha) // 9
|
|
||||||
: "cc",
|
|
||||||
"%xmm0", "%xmm1",
|
|
@ -14,8 +14,8 @@
|
|||||||
# "obsoleted" features are still kept in the spec.
|
# "obsoleted" features are still kept in the spec.
|
||||||
|
|
||||||
Name: openblas
|
Name: openblas
|
||||||
Version: 0.3.5
|
Version: 0.3.6
|
||||||
Release: 5%{?dist}
|
Release: 1%{?dist}
|
||||||
Summary: An optimized BLAS library based on GotoBLAS2
|
Summary: An optimized BLAS library based on GotoBLAS2
|
||||||
License: BSD
|
License: BSD
|
||||||
URL: https://github.com/xianyi/OpenBLAS/
|
URL: https://github.com/xianyi/OpenBLAS/
|
||||||
@ -29,18 +29,6 @@ Patch2: openblas-0.2.15-constructor.patch
|
|||||||
# Supply the proper flags to the test makefile
|
# Supply the proper flags to the test makefile
|
||||||
Patch3: openblas-0.3.2-tests.patch
|
Patch3: openblas-0.3.2-tests.patch
|
||||||
|
|
||||||
# Fix assembly code
|
|
||||||
Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch
|
|
||||||
Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch
|
|
||||||
Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch
|
|
||||||
Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch
|
|
||||||
Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch
|
|
||||||
Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch
|
|
||||||
Patch16: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2028.patch
|
|
||||||
Patch17: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1965.patch
|
|
||||||
Patch18: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1966.patch
|
|
||||||
Patch19: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1967.patch
|
|
||||||
|
|
||||||
BuildRequires: gcc
|
BuildRequires: gcc
|
||||||
BuildRequires: gcc-gfortran
|
BuildRequires: gcc-gfortran
|
||||||
BuildRequires: perl-devel
|
BuildRequires: perl-devel
|
||||||
@ -251,17 +239,6 @@ cd OpenBLAS-%{version}
|
|||||||
%endif
|
%endif
|
||||||
%patch3 -p1 -b .tests
|
%patch3 -p1 -b .tests
|
||||||
|
|
||||||
%patch10 -p1
|
|
||||||
%patch11 -p1
|
|
||||||
%patch12 -p1
|
|
||||||
%patch13 -p1
|
|
||||||
%patch14 -p1
|
|
||||||
%patch15 -p1
|
|
||||||
%patch16 -p1
|
|
||||||
%patch17 -p1
|
|
||||||
%patch18 -p1
|
|
||||||
%patch19 -p1
|
|
||||||
|
|
||||||
# Fix source permissions
|
# Fix source permissions
|
||||||
find -name \*.f -exec chmod 644 {} \;
|
find -name \*.f -exec chmod 644 {} \;
|
||||||
|
|
||||||
@ -697,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig
|
|||||||
%endif
|
%endif
|
||||||
|
|
||||||
%changelog
|
%changelog
|
||||||
|
* Tue Apr 30 2019 Susi Lehtola <jussilehtola@fedoraproject.org> - 0.3.6-1
|
||||||
|
- Update to 0.3.6.
|
||||||
|
|
||||||
* Tue Feb 26 2019 Susi Lehtola <jussilehtola@fedoraproject.org> - 0.3.5-5
|
* Tue Feb 26 2019 Susi Lehtola <jussilehtola@fedoraproject.org> - 0.3.5-5
|
||||||
- Even more assembly kernel patches.
|
- Even more assembly kernel patches.
|
||||||
|
|
||||||
|
2
sources
2
sources
@ -1 +1 @@
|
|||||||
SHA512 (openblas-0.3.5.tar.gz) = 91b3074eb922453bf843158b4281cde65db9e8bbdd7590e75e9e6cdcb486157f7973f2936f327bb3eb4f1702ce0ba51ae6729d8d4baf2d986c50771e8f696df0
|
SHA512 (openblas-0.3.6.tar.gz) = 1ad980176a51f70d8b0b2d158da8c01f30f77b7cf385b24a6340d3c5feb1513bd04b9390487d05cc9557db7dc5f7c135b1688dec9f17ebef35dba884ef7ddee9
|
||||||
|
Loading…
Reference in New Issue
Block a user