diff --git a/.gitignore b/.gitignore index 9b6016d..36744a3 100644 --- a/.gitignore +++ b/.gitignore @@ -15,3 +15,4 @@ /v0.3.0.tar.gz /v0.3.1.tar.gz /openblas-0.3.2.tar.gz +/openblas-0.3.6.tar.gz diff --git a/1965.patch b/1965.patch deleted file mode 100644 index 5d8b935..0000000 --- a/1965.patch +++ /dev/null @@ -1,3283 +0,0 @@ -From f0dd0584306b42289cac77fdafe6997e449d4f38 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:56:10 +0100 -Subject: [PATCH 001/111] Tag operands 0 and 1 as both input and output - -For #1964 (basically a continuation of coding problems first seen in #1292) ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index 33bda0943..cb98f208a 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 8a6bbf5a5bf4623795b2ff9aaa8d35467288d6c7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:57:27 +0100 -Subject: [PATCH 002/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index 00e2e6a42..f31cf9710 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -113,8 +113,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 4e6f8fec31e83648c77c47398829b5191e671966 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:58:19 +0100 -Subject: [PATCH 003/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index a798fd977..931d1ad47 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -97,8 +97,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 663eef3b666e79c0e93f35cf79eada50040d9dd3 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 19:59:59 +0100 -Subject: [PATCH 004/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index 87370b032..9aeb47968 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -115,8 +115,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From a671e19dd2cad6dc1e2e639f45a4faebf53b6f7f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:02:01 +0100 -Subject: [PATCH 005/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index f587aa036..e6d11f1af 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 47e2b4592eb31860a58222bedc8a3208c153aa00 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:03:03 +0100 -Subject: [PATCH 006/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index fe195a63b..9fee7615d 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -99,8 +99,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 30a7bd8e15fb68d3fa651bbf48e1e65fc6078090 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:03:50 +0100 -Subject: [PATCH 007/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 01816917d..705c80c5c 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -107,8 +107,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2f5a7c1656b7975f71db2b8da90080938ccd3757 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:05:03 +0100 -Subject: [PATCH 008/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 76a3aa0eb..5a46aed8c 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -98,8 +98,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From bb16456fe1ff372b61a7ab042418248f68ddddc6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:23:58 +0100 -Subject: [PATCH 009/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index 8c520dcf1..c9a01580e 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -65,8 +65,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 7af8f34df4efcc0ecaaa34c380119edcd5d206de Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:24:55 +0100 -Subject: [PATCH 010/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index bbe8b9550..67431659d 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -61,8 +61,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From d94e7da701dae1106854753b2d5b676255c1c0f4 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:25:56 +0100 -Subject: [PATCH 011/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index 943d893af..61c99904a 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -74,8 +74,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6008f6531855d615ad98febe65364074b99fa5bf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:26:55 +0100 -Subject: [PATCH 012/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index 95eb953b4..e3d605b75 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 9d46f84f24dc7284fc398574b811621e5c61e2dc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:27:48 +0100 -Subject: [PATCH 013/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 85e038cef..1b827e7e2 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From ca02ac724f5b06e16a8941ef3b2582c251234679 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:28:56 +0100 -Subject: [PATCH 014/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index e40009037..2cab80067 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -80,8 +80,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -142,8 +142,8 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From c18c2c9d9b0cd7e82cb98c7b212ffb29648fb9e0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:32:17 +0100 -Subject: [PATCH 015/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 9756ee46a..379fd3ca1 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -67,8 +67,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c23c17163f1b7a5fb7652cbc038a50c01f9440c5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:33:07 +0100 -Subject: [PATCH 016/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index 365737363..c0c277c32 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -78,8 +78,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From de207d10c1f11ef1f38b4f766909619ab744d64a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:34:05 +0100 -Subject: [PATCH 017/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index fb5ec9bca..ea0b4eff1 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -77,8 +77,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c9078eb8b4481fbc1841bcbf36ba438bf2749632 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:35:14 +0100 -Subject: [PATCH 018/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index ac950885c..f7b74add6 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -147,8 +147,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 00aff05c4049cd697b4000b5f2e726496b34dc54 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:36:08 +0100 -Subject: [PATCH 019/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index 160f95604..e57eb37ea 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -83,8 +83,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From dc15f3b5a7689a6cea1d31e004d7a3488bf9b66d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:37:06 +0100 -Subject: [PATCH 020/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 5ce20b5de..845c78df1 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -80,8 +80,8 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 3f1719a98da89f0a6f1d435d3f705aa083702ac7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:39:08 +0100 -Subject: [PATCH 021/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3a743d64c..3b03e11a4 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -61,8 +61,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From b13f3c3bcfffcecbcc80454c90c31bc05dd5a04d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:39:57 +0100 -Subject: [PATCH 022/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index 68f68ea3a..4ffb39acf 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -74,8 +74,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 2bd18c7b73731d1b8bd900213fc7fa7a2356a357 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:40:50 +0100 -Subject: [PATCH 023/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 204cf8bac..87c5fe3cf 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -80,8 +80,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -141,8 +141,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6fcb55b22f6e8b80e7f6ffcf228c70c0929915b5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:41:41 +0100 -Subject: [PATCH 024/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 0a6bef046..5a8424d66 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 922e44897831f393cbeeb1406feb7fcf6e320281 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:42:35 +0100 -Subject: [PATCH 025/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index 36e61b077..5a6fc6da2 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -68,8 +68,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From d384880da564344e92a8d60b08e3183ab02ba75b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:43:24 +0100 -Subject: [PATCH 026/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index df367b61f..89d9cfe61 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -81,8 +81,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From cd3a35ee79b4b5fa00e5a446be2a6cceb3230874 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:44:13 +0100 -Subject: [PATCH 027/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index 1a27177f5..cef41b530 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -77,8 +77,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ba9f792e759ea97e75445b1fe1eaab4f3432f4f1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:45:08 +0100 -Subject: [PATCH 028/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index ca13536f2..e77ba1424 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -84,8 +84,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From c931bb8172bbdcbcfe6d2de281d2f83a7f5a3515 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:46:19 +0100 -Subject: [PATCH 029/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index 6b8b2566b..bedde8fb6 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -82,8 +82,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -145,8 +145,8 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 0172c51829110a5450b4d6d5f454bd4aa4106269 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:48:16 +0100 -Subject: [PATCH 030/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index 0e15761f7..56493f8cb 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 0cfb647a577058cebeaabadbe6ef62eebd2ce49e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:51:34 +0100 -Subject: [PATCH 031/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index 30e8b1955..bd52ba01f 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -113,8 +113,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 2b542d10368cbb8433b7274fb12b77845606d2fe Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:52:35 +0100 -Subject: [PATCH 032/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index 233af143a..d6a9ff394 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -101,8 +101,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -178,8 +178,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From af29c99c85d9ea5c27b6e917ebb1dcdbe1292f7b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 20:53:29 +0100 -Subject: [PATCH 033/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 728d09213..58d4c7286 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -115,8 +115,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -182,8 +182,8 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From f78531a9ec8ee28f7790505382231b3f5094b795 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:05:31 +0100 -Subject: [PATCH 034/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index 30a9552d6..ed66cc674 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -177,8 +177,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From b6f4ef5aea58e5ea1225283e406cadf9416818fc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:06:54 +0100 -Subject: [PATCH 035/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 11056a3c1..0e6ac55db 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -103,8 +103,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -188,8 +188,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 715b1f263d6903f1af391c5278a9aa61f1753193 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:08:09 +0100 -Subject: [PATCH 036/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 87c5b0340..416265ae2 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -109,8 +109,8 @@ if ( n < 1280 ) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -201,8 +201,8 @@ if ( n < 1280 ) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From e8d835ea466a1605db2157b6884a4cfe762478fc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 21:09:03 +0100 -Subject: [PATCH 037/111] Tag operands 0 and 1 as both input and output - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index 325f74ae3..fe1613fd4 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -97,8 +97,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -174,8 +174,8 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From bbc30700e871d84c07d770f54b645ea3eee549fa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:49:34 +0100 -Subject: [PATCH 038/111] Update saxpy_microk_nehalem-2.c - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index 4ffb39acf..e25156939 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -73,9 +73,9 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 300bb19b3ec0a48b7371d7c1be3ee88a29e87cf9 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:52:04 +0100 -Subject: [PATCH 039/111] Update caxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index cb98f208a..faf5cdc40 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -114,9 +114,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 1878e0c95aee9777f7c082bcc98ff12b04edc75d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:52:54 +0100 -Subject: [PATCH 040/111] Update caxpy_microk_haswell-2.c - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index f31cf9710..a011b2bfa 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -112,9 +112,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From f6be89295f4e21572a743d26e677256fc29ee8cf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:53:35 +0100 -Subject: [PATCH 041/111] Update caxpy_microk_sandy-2.c - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index 931d1ad47..c760d6540 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -95,10 +95,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 4673e5317861de37b326181b0dfc8514a2b3b69d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:54:39 +0100 -Subject: [PATCH 042/111] Update caxpy_microk_steamroller-2.c - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index 9aeb47968..b6eb55f9b 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -113,10 +113,10 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -181,9 +181,9 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From ba6d2c77a98f55431d8d2d4de4b6df99814352c1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:55:38 +0100 -Subject: [PATCH 043/111] Update cdot_microk_bulldozer-2.c - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index e6d11f1af..c2245c6dc 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 093a3d7d5790efd7441611ee8c8769d4f3d997c0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:56:15 +0100 -Subject: [PATCH 044/111] Update cdot_microk_haswell-2.c - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index 9fee7615d..396dbeaa7 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -98,9 +98,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2224bcb4f070e607ede67f2f6e089e2e99519517 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:57:01 +0100 -Subject: [PATCH 045/111] Update cdot_microk_sandy-2.c - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 705c80c5c..20ba48c00 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -105,10 +105,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 2414f1d796e23f8e9e4abba27e948f5877773640 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:57:56 +0100 -Subject: [PATCH 046/111] Update cdot_microk_steamroller-2.c - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 5a46aed8c..01754b147 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -97,9 +97,9 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ae00befb3e3a9632d9545ba0af43f9afb90787b2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:58:52 +0100 -Subject: [PATCH 047/111] Update daxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index c9a01580e..2e2356fb6 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -64,9 +64,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 604c574542a5fac237b5134610166fab26db1285 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 22:59:30 +0100 -Subject: [PATCH 048/111] Update daxpy_microk_haswell-2.c - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index 67431659d..c77fc33ef 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -59,10 +59,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 082498ee3b8470e992f33414e3097ca301f9efa7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:00:07 +0100 -Subject: [PATCH 049/111] Update daxpy_microk_nehalem-2.c - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index 61c99904a..b81fe6562 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -73,9 +73,9 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 293f5531e66088d7149bebd68bcd7aa564b3a263 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:00:53 +0100 -Subject: [PATCH 050/111] Update daxpy_microk_piledriver-2.c - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index e3d605b75..efe93dfed 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6cee8e0fdd463139f85656292971de1e4810d775 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:01:28 +0100 -Subject: [PATCH 051/111] Update daxpy_microk_sandy-2.c - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 1b827e7e2..3b1214f36 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 6450bf14afa94cade7d28330749dfbf255697026 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:02:32 +0100 -Subject: [PATCH 052/111] Update daxpy_microk_steamroller-2.c - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index 2cab80067..a5143682f 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -78,10 +78,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -140,10 +140,10 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "subq $16, %1 \n\t" - "jnz 1b \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From a339b45e51c58e5b13c01c6918282fb31941acdf Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:03:07 +0100 -Subject: [PATCH 053/111] Update ddot_microk_bulldozer-2.c - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 379fd3ca1..62bf7e7dc 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -65,10 +65,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovsd %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 64fcdadf39137bdc56c56ead1e4d8f1bea32fe2a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:03:44 +0100 -Subject: [PATCH 054/111] Update ddot_microk_haswell-2.c - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index c0c277c32..0cf4ece65 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -77,9 +77,9 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vzeroupper \n\t" - - : -- : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 504dd44e887cbd985bac3d48a2a7fdc3a03727d8 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:04:20 +0100 -Subject: [PATCH 055/111] Update ddot_microk_nehalem-2.c - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index ea0b4eff1..086a0bb91 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -75,10 +75,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "movsd %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 56c67a929a2b215c3980a542c74a016f828e119d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:05:11 +0100 -Subject: [PATCH 056/111] Update ddot_microk_piledriver-2.c - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index f7b74add6..d7347ebdf 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -145,10 +145,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From b7ffbc40eca528e3aae46d004c1ad8e6fd013530 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:05:43 +0100 -Subject: [PATCH 057/111] Update ddot_microk_sandy-2.c - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index e57eb37ea..28b1a8bd1 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -81,10 +81,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 0c9c31dbe4817ad24ecc2cc5dc553239a7c31590 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:06:20 +0100 -Subject: [PATCH 058/111] Update ddot_microk_steamroller-2.c - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 845c78df1..98cf94acf 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -78,10 +78,10 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovsd %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From d1b69022c935a37bbe3c8b09eb329a7468339ff0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:07:04 +0100 -Subject: [PATCH 059/111] Update saxpy_microk_haswell-2.c - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3b03e11a4..3bc450f7b 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -59,10 +59,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 369a2b4af5680dfcbd1d8290077f62a4d74336fb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:07:54 +0100 -Subject: [PATCH 060/111] Update saxpy_microk_piledriver-2.c - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 87c5fe3cf..87e742ac7 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -78,10 +78,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 -@@ -139,10 +139,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From dc931ad1fe709ad378d6d963fbde5bad421e5514 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:08:27 +0100 -Subject: [PATCH 061/111] Update saxpy_microk_sandy-2.c - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 5a8424d66..6ce67a7d1 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From b2d6fea1cb99f0830c33e3667d1928be4496a31f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:09:07 +0100 -Subject: [PATCH 062/111] Update sdot_microk_bulldozer-2.c - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index 5a6fc6da2..c7f8cb1a7 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -66,10 +66,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ffc008663aef2dd318c58275fb8b68cc93de9a42 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:09:50 +0100 -Subject: [PATCH 063/111] Update sdot_microk_haswell-2.c - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index 89d9cfe61..417fb3862 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -79,10 +79,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovss %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 88b0dbfbddbc5170263bd06eb0aad0abf85faa81 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:10:30 +0100 -Subject: [PATCH 064/111] Update sdot_microk_nehalem-2.c - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index cef41b530..115e7a410 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -75,10 +75,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "movss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ba9c3c4328a73821ce6067fb78b01b8817a92fa1 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:11:09 +0100 -Subject: [PATCH 065/111] Update sdot_microk_sandy-2.c - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index e77ba1424..9d0795181 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -82,10 +82,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovss %%xmm4, (%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 266e72d24b767dbcdb97f597c899c7f495609c6f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:12:07 +0100 -Subject: [PATCH 066/111] Update sdot_microk_steamroller-2.c - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index bedde8fb6..3475f890d 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -80,10 +80,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -143,10 +143,10 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - "vmovss %%xmm4, (%4) \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 72c3a4d1bd1daf3a98413dbea081f19fc6ee897d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:13:06 +0100 -Subject: [PATCH 067/111] Update zaxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index 56493f8cb..eed36ffd0 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 157e65ff74b7760a19ed38e8796aab6ad0d2a152 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:13:41 +0100 -Subject: [PATCH 068/111] Update zaxpy_microk_haswell-2.c - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index bd52ba01f..9aeea975b 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -111,10 +111,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 212b0a106d83491aeac793c6d45b4e494d06d868 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:14:28 +0100 -Subject: [PATCH 069/111] Update zaxpy_microk_sandy-2.c - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index d6a9ff394..cbd9b378f 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -99,10 +99,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -176,10 +176,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 2fa6d8107c40d780c988c8f23b5d61d6a0f8e8eb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:15:18 +0100 -Subject: [PATCH 070/111] Update zaxpy_microk_steamroller-2.c - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 58d4c7286..5fc56aec7 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -113,10 +113,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 -@@ -180,10 +180,10 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "jnz 1b \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha), // 4 - -From 79d5dd461d13953e8cade9a1dad43ad38cf93aaa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:16:09 +0100 -Subject: [PATCH 071/111] Update zdot_microk_bulldozer-2.c - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index ed66cc674..a80eac003 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -96,10 +96,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -175,10 +175,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From cb5cfffb1765ac8ef1e2f149aea1dc3e5fbb9623 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:16:55 +0100 -Subject: [PATCH 072/111] Update zdot_microk_haswell-2.c - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 0e6ac55db..963d2e3bd 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -101,10 +101,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -186,10 +186,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From f4e5f931ae5c14d284749c65d1e9ed08873afaa2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:17:46 +0100 -Subject: [PATCH 073/111] Update zdot_microk_sandy-2.c - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 416265ae2..88d4e1bbb 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -107,10 +107,10 @@ if ( n < 1280 ) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -199,10 +199,10 @@ if ( n < 1280 ) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From ae2f3e617df8894ebe1779d3bcc78170bcad8b4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:18:27 +0100 -Subject: [PATCH 074/111] Update zdot_microk_steamroller-2.c - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index fe1613fd4..2f11fe562 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -95,10 +95,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 -@@ -172,10 +172,10 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - "vmovups %%xmm4, 16(%4) \n\t" - "vzeroupper \n\t" - -- : -- : -+ : - "+r" (i), // 0 - "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (dot) // 4 - -From 379aa11f4bfc5bb352372a3f423062267e73dd77 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:10:21 +0100 -Subject: [PATCH 075/111] Update caxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/caxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_bulldozer-2.c b/kernel/x86_64/caxpy_microk_bulldozer-2.c -index faf5cdc40..ca2209340 100644 ---- a/kernel/x86_64/caxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/caxpy_microk_bulldozer-2.c -@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 983c72ab0fc182264a635d1c5286ceebc2b2f3e2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:10:51 +0100 -Subject: [PATCH 076/111] Update caxpy_microk_haswell-2.c - ---- - kernel/x86_64/caxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_haswell-2.c b/kernel/x86_64/caxpy_microk_haswell-2.c -index a011b2bfa..b605ea34c 100644 ---- a/kernel/x86_64/caxpy_microk_haswell-2.c -+++ b/kernel/x86_64/caxpy_microk_haswell-2.c -@@ -113,7 +113,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6f7f9967f945c145e6e4ceac14162e8dbc551f4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:11:21 +0100 -Subject: [PATCH 077/111] Update caxpy_microk_sandy-2.c - ---- - kernel/x86_64/caxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/caxpy_microk_sandy-2.c b/kernel/x86_64/caxpy_microk_sandy-2.c -index c760d6540..72d37afed 100644 ---- a/kernel/x86_64/caxpy_microk_sandy-2.c -+++ b/kernel/x86_64/caxpy_microk_sandy-2.c -@@ -97,7 +97,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From aa799573b5f91e786ef41116b9fd030161fb6a10 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:11:59 +0100 -Subject: [PATCH 078/111] Update caxpy_microk_steamroller-2.c - ---- - kernel/x86_64/caxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/caxpy_microk_steamroller-2.c b/kernel/x86_64/caxpy_microk_steamroller-2.c -index b6eb55f9b..7ca7af070 100644 ---- a/kernel/x86_64/caxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/caxpy_microk_steamroller-2.c -@@ -115,7 +115,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void caxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From f9497bdab685ca8b9bea018c900df24b7dd2aad7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:12:37 +0100 -Subject: [PATCH 079/111] Update cdot_microk_bulldozer-2.c - ---- - kernel/x86_64/cdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_bulldozer-2.c b/kernel/x86_64/cdot_microk_bulldozer-2.c -index c2245c6dc..118655913 100644 ---- a/kernel/x86_64/cdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/cdot_microk_bulldozer-2.c -@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From de4c5a9258b3c29e1e305660c50e7b4cf8204c46 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:13:09 +0100 -Subject: [PATCH 080/111] Update daxpy_microk_haswell-2.c - ---- - kernel/x86_64/daxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_haswell-2.c b/kernel/x86_64/daxpy_microk_haswell-2.c -index c77fc33ef..f3682e6d7 100644 ---- a/kernel/x86_64/daxpy_microk_haswell-2.c -+++ b/kernel/x86_64/daxpy_microk_haswell-2.c -@@ -61,7 +61,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 59ca748c9ec75cf57148bcf4de06dc328f227845 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:13:38 +0100 -Subject: [PATCH 081/111] Update daxpy_microk_nehalem-2.c - ---- - kernel/x86_64/daxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_nehalem-2.c b/kernel/x86_64/daxpy_microk_nehalem-2.c -index b81fe6562..8feb9f26c 100644 ---- a/kernel/x86_64/daxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/daxpy_microk_nehalem-2.c -@@ -74,7 +74,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 5f2ef0e70fb180022f3447826029f42c75c6fbb5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:14:13 +0100 -Subject: [PATCH 082/111] Update daxpy_microk_piledriver-2.c - ---- - kernel/x86_64/daxpy_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_piledriver-2.c b/kernel/x86_64/daxpy_microk_piledriver-2.c -index efe93dfed..4b83124c7 100644 ---- a/kernel/x86_64/daxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/daxpy_microk_piledriver-2.c -@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From c5b01c8be14c3cc3b364b9067124695e2d91c63a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:14:43 +0100 -Subject: [PATCH 083/111] Update daxpy_microk_sandy-2.c - ---- - kernel/x86_64/daxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_sandy-2.c b/kernel/x86_64/daxpy_microk_sandy-2.c -index 3b1214f36..db9a45de8 100644 ---- a/kernel/x86_64/daxpy_microk_sandy-2.c -+++ b/kernel/x86_64/daxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From d4f3b733dc1026c9d1bfa8bea5696353de3b47c0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:15:18 +0100 -Subject: [PATCH 084/111] Update daxpy_microk_steamroller-2.c - ---- - kernel/x86_64/daxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/daxpy_microk_steamroller-2.c b/kernel/x86_64/daxpy_microk_steamroller-2.c -index a5143682f..8e63fcc1d 100644 ---- a/kernel/x86_64/daxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/daxpy_microk_steamroller-2.c -@@ -80,7 +80,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -142,7 +142,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From dcfab783f725abb0280a77f61a4083be581e89b8 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:15:57 +0100 -Subject: [PATCH 085/111] Update ddot_microk_bulldozer-2.c - ---- - kernel/x86_64/ddot_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_bulldozer-2.c b/kernel/x86_64/ddot_microk_bulldozer-2.c -index 62bf7e7dc..5590c5b17 100644 ---- a/kernel/x86_64/ddot_microk_bulldozer-2.c -+++ b/kernel/x86_64/ddot_microk_bulldozer-2.c -@@ -67,7 +67,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 0779654cb47dbc9984f344d5b7ffa68e39afdbc3 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:16:26 +0100 -Subject: [PATCH 086/111] Update ddot_microk_haswell-2.c - ---- - kernel/x86_64/ddot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_haswell-2.c b/kernel/x86_64/ddot_microk_haswell-2.c -index 0cf4ece65..dbb5487f7 100644 ---- a/kernel/x86_64/ddot_microk_haswell-2.c -+++ b/kernel/x86_64/ddot_microk_haswell-2.c -@@ -78,7 +78,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 29028652213235c1d2e7dc18d49daa86f3356574 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:16:53 +0100 -Subject: [PATCH 087/111] Update ddot_microk_nehalem-2.c - ---- - kernel/x86_64/ddot_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_nehalem-2.c b/kernel/x86_64/ddot_microk_nehalem-2.c -index 086a0bb91..e5e234e22 100644 ---- a/kernel/x86_64/ddot_microk_nehalem-2.c -+++ b/kernel/x86_64/ddot_microk_nehalem-2.c -@@ -77,7 +77,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6df88c7c455c37a18a16f1cbd003b640ef6777f0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:17:43 +0100 -Subject: [PATCH 088/111] Update cdot_microk_haswell-2.c - ---- - kernel/x86_64/cdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_haswell-2.c b/kernel/x86_64/cdot_microk_haswell-2.c -index 396dbeaa7..8b9d6d104 100644 ---- a/kernel/x86_64/cdot_microk_haswell-2.c -+++ b/kernel/x86_64/cdot_microk_haswell-2.c -@@ -99,7 +99,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 81691c726eb55df75f638794fe3afff70cc3286d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:18:11 +0100 -Subject: [PATCH 089/111] Update cdot_microk_sandy-2.c - ---- - kernel/x86_64/cdot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/cdot_microk_sandy-2.c b/kernel/x86_64/cdot_microk_sandy-2.c -index 20ba48c00..fe142c38f 100644 ---- a/kernel/x86_64/cdot_microk_sandy-2.c -+++ b/kernel/x86_64/cdot_microk_sandy-2.c -@@ -107,7 +107,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From ab8cc007364b9477e13c107a7befce7668c10ebb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:18:47 +0100 -Subject: [PATCH 090/111] Update cdot_microk_steamroller-2.c - ---- - kernel/x86_64/cdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/cdot_microk_steamroller-2.c b/kernel/x86_64/cdot_microk_steamroller-2.c -index 01754b147..7350b21c9 100644 ---- a/kernel/x86_64/cdot_microk_steamroller-2.c -+++ b/kernel/x86_64/cdot_microk_steamroller-2.c -@@ -98,7 +98,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void cdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From bdcba6adda368da48e450cdc3b9c9f7b6c52e630 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:19:32 +0100 -Subject: [PATCH 091/111] Update daxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/daxpy_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/daxpy_microk_bulldozer-2.c b/kernel/x86_64/daxpy_microk_bulldozer-2.c -index 2e2356fb6..9c1305b97 100644 ---- a/kernel/x86_64/daxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/daxpy_microk_bulldozer-2.c -@@ -65,7 +65,7 @@ static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From e9fc4dfdead60ed013e016c62215170d04b5ad9d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:20:20 +0100 -Subject: [PATCH 092/111] Update ddot_microk_piledriver-2.c - ---- - kernel/x86_64/ddot_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/ddot_microk_piledriver-2.c b/kernel/x86_64/ddot_microk_piledriver-2.c -index d7347ebdf..cc4bcd90a 100644 ---- a/kernel/x86_64/ddot_microk_piledriver-2.c -+++ b/kernel/x86_64/ddot_microk_piledriver-2.c -@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -147,7 +147,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 9430424102257485eae76482f495402260e9682d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:20:48 +0100 -Subject: [PATCH 093/111] Update ddot_microk_sandy-2.c - ---- - kernel/x86_64/ddot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_sandy-2.c b/kernel/x86_64/ddot_microk_sandy-2.c -index 28b1a8bd1..84493ec27 100644 ---- a/kernel/x86_64/ddot_microk_sandy-2.c -+++ b/kernel/x86_64/ddot_microk_sandy-2.c -@@ -83,7 +83,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 129a987e4b55f13c413f4eaad58465443051dd43 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:21:26 +0100 -Subject: [PATCH 094/111] Update ddot_microk_steamroller-2.c - ---- - kernel/x86_64/ddot_microk_steamroller-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/ddot_microk_steamroller-2.c b/kernel/x86_64/ddot_microk_steamroller-2.c -index 98cf94acf..27d5244ce 100644 ---- a/kernel/x86_64/ddot_microk_steamroller-2.c -+++ b/kernel/x86_64/ddot_microk_steamroller-2.c -@@ -80,7 +80,7 @@ static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 49789c39fb2a55dacc146f079c1c5fab45d3ce2e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:22:17 +0100 -Subject: [PATCH 095/111] Update saxpy_microk_haswell-2.c - ---- - kernel/x86_64/saxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_haswell-2.c b/kernel/x86_64/saxpy_microk_haswell-2.c -index 3bc450f7b..7099ba4c6 100644 ---- a/kernel/x86_64/saxpy_microk_haswell-2.c -+++ b/kernel/x86_64/saxpy_microk_haswell-2.c -@@ -61,7 +61,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 7f556b81fb40ca6d90529829b802b38adbc747d7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:22:46 +0100 -Subject: [PATCH 096/111] Update saxpy_microk_nehalem-2.c - ---- - kernel/x86_64/saxpy_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_nehalem-2.c b/kernel/x86_64/saxpy_microk_nehalem-2.c -index e25156939..88bbb695d 100644 ---- a/kernel/x86_64/saxpy_microk_nehalem-2.c -+++ b/kernel/x86_64/saxpy_microk_nehalem-2.c -@@ -74,7 +74,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From cb75878f98892850b29fc7a0b427500a56d244dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:23:16 +0100 -Subject: [PATCH 097/111] Update saxpy_microk_piledriver-2.c - ---- - kernel/x86_64/saxpy_microk_piledriver-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/saxpy_microk_piledriver-2.c b/kernel/x86_64/saxpy_microk_piledriver-2.c -index 87e742ac7..5feea7f24 100644 ---- a/kernel/x86_64/saxpy_microk_piledriver-2.c -+++ b/kernel/x86_64/saxpy_microk_piledriver-2.c -@@ -80,7 +80,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -141,7 +141,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 65719fcb41987c499c31455fe7b0290800cacdd6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:23:44 +0100 -Subject: [PATCH 098/111] Update saxpy_microk_sandy-2.c - ---- - kernel/x86_64/saxpy_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/saxpy_microk_sandy-2.c b/kernel/x86_64/saxpy_microk_sandy-2.c -index 6ce67a7d1..0d448d5f8 100644 ---- a/kernel/x86_64/saxpy_microk_sandy-2.c -+++ b/kernel/x86_64/saxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void saxpy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From b52e763084040ed624fff574fba1fe1bc58b1cc7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:24:16 +0100 -Subject: [PATCH 099/111] Update sdot_microk_bulldozer-2.c - ---- - kernel/x86_64/sdot_microk_bulldozer-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_bulldozer-2.c b/kernel/x86_64/sdot_microk_bulldozer-2.c -index c7f8cb1a7..8958a33dc 100644 ---- a/kernel/x86_64/sdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/sdot_microk_bulldozer-2.c -@@ -68,7 +68,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 2c021aeb9c018e4da2a7a0a5c0315d06d689a3c2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:24:42 +0100 -Subject: [PATCH 100/111] Update sdot_microk_haswell-2.c - ---- - kernel/x86_64/sdot_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_haswell-2.c b/kernel/x86_64/sdot_microk_haswell-2.c -index 417fb3862..91dc928d3 100644 ---- a/kernel/x86_64/sdot_microk_haswell-2.c -+++ b/kernel/x86_64/sdot_microk_haswell-2.c -@@ -81,7 +81,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From bb43f185cf2f4354b62b779a369b53db3607598d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:25:15 +0100 -Subject: [PATCH 101/111] Update sdot_microk_nehalem-2.c - ---- - kernel/x86_64/sdot_microk_nehalem-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_nehalem-2.c b/kernel/x86_64/sdot_microk_nehalem-2.c -index 115e7a410..5a715d008 100644 ---- a/kernel/x86_64/sdot_microk_nehalem-2.c -+++ b/kernel/x86_64/sdot_microk_nehalem-2.c -@@ -77,7 +77,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 3b98d1e16d48f08540952624e9aa7843d5384ceb Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:25:43 +0100 -Subject: [PATCH 102/111] Update sdot_microk_sandy-2.c - ---- - kernel/x86_64/sdot_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sdot_microk_sandy-2.c b/kernel/x86_64/sdot_microk_sandy-2.c -index 9d0795181..ae25d5a50 100644 ---- a/kernel/x86_64/sdot_microk_sandy-2.c -+++ b/kernel/x86_64/sdot_microk_sandy-2.c -@@ -84,7 +84,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 7009a0337f674911ebe6d9ce6d1bf9b21472e05e Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:26:24 +0100 -Subject: [PATCH 103/111] Update sdot_microk_steamroller-2.c - ---- - kernel/x86_64/sdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sdot_microk_steamroller-2.c b/kernel/x86_64/sdot_microk_steamroller-2.c -index 3475f890d..bf6a5f287 100644 ---- a/kernel/x86_64/sdot_microk_steamroller-2.c -+++ b/kernel/x86_64/sdot_microk_steamroller-2.c -@@ -82,7 +82,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -145,7 +145,7 @@ static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From f117a2e4aa3e100015d479dd61530019db66e53f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:27:34 +0100 -Subject: [PATCH 104/111] Update zaxpy_microk_bulldozer-2.c - ---- - kernel/x86_64/zaxpy_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_bulldozer-2.c b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -index eed36ffd0..15d367971 100644 ---- a/kernel/x86_64/zaxpy_microk_bulldozer-2.c -+++ b/kernel/x86_64/zaxpy_microk_bulldozer-2.c -@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 752d4e88089ce1ff5ab27b25de382750b5e4a9c7 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:28:00 +0100 -Subject: [PATCH 105/111] Update zaxpy_microk_haswell-2.c - ---- - kernel/x86_64/zaxpy_microk_haswell-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/zaxpy_microk_haswell-2.c b/kernel/x86_64/zaxpy_microk_haswell-2.c -index 9aeea975b..89d23daf3 100644 ---- a/kernel/x86_64/zaxpy_microk_haswell-2.c -+++ b/kernel/x86_64/zaxpy_microk_haswell-2.c -@@ -113,7 +113,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 0f905d346e8c0bda5bbf7cb6ae7f7a6ad137aa76 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:28:40 +0100 -Subject: [PATCH 106/111] Update zaxpy_microk_sandy-2.c - ---- - kernel/x86_64/zaxpy_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_sandy-2.c b/kernel/x86_64/zaxpy_microk_sandy-2.c -index cbd9b378f..17b8b24f7 100644 ---- a/kernel/x86_64/zaxpy_microk_sandy-2.c -+++ b/kernel/x86_64/zaxpy_microk_sandy-2.c -@@ -101,7 +101,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -178,7 +178,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 39a29ef0ce2de84526cf8e71881e6117b4532f84 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:29:18 +0100 -Subject: [PATCH 107/111] Update zaxpy_microk_steamroller-2.c - ---- - kernel/x86_64/zaxpy_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zaxpy_microk_steamroller-2.c b/kernel/x86_64/zaxpy_microk_steamroller-2.c -index 5fc56aec7..907b1ae00 100644 ---- a/kernel/x86_64/zaxpy_microk_steamroller-2.c -+++ b/kernel/x86_64/zaxpy_microk_steamroller-2.c -@@ -115,7 +115,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -182,7 +182,7 @@ static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 1496c1a69f4d0c521d797b1847363c38e46958d5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:30:03 +0100 -Subject: [PATCH 108/111] Update zdot_microk_bulldozer-2.c - ---- - kernel/x86_64/zdot_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_bulldozer-2.c b/kernel/x86_64/zdot_microk_bulldozer-2.c -index a80eac003..db9a48cce 100644 ---- a/kernel/x86_64/zdot_microk_bulldozer-2.c -+++ b/kernel/x86_64/zdot_microk_bulldozer-2.c -@@ -98,7 +98,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -177,7 +177,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 8f09f06f2c964ece75730dadd99e569844497fe6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:30:43 +0100 -Subject: [PATCH 109/111] Update zdot_microk_haswell-2.c - ---- - kernel/x86_64/zdot_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_haswell-2.c b/kernel/x86_64/zdot_microk_haswell-2.c -index 963d2e3bd..9f2fc2c1d 100644 ---- a/kernel/x86_64/zdot_microk_haswell-2.c -+++ b/kernel/x86_64/zdot_microk_haswell-2.c -@@ -103,7 +103,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -188,7 +188,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From fca3f8610fbeb0a4a4198eb0f2fc74f91cd6e85d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:31:24 +0100 -Subject: [PATCH 110/111] Update zdot_microk_sandy-2.c - ---- - kernel/x86_64/zdot_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_sandy-2.c b/kernel/x86_64/zdot_microk_sandy-2.c -index 88d4e1bbb..33415e26e 100644 ---- a/kernel/x86_64/zdot_microk_sandy-2.c -+++ b/kernel/x86_64/zdot_microk_sandy-2.c -@@ -109,7 +109,7 @@ if ( n < 1280 ) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -201,7 +201,7 @@ if ( n < 1280 ) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From 6976222962772b395054016e99faac34986b5e59 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:32:05 +0100 -Subject: [PATCH 111/111] Update zdot_microk_steamroller-2.c - ---- - kernel/x86_64/zdot_microk_steamroller-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/zdot_microk_steamroller-2.c b/kernel/x86_64/zdot_microk_steamroller-2.c -index 2f11fe562..87138fe9a 100644 ---- a/kernel/x86_64/zdot_microk_steamroller-2.c -+++ b/kernel/x86_64/zdot_microk_steamroller-2.c -@@ -97,7 +97,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 -@@ -174,7 +174,7 @@ static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 diff --git a/1966.patch b/1966.patch deleted file mode 100644 index c2663cd..0000000 --- a/1966.patch +++ /dev/null @@ -1,960 +0,0 @@ -From 63cdd8f4a04f3a5ac1733e202b6b3678c34fb8dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:27:38 +0100 -Subject: [PATCH 01/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_bulldozer-2.c | 32 ++++++++++++------------ - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c -index 3abffc4cf..f526fd611 100644 ---- a/kernel/x86_64/cscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c -@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -285,11 +285,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -330,11 +330,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From b6136be686e415fbdb035267c5020cb08e4e49ac Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:30:03 +0100 -Subject: [PATCH 02/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_haswell-2.c | 30 +++++++++++++------------- - 1 file changed, 15 insertions(+), 15 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c -index 0a4eb683c..8623dcd10 100644 ---- a/kernel/x86_64/cscal_microk_haswell-2.c -+++ b/kernel/x86_64/cscal_microk_haswell-2.c -@@ -116,11 +116,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,9 +208,9 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 - : "cc", // "0", "1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -285,9 +285,9 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -329,12 +329,12 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -- : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From f447fb4c54870710cd6304553df59f50ff51b8f5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:32:48 +0100 -Subject: [PATCH 03/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/cscal_microk_steamroller-2.c | 32 +++++++++++----------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c -index 8346e1748..fbeb857e2 100644 ---- a/kernel/x86_64/cscal_microk_steamroller-2.c -+++ b/kernel/x86_64/cscal_microk_steamroller-2.c -@@ -117,11 +117,11 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,12 +208,12 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -286,11 +286,11 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -331,11 +331,11 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"0", "1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From fcd7fde5702cf7270332a5dd747f83efe7be93dd Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:35:18 +0100 -Subject: [PATCH 04/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_bulldozer-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c -index de53b0bc4..71d3a9846 100644 ---- a/kernel/x86_64/dscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -188,9 +188,9 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From 05e961994401bfc6dc8639fa9bc159148569ca9d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:36:37 +0100 -Subject: [PATCH 05/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_haswell-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c -index e732a2718..90790cfdc 100644 ---- a/kernel/x86_64/dscal_microk_haswell-2.c -+++ b/kernel/x86_64/dscal_microk_haswell-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n1), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From 7a11cc5b9f7c9669ee1f9818a1ea3f44c2f6d98d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:37:49 +0100 -Subject: [PATCH 06/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dscal_microk_sandy-2.c | 12 ++++++------ - 1 file changed, 6 insertions(+), 6 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c -index 8d855072b..0f187ba88 100644 ---- a/kernel/x86_64/dscal_microk_sandy-2.c -+++ b/kernel/x86_64/dscal_microk_sandy-2.c -@@ -122,9 +122,9 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 -+ "+r" (n1), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", -@@ -187,10 +187,10 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n1), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n1), // 0 -- "r" (x), // 1 - "r" (alpha), // 2 - "r" (n2) // 3 - : "cc", - -From a6c06bffe1ec60ec359b300b8cc9e18b30c72d0d Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:40:28 +0100 -Subject: [PATCH 07/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c -index 03882d6b6..1ce59d2c7 100644 ---- a/kernel/x86_64/zscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c -@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 5efc7ce079fd87de9ab7ca20aaaf8c5c627170fa Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:42:34 +0100 -Subject: [PATCH 08/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_haswell-2.c | 32 +++++++++++++------------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c -index d9253c1ed..534370959 100644 ---- a/kernel/x86_64/zscal_microk_haswell-2.c -+++ b/kernel/x86_64/zscal_microk_haswell-2.c -@@ -116,11 +116,11 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -208,11 +208,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -285,11 +285,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -330,11 +330,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 1a1471c6be597a176a4dbfe2757c134eb3780af0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Wed, 16 Jan 2019 23:44:42 +0100 -Subject: [PATCH 09/18] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/zscal_microk_steamroller-2.c | 32 +++++++++++----------- - 1 file changed, 16 insertions(+), 16 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c -index 97b07add6..4b489d9f3 100644 ---- a/kernel/x86_64/zscal_microk_steamroller-2.c -+++ b/kernel/x86_64/zscal_microk_steamroller-2.c -@@ -116,12 +116,12 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x), // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -209,11 +209,11 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -286,11 +286,11 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", -@@ -331,11 +331,11 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x), // 1 -+ : - "r" (alpha) // 2 -- : "cc", //"%0", "%1", -+ : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", - "%xmm8", "%xmm9", "%xmm10", "%xmm11", - -From 90e28665183cd8da3a6129016977f57dd415c6a9 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:38:20 +0100 -Subject: [PATCH 10/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_bulldozer-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_bulldozer-2.c b/kernel/x86_64/cscal_microk_bulldozer-2.c -index f526fd611..31451aa6c 100644 ---- a/kernel/x86_64/cscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/cscal_microk_bulldozer-2.c -@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From b8dd71bddcb41d3d88af1a1eb77f845760452f5f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:39:23 +0100 -Subject: [PATCH 11/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_haswell-2.c b/kernel/x86_64/cscal_microk_haswell-2.c -index 8623dcd10..a04a4c4ab 100644 ---- a/kernel/x86_64/cscal_microk_haswell-2.c -+++ b/kernel/x86_64/cscal_microk_haswell-2.c -@@ -117,7 +117,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", // "0", "1", -@@ -286,7 +286,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", //"%0", "%1", -@@ -331,7 +331,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From 8c9a6356eaba102124147856422b9a0570daeb55 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:40:25 +0100 -Subject: [PATCH 12/18] Remove stray comma - ---- - kernel/x86_64/cscal_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/cscal_microk_steamroller-2.c b/kernel/x86_64/cscal_microk_steamroller-2.c -index fbeb857e2..e8073d485 100644 ---- a/kernel/x86_64/cscal_microk_steamroller-2.c -+++ b/kernel/x86_64/cscal_microk_steamroller-2.c -@@ -118,7 +118,7 @@ static void cscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -210,7 +210,7 @@ static void cscal_kernel_16_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -287,7 +287,7 @@ static void cscal_kernel_16_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -332,7 +332,7 @@ static void cscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From ebe8882eb23e88d410f824d8d6a113f0fca94a3b Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:41:27 +0100 -Subject: [PATCH 13/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_bulldozer-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_bulldozer-2.c b/kernel/x86_64/dscal_microk_bulldozer-2.c -index 71d3a9846..096662781 100644 ---- a/kernel/x86_64/dscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/dscal_microk_bulldozer-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From fd3e2c862286019589530ece0a61be6d86a01e92 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:42:12 +0100 -Subject: [PATCH 14/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_sandy-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_sandy-2.c b/kernel/x86_64/dscal_microk_sandy-2.c -index 0f187ba88..9982b8e58 100644 ---- a/kernel/x86_64/dscal_microk_sandy-2.c -+++ b/kernel/x86_64/dscal_microk_sandy-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From 45339034256043b4405fd6330f918cbed3660ac4 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:43:14 +0100 -Subject: [PATCH 15/18] Remove stray comma - ---- - kernel/x86_64/dscal_microk_haswell-2.c | 4 ++-- - 1 file changed, 2 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/dscal_microk_haswell-2.c b/kernel/x86_64/dscal_microk_haswell-2.c -index 90790cfdc..77ed59a4e 100644 ---- a/kernel/x86_64/dscal_microk_haswell-2.c -+++ b/kernel/x86_64/dscal_microk_haswell-2.c -@@ -123,7 +123,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 -@@ -189,7 +189,7 @@ static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n1), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha), // 2 - "r" (n2) // 3 - -From 3b0b5ce0f69a45753b126d8bd96a48de2f882a4c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:46:05 +0100 -Subject: [PATCH 16/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_bulldozer-2.c | 16 ++++++++-------- - 1 file changed, 8 insertions(+), 8 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_bulldozer-2.c b/kernel/x86_64/zscal_microk_bulldozer-2.c -index 1ce59d2c7..5e733ffda 100644 ---- a/kernel/x86_64/zscal_microk_bulldozer-2.c -+++ b/kernel/x86_64/zscal_microk_bulldozer-2.c -@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -285,9 +285,9 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - "vzeroupper \n\t" - - : -- : -- "r" (n), // 0 -- "r" (x), // 1 -+ "+r" (n), // 0 -+ "+r" (x) // 1 -+ : - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", -@@ -329,10 +329,10 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - "vzeroupper \n\t" - -+ : -+ "+r" (n), // 0 -+ "+r" (x) // 1 - : -- : -- "r" (n), // 0 -- "r" (x), // 1 - "r" (alpha) // 2 - : "cc", //"%0", "%1", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - -From c17d2f61c2387b5a6cfab22d964d70afcce69b23 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:47:12 +0100 -Subject: [PATCH 17/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_haswell-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_haswell-2.c b/kernel/x86_64/zscal_microk_haswell-2.c -index 534370959..8c8f5b75c 100644 ---- a/kernel/x86_64/zscal_microk_haswell-2.c -+++ b/kernel/x86_64/zscal_microk_haswell-2.c -@@ -117,7 +117,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -209,7 +209,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -286,7 +286,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -331,7 +331,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", - -From ccb2b2175751037b5625b4ec3c60ddca26a04394 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:48:40 +0100 -Subject: [PATCH 18/18] Remove stray comma - ---- - kernel/x86_64/zscal_microk_steamroller-2.c | 8 ++++---- - 1 file changed, 4 insertions(+), 4 deletions(-) - -diff --git a/kernel/x86_64/zscal_microk_steamroller-2.c b/kernel/x86_64/zscal_microk_steamroller-2.c -index 4b489d9f3..c9267ee0c 100644 ---- a/kernel/x86_64/zscal_microk_steamroller-2.c -+++ b/kernel/x86_64/zscal_microk_steamroller-2.c -@@ -118,7 +118,7 @@ static void zscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -210,7 +210,7 @@ static void zscal_kernel_8_zero_r( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -287,7 +287,7 @@ static void zscal_kernel_8_zero_i( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", -@@ -332,7 +332,7 @@ static void zscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) - - : - "+r" (n), // 0 -- "+r" (x), // 1 -+ "+r" (x) // 1 - : - "r" (alpha) // 2 - : "cc", diff --git a/1967.patch b/1967.patch deleted file mode 100644 index c7066fa..0000000 --- a/1967.patch +++ /dev/null @@ -1,99 +0,0 @@ -From 7ff08e4b06e2c643829b566a4f2c1daba25b1029 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 00:04:44 +0100 -Subject: [PATCH 1/4] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/dger_microk_sandy-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c -index 2bf966a5f..944d4c6f1 100644 ---- a/kernel/x86_64/dger_microk_sandy-2.c -+++ b/kernel/x86_64/dger_microk_sandy-2.c -@@ -105,9 +105,9 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 003583675d31ce5ddabfede7fc0f93cfbac51e5f Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 00:05:47 +0100 -Subject: [PATCH 2/4] Tag arguments 0 and 1 as both input and output - ---- - kernel/x86_64/sger_microk_sandy-2.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c -index 79180b991..d38fdd551 100644 ---- a/kernel/x86_64/sger_microk_sandy-2.c -+++ b/kernel/x86_64/sger_microk_sandy-2.c -@@ -105,9 +105,9 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (alpha) // 4 - -From 78aeb19e4613104c1ae8ea1c67022451dcfed7e6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:34:12 +0100 -Subject: [PATCH 3/4] Remove stray comma - ---- - kernel/x86_64/sger_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/sger_microk_sandy-2.c b/kernel/x86_64/sger_microk_sandy-2.c -index d38fdd551..14f13475b 100644 ---- a/kernel/x86_64/sger_microk_sandy-2.c -+++ b/kernel/x86_64/sger_microk_sandy-2.c -@@ -106,7 +106,7 @@ static void sger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 - -From d3e7e25bfb73e16bdbf89ee07d0ab584339be2a0 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 17 Jan 2019 09:35:56 +0100 -Subject: [PATCH 4/4] Remove stray comma - ---- - kernel/x86_64/dger_microk_sandy-2.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/dger_microk_sandy-2.c b/kernel/x86_64/dger_microk_sandy-2.c -index 944d4c6f1..e8494500f 100644 ---- a/kernel/x86_64/dger_microk_sandy-2.c -+++ b/kernel/x86_64/dger_microk_sandy-2.c -@@ -106,7 +106,7 @@ static void dger_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha) - - : - "+r" (i), // 0 -- "+r" (n), // 1 -+ "+r" (n) // 1 - : - "r" (x), // 2 - "r" (y), // 3 diff --git a/2010.patch b/2010.patch deleted file mode 100644 index 2393325..0000000 --- a/2010.patch +++ /dev/null @@ -1,499 +0,0 @@ -From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 15:33:48 +0100 -Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64 - s/dGEMV_T and s/dGEMV_N kernels - -Arguments 0 and 1 need to be tagged as both input and output ---- - kernel/x86_64/dgemv_n_4.c | 10 +++++----- - kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- - kernel/x86_64/sgemv_n_4.c | 14 +++++++------- - kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- - 4 files changed, 30 insertions(+), 30 deletions(-) - -diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c -index 6d2530e81..6d33641e9 100644 ---- a/kernel/x86_64/dgemv_n_4.c -+++ b/kernel/x86_64/dgemv_n_4.c -@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 -@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a - "jnz 1b \n\t" - - : -+ "+r" (i), // 0 -+ "+r" (n) // 1 - : -- "r" (i), // 0 -- "r" (n), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap), // 4 -diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c -index a7478e3a8..ed672a757 100644 ---- a/kernel/x86_64/dgemv_t_4.c -+++ b/kernel/x86_64/dgemv_t_4.c -@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT - "movsd %%xmm11,8(%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap0), // 3 - "r" (ap1), // 4 -@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) - "movsd %%xmm10, (%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap), // 3 - "r" (x) // 4 -@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (&da), // 2 - "r" (src), // 3 - "r" (dest) // 4 -diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c -index 65305ac59..63697970f 100644 ---- a/kernel/x86_64/sgemv_n_4.c -+++ b/kernel/x86_64/sgemv_n_4.c -@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (ap[0]), // 4 -@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a - - "3: \n\t" - : -+ "+r" (i), // 0 -+ "+r" (n1) // 1 - : -- "r" (i), // 0 -- "r" (n1), // 1 - "r" (x), // 2 - "r" (y), // 3 - "r" (ap), // 4 -@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) - "jnz 1b \n\t" - - : -+ "+r" (i), // 0 -+ "+r" (n) // 1 - : -- "r" (i), // 0 -- "r" (n), // 1 - "r" (src), // 2 - "r" (dest) // 3 - : "cc", -diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c -index 065e5b385..86ecaf516 100644 ---- a/kernel/x86_64/sgemv_t_4.c -+++ b/kernel/x86_64/sgemv_t_4.c -@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT - "movss %%xmm11,4(%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap0), // 3 - "r" (ap1), // 4 -@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) - "movss %%xmm10, (%2) \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (y), // 2 - "r" (ap), // 3 - "r" (x) // 4 -@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d - "jnz 1b \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (&da), // 2 - "r" (src), // 3 - "r" (dest) // 4 - -From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 15:51:43 +0100 -Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly - -Argument 0 is modified as it doubles as a counter ---- - kernel/x86_64/dscal.c | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c -index ef9a0a6ba..d0d7801fd 100644 ---- a/kernel/x86_64/dscal.c -+++ b/kernel/x86_64/dscal.c -@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ - "jnz 1b \n\t" - - : -+ "+r" (n) // 0 - : -- "r" (n), // 0 - "r" (x), // 1 - "r" (x1), // 2 - "r" (alpha), // 3 - -From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 16:00:18 +0100 -Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV - microkernels - -Arguments 0 and 1 are both input and output ---- - kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- - kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- - kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- - 8 files changed, 24 insertions(+), 24 deletions(-) - -diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -index d7166fe4b..ae287b6d8 100644 ---- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -+++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c -@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c -index d83d20f8e..4778f644a 100644 ---- a/kernel/x86_64/dsymv_U_microk_haswell-2.c -+++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c -@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c -index 1344c75f7..065182286 100644 ---- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c -+++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c -@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "movsd %%xmm3 , 24(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c -index 1ef6fbafd..d84e703bd 100644 ---- a/kernel/x86_64/dsymv_U_microk_sandy-2.c -+++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c -@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -index 8c01ab806..4a4f4d68d 100644 ---- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -+++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c -@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c -index a32e59b44..e6a09ccf8 100644 ---- a/kernel/x86_64/ssymv_U_microk_haswell-2.c -+++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c -@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c -index b8e6ee732..c56ff3b15 100644 ---- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c -+++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c -@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "movss %%xmm3 , 12(%9) \n\t" // save temp2 - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 -diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c -index e8650650c..c4919a39a 100644 ---- a/kernel/x86_64/ssymv_U_microk_sandy-2.c -+++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c -@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT - "vzeroupper \n\t" - - : -- : -- "r" (i), // 0 -- "r" (n), // 1 -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : - "r" (x), // 2 - "r" (y), // 3 - "r" (a0), // 4 - -From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Tue, 12 Feb 2019 16:14:02 +0100 -Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly - -Argument 0 is modified so should be input and output ---- - kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- - kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- - kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- - 8 files changed, 18 insertions(+), 18 deletions(-) - -diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -index d84470cc4..bfa07b6d0 100644 ---- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -+++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c -@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c -index 866782ee6..6241879d5 100644 ---- a/kernel/x86_64/dsymv_L_microk_haswell-2.c -+++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c -@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c -index 38479f77a..a161dcd8b 100644 ---- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c -+++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c -@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "movsd %%xmm3 , 24(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c -index b4e6ab369..b205b1019 100644 ---- a/kernel/x86_64/dsymv_L_microk_sandy-2.c -+++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c -@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -index 9002228f3..602c3edf2 100644 ---- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -+++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c -@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c -index 69db008b6..fdfe4349a 100644 ---- a/kernel/x86_64/ssymv_L_microk_haswell-2.c -+++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c -@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c -index c0fe5d640..6bb9c02f6 100644 ---- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c -+++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c -@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F - "movss %%xmm3 , 12(%9) \n\t" // save temp2 - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c -index 093ca8073..0c78212e7 100644 ---- a/kernel/x86_64/ssymv_L_microk_sandy-2.c -+++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c -@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 -@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL - "vzeroupper \n\t" - - : -- : -- "r" (from), // 0 -+ "+r" (from) // 0 -+ : - "r" (to), // 1 - "r" (x), // 2 - "r" (y), // 3 diff --git a/2018.patch b/2018.patch deleted file mode 100644 index 594a4c4..0000000 --- a/2018.patch +++ /dev/null @@ -1,27 +0,0 @@ -From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 -From: Bart Oldeman -Date: Thu, 14 Feb 2019 16:19:41 +0000 -Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for - xmm0,xmm1,xmm2,xmm3 - -This fixes a crash in dblat2 when OpenBLAS is compiled using --march=znver1 -ftree-vectorize -O2 - -See also: -https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 ---- - kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + - 1 file changed, 1 insertion(+) - -diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c -index 584a6c6b5..da0fa2fff 100644 ---- a/kernel/x86_64/dgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c -@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - "r" (ap[3]), // 7 - "r" (alpha) // 8 - : "cc", -+ "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", diff --git a/2019.patch b/2019.patch deleted file mode 100644 index a3aa674..0000000 --- a/2019.patch +++ /dev/null @@ -1,274 +0,0 @@ -From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Thu, 14 Feb 2019 22:43:18 +0100 -Subject: [PATCH 1/2] Save and restore input argument 8 (lda4) - -Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) ---- - kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- - 1 file changed, 5 insertions(+), 2 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c -index 2c90f8aa9..e89a16785 100644 ---- a/kernel/x86_64/sgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c -@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - *****************************************************************************/ - - -- - #define HAVE_KERNEL_4x8 1 - static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); - -@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -+ "movq %8, %%xmm10 \n\t" //save lda -+ - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "4: \n\t" - "vzeroupper \n\t" -+ "movq %%xmm10, %8 \n\t" //restore lda - - : - "+r" (i), // 0 -@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", -+ "%xmm10", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); -@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - } - - -- - #define HAVE_KERNEL_4x4 1 - static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); - -@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT - - "vbroadcastss (%8), %%ymm6 \n\t" // alpha - -+ - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - - -From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Fri, 15 Feb 2019 10:10:04 +0100 -Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint - list - ---- - kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ - 1 file changed, 61 insertions(+), 65 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c -index e89a16785..93e1e26e8 100644 ---- a/kernel/x86_64/sgemv_n_microk_haswell-4.c -+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c -@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastss (%2), %%ymm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastss (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -- "movq %8, %%xmm10 \n\t" //save lda -- - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" - "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" - -- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" -+ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" - -- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" - - "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" - "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" - "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" - -- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - -@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "testq $0x08, %1 \n\t" - "jz 3f \n\t" - -- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y -+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" - -- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" -- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" -- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" -+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" - -- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" - - "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - -- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "addq $8 , %0 \n\t" - "subq $8 , %1 \n\t" - -@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y -- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y -- -- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" -- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" -- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" -- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" -- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" -- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" -- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" -- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" -+ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y -+ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y -+ -+ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" - "addq $16, %0 \n\t" -- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" - -- "addq $16, %8 \n\t" -- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y -+ "addq $16, %2 \n\t" -+ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y - "subq $16, %1 \n\t" -- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y - - "jnz 1b \n\t" - - "4: \n\t" - "vzeroupper \n\t" -- "movq %%xmm10, %8 \n\t" //restore lda - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", -@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "%xmm4", "%xmm5", - "%xmm6", "%xmm7", - "%xmm8", "%xmm9", -- "%xmm10", - "%xmm12", "%xmm13", "%xmm14", "%xmm15", - "memory" - ); diff --git a/2021.patch b/2021.patch deleted file mode 100644 index 7724f38..0000000 --- a/2021.patch +++ /dev/null @@ -1,255 +0,0 @@ -From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Fri, 15 Feb 2019 15:08:16 +0100 -Subject: [PATCH] Fix wrong constraints in inline assembly - -for #2009 ---- - kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- - 1 file changed, 49 insertions(+), 49 deletions(-) - -diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c -index fcab8e2c7..9ab78fc8e 100644 ---- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c -+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c -@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " cmpq $0, %0 \n\t" - " je 4f \n\t" - -- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a -- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 -- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 -+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a -+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 -+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 - - - " addq $8, %1 \n\t" -@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .p2align 4 \n\t" - "1: \n\t" - -- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a -+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" - - " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" - " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" - -- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 -+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 - " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" - " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" - - " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" -- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 -+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 - " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" - " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" - " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" -@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 22f \n\t" - -- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a -+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a - - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" - - " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" -- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 -+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" - - " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" -- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 -+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 - " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" - " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" - -@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 - - " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" -- " vmovups (%9), %%ymm0 \n\t" -+ " vmovups (%3), %%ymm0 \n\t" - " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" - " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" - " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" -@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" - - " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" -- " vmovups 32(%9), %%ymm4 \n\t" -+ " vmovups 32(%3), %%ymm4 \n\t" - " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" - " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" - " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" -@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "5: \n\t" // i = 0 - -- " addq $64, %9 \n\t" // b=b+8 -+ " addq $64, %3 \n\t" // b=b+8 - - " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups %%ymm8 , (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups %%ymm8 , (%2) \n\t" // write a - " vmovups %%ymm8 , (%4) \n\t" // write c - - " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" - " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" - " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" - " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" -@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - - " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm9 , (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm9 , (%2) \n\t" // write a - " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c - - " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" -@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb -- " vmovups (%9), %%ymm0 \n\t" -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm10, (%8) \n\t" // write a -+ " vmovups (%3), %%ymm0 \n\t" -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm10, (%2) \n\t" // write a - " vmovups %%ymm10, (%4,%7,2) \n\t" // write c - - " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" -@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - - " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm11, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm11, (%2) \n\t" // write a - " vmovups %%ymm11, (%5) \n\t" // write c - - " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" -@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm12, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm12, (%2) \n\t" // write a - " vmovups %%ymm12, (%5,%7,1) \n\t" // write c - - " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" -@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" - " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm13, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm13, (%2) \n\t" // write a - " vmovups %%ymm13, (%5,%7,2) \n\t" // write c - - " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" -@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" - - -- " addq $64, %9 \n\t" // b=b+8 -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $64, %3 \n\t" // b=b+8 -+ " addq $32, %2 \n\t" // a=a+8 - - - " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb -- " vmovups 32(%9), %%ymm1 \n\t" -- " vmovups %%ymm14, (%8) \n\t" // write a -+ " vmovups 32(%3), %%ymm1 \n\t" -+ " vmovups %%ymm14, (%2) \n\t" // write a - " vmovups %%ymm14, (%6) \n\t" // write c - - " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" - - " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" - -- " addq $32, %8 \n\t" // a=a+8 -+ " addq $32, %2 \n\t" // a=a+8 - - " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb -- " vmovups %%ymm15, (%8) \n\t" // write a -+ " vmovups %%ymm15, (%2) \n\t" // write a - " vmovups %%ymm15, (%6,%7,1) \n\t" // write c - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c3), // 5 - "r" (c6), // 6 - "r" (ldc), // 7 -- "r" (as), // 8 -- "r" (bs) // 9 -+ "r" (a), // 8 -+ "r" (b) // 9 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2023.patch b/2023.patch deleted file mode 100644 index 225a8a2..0000000 --- a/2023.patch +++ /dev/null @@ -1,874 +0,0 @@ -From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:24:11 +0100 -Subject: [PATCH 1/4] Fix inline assembly constraints - -rework indices to allow marking argument lda4 as input and output. For #2009 ---- - kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ - 1 file changed, 27 insertions(+), 27 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c -index 11a3e943b..d21232bfa 100644 ---- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c -+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c -@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - __asm__ __volatile__ - ( -- "movss (%2), %%xmm12 \n\t" // x0 -- "movss 4(%2), %%xmm13 \n\t" // x1 -- "movss 8(%2), %%xmm14 \n\t" // x2 -- "movss 12(%2), %%xmm15 \n\t" // x3 -+ "movss (%3), %%xmm12 \n\t" // x0 -+ "movss 4(%3), %%xmm13 \n\t" // x1 -+ "movss 8(%3), %%xmm14 \n\t" // x2 -+ "movss 12(%3), %%xmm15 \n\t" // x3 - "shufps $0, %%xmm12, %%xmm12\n\t" - "shufps $0, %%xmm13, %%xmm13\n\t" - "shufps $0, %%xmm14, %%xmm14\n\t" - "shufps $0, %%xmm15, %%xmm15\n\t" - -- "movss 16(%2), %%xmm0 \n\t" // x4 -- "movss 20(%2), %%xmm1 \n\t" // x5 -- "movss 24(%2), %%xmm2 \n\t" // x6 -- "movss 28(%2), %%xmm3 \n\t" // x7 -+ "movss 16(%3), %%xmm0 \n\t" // x4 -+ "movss 20(%3), %%xmm1 \n\t" // x5 -+ "movss 24(%3), %%xmm2 \n\t" // x6 -+ "movss 28(%3), %%xmm3 \n\t" // x7 - "shufps $0, %%xmm0 , %%xmm0 \n\t" - "shufps $0, %%xmm1 , %%xmm1 \n\t" - "shufps $0, %%xmm2 , %%xmm2 \n\t" -@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "1: \n\t" - "xorps %%xmm4 , %%xmm4 \n\t" - "xorps %%xmm5 , %%xmm5 \n\t" -- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y - - ".p2align 1 \n\t" -- "movups (%4,%0,4), %%xmm8 \n\t" -- "movups (%5,%0,4), %%xmm9 \n\t" -- "movups (%6,%0,4), %%xmm10 \n\t" -- "movups (%7,%0,4), %%xmm11 \n\t" -+ "movups (%5,%0,4), %%xmm8 \n\t" -+ "movups (%6,%0,4), %%xmm9 \n\t" -+ "movups (%7,%0,4), %%xmm10 \n\t" -+ "movups (%8,%0,4), %%xmm11 \n\t" - ".p2align 1 \n\t" - "mulps %%xmm12, %%xmm8 \n\t" - "mulps %%xmm13, %%xmm9 \n\t" -@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "addps %%xmm10, %%xmm4 \n\t" - "addps %%xmm11, %%xmm5 \n\t" - -- "movups (%4,%8,4), %%xmm8 \n\t" -- "movups (%5,%8,4), %%xmm9 \n\t" -- "movups (%6,%8,4), %%xmm10 \n\t" -- "movups (%7,%8,4), %%xmm11 \n\t" -+ "movups (%5,%2,4), %%xmm8 \n\t" -+ "movups (%6,%2,4), %%xmm9 \n\t" -+ "movups (%7,%2,4), %%xmm10 \n\t" -+ "movups (%8,%2,4), %%xmm11 \n\t" - ".p2align 1 \n\t" - "mulps %%xmm0 , %%xmm8 \n\t" - "mulps %%xmm1 , %%xmm9 \n\t" -@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "addps %%xmm10, %%xmm4 \n\t" - "addps %%xmm11, %%xmm5 \n\t" - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addps %%xmm5 , %%xmm4 \n\t" - "addq $4 , %0 \n\t" - "mulps %%xmm6 , %%xmm4 \n\t" - "subq $4 , %1 \n\t" - "addps %%xmm4 , %%xmm7 \n\t" - -- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y -+ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y - - "jnz 1b \n\t" - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:36:39 +0100 -Subject: [PATCH 2/4] Fix inline assembly constraints - -rework indices to allow marking argument lda as input and output. ---- - kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- - 1 file changed, 65 insertions(+), 65 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c -index b35daa35b..3fc46542b 100644 ---- a/kernel/x86_64/sgemv_n_microk_sandy-4.c -+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c -@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastss (%2), %%ymm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastss (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 - - "vbroadcastss (%9), %%ymm6 \n\t" // alpha - -@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" -- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y -+ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y - -- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" -- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" -- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" -- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" -+ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" -+ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" -+ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" -+ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" - "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" - "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" - -- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" -- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" -- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" -- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" -+ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" -+ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" -+ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" -+ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" - "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" - "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" - "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" -@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" - "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" - -- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y - -- "addq $4, %8 \n\t" -+ "addq $4, %2 \n\t" - "addq $4, %0 \n\t" - "subq $4, %1 \n\t" - -@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" -- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y -+ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y - -- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" -- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" -- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" -- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" -+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" -+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" -+ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" -+ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" -- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" -- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" -- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" -+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" -+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" -+ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" -+ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" -@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" - "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" - -- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y - -- "addq $8, %8 \n\t" -+ "addq $8, %2 \n\t" - "addq $8, %0 \n\t" - "subq $8, %1 \n\t" - -@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" - "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" - -- "prefetcht0 192(%4,%0,4) \n\t" -- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" -- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" -- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" -- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" -+ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" -+ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" -+ "prefetcht0 192(%6,%0,4) \n\t" -+ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" -+ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%6,%0,4) \n\t" -- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" -- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" -- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" -- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" -+ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" -+ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" -+ "prefetcht0 192(%8,%0,4) \n\t" -+ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" -+ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%4,%8,4) \n\t" -- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" -- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" -- "prefetcht0 192(%5,%8,4) \n\t" -- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" -- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" -+ "prefetcht0 192(%5,%2,4) \n\t" -+ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" -+ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" -+ "prefetcht0 192(%6,%2,4) \n\t" -+ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" -+ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" - -- "prefetcht0 192(%6,%8,4) \n\t" -- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" -- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" -- "prefetcht0 192(%7,%8,4) \n\t" -- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" -- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" -+ "prefetcht0 192(%7,%2,4) \n\t" -+ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" -+ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" -+ "prefetcht0 192(%8,%2,4) \n\t" -+ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" -+ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" - "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" - "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" - "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" -@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" - "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" - -- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y -- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y -+ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y -+ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y - -- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y -- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y -+ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y - -- "addq $16, %8 \n\t" -+ "addq $16, %2 \n\t" - "addq $16, %0 \n\t" - "subq $16, %1 \n\t" - "jnz 1b \n\t" -@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:46:17 +0100 -Subject: [PATCH 3/4] Fix inline assembly constraints - ---- - kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- - 1 file changed, 97 insertions(+), 97 deletions(-) - -diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -index 31001c7f3..bbf06c84b 100644 ---- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c -@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - __asm__ __volatile__ - ( -- "vbroadcastss (%2), %%xmm12 \n\t" // x0 -- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 -- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 -- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 -- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 -- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 -- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 -- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 -+ "vbroadcastss (%3), %%xmm12 \n\t" // x0 -+ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 -+ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 -+ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 -+ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 -+ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 -+ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 -+ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 - - "vbroadcastss (%9), %%xmm8 \n\t" // alpha - -@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" - "addq $4 , %0 \n\t" - -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" -- "addq $4 , %8 \n\t" -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" -+ "addq $4 , %2 \n\t" - - "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" -- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" -+ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" - "subq $4 , %1 \n\t" -- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y - - "2: \n\t" - -@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" - "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" - -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" -- -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" -- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" -+ -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - -- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y -+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -+ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y - - "addq $8 , %0 \n\t" -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "subq $8 , %1 \n\t" - - -@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" - "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" - -- "prefetcht0 192(%4,%0,4) \n\t" -- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" - "prefetcht0 192(%5,%0,4) \n\t" -- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" - "prefetcht0 192(%6,%0,4) \n\t" -- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" -+ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" - "prefetcht0 192(%7,%0,4) \n\t" -- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" -+ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" -+ "prefetcht0 192(%8,%0,4) \n\t" -+ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" - ".align 2 \n\t" -- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" -- -- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" -- -- "prefetcht0 192(%4,%8,4) \n\t" -- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" -- "prefetcht0 192(%5,%8,4) \n\t" -- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" -- "prefetcht0 192(%6,%8,4) \n\t" -- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" -- "prefetcht0 192(%7,%8,4) \n\t" -- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" -- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" -+ -+ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" -+ -+ "prefetcht0 192(%5,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" -+ "prefetcht0 192(%6,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" -+ "prefetcht0 192(%7,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" -+ "prefetcht0 192(%8,%2,4) \n\t" -+ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" -+ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" - -- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" -- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" -- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" -+ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" -+ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" - -- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" -- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" -+ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" -+ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" -+ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" -+ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" - - "addq $16, %0 \n\t" -- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y -- "addq $16, %8 \n\t" -- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y -- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y -+ "addq $16, %2 \n\t" -+ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y -+ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y - - "subq $16, %1 \n\t" - "jnz 1b \n\t" -@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", - -From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 18:51:09 +0100 -Subject: [PATCH 4/4] Fix inline assembly constraints - ---- - dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ - 1 file changed, 247 insertions(+) - create mode 100644 dgemv_n_microk_piledriver-4.c - -diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c -new file mode 100644 -index 000000000..466931b82 ---- /dev/null -+++ b/dgemv_n_microk_piledriver-4.c -@@ -0,0 +1,247 @@ -+/*************************************************************************** -+Copyright (c) 2014, The OpenBLAS Project -+All rights reserved. -+Redistribution and use in source and binary forms, with or without -+modification, are permitted provided that the following conditions are -+met: -+1. Redistributions of source code must retain the above copyright -+notice, this list of conditions and the following disclaimer. -+2. Redistributions in binary form must reproduce the above copyright -+notice, this list of conditions and the following disclaimer in -+the documentation and/or other materials provided with the -+distribution. -+3. Neither the name of the OpenBLAS project nor the names of -+its contributors may be used to endorse or promote products -+derived from this software without specific prior written permission. -+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" -+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE -+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE -+ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE -+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL -+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR -+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER -+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, -+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE -+USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. -+*****************************************************************************/ -+ -+ -+ -+#define HAVE_KERNEL_4x8 1 -+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -+ -+static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) -+{ -+ -+ BLASLONG register i = 0; -+ -+ __asm__ __volatile__ -+ ( -+ "vzeroupper \n\t" -+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 -+ -+ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha -+ -+ "testq $0x04, %1 \n\t" -+ "jz 2f \n\t" -+ -+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" -+ -+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -+ -+ -+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y -+ -+ "addq $4 , %2 \n\t" -+ "addq $4 , %0 \n\t" -+ "subq $4 , %1 \n\t" -+ -+ "2: \n\t" -+ -+ "cmpq $0, %1 \n\t" -+ "je 3f \n\t" -+ -+ -+ ".align 16 \n\t" -+ "1: \n\t" -+ -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "addq $8 , %0 \n\t" -+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" -+ -+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -+ -+ "addq $8 , %2 \n\t" -+ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y -+ "subq $8 , %1 \n\t" -+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y -+ -+ "jnz 1b \n\t" -+ -+ "3: \n\t" -+ "vzeroupper \n\t" -+ -+ : -+ "+r" (i), // 0 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 -+ : -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 -+ "r" (alpha) // 9 -+ : "cc", -+ "%xmm0", "%xmm1", -+ "%xmm2", "%xmm3", -+ "%xmm4", "%xmm5", -+ "%xmm6", "%xmm7", -+ "%xmm8", "%xmm9", -+ "%xmm12", "%xmm13", "%xmm14", "%xmm15", -+ "memory" -+ ); -+ -+} -+ -+ -+ -+#define HAVE_KERNEL_4x4 1 -+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -+ -+static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) -+{ -+ -+ BLASLONG register i = 0; -+ -+ __asm__ __volatile__ -+ ( -+ "vzeroupper \n\t" -+ "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -+ -+ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha -+ -+ "testq $0x04, %1 \n\t" -+ "jz 2f \n\t" -+ -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -+ -+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -+ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -+ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -+ -+ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -+ -+ "addq $4 , %0 \n\t" -+ "subq $4 , %1 \n\t" -+ -+ "2: \n\t" -+ -+ "cmpq $0, %1 \n\t" -+ "je 3f \n\t" -+ -+ -+ ".align 16 \n\t" -+ "1: \n\t" -+ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -+ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -+ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -+ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -+ -+ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y -+ -+ "addq $8 , %0 \n\t" -+ "subq $8 , %1 \n\t" -+ "jnz 1b \n\t" -+ -+ "3: \n\t" -+ "vzeroupper \n\t" -+ -+ : -+ "+r" (i), // 0 -+ "+r" (n) // 1 -+ : -+ "r" (x), // 2 -+ "r" (y), // 3 -+ "r" (ap[0]), // 4 -+ "r" (ap[1]), // 5 -+ "r" (ap[2]), // 6 -+ "r" (ap[3]), // 7 -+ "r" (alpha) // 8 -+ : "cc", -+ "%xmm4", "%xmm5", -+ "%xmm6", "%xmm7", -+ "%xmm8", "%xmm9", -+ "%xmm12", "%xmm13", "%xmm14", "%xmm15", -+ "memory" -+ ); -+ -+} -+ -+ diff --git a/2024.patch b/2024.patch deleted file mode 100644 index 720a9e2..0000000 --- a/2024.patch +++ /dev/null @@ -1,1349 +0,0 @@ -From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Sat, 16 Feb 2019 20:06:48 +0100 -Subject: [PATCH] Fix inline assembly constraints in Bulldozer TRSM kernels - -rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 ---- - kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- - kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- - kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- - kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- - kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- - 5 files changed, 356 insertions(+), 356 deletions(-) - -diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -index 54df5b359..35ed4cc01 100644 ---- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -+++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c -@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " prefetcht0 384(%3,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " prefetcht0 384(%7,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - " jz 2f \n\t" - -- " prefetcht0 384(%2,%1,8) \n\t" -- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b -- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " prefetcht0 384(%6,%1,8) \n\t" -+ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b -+ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 1 - -- " vmovddup (%7), %%xmm1 \n\t" // read b -- " vmovddup 8(%7), %%xmm0 \n\t" // read bb -+ " vmovddup (%3), %%xmm1 \n\t" // read b -+ " vmovddup 8(%3), %%xmm0 \n\t" // read bb - - " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" - - " \n\t" // i = 0 -- " subq $16 , %7 \n\t" // b = b - 2 -- " subq $64 , %6 \n\t" // a = a - 8 -+ " subq $16 , %3 \n\t" // b = b - 2 -+ " subq $64 , %2 \n\t" // a = a - 8 - -- " vmovddup (%7), %%xmm0 \n\t" // read bb -+ " vmovddup (%3), %%xmm0 \n\t" // read bb - - " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c -index 1b8991c6c..3cd215000 100644 ---- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c -@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" - -- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] -+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] - " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] -+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] - " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] -+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] - " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] -+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] - " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] -+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] - " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] -+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] - " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] -+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] - " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] -+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] - " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] -+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] - " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] -+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] - " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] -+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] - " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] -+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] - " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] -+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] - " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] -+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] - " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] -+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] - " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - -- " subq $64 , %6 \n\t" // a -= m -- " subq $8 , %7 \n\t" // b -= n -+ " subq $64 , %2 \n\t" // a -= m -+ " subq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] -+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] - " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 - "r" (c), // 4 - "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c -index 0623dddb0..a4a62491c 100644 ---- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c -@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" - -- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] -+ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] - " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] -+ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] - " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] -+ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] - " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" - " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" -@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] -+ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] - " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] -+ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] - " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] -+ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] - " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] -+ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] - " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" - " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" -@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] -+ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] - " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] -+ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] - " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] -+ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] - " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] -+ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] - " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" - " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] -+ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] - " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] -+ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] - " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] -+ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] - " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] -+ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] - " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - -- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] -+ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] - " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" - " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" - -- " addq $64 , %6 \n\t" // a -= m -- " addq $8 , %7 \n\t" // b -= n -+ " addq $64 , %2 \n\t" // a -= m -+ " addq $8 , %3 \n\t" // b -= n - -- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] -+ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] - " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 - " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 - " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa - " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa - " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa - " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa -- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa -- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa -+ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa -+ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa - - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c -index 4cc557d55..c11c84cec 100644 ---- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c -@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 0 - -- " vbroadcastss (%7), %%xmm0 \n\t" // read bb -- " vbroadcastss 4(%7), %%xmm1 \n\t" // read b -+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb -+ " vbroadcastss 4(%3), %%xmm1 \n\t" // read b - - " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" - - " \n\t" // i = 1 -- " addq $8 , %7 \n\t" // b = b + 2 -- " addq $64 , %6 \n\t" // a = a + 16 -+ " addq $8 , %3 \n\t" // b = b + 2 -+ " addq $64 , %2 \n\t" // a = a + 16 - -- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", -diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c -index 73f6e8a95..326ca2976 100644 ---- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c -+++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c -@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " .align 16 \n\t" - "1: \n\t" - -- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b -- " vmovups (%2,%1,8), %%xmm4 \n\t" -- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" -- " vmovups 16(%2,%1,8), %%xmm5 \n\t" -- " vmovups 32(%2,%1,8), %%xmm6 \n\t" -- " vmovups 48(%2,%1,8), %%xmm7 \n\t" -+ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b -+ " vmovups (%6,%1,8), %%xmm4 \n\t" -+ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" -+ " vmovups 16(%6,%1,8), %%xmm5 \n\t" -+ " vmovups 32(%6,%1,8), %%xmm6 \n\t" -+ " vmovups 48(%6,%1,8), %%xmm7 \n\t" - - " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" - " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" -@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - - "3: \n\t" // i = 1 - -- " vbroadcastss (%7), %%xmm1 \n\t" // read b -- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss (%3), %%xmm1 \n\t" // read b -+ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb - " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb - " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb - " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb - -- " vmovups %%xmm12 , (%6) \n\t" // write a -- " vmovups %%xmm13 , 16(%6) \n\t" // write a -- " vmovups %%xmm14 , 32(%6) \n\t" // write a -- " vmovups %%xmm15 , 48(%6) \n\t" // write a -+ " vmovups %%xmm12 , (%2) \n\t" // write a -+ " vmovups %%xmm13 , 16(%2) \n\t" // write a -+ " vmovups %%xmm14 , 32(%2) \n\t" // write a -+ " vmovups %%xmm15 , 48(%2) \n\t" // write a - - " vmovups %%xmm12 , (%5) \n\t" // write c1 - " vmovups %%xmm13 , 16(%5) \n\t" -@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" - - " \n\t" // i = 0 -- " subq $8 , %7 \n\t" // b = b - 2 -- " subq $64 , %6 \n\t" // a = a - 16 -+ " subq $8 , %3 \n\t" // b = b - 2 -+ " subq $64 , %2 \n\t" // a = a - 16 - -- " vbroadcastss (%7), %%xmm0 \n\t" // read bb -+ " vbroadcastss (%3), %%xmm0 \n\t" // read bb - - " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb - " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" - " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" - " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" - -- " vmovups %%xmm8 , (%6) \n\t" // write a -- " vmovups %%xmm9 , 16(%6) \n\t" -- " vmovups %%xmm10 , 32(%6) \n\t" -- " vmovups %%xmm11 , 48(%6) \n\t" -+ " vmovups %%xmm8 , (%2) \n\t" // write a -+ " vmovups %%xmm9 , 16(%2) \n\t" -+ " vmovups %%xmm10 , 32(%2) \n\t" -+ " vmovups %%xmm11 , 48(%2) \n\t" - - " vmovups %%xmm8 , (%4) \n\t" // write c0 - " vmovups %%xmm9 , 16(%4) \n\t" -@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON - " vzeroupper \n\t" - - : -+ "+r" (n1), // 0 -+ "+a" (i), // 1 -+ "+r" (as), // 2 -+ "+r" (bs) // 3 - : -- "r" (n1), // 0 -- "a" (i), // 1 -- "r" (a), // 2 -- "r" (b), // 3 -- "r" (c), // 4 -- "r" (c1), // 5 -- "r" (as), // 6 -- "r" (bs) // 7 -+ "r" (c), // 4 -+ "r" (c1), // 5 -+ "r" (a), // 6 -+ "r" (b) // 7 - : "cc", - "%xmm0", "%xmm1", "%xmm2", "%xmm3", - "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2028.patch b/2028.patch deleted file mode 100644 index 64d050f..0000000 --- a/2028.patch +++ /dev/null @@ -1,412 +0,0 @@ -From 6eee1beac524b5582a6c6de14d9d35a78c1ece74 Mon Sep 17 00:00:00 2001 -From: Andrew <16061801+brada4@users.noreply.github.com> -Date: Sun, 24 Feb 2019 20:41:02 +0200 -Subject: [PATCH 2/2] move fix to right place - ---- - dgemv_n_microk_piledriver-4.c | 247 -------------------- - kernel/x86_64/dgemv_n_microk_piledriver-4.c | 98 ++++---- - 2 files changed, 49 insertions(+), 296 deletions(-) - delete mode 100644 dgemv_n_microk_piledriver-4.c - -diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c -deleted file mode 100644 -index 466931b82..000000000 ---- a/dgemv_n_microk_piledriver-4.c -+++ /dev/null -@@ -1,247 +0,0 @@ --/*************************************************************************** --Copyright (c) 2014, The OpenBLAS Project --All rights reserved. --Redistribution and use in source and binary forms, with or without --modification, are permitted provided that the following conditions are --met: --1. Redistributions of source code must retain the above copyright --notice, this list of conditions and the following disclaimer. --2. Redistributions in binary form must reproduce the above copyright --notice, this list of conditions and the following disclaimer in --the documentation and/or other materials provided with the --distribution. --3. Neither the name of the OpenBLAS project nor the names of --its contributors may be used to endorse or promote products --derived from this software without specific prior written permission. --THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" --AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE --IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE --ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE --LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL --DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR --SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER --CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, --OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE --USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. --*****************************************************************************/ -- -- -- --#define HAVE_KERNEL_4x8 1 --static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); -- --static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) --{ -- -- BLASLONG register i = 0; -- -- __asm__ __volatile__ -- ( -- "vzeroupper \n\t" -- "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -- "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -- "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -- "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -- "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 -- -- "vbroadcastsd (%9), %%ymm6 \n\t" // alpha -- -- "testq $0x04, %1 \n\t" -- "jz 2f \n\t" -- -- "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" -- -- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -- -- -- "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y -- -- "addq $4 , %2 \n\t" -- "addq $4 , %0 \n\t" -- "subq $4 , %1 \n\t" -- -- "2: \n\t" -- -- "cmpq $0, %1 \n\t" -- "je 3f \n\t" -- -- -- ".align 16 \n\t" -- "1: \n\t" -- -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -- "addq $8 , %0 \n\t" -- "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" -- -- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -- -- "addq $8 , %2 \n\t" -- "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y -- "subq $8 , %1 \n\t" -- "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y -- -- "jnz 1b \n\t" -- -- "3: \n\t" -- "vzeroupper \n\t" -- -- : -- "+r" (i), // 0 -- "+r" (n), // 1 -- "+r" (lda4) // 2 -- : -- "r" (x), // 3 -- "r" (y), // 4 -- "r" (ap[0]), // 5 -- "r" (ap[1]), // 6 -- "r" (ap[2]), // 7 -- "r" (ap[3]), // 8 -- "r" (alpha) // 9 -- : "cc", -- "%xmm0", "%xmm1", -- "%xmm2", "%xmm3", -- "%xmm4", "%xmm5", -- "%xmm6", "%xmm7", -- "%xmm8", "%xmm9", -- "%xmm12", "%xmm13", "%xmm14", "%xmm15", -- "memory" -- ); -- --} -- -- -- --#define HAVE_KERNEL_4x4 1 --static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); -- --static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) --{ -- -- BLASLONG register i = 0; -- -- __asm__ __volatile__ -- ( -- "vzeroupper \n\t" -- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -- -- "vbroadcastsd (%8), %%ymm6 \n\t" // alpha -- -- "testq $0x04, %1 \n\t" -- "jz 2f \n\t" -- -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" -- "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" -- "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" -- -- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -- -- "addq $4 , %0 \n\t" -- "subq $4 , %1 \n\t" -- -- "2: \n\t" -- -- "cmpq $0, %1 \n\t" -- "je 3f \n\t" -- -- -- ".align 16 \n\t" -- "1: \n\t" -- "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" -- "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" -- "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" -- -- "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y -- "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y -- -- "addq $8 , %0 \n\t" -- "subq $8 , %1 \n\t" -- "jnz 1b \n\t" -- -- "3: \n\t" -- "vzeroupper \n\t" -- -- : -- "+r" (i), // 0 -- "+r" (n) // 1 -- : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (alpha) // 8 -- : "cc", -- "%xmm4", "%xmm5", -- "%xmm6", "%xmm7", -- "%xmm8", "%xmm9", -- "%xmm12", "%xmm13", "%xmm14", "%xmm15", -- "memory" -- ); -- --} -- -- -diff --git a/kernel/x86_64/dgemv_n_microk_piledriver-4.c b/kernel/x86_64/dgemv_n_microk_piledriver-4.c -index 530780bab..466931b82 100644 ---- a/kernel/x86_64/dgemv_n_microk_piledriver-4.c -+++ b/kernel/x86_64/dgemv_n_microk_piledriver-4.c -@@ -38,42 +38,42 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - __asm__ __volatile__ - ( - "vzeroupper \n\t" -- "vbroadcastsd (%2), %%ymm12 \n\t" // x0 -- "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 -- "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 -- "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 -- "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 -- "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 -- "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 -- "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 -+ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 -+ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 -+ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 -+ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 -+ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 -+ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 -+ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 -+ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 - - "vbroadcastsd (%9), %%ymm6 \n\t" // alpha - - "testq $0x04, %1 \n\t" - "jz 2f \n\t" - -- "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y -+ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" - -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" - -- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" -- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" - "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" - "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" - - -- "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y - -- "addq $4 , %8 \n\t" -+ "addq $4 , %2 \n\t" - "addq $4 , %0 \n\t" - "subq $4 , %1 \n\t" - -@@ -88,35 +88,35 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" - "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" -- "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y -- "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y -- -- "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" -- "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" -- "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" -- "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" -- "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" -- -- "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" -+ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y -+ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y -+ -+ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" -+ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" -+ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" -+ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" -+ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" -+ -+ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" - "addq $8 , %0 \n\t" -- "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" -- "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" -- "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" -- "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" -- "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" -- "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" -- "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" -+ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" -+ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" -+ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" -+ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" -+ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" - - "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" - "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" - -- "addq $8 , %8 \n\t" -+ "addq $8 , %2 \n\t" - "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y - "subq $8 , %1 \n\t" -- "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y -+ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y - - "jnz 1b \n\t" - -@@ -125,15 +125,15 @@ static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO - - : - "+r" (i), // 0 -- "+r" (n) // 1 -+ "+r" (n), // 1 -+ "+r" (lda4) // 2 - : -- "r" (x), // 2 -- "r" (y), // 3 -- "r" (ap[0]), // 4 -- "r" (ap[1]), // 5 -- "r" (ap[2]), // 6 -- "r" (ap[3]), // 7 -- "r" (lda4), // 8 -+ "r" (x), // 3 -+ "r" (y), // 4 -+ "r" (ap[0]), // 5 -+ "r" (ap[1]), // 6 -+ "r" (ap[2]), // 7 -+ "r" (ap[3]), // 8 - "r" (alpha) // 9 - : "cc", - "%xmm0", "%xmm1", diff --git a/openblas.spec b/openblas.spec index 45cc85f..e28699d 100644 --- a/openblas.spec +++ b/openblas.spec @@ -14,8 +14,8 @@ # "obsoleted" features are still kept in the spec. Name: openblas -Version: 0.3.5 -Release: 5%{?dist} +Version: 0.3.6 +Release: 1%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -29,18 +29,6 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch -# Fix assembly code -Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch -Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch -Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch -Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch -Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch -Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch -Patch16: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2028.patch -Patch17: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1965.patch -Patch18: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1966.patch -Patch19: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/1967.patch - BuildRequires: gcc BuildRequires: gcc-gfortran BuildRequires: perl-devel @@ -251,17 +239,6 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests -%patch10 -p1 -%patch11 -p1 -%patch12 -p1 -%patch13 -p1 -%patch14 -p1 -%patch15 -p1 -%patch16 -p1 -%patch17 -p1 -%patch18 -p1 -%patch19 -p1 - # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -697,6 +674,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Tue Apr 30 2019 Susi Lehtola - 0.3.6-1 +- Update to 0.3.6. + * Tue Feb 26 2019 Susi Lehtola - 0.3.5-5 - Even more assembly kernel patches. diff --git a/sources b/sources index e303585..a1a5ace 100644 --- a/sources +++ b/sources @@ -1 +1 @@ -SHA512 (openblas-0.3.5.tar.gz) = 91b3074eb922453bf843158b4281cde65db9e8bbdd7590e75e9e6cdcb486157f7973f2936f327bb3eb4f1702ce0ba51ae6729d8d4baf2d986c50771e8f696df0 +SHA512 (openblas-0.3.6.tar.gz) = 1ad980176a51f70d8b0b2d158da8c01f30f77b7cf385b24a6340d3c5feb1513bd04b9390487d05cc9557db7dc5f7c135b1688dec9f17ebef35dba884ef7ddee9