From 2a8a1574d78ffdd825b263cbebbf7f200db65e57 Mon Sep 17 00:00:00 2001 From: Susi Lehtola Date: Sun, 17 Feb 2019 21:36:32 +0100 Subject: [PATCH] Import patches from upstream to fix gcc 9 compatibility. --- 2010.patch | 499 ++++++++++++++++++ 2018.patch | 27 + 2019.patch | 274 ++++++++++ 2021.patch | 255 ++++++++++ 2023.patch | 874 ++++++++++++++++++++++++++++++++ 2024.patch | 1349 +++++++++++++++++++++++++++++++++++++++++++++++++ openblas.spec | 20 +- 7 files changed, 3297 insertions(+), 1 deletion(-) create mode 100644 2010.patch create mode 100644 2018.patch create mode 100644 2019.patch create mode 100644 2021.patch create mode 100644 2023.patch create mode 100644 2024.patch diff --git a/2010.patch b/2010.patch new file mode 100644 index 0000000..2393325 --- /dev/null +++ b/2010.patch @@ -0,0 +1,499 @@ +From dc6ac9eab0c59bcf56c1c512c099723215609fb2 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 15:33:48 +0100 +Subject: [PATCH 1/4] Fix declaration of input arguments in the x86_64 + s/dGEMV_T and s/dGEMV_N kernels + +Arguments 0 and 1 need to be tagged as both input and output +--- + kernel/x86_64/dgemv_n_4.c | 10 +++++----- + kernel/x86_64/dgemv_t_4.c | 18 +++++++++--------- + kernel/x86_64/sgemv_n_4.c | 14 +++++++------- + kernel/x86_64/sgemv_t_4.c | 18 +++++++++--------- + 4 files changed, 30 insertions(+), 30 deletions(-) + +diff --git a/kernel/x86_64/dgemv_n_4.c b/kernel/x86_64/dgemv_n_4.c +index 6d2530e81..6d33641e9 100644 +--- a/kernel/x86_64/dgemv_n_4.c ++++ b/kernel/x86_64/dgemv_n_4.c +@@ -111,9 +111,9 @@ static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 +@@ -166,9 +166,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a + "jnz 1b \n\t" + + : ++ "+r" (i), // 0 ++ "+r" (n) // 1 + : +- "r" (i), // 0 +- "r" (n), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 +diff --git a/kernel/x86_64/dgemv_t_4.c b/kernel/x86_64/dgemv_t_4.c +index a7478e3a8..ed672a757 100644 +--- a/kernel/x86_64/dgemv_t_4.c ++++ b/kernel/x86_64/dgemv_t_4.c +@@ -127,9 +127,9 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT + "movsd %%xmm11,8(%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 +@@ -195,9 +195,9 @@ static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) + "movsd %%xmm10, (%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 +@@ -259,9 +259,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (&da), // 2 + "r" (src), // 3 + "r" (dest) // 4 +diff --git a/kernel/x86_64/sgemv_n_4.c b/kernel/x86_64/sgemv_n_4.c +index 65305ac59..63697970f 100644 +--- a/kernel/x86_64/sgemv_n_4.c ++++ b/kernel/x86_64/sgemv_n_4.c +@@ -149,9 +149,9 @@ static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (ap[0]), // 4 +@@ -223,9 +223,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *a + + "3: \n\t" + : ++ "+r" (i), // 0 ++ "+r" (n1) // 1 + : +- "r" (i), // 0 +- "r" (n1), // 1 + "r" (x), // 2 + "r" (y), // 3 + "r" (ap), // 4 +@@ -277,9 +277,9 @@ static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) + "jnz 1b \n\t" + + : ++ "+r" (i), // 0 ++ "+r" (n) // 1 + : +- "r" (i), // 0 +- "r" (n), // 1 + "r" (src), // 2 + "r" (dest) // 3 + : "cc", +diff --git a/kernel/x86_64/sgemv_t_4.c b/kernel/x86_64/sgemv_t_4.c +index 065e5b385..86ecaf516 100644 +--- a/kernel/x86_64/sgemv_t_4.c ++++ b/kernel/x86_64/sgemv_t_4.c +@@ -139,9 +139,9 @@ static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT + "movss %%xmm11,4(%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap0), // 3 + "r" (ap1), // 4 +@@ -208,9 +208,9 @@ static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) + "movss %%xmm10, (%2) \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (y), // 2 + "r" (ap), // 3 + "r" (x) // 4 +@@ -272,9 +272,9 @@ static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_d + "jnz 1b \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (&da), // 2 + "r" (src), // 3 + "r" (dest) // 4 + +From 91481a3e4e88b26be920aff7d5c9e72ee82d6abc Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 15:51:43 +0100 +Subject: [PATCH 2/4] Fix declaration of input arguments in inline assembly + +Argument 0 is modified as it doubles as a counter +--- + kernel/x86_64/dscal.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/x86_64/dscal.c b/kernel/x86_64/dscal.c +index ef9a0a6ba..d0d7801fd 100644 +--- a/kernel/x86_64/dscal.c ++++ b/kernel/x86_64/dscal.c +@@ -136,8 +136,8 @@ static void dscal_kernel_inc_8(BLASLONG n, FLOAT *alpha, FLOAT *x, BLASLONG inc_ + "jnz 1b \n\t" + + : ++ "+r" (n) // 0 + : +- "r" (n), // 0 + "r" (x), // 1 + "r" (x1), // 2 + "r" (alpha), // 3 + +From b824fa70ebdd0b66ed045dbb17c08519525af782 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 16:00:18 +0100 +Subject: [PATCH 3/4] Fix declaration of assembly arguments in SSYMV and DSYMV + microkernels + +Arguments 0 and 1 are both input and output +--- + kernel/x86_64/dsymv_U_microk_bulldozer-2.c | 6 +++--- + kernel/x86_64/dsymv_U_microk_haswell-2.c | 6 +++--- + kernel/x86_64/dsymv_U_microk_nehalem-2.c | 6 +++--- + kernel/x86_64/dsymv_U_microk_sandy-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_bulldozer-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_haswell-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_nehalem-2.c | 6 +++--- + kernel/x86_64/ssymv_U_microk_sandy-2.c | 6 +++--- + 8 files changed, 24 insertions(+), 24 deletions(-) + +diff --git a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +index d7166fe4b..ae287b6d8 100644 +--- a/kernel/x86_64/dsymv_U_microk_bulldozer-2.c ++++ b/kernel/x86_64/dsymv_U_microk_bulldozer-2.c +@@ -106,9 +106,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/dsymv_U_microk_haswell-2.c b/kernel/x86_64/dsymv_U_microk_haswell-2.c +index d83d20f8e..4778f644a 100644 +--- a/kernel/x86_64/dsymv_U_microk_haswell-2.c ++++ b/kernel/x86_64/dsymv_U_microk_haswell-2.c +@@ -107,9 +107,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/dsymv_U_microk_nehalem-2.c b/kernel/x86_64/dsymv_U_microk_nehalem-2.c +index 1344c75f7..065182286 100644 +--- a/kernel/x86_64/dsymv_U_microk_nehalem-2.c ++++ b/kernel/x86_64/dsymv_U_microk_nehalem-2.c +@@ -101,9 +101,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/dsymv_U_microk_sandy-2.c b/kernel/x86_64/dsymv_U_microk_sandy-2.c +index 1ef6fbafd..d84e703bd 100644 +--- a/kernel/x86_64/dsymv_U_microk_sandy-2.c ++++ b/kernel/x86_64/dsymv_U_microk_sandy-2.c +@@ -116,9 +116,9 @@ static void dsymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +index 8c01ab806..4a4f4d68d 100644 +--- a/kernel/x86_64/ssymv_U_microk_bulldozer-2.c ++++ b/kernel/x86_64/ssymv_U_microk_bulldozer-2.c +@@ -90,9 +90,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_haswell-2.c b/kernel/x86_64/ssymv_U_microk_haswell-2.c +index a32e59b44..e6a09ccf8 100644 +--- a/kernel/x86_64/ssymv_U_microk_haswell-2.c ++++ b/kernel/x86_64/ssymv_U_microk_haswell-2.c +@@ -112,9 +112,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_nehalem-2.c b/kernel/x86_64/ssymv_U_microk_nehalem-2.c +index b8e6ee732..c56ff3b15 100644 +--- a/kernel/x86_64/ssymv_U_microk_nehalem-2.c ++++ b/kernel/x86_64/ssymv_U_microk_nehalem-2.c +@@ -106,9 +106,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "movss %%xmm3 , 12(%9) \n\t" // save temp2 + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 +diff --git a/kernel/x86_64/ssymv_U_microk_sandy-2.c b/kernel/x86_64/ssymv_U_microk_sandy-2.c +index e8650650c..c4919a39a 100644 +--- a/kernel/x86_64/ssymv_U_microk_sandy-2.c ++++ b/kernel/x86_64/ssymv_U_microk_sandy-2.c +@@ -120,9 +120,9 @@ static void ssymv_kernel_4x4(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *a2, FLOAT + "vzeroupper \n\t" + + : +- : +- "r" (i), // 0 +- "r" (n), // 1 ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : + "r" (x), // 2 + "r" (y), // 3 + "r" (a0), // 4 + +From ab1630f9fac57245fbbfc20af91a060354e41c71 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Tue, 12 Feb 2019 16:14:02 +0100 +Subject: [PATCH 4/4] Fix declaration of arguments in inline assembly + +Argument 0 is modified so should be input and output +--- + kernel/x86_64/dsymv_L_microk_bulldozer-2.c | 4 ++-- + kernel/x86_64/dsymv_L_microk_haswell-2.c | 4 ++-- + kernel/x86_64/dsymv_L_microk_nehalem-2.c | 4 ++-- + kernel/x86_64/dsymv_L_microk_sandy-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_bulldozer-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_haswell-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_nehalem-2.c | 4 ++-- + kernel/x86_64/ssymv_L_microk_sandy-2.c | 8 ++++---- + 8 files changed, 18 insertions(+), 18 deletions(-) + +diff --git a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +index d84470cc4..bfa07b6d0 100644 +--- a/kernel/x86_64/dsymv_L_microk_bulldozer-2.c ++++ b/kernel/x86_64/dsymv_L_microk_bulldozer-2.c +@@ -113,8 +113,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vmovsd %%xmm3 ,24(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/dsymv_L_microk_haswell-2.c b/kernel/x86_64/dsymv_L_microk_haswell-2.c +index 866782ee6..6241879d5 100644 +--- a/kernel/x86_64/dsymv_L_microk_haswell-2.c ++++ b/kernel/x86_64/dsymv_L_microk_haswell-2.c +@@ -105,8 +105,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/dsymv_L_microk_nehalem-2.c b/kernel/x86_64/dsymv_L_microk_nehalem-2.c +index 38479f77a..a161dcd8b 100644 +--- a/kernel/x86_64/dsymv_L_microk_nehalem-2.c ++++ b/kernel/x86_64/dsymv_L_microk_nehalem-2.c +@@ -108,8 +108,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "movsd %%xmm3 , 24(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/dsymv_L_microk_sandy-2.c b/kernel/x86_64/dsymv_L_microk_sandy-2.c +index b4e6ab369..b205b1019 100644 +--- a/kernel/x86_64/dsymv_L_microk_sandy-2.c ++++ b/kernel/x86_64/dsymv_L_microk_sandy-2.c +@@ -114,8 +114,8 @@ static void dsymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +index 9002228f3..602c3edf2 100644 +--- a/kernel/x86_64/ssymv_L_microk_bulldozer-2.c ++++ b/kernel/x86_64/ssymv_L_microk_bulldozer-2.c +@@ -98,8 +98,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vmovss %%xmm3 ,12(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_haswell-2.c b/kernel/x86_64/ssymv_L_microk_haswell-2.c +index 69db008b6..fdfe4349a 100644 +--- a/kernel/x86_64/ssymv_L_microk_haswell-2.c ++++ b/kernel/x86_64/ssymv_L_microk_haswell-2.c +@@ -99,8 +99,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_nehalem-2.c b/kernel/x86_64/ssymv_L_microk_nehalem-2.c +index c0fe5d640..6bb9c02f6 100644 +--- a/kernel/x86_64/ssymv_L_microk_nehalem-2.c ++++ b/kernel/x86_64/ssymv_L_microk_nehalem-2.c +@@ -113,8 +113,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, F + "movss %%xmm3 , 12(%9) \n\t" // save temp2 + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +diff --git a/kernel/x86_64/ssymv_L_microk_sandy-2.c b/kernel/x86_64/ssymv_L_microk_sandy-2.c +index 093ca8073..0c78212e7 100644 +--- a/kernel/x86_64/ssymv_L_microk_sandy-2.c ++++ b/kernel/x86_64/ssymv_L_microk_sandy-2.c +@@ -109,8 +109,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 +@@ -217,8 +217,8 @@ static void ssymv_kernel_4x4(BLASLONG from, BLASLONG to, FLOAT **a, FLOAT *x, FL + "vzeroupper \n\t" + + : +- : +- "r" (from), // 0 ++ "+r" (from) // 0 ++ : + "r" (to), // 1 + "r" (x), // 2 + "r" (y), // 3 diff --git a/2018.patch b/2018.patch new file mode 100644 index 0000000..594a4c4 --- /dev/null +++ b/2018.patch @@ -0,0 +1,27 @@ +From 69a97ca7b9d7bbbb9b9f018592586e3c17b51a57 Mon Sep 17 00:00:00 2001 +From: Bart Oldeman +Date: Thu, 14 Feb 2019 16:19:41 +0000 +Subject: [PATCH] dgemv_kernel_4x4(Haswell): add missing clobbers for + xmm0,xmm1,xmm2,xmm3 + +This fixes a crash in dblat2 when OpenBLAS is compiled using +-march=znver1 -ftree-vectorize -O2 + +See also: +https://github.com/easybuilders/easybuild-easyconfigs/issues/7180 +--- + kernel/x86_64/dgemv_n_microk_haswell-4.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/kernel/x86_64/dgemv_n_microk_haswell-4.c b/kernel/x86_64/dgemv_n_microk_haswell-4.c +index 584a6c6b5..da0fa2fff 100644 +--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c ++++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c +@@ -104,6 +104,7 @@ static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + "r" (ap[3]), // 7 + "r" (alpha) // 8 + : "cc", ++ "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", diff --git a/2019.patch b/2019.patch new file mode 100644 index 0000000..a3aa674 --- /dev/null +++ b/2019.patch @@ -0,0 +1,274 @@ +From 46e415b1405044b038586537d213e4f2f04b8536 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Thu, 14 Feb 2019 22:43:18 +0100 +Subject: [PATCH 1/2] Save and restore input argument 8 (lda4) + +Fixes miscompilation with gcc9 -ftree-vectorize (related to issue #2009) +--- + kernel/x86_64/sgemv_n_microk_haswell-4.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c +index 2c90f8aa9..e89a16785 100644 +--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c ++++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c +@@ -26,7 +26,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + *****************************************************************************/ + + +- + #define HAVE_KERNEL_4x8 1 + static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); + +@@ -49,6 +48,8 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + ++ "movq %8, %%xmm10 \n\t" //save lda ++ + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + +@@ -151,6 +152,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "4: \n\t" + "vzeroupper \n\t" ++ "movq %%xmm10, %8 \n\t" //restore lda + + : + "+r" (i), // 0 +@@ -170,6 +172,7 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", ++ "%xmm10", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); +@@ -177,7 +180,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + } + + +- + #define HAVE_KERNEL_4x4 1 + static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); + +@@ -196,6 +198,7 @@ static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT + + "vbroadcastss (%8), %%ymm6 \n\t" // alpha + ++ + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + + +From 4255a58cd22d5395dbd6573683298849bd3a23b5 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 15 Feb 2019 10:10:04 +0100 +Subject: [PATCH 2/2] Rename operands to put lda on the input/output constraint + list + +--- + kernel/x86_64/sgemv_n_microk_haswell-4.c | 126 +++++++++++------------ + 1 file changed, 61 insertions(+), 65 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_haswell-4.c b/kernel/x86_64/sgemv_n_microk_haswell-4.c +index e89a16785..93e1e26e8 100644 +--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c ++++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c +@@ -37,43 +37,41 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + __asm__ __volatile__ + ( + "vzeroupper \n\t" +- "vbroadcastss (%2), %%ymm12 \n\t" // x0 +- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 +- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 +- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 +- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 +- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 +- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 +- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 ++ "vbroadcastss (%3), %%ymm12 \n\t" // x0 ++ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 ++ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 ++ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 ++ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 ++ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 ++ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 ++ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + +- "movq %8, %%xmm10 \n\t" //save lda +- + "testq $0x04, %1 \n\t" + "jz 2f \n\t" + +- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y ++ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y + "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" + +- "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" +- "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" ++ "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmadd231ps (%6,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmadd231ps (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmadd231ps (%8,%0,4), %%xmm15, %%xmm5 \n\t" + +- "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" ++ "vfmadd231ps (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmadd231ps (%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "vfmadd231ps (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmadd231ps (%8,%2,4), %%xmm3 , %%xmm5 \n\t" + + "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" + "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" + "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" + +- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y + +- "addq $4 , %8 \n\t" ++ "addq $4 , %2 \n\t" + "addq $4 , %0 \n\t" + "subq $4 , %1 \n\t" + +@@ -82,28 +80,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "testq $0x08, %1 \n\t" + "jz 3f \n\t" + +- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y ++ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" + +- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" +- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" +- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" +- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" ++ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm5 \n\t" + +- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" +- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" ++ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" ++ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm5 \n\t" + + "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" + "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" + "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" + + +- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y + +- "addq $8 , %8 \n\t" ++ "addq $8 , %2 \n\t" + "addq $8 , %0 \n\t" + "subq $8 , %1 \n\t" + +@@ -118,53 +116,52 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" +- "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y +- "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y +- +- "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" +- "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" +- "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" +- "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" +- "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" +- "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" +- "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" +- "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" +- +- "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" ++ "vmovups (%4,%0,4), %%ymm8 \n\t" // 8 * y ++ "vmovups 32(%4,%0,4), %%ymm9 \n\t" // 8 * y ++ ++ "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231ps 32(%5,%0,4), %%ymm12, %%ymm5 \n\t" ++ "vfmadd231ps (%6,%0,4), %%ymm13, %%ymm4 \n\t" ++ "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231ps (%7,%0,4), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231ps 32(%7,%0,4), %%ymm14, %%ymm5 \n\t" ++ "vfmadd231ps (%8,%0,4), %%ymm15, %%ymm4 \n\t" ++ "vfmadd231ps 32(%8,%0,4), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231ps (%5,%2,4), %%ymm0 , %%ymm4 \n\t" + "addq $16, %0 \n\t" +- "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" +- "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" +- "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" +- "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" +- "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" +- "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" +- "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" ++ "vfmadd231ps 32(%5,%2,4), %%ymm0 , %%ymm5 \n\t" ++ "vfmadd231ps (%6,%2,4), %%ymm1 , %%ymm4 \n\t" ++ "vfmadd231ps 32(%6,%2,4), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231ps (%7,%2,4), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231ps 32(%7,%2,4), %%ymm2 , %%ymm5 \n\t" ++ "vfmadd231ps (%8,%2,4), %%ymm3 , %%ymm4 \n\t" ++ "vfmadd231ps 32(%8,%2,4), %%ymm3 , %%ymm5 \n\t" + + "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" + "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" + +- "addq $16, %8 \n\t" +- "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y ++ "addq $16, %2 \n\t" ++ "vmovups %%ymm8,-64(%4,%0,4) \n\t" // 8 * y + "subq $16, %1 \n\t" +- "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm9,-32(%4,%0,4) \n\t" // 8 * y + + "jnz 1b \n\t" + + "4: \n\t" + "vzeroupper \n\t" +- "movq %%xmm10, %8 \n\t" //restore lda + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", +@@ -172,7 +169,6 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "%xmm4", "%xmm5", + "%xmm6", "%xmm7", + "%xmm8", "%xmm9", +- "%xmm10", + "%xmm12", "%xmm13", "%xmm14", "%xmm15", + "memory" + ); diff --git a/2021.patch b/2021.patch new file mode 100644 index 0000000..7724f38 --- /dev/null +++ b/2021.patch @@ -0,0 +1,255 @@ +From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Fri, 15 Feb 2019 15:08:16 +0100 +Subject: [PATCH] Fix wrong constraints in inline assembly + +for #2009 +--- + kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++------------- + 1 file changed, 49 insertions(+), 49 deletions(-) + +diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c +index fcab8e2c7..9ab78fc8e 100644 +--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c ++++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c +@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " cmpq $0, %0 \n\t" + " je 4f \n\t" + +- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a +- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 +- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 ++ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a ++ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 ++ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 + + + " addq $8, %1 \n\t" +@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .p2align 4 \n\t" + "1: \n\t" + +- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a ++ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a + " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t" + +- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0 ++ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0 + " vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t" + " vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t" +- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1 ++ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1 + " vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t" + " vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t" + " vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t" +@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 22f \n\t" + +- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a ++ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a + + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t" + + " vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t" +- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0 ++ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0 + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t" + + " vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t" +- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1 ++ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1 + " vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t" + " vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t" + +@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7 + + " vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t" +- " vmovups (%9), %%ymm0 \n\t" ++ " vmovups (%3), %%ymm0 \n\t" + " vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t" + " vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t" + " vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t" +@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t" + + " vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t" +- " vmovups 32(%9), %%ymm4 \n\t" ++ " vmovups 32(%3), %%ymm4 \n\t" + " vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t" + " vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t" + " vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t" +@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "5: \n\t" // i = 0 + +- " addq $64, %9 \n\t" // b=b+8 ++ " addq $64, %3 \n\t" // b=b+8 + + " vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb +- " vmovups (%9), %%ymm0 \n\t" +- " vmovups %%ymm8 , (%8) \n\t" // write a ++ " vmovups (%3), %%ymm0 \n\t" ++ " vmovups %%ymm8 , (%2) \n\t" // write a + " vmovups %%ymm8 , (%4) \n\t" // write c + + " vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t" +- " vmovups 32(%9), %%ymm1 \n\t" ++ " vmovups 32(%3), %%ymm1 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t" + " vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t" + " vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t" +@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + + " vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb +- " vmovups (%9), %%ymm0 \n\t" +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm9 , (%8) \n\t" // write a ++ " vmovups (%3), %%ymm0 \n\t" ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm9 , (%2) \n\t" // write a + " vmovups %%ymm9 , (%4,%7,1) \n\t" // write c + + " vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t" +@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + " vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb +- " vmovups (%9), %%ymm0 \n\t" +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm10, (%8) \n\t" // write a ++ " vmovups (%3), %%ymm0 \n\t" ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm10, (%2) \n\t" // write a + " vmovups %%ymm10, (%4,%7,2) \n\t" // write c + + " vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t" +@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t" + + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + + " vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm11, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm11, (%2) \n\t" // write a + " vmovups %%ymm11, (%5) \n\t" // write c + + " vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t" +@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t" + + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + " vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm12, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm12, (%2) \n\t" // write a + " vmovups %%ymm12, (%5,%7,1) \n\t" // write c + + " vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t" +@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xff , %%ymm1 , %%ymm7 \n\t" + " vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t" + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + " vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm13, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm13, (%2) \n\t" // write a + " vmovups %%ymm13, (%5,%7,2) \n\t" // write c + + " vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t" +@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t" + + +- " addq $64, %9 \n\t" // b=b+8 +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $64, %3 \n\t" // b=b+8 ++ " addq $32, %2 \n\t" // a=a+8 + + + " vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb +- " vmovups 32(%9), %%ymm1 \n\t" +- " vmovups %%ymm14, (%8) \n\t" // write a ++ " vmovups 32(%3), %%ymm1 \n\t" ++ " vmovups %%ymm14, (%2) \n\t" // write a + " vmovups %%ymm14, (%6) \n\t" // write c + + " vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t" + + " vpermpd $0xff , %%ymm1 , %%ymm0 \n\t" + +- " addq $32, %8 \n\t" // a=a+8 ++ " addq $32, %2 \n\t" // a=a+8 + + " vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb +- " vmovups %%ymm15, (%8) \n\t" // write a ++ " vmovups %%ymm15, (%2) \n\t" // write a + " vmovups %%ymm15, (%6,%7,1) \n\t" // write c + + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 + "r" (c), // 4 + "r" (c3), // 5 + "r" (c6), // 6 + "r" (ldc), // 7 +- "r" (as), // 8 +- "r" (bs) // 9 ++ "r" (a), // 8 ++ "r" (b) // 9 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/2023.patch b/2023.patch new file mode 100644 index 0000000..225a8a2 --- /dev/null +++ b/2023.patch @@ -0,0 +1,874 @@ +From 9d8be1578983d9fec6a1a7ae81d4ef9c1ac4c08c Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:24:11 +0100 +Subject: [PATCH 1/4] Fix inline assembly constraints + +rework indices to allow marking argument lda4 as input and output. For #2009 +--- + kernel/x86_64/sgemv_n_microk_nehalem-4.c | 54 ++++++++++++------------ + 1 file changed, 27 insertions(+), 27 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_nehalem-4.c b/kernel/x86_64/sgemv_n_microk_nehalem-4.c +index 11a3e943b..d21232bfa 100644 +--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c ++++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c +@@ -37,19 +37,19 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + __asm__ __volatile__ + ( +- "movss (%2), %%xmm12 \n\t" // x0 +- "movss 4(%2), %%xmm13 \n\t" // x1 +- "movss 8(%2), %%xmm14 \n\t" // x2 +- "movss 12(%2), %%xmm15 \n\t" // x3 ++ "movss (%3), %%xmm12 \n\t" // x0 ++ "movss 4(%3), %%xmm13 \n\t" // x1 ++ "movss 8(%3), %%xmm14 \n\t" // x2 ++ "movss 12(%3), %%xmm15 \n\t" // x3 + "shufps $0, %%xmm12, %%xmm12\n\t" + "shufps $0, %%xmm13, %%xmm13\n\t" + "shufps $0, %%xmm14, %%xmm14\n\t" + "shufps $0, %%xmm15, %%xmm15\n\t" + +- "movss 16(%2), %%xmm0 \n\t" // x4 +- "movss 20(%2), %%xmm1 \n\t" // x5 +- "movss 24(%2), %%xmm2 \n\t" // x6 +- "movss 28(%2), %%xmm3 \n\t" // x7 ++ "movss 16(%3), %%xmm0 \n\t" // x4 ++ "movss 20(%3), %%xmm1 \n\t" // x5 ++ "movss 24(%3), %%xmm2 \n\t" // x6 ++ "movss 28(%3), %%xmm3 \n\t" // x7 + "shufps $0, %%xmm0 , %%xmm0 \n\t" + "shufps $0, %%xmm1 , %%xmm1 \n\t" + "shufps $0, %%xmm2 , %%xmm2 \n\t" +@@ -63,13 +63,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "1: \n\t" + "xorps %%xmm4 , %%xmm4 \n\t" + "xorps %%xmm5 , %%xmm5 \n\t" +- "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y ++ "movups (%4,%0,4), %%xmm7 \n\t" // 4 * y + + ".p2align 1 \n\t" +- "movups (%4,%0,4), %%xmm8 \n\t" +- "movups (%5,%0,4), %%xmm9 \n\t" +- "movups (%6,%0,4), %%xmm10 \n\t" +- "movups (%7,%0,4), %%xmm11 \n\t" ++ "movups (%5,%0,4), %%xmm8 \n\t" ++ "movups (%6,%0,4), %%xmm9 \n\t" ++ "movups (%7,%0,4), %%xmm10 \n\t" ++ "movups (%8,%0,4), %%xmm11 \n\t" + ".p2align 1 \n\t" + "mulps %%xmm12, %%xmm8 \n\t" + "mulps %%xmm13, %%xmm9 \n\t" +@@ -80,10 +80,10 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "addps %%xmm10, %%xmm4 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + +- "movups (%4,%8,4), %%xmm8 \n\t" +- "movups (%5,%8,4), %%xmm9 \n\t" +- "movups (%6,%8,4), %%xmm10 \n\t" +- "movups (%7,%8,4), %%xmm11 \n\t" ++ "movups (%5,%2,4), %%xmm8 \n\t" ++ "movups (%6,%2,4), %%xmm9 \n\t" ++ "movups (%7,%2,4), %%xmm10 \n\t" ++ "movups (%8,%2,4), %%xmm11 \n\t" + ".p2align 1 \n\t" + "mulps %%xmm0 , %%xmm8 \n\t" + "mulps %%xmm1 , %%xmm9 \n\t" +@@ -94,28 +94,28 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "addps %%xmm10, %%xmm4 \n\t" + "addps %%xmm11, %%xmm5 \n\t" + +- "addq $4 , %8 \n\t" ++ "addq $4 , %2 \n\t" + "addps %%xmm5 , %%xmm4 \n\t" + "addq $4 , %0 \n\t" + "mulps %%xmm6 , %%xmm4 \n\t" + "subq $4 , %1 \n\t" + "addps %%xmm4 , %%xmm7 \n\t" + +- "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y ++ "movups %%xmm7 , -16(%4,%0,4) \n\t" // 4 * y + + "jnz 1b \n\t" + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + +From e976557d2965efb687aaaf88e7829bdd9438a7a6 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:36:39 +0100 +Subject: [PATCH 2/4] Fix inline assembly constraints + +rework indices to allow marking argument lda as input and output. +--- + kernel/x86_64/sgemv_n_microk_sandy-4.c | 130 ++++++++++++------------- + 1 file changed, 65 insertions(+), 65 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_sandy-4.c b/kernel/x86_64/sgemv_n_microk_sandy-4.c +index b35daa35b..3fc46542b 100644 +--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c ++++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c +@@ -39,14 +39,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + __asm__ __volatile__ + ( + "vzeroupper \n\t" +- "vbroadcastss (%2), %%ymm12 \n\t" // x0 +- "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 +- "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 +- "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 +- "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 +- "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 +- "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 +- "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 ++ "vbroadcastss (%3), %%ymm12 \n\t" // x0 ++ "vbroadcastss 4(%3), %%ymm13 \n\t" // x1 ++ "vbroadcastss 8(%3), %%ymm14 \n\t" // x2 ++ "vbroadcastss 12(%3), %%ymm15 \n\t" // x3 ++ "vbroadcastss 16(%3), %%ymm0 \n\t" // x4 ++ "vbroadcastss 20(%3), %%ymm1 \n\t" // x5 ++ "vbroadcastss 24(%3), %%ymm2 \n\t" // x6 ++ "vbroadcastss 28(%3), %%ymm3 \n\t" // x7 + + "vbroadcastss (%9), %%ymm6 \n\t" // alpha + +@@ -55,21 +55,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" +- "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y ++ "vmovups (%4,%0,4), %%xmm7 \n\t" // 4 * y + +- "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" +- "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" +- "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" +- "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" ++ "vmulps (%5,%0,4), %%xmm12, %%xmm8 \n\t" ++ "vmulps (%6,%0,4), %%xmm13, %%xmm10 \n\t" ++ "vmulps (%7,%0,4), %%xmm14, %%xmm9 \n\t" ++ "vmulps (%8,%0,4), %%xmm15, %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" + +- "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" +- "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" +- "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" +- "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" ++ "vmulps (%5,%2,4), %%xmm0 , %%xmm8 \n\t" ++ "vmulps (%6,%2,4), %%xmm1 , %%xmm10 \n\t" ++ "vmulps (%7,%2,4), %%xmm2 , %%xmm9 \n\t" ++ "vmulps (%8,%2,4), %%xmm3 , %%xmm11 \n\t" + "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" + "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" + "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" +@@ -79,9 +79,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" + "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" + +- "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5, (%4,%0,4) \n\t" // 4 * y + +- "addq $4, %8 \n\t" ++ "addq $4, %2 \n\t" + "addq $4, %0 \n\t" + "subq $4, %1 \n\t" + +@@ -92,21 +92,21 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" +- "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y ++ "vmovups (%4,%0,4), %%ymm7 \n\t" // 8 * y + +- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" +- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" +- "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" +- "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" ++ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" ++ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" ++ "vmulps (%7,%0,4), %%ymm14, %%ymm9 \n\t" ++ "vmulps (%8,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" +- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" +- "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" +- "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" ++ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" ++ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" ++ "vmulps (%7,%2,4), %%ymm2 , %%ymm9 \n\t" ++ "vmulps (%8,%2,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" +@@ -116,9 +116,9 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" + "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" + +- "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm5, (%4,%0,4) \n\t" // 8 * y + +- "addq $8, %8 \n\t" ++ "addq $8, %2 \n\t" + "addq $8, %0 \n\t" + "subq $8, %1 \n\t" + +@@ -134,45 +134,45 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" + "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" + +- "prefetcht0 192(%4,%0,4) \n\t" +- "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" +- "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" +- "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" +- "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" ++ "vmulps (%5,%0,4), %%ymm12, %%ymm8 \n\t" ++ "vmulps 32(%5,%0,4), %%ymm12, %%ymm9 \n\t" ++ "prefetcht0 192(%6,%0,4) \n\t" ++ "vmulps (%6,%0,4), %%ymm13, %%ymm10 \n\t" ++ "vmulps 32(%6,%0,4), %%ymm13, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "prefetcht0 192(%6,%0,4) \n\t" +- "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" +- "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" +- "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" +- "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" ++ "vmulps (%7,%0,4), %%ymm14, %%ymm8 \n\t" ++ "vmulps 32(%7,%0,4), %%ymm14, %%ymm9 \n\t" ++ "prefetcht0 192(%8,%0,4) \n\t" ++ "vmulps (%8,%0,4), %%ymm15, %%ymm10 \n\t" ++ "vmulps 32(%8,%0,4), %%ymm15, %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "prefetcht0 192(%4,%8,4) \n\t" +- "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" +- "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" +- "prefetcht0 192(%5,%8,4) \n\t" +- "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" +- "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" ++ "prefetcht0 192(%5,%2,4) \n\t" ++ "vmulps (%5,%2,4), %%ymm0 , %%ymm8 \n\t" ++ "vmulps 32(%5,%2,4), %%ymm0 , %%ymm9 \n\t" ++ "prefetcht0 192(%6,%2,4) \n\t" ++ "vmulps (%6,%2,4), %%ymm1 , %%ymm10 \n\t" ++ "vmulps 32(%6,%2,4), %%ymm1 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" + +- "prefetcht0 192(%6,%8,4) \n\t" +- "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" +- "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" +- "prefetcht0 192(%7,%8,4) \n\t" +- "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" +- "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" ++ "prefetcht0 192(%7,%2,4) \n\t" ++ "vmulps (%7,%2,4), %%ymm2 , %%ymm8 \n\t" ++ "vmulps 32(%7,%2,4), %%ymm2 , %%ymm9 \n\t" ++ "prefetcht0 192(%8,%2,4) \n\t" ++ "vmulps (%8,%2,4), %%ymm3 , %%ymm10 \n\t" ++ "vmulps 32(%8,%2,4), %%ymm3 , %%ymm11 \n\t" + "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" + "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" + "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" +@@ -181,13 +181,13 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" + "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" + +- "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y +- "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y ++ "vaddps (%4,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y ++ "vaddps 32(%4,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y + +- "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y +- "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm4, (%4,%0,4) \n\t" // 8 * y ++ "vmovups %%ymm5, 32(%4,%0,4) \n\t" // 8 * y + +- "addq $16, %8 \n\t" ++ "addq $16, %2 \n\t" + "addq $16, %0 \n\t" + "subq $16, %1 \n\t" + "jnz 1b \n\t" +@@ -197,15 +197,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + +From efb9038f7273cddc1ef30fce6ed4df7967a2fb03 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:46:17 +0100 +Subject: [PATCH 3/4] Fix inline assembly constraints + +--- + kernel/x86_64/sgemv_n_microk_bulldozer-4.c | 194 ++++++++++----------- + 1 file changed, 97 insertions(+), 97 deletions(-) + +diff --git a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +index 31001c7f3..bbf06c84b 100644 +--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c ++++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c +@@ -37,14 +37,14 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + __asm__ __volatile__ + ( +- "vbroadcastss (%2), %%xmm12 \n\t" // x0 +- "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 +- "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 +- "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 +- "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 +- "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 +- "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 +- "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 ++ "vbroadcastss (%3), %%xmm12 \n\t" // x0 ++ "vbroadcastss 4(%3), %%xmm13 \n\t" // x1 ++ "vbroadcastss 8(%3), %%xmm14 \n\t" // x2 ++ "vbroadcastss 12(%3), %%xmm15 \n\t" // x3 ++ "vbroadcastss 16(%3), %%xmm0 \n\t" // x4 ++ "vbroadcastss 20(%3), %%xmm1 \n\t" // x5 ++ "vbroadcastss 24(%3), %%xmm2 \n\t" // x6 ++ "vbroadcastss 28(%3), %%xmm3 \n\t" // x7 + + "vbroadcastss (%9), %%xmm8 \n\t" // alpha + +@@ -54,22 +54,22 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + +- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%6,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%8,%0,4), %%xmm15, %%xmm5 \n\t" + "addq $4 , %0 \n\t" + +- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" +- "addq $4 , %8 \n\t" ++ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, (%8,%2,4), %%xmm3 , %%xmm5 \n\t" ++ "addq $4 , %2 \n\t" + + "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" +- "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" ++ "vfmaddps -16(%4,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" + "subq $4 , %1 \n\t" +- "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm6, -16(%4,%0,4) \n\t" // 4 * y + + "2: \n\t" + +@@ -79,31 +79,31 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" + "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" + +- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" +- +- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" +- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" ++ ++ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" + +- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" +- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" +- "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y +- "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y ++ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" ++ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" ++ "vmovups %%xmm4, (%4,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5, 16(%4,%0,4) \n\t" // 4 * y + + "addq $8 , %0 \n\t" +- "addq $8 , %8 \n\t" ++ "addq $8 , %2 \n\t" + "subq $8 , %1 \n\t" + + +@@ -120,62 +120,62 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" + "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" + +- "prefetcht0 192(%4,%0,4) \n\t" +- "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" + "prefetcht0 192(%5,%0,4) \n\t" +- "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%5,%0,4), %%xmm12, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm12, %%xmm5 \n\t" + "prefetcht0 192(%6,%0,4) \n\t" +- "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" ++ "vfmaddps %%xmm4, (%6,%0,4), %%xmm13, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm13, %%xmm5 \n\t" + "prefetcht0 192(%7,%0,4) \n\t" +- "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" ++ "vfmaddps %%xmm4, (%7,%0,4), %%xmm14, %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm14, %%xmm5 \n\t" ++ "prefetcht0 192(%8,%0,4) \n\t" ++ "vfmaddps %%xmm4, (%8,%0,4), %%xmm15, %%xmm4 \n\t" + ".align 2 \n\t" +- "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" +- +- "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" +- +- "prefetcht0 192(%4,%8,4) \n\t" +- "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" +- "prefetcht0 192(%5,%8,4) \n\t" +- "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" +- "prefetcht0 192(%6,%8,4) \n\t" +- "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" +- "prefetcht0 192(%7,%8,4) \n\t" +- "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" +- "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%0,4), %%xmm15, %%xmm5 \n\t" ++ ++ "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm12, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm12, %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm13, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm13, %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm14, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm14, %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%8,%0,4), %%xmm15, %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%8,%0,4), %%xmm15, %%xmm7 \n\t" ++ ++ "prefetcht0 192(%5,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%5,%2,4), %%xmm0 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%5,%2,4), %%xmm0 , %%xmm5 \n\t" ++ "prefetcht0 192(%6,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%6,%2,4), %%xmm1 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%6,%2,4), %%xmm1 , %%xmm5 \n\t" ++ "prefetcht0 192(%7,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%7,%2,4), %%xmm2 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%7,%2,4), %%xmm2 , %%xmm5 \n\t" ++ "prefetcht0 192(%8,%2,4) \n\t" ++ "vfmaddps %%xmm4, (%8,%2,4), %%xmm3 , %%xmm4 \n\t" ++ "vfmaddps %%xmm5, 16(%8,%2,4), %%xmm3 , %%xmm5 \n\t" + +- "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" +- "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" +- "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%5,%2,4), %%xmm0 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%5,%2,4), %%xmm0 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%6,%2,4), %%xmm1 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%6,%2,4), %%xmm1 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%7,%2,4), %%xmm2 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%7,%2,4), %%xmm2 , %%xmm7 \n\t" ++ "vfmaddps %%xmm6, 32(%8,%2,4), %%xmm3 , %%xmm6 \n\t" ++ "vfmaddps %%xmm7, 48(%8,%2,4), %%xmm3 , %%xmm7 \n\t" + +- "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" +- "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" +- "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" +- "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" ++ "vfmaddps (%4,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" ++ "vfmaddps 16(%4,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" ++ "vfmaddps 32(%4,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" ++ "vfmaddps 48(%4,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" + + "addq $16, %0 \n\t" +- "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y +- "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y +- "addq $16, %8 \n\t" +- "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y +- "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm4,-64(%4,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm5,-48(%4,%0,4) \n\t" // 4 * y ++ "addq $16, %2 \n\t" ++ "vmovups %%xmm6,-32(%4,%0,4) \n\t" // 4 * y ++ "vmovups %%xmm7,-16(%4,%0,4) \n\t" // 4 * y + + "subq $16, %1 \n\t" + "jnz 1b \n\t" +@@ -184,15 +184,15 @@ static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLO + + : + "+r" (i), // 0 +- "+r" (n) // 1 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 + : +- "r" (x), // 2 +- "r" (y), // 3 +- "r" (ap[0]), // 4 +- "r" (ap[1]), // 5 +- "r" (ap[2]), // 6 +- "r" (ap[3]), // 7 +- "r" (lda4), // 8 ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 + "r" (alpha) // 9 + : "cc", + "%xmm0", "%xmm1", + +From 8242b1fe3f6c3a49b342d99157cd04632267c009 Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 18:51:09 +0100 +Subject: [PATCH 4/4] Fix inline assembly constraints + +--- + dgemv_n_microk_piledriver-4.c | 247 ++++++++++++++++++++++++++++++++++ + 1 file changed, 247 insertions(+) + create mode 100644 dgemv_n_microk_piledriver-4.c + +diff --git a/dgemv_n_microk_piledriver-4.c b/dgemv_n_microk_piledriver-4.c +new file mode 100644 +index 000000000..466931b82 +--- /dev/null ++++ b/dgemv_n_microk_piledriver-4.c +@@ -0,0 +1,247 @@ ++/*************************************************************************** ++Copyright (c) 2014, The OpenBLAS Project ++All rights reserved. ++Redistribution and use in source and binary forms, with or without ++modification, are permitted provided that the following conditions are ++met: ++1. Redistributions of source code must retain the above copyright ++notice, this list of conditions and the following disclaimer. ++2. Redistributions in binary form must reproduce the above copyright ++notice, this list of conditions and the following disclaimer in ++the documentation and/or other materials provided with the ++distribution. ++3. Neither the name of the OpenBLAS project nor the names of ++its contributors may be used to endorse or promote products ++derived from this software without specific prior written permission. ++THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE ++LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL ++DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR ++SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER ++CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, ++OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE ++USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. ++*****************************************************************************/ ++ ++ ++ ++#define HAVE_KERNEL_4x8 1 ++static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); ++ ++static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) ++{ ++ ++ BLASLONG register i = 0; ++ ++ __asm__ __volatile__ ++ ( ++ "vzeroupper \n\t" ++ "vbroadcastsd (%3), %%ymm12 \n\t" // x0 ++ "vbroadcastsd 8(%3), %%ymm13 \n\t" // x1 ++ "vbroadcastsd 16(%3), %%ymm14 \n\t" // x2 ++ "vbroadcastsd 24(%3), %%ymm15 \n\t" // x3 ++ "vbroadcastsd 32(%3), %%ymm0 \n\t" // x4 ++ "vbroadcastsd 40(%3), %%ymm1 \n\t" // x5 ++ "vbroadcastsd 48(%3), %%ymm2 \n\t" // x6 ++ "vbroadcastsd 56(%3), %%ymm3 \n\t" // x7 ++ ++ "vbroadcastsd (%9), %%ymm6 \n\t" // alpha ++ ++ "testq $0x04, %1 \n\t" ++ "jz 2f \n\t" ++ ++ "vmovupd (%4,%0,8), %%ymm7 \n\t" // 4 * y ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ ++ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" ++ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm5 \n\t" ++ ++ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" ++ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" ++ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" ++ ++ ++ "vmovupd %%ymm5, (%4,%0,8) \n\t" // 4 * y ++ ++ "addq $4 , %2 \n\t" ++ "addq $4 , %0 \n\t" ++ "subq $4 , %1 \n\t" ++ ++ "2: \n\t" ++ ++ "cmpq $0, %1 \n\t" ++ "je 3f \n\t" ++ ++ ++ ".align 16 \n\t" ++ "1: \n\t" ++ ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ "vmovupd (%4,%0,8), %%ymm8 \n\t" // 4 * y ++ "vmovupd 32(%4,%0,8), %%ymm9 \n\t" // 4 * y ++ ++ "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd 32(%5,%0,8), %%ymm12, %%ymm5 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm13, %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%0,8), %%ymm14, %%ymm5 \n\t" ++ "vfmadd231pd (%8,%0,8), %%ymm15, %%ymm4 \n\t" ++ "vfmadd231pd 32(%8,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231pd (%5,%2,8), %%ymm0 , %%ymm4 \n\t" ++ "addq $8 , %0 \n\t" ++ "vfmadd231pd 32(%5,%2,8), %%ymm0 , %%ymm5 \n\t" ++ "vfmadd231pd (%6,%2,8), %%ymm1 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%2,8), %%ymm1 , %%ymm5 \n\t" ++ "vfmadd231pd (%7,%2,8), %%ymm2 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%2,8), %%ymm2 , %%ymm5 \n\t" ++ "vfmadd231pd (%8,%2,8), %%ymm3 , %%ymm4 \n\t" ++ "vfmadd231pd 32(%8,%2,8), %%ymm3 , %%ymm5 \n\t" ++ ++ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" ++ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" ++ ++ "addq $8 , %2 \n\t" ++ "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y ++ "subq $8 , %1 \n\t" ++ "vmovupd %%ymm9,-32(%4,%0,8) \n\t" // 4 * y ++ ++ "jnz 1b \n\t" ++ ++ "3: \n\t" ++ "vzeroupper \n\t" ++ ++ : ++ "+r" (i), // 0 ++ "+r" (n), // 1 ++ "+r" (lda4) // 2 ++ : ++ "r" (x), // 3 ++ "r" (y), // 4 ++ "r" (ap[0]), // 5 ++ "r" (ap[1]), // 6 ++ "r" (ap[2]), // 7 ++ "r" (ap[3]), // 8 ++ "r" (alpha) // 9 ++ : "cc", ++ "%xmm0", "%xmm1", ++ "%xmm2", "%xmm3", ++ "%xmm4", "%xmm5", ++ "%xmm6", "%xmm7", ++ "%xmm8", "%xmm9", ++ "%xmm12", "%xmm13", "%xmm14", "%xmm15", ++ "memory" ++ ); ++ ++} ++ ++ ++ ++#define HAVE_KERNEL_4x4 1 ++static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); ++ ++static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) ++{ ++ ++ BLASLONG register i = 0; ++ ++ __asm__ __volatile__ ++ ( ++ "vzeroupper \n\t" ++ "vbroadcastsd (%2), %%ymm12 \n\t" // x0 ++ "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 ++ "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 ++ "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 ++ ++ "vbroadcastsd (%8), %%ymm6 \n\t" // alpha ++ ++ "testq $0x04, %1 \n\t" ++ "jz 2f \n\t" ++ ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y ++ ++ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" ++ "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" ++ "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" ++ ++ "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y ++ ++ "addq $4 , %0 \n\t" ++ "subq $4 , %1 \n\t" ++ ++ "2: \n\t" ++ ++ "cmpq $0, %1 \n\t" ++ "je 3f \n\t" ++ ++ ++ ".align 16 \n\t" ++ "1: \n\t" ++ "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" ++ "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" ++ "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y ++ "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y ++ ++ "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" ++ "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" ++ "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" ++ "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" ++ "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" ++ "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" ++ "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" ++ "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" ++ ++ "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" ++ "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" ++ ++ "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y ++ "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y ++ ++ "addq $8 , %0 \n\t" ++ "subq $8 , %1 \n\t" ++ "jnz 1b \n\t" ++ ++ "3: \n\t" ++ "vzeroupper \n\t" ++ ++ : ++ "+r" (i), // 0 ++ "+r" (n) // 1 ++ : ++ "r" (x), // 2 ++ "r" (y), // 3 ++ "r" (ap[0]), // 4 ++ "r" (ap[1]), // 5 ++ "r" (ap[2]), // 6 ++ "r" (ap[3]), // 7 ++ "r" (alpha) // 8 ++ : "cc", ++ "%xmm4", "%xmm5", ++ "%xmm6", "%xmm7", ++ "%xmm8", "%xmm9", ++ "%xmm12", "%xmm13", "%xmm14", "%xmm15", ++ "memory" ++ ); ++ ++} ++ ++ diff --git a/2024.patch b/2024.patch new file mode 100644 index 0000000..720a9e2 --- /dev/null +++ b/2024.patch @@ -0,0 +1,1349 @@ +From f9bb76d29af48f448a8ab2bdfffc962d9623a3df Mon Sep 17 00:00:00 2001 +From: Martin Kroeker +Date: Sat, 16 Feb 2019 20:06:48 +0100 +Subject: [PATCH] Fix inline assembly constraints in Bulldozer TRSM kernels + +rework indices to allow marking i,as and bs as both input and output (marked operand n1 as well for simplicity). For #2009 +--- + kernel/x86_64/dtrsm_kernel_RT_bulldozer.c | 96 ++++---- + kernel/x86_64/strsm_kernel_LN_bulldozer.c | 252 ++++++++++----------- + kernel/x86_64/strsm_kernel_LT_bulldozer.c | 256 +++++++++++----------- + kernel/x86_64/strsm_kernel_RN_bulldozer.c | 54 ++--- + kernel/x86_64/strsm_kernel_RT_bulldozer.c | 54 ++--- + 5 files changed, 356 insertions(+), 356 deletions(-) + +diff --git a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +index 54df5b359..35ed4cc01 100644 +--- a/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c ++++ b/kernel/x86_64/dtrsm_kernel_RT_bulldozer.c +@@ -125,14 +125,14 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " prefetcht0 384(%3,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " prefetcht0 384(%7,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -147,13 +147,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 2f \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -168,13 +168,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 2f \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -189,13 +189,13 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + " jz 2f \n\t" + +- " prefetcht0 384(%2,%1,8) \n\t" +- " vmovddup (%3,%1,2), %%xmm0 \n\t" // read b +- " vmovddup 8(%3,%1,2), %%xmm1 \n\t" +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " prefetcht0 384(%6,%1,8) \n\t" ++ " vmovddup (%7,%1,2), %%xmm0 \n\t" // read b ++ " vmovddup 8(%7,%1,2), %%xmm1 \n\t" ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddpd %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddpd %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -235,18 +235,18 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" // i = 1 + +- " vmovddup (%7), %%xmm1 \n\t" // read b +- " vmovddup 8(%7), %%xmm0 \n\t" // read bb ++ " vmovddup (%3), %%xmm1 \n\t" // read b ++ " vmovddup 8(%3), %%xmm0 \n\t" // read bb + + " vmulpd %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulpd %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulpd %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulpd %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + +- " vmovups %%xmm12 , (%6) \n\t" // write a +- " vmovups %%xmm13 , 16(%6) \n\t" // write a +- " vmovups %%xmm14 , 32(%6) \n\t" // write a +- " vmovups %%xmm15 , 48(%6) \n\t" // write a ++ " vmovups %%xmm12 , (%2) \n\t" // write a ++ " vmovups %%xmm13 , 16(%2) \n\t" // write a ++ " vmovups %%xmm14 , 32(%2) \n\t" // write a ++ " vmovups %%xmm15 , 48(%2) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" +@@ -259,20 +259,20 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddpd %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" + + " \n\t" // i = 0 +- " subq $16 , %7 \n\t" // b = b - 2 +- " subq $64 , %6 \n\t" // a = a - 8 ++ " subq $16 , %3 \n\t" // b = b - 2 ++ " subq $64 , %2 \n\t" // a = a - 8 + +- " vmovddup (%7), %%xmm0 \n\t" // read bb ++ " vmovddup (%3), %%xmm0 \n\t" // read bb + + " vmulpd %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulpd %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulpd %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulpd %%xmm11 , %%xmm0 , %%xmm11 \n\t" + +- " vmovups %%xmm8 , (%6) \n\t" // write a +- " vmovups %%xmm9 , 16(%6) \n\t" +- " vmovups %%xmm10 , 32(%6) \n\t" +- " vmovups %%xmm11 , 48(%6) \n\t" ++ " vmovups %%xmm8 , (%2) \n\t" // write a ++ " vmovups %%xmm9 , 16(%2) \n\t" ++ " vmovups %%xmm10 , 32(%2) \n\t" ++ " vmovups %%xmm11 , 48(%2) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" +@@ -282,15 +282,15 @@ static void dtrsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_LN_bulldozer.c b/kernel/x86_64/strsm_kernel_LN_bulldozer.c +index 1b8991c6c..3cd215000 100644 +--- a/kernel/x86_64/strsm_kernel_LN_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_LN_bulldozer.c +@@ -126,12 +126,12 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -171,20 +171,20 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" + +- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] ++ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] + " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -194,23 +194,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] ++ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] + " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -220,23 +220,23 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] ++ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] + " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -246,22 +246,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] ++ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] + " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -269,22 +269,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] ++ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] + " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -292,22 +292,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] ++ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] + " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -315,22 +315,22 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9 , read aa[i] ++ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9 , read aa[i] + " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -338,179 +338,179 @@ static void strsm_LN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8 , read aa[i] ++ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8 , read aa[i] + " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7 , read aa[i] ++ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7 , read aa[i] + " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6 , read aa[i] ++ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6 , read aa[i] + " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5 , read aa[i] ++ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5 , read aa[i] + " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4 , read aa[i] ++ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4 , read aa[i] + " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3 , read aa[i] ++ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3 , read aa[i] + " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2 , read aa[i] ++ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2 , read aa[i] + " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1 , read aa[i] ++ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1 , read aa[i] + " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + +- " subq $64 , %6 \n\t" // a -= m +- " subq $8 , %7 \n\t" // b -= n ++ " subq $64 , %2 \n\t" // a -= m ++ " subq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0 , read aa[i] ++ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0 , read aa[i] + " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 + "r" (c), // 4 + "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_LT_bulldozer.c b/kernel/x86_64/strsm_kernel_LT_bulldozer.c +index 0623dddb0..a4a62491c 100644 +--- a/kernel/x86_64/strsm_kernel_LT_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_LT_bulldozer.c +@@ -121,12 +121,12 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -166,20 +166,20 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" + +- " vbroadcastss 0(%6) , %%xmm0 \n\t" // i=0, read aa[i] ++ " vbroadcastss 0(%2) , %%xmm0 \n\t" // i=0, read aa[i] + " vshufps $0x00 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 0(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 0(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -189,23 +189,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 4(%6) , %%xmm0 \n\t" // i=1, read aa[i] ++ " vbroadcastss 4(%2) , %%xmm0 \n\t" // i=1, read aa[i] + " vshufps $0x55 , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 4(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 4(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -215,23 +215,23 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 8(%6) , %%xmm0 \n\t" // i=2, read aa[i] ++ " vbroadcastss 8(%2) , %%xmm0 \n\t" // i=2, read aa[i] + " vshufps $0xaa , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 8(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 8(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 0(%6) , %%xmm4 \n\t" // read a[k] +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 0(%2) , %%xmm4 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm8 , %%xmm1 , %%xmm4 , %%xmm8 \n\t" + " vfnmaddps %%xmm12 , %%xmm2 , %%xmm4 , %%xmm12 \n\t" + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" +@@ -241,22 +241,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 12(%6) , %%xmm0 \n\t" // i=3, read aa[i] ++ " vbroadcastss 12(%2) , %%xmm0 \n\t" // i=3, read aa[i] + " vshufps $0xff , %%xmm8 , %%xmm8 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm12 , %%xmm12 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 12(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 12(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -264,22 +264,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 16(%6) , %%xmm0 \n\t" // i=4, read aa[i] ++ " vbroadcastss 16(%2) , %%xmm0 \n\t" // i=4, read aa[i] + " vshufps $0x00 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 16(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 16(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -287,22 +287,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 20(%6) , %%xmm0 \n\t" // i=5, read aa[i] ++ " vbroadcastss 20(%2) , %%xmm0 \n\t" // i=5, read aa[i] + " vshufps $0x55 , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 20(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 20(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -310,22 +310,22 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 24(%6) , %%xmm0 \n\t" // i=6, read aa[i] ++ " vbroadcastss 24(%2) , %%xmm0 \n\t" // i=6, read aa[i] + " vshufps $0xaa , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 24(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 24(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 16(%6) , %%xmm5 \n\t" // read a[k] +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 16(%2) , %%xmm5 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm9 , %%xmm1 , %%xmm5 , %%xmm9 \n\t" + " vfnmaddps %%xmm13 , %%xmm2 , %%xmm5 , %%xmm13 \n\t" + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" +@@ -333,179 +333,179 @@ static void strsm_LT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 28(%6) , %%xmm0 \n\t" // i=7, read aa[i] ++ " vbroadcastss 28(%2) , %%xmm0 \n\t" // i=7, read aa[i] + " vshufps $0xff , %%xmm9 , %%xmm9 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm13 , %%xmm13 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 28(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 28(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 32(%6) , %%xmm0 \n\t" // i=8, read aa[i] ++ " vbroadcastss 32(%2) , %%xmm0 \n\t" // i=8, read aa[i] + " vshufps $0x00 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 32(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 32(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 36(%6) , %%xmm0 \n\t" // i=9, read aa[i] ++ " vbroadcastss 36(%2) , %%xmm0 \n\t" // i=9, read aa[i] + " vshufps $0x55 , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 36(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 36(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 40(%6) , %%xmm0 \n\t" // i=10, read aa[i] ++ " vbroadcastss 40(%2) , %%xmm0 \n\t" // i=10, read aa[i] + " vshufps $0xaa , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 40(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 40(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 32(%6) , %%xmm6 \n\t" // read a[k] +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 32(%2) , %%xmm6 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm10 , %%xmm1 , %%xmm6 , %%xmm10 \n\t" + " vfnmaddps %%xmm14 , %%xmm2 , %%xmm6 , %%xmm14 \n\t" + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 44(%6) , %%xmm0 \n\t" // i=11, read aa[i] ++ " vbroadcastss 44(%2) , %%xmm0 \n\t" // i=11, read aa[i] + " vshufps $0xff , %%xmm10 , %%xmm10 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm14 , %%xmm14 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 44(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 44(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 48(%6) , %%xmm0 \n\t" // i=12, read aa[i] ++ " vbroadcastss 48(%2) , %%xmm0 \n\t" // i=12, read aa[i] + " vshufps $0x00 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x00 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 48(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 48(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 52(%6) , %%xmm0 \n\t" // i=13, read aa[i] ++ " vbroadcastss 52(%2) , %%xmm0 \n\t" // i=13, read aa[i] + " vshufps $0x55 , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0x55 , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 52(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 52(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 56(%6) , %%xmm0 \n\t" // i=14, read aa[i] ++ " vbroadcastss 56(%2) , %%xmm0 \n\t" // i=14, read aa[i] + " vshufps $0xaa , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xaa , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 56(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 56(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + +- " vmovups 48(%6) , %%xmm7 \n\t" // read a[k] ++ " vmovups 48(%2) , %%xmm7 \n\t" // read a[k] + " vfnmaddps %%xmm11 , %%xmm1 , %%xmm7 , %%xmm11 \n\t" + " vfnmaddps %%xmm15 , %%xmm2 , %%xmm7 , %%xmm15 \n\t" + +- " addq $64 , %6 \n\t" // a -= m +- " addq $8 , %7 \n\t" // b -= n ++ " addq $64 , %2 \n\t" // a -= m ++ " addq $8 , %3 \n\t" // b -= n + +- " vbroadcastss 60(%6) , %%xmm0 \n\t" // i=15, read aa[i] ++ " vbroadcastss 60(%2) , %%xmm0 \n\t" // i=15, read aa[i] + " vshufps $0xff , %%xmm11 , %%xmm11 , %%xmm1 \n\t" // extract bb0 + " vshufps $0xff , %%xmm15 , %%xmm15 , %%xmm2 \n\t" // extract bb1 + " vmulps %%xmm0 , %%xmm1 , %%xmm1 \n\t" // bb0 * aa + " vmulps %%xmm0 , %%xmm2 , %%xmm2 \n\t" // bb1 * aa + " vmovss %%xmm1 , 60(%4) \n\t" // c[i] = bb0 * aa + " vmovss %%xmm2 , 60(%5) \n\t" // c[i] = bb1 * aa +- " vmovss %%xmm1 , (%7) \n\t" // b[0] = bb0 * aa +- " vmovss %%xmm2 , 4(%7) \n\t" // b[1] = bb1 * aa ++ " vmovss %%xmm1 , (%3) \n\t" // b[0] = bb0 * aa ++ " vmovss %%xmm2 , 4(%3) \n\t" // b[1] = bb1 * aa + + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 +- "r" (c), // 4 +- "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (c), // 4 ++ "r" (c1), // 5 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_RN_bulldozer.c b/kernel/x86_64/strsm_kernel_RN_bulldozer.c +index 4cc557d55..c11c84cec 100644 +--- a/kernel/x86_64/strsm_kernel_RN_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_RN_bulldozer.c +@@ -121,12 +121,12 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -166,18 +166,18 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" // i = 0 + +- " vbroadcastss (%7), %%xmm0 \n\t" // read bb +- " vbroadcastss 4(%7), %%xmm1 \n\t" // read b ++ " vbroadcastss (%3), %%xmm0 \n\t" // read bb ++ " vbroadcastss 4(%3), %%xmm1 \n\t" // read b + + " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" + +- " vmovups %%xmm8 , (%6) \n\t" // write a +- " vmovups %%xmm9 , 16(%6) \n\t" +- " vmovups %%xmm10 , 32(%6) \n\t" +- " vmovups %%xmm11 , 48(%6) \n\t" ++ " vmovups %%xmm8 , (%2) \n\t" // write a ++ " vmovups %%xmm9 , 16(%2) \n\t" ++ " vmovups %%xmm10 , 32(%2) \n\t" ++ " vmovups %%xmm11 , 48(%2) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" +@@ -190,20 +190,20 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm15 , %%xmm11 , %%xmm1 , %%xmm15 \n\t" + + " \n\t" // i = 1 +- " addq $8 , %7 \n\t" // b = b + 2 +- " addq $64 , %6 \n\t" // a = a + 16 ++ " addq $8 , %3 \n\t" // b = b + 2 ++ " addq $64 , %2 \n\t" // a = a + 16 + +- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb ++ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb + + " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + +- " vmovups %%xmm12 , (%6) \n\t" // write a +- " vmovups %%xmm13 , 16(%6) \n\t" // write a +- " vmovups %%xmm14 , 32(%6) \n\t" // write a +- " vmovups %%xmm15 , 48(%6) \n\t" // write a ++ " vmovups %%xmm12 , (%2) \n\t" // write a ++ " vmovups %%xmm13 , 16(%2) \n\t" // write a ++ " vmovups %%xmm14 , 32(%2) \n\t" // write a ++ " vmovups %%xmm15 , 48(%2) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" +@@ -213,15 +213,15 @@ static void strsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 +- "r" (c), // 4 +- "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (c), // 4 ++ "r" (c1), // 5 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", +diff --git a/kernel/x86_64/strsm_kernel_RT_bulldozer.c b/kernel/x86_64/strsm_kernel_RT_bulldozer.c +index 73f6e8a95..326ca2976 100644 +--- a/kernel/x86_64/strsm_kernel_RT_bulldozer.c ++++ b/kernel/x86_64/strsm_kernel_RT_bulldozer.c +@@ -125,12 +125,12 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " .align 16 \n\t" + "1: \n\t" + +- " vbroadcastss (%3,%1,1), %%xmm0 \n\t" // read b +- " vmovups (%2,%1,8), %%xmm4 \n\t" +- " vbroadcastss 4(%3,%1,1), %%xmm1 \n\t" +- " vmovups 16(%2,%1,8), %%xmm5 \n\t" +- " vmovups 32(%2,%1,8), %%xmm6 \n\t" +- " vmovups 48(%2,%1,8), %%xmm7 \n\t" ++ " vbroadcastss (%7,%1,1), %%xmm0 \n\t" // read b ++ " vmovups (%6,%1,8), %%xmm4 \n\t" ++ " vbroadcastss 4(%7,%1,1), %%xmm1 \n\t" ++ " vmovups 16(%6,%1,8), %%xmm5 \n\t" ++ " vmovups 32(%6,%1,8), %%xmm6 \n\t" ++ " vmovups 48(%6,%1,8), %%xmm7 \n\t" + + " vfmaddps %%xmm8 , %%xmm0 , %%xmm4 , %%xmm8 \n\t" + " vfmaddps %%xmm12, %%xmm1 , %%xmm4 , %%xmm12 \n\t" +@@ -170,18 +170,18 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + + "3: \n\t" // i = 1 + +- " vbroadcastss (%7), %%xmm1 \n\t" // read b +- " vbroadcastss 4(%7), %%xmm0 \n\t" // read bb ++ " vbroadcastss (%3), %%xmm1 \n\t" // read b ++ " vbroadcastss 4(%3), %%xmm0 \n\t" // read bb + + " vmulps %%xmm12 , %%xmm0 , %%xmm12 \n\t" // aa * bb + " vmulps %%xmm13 , %%xmm0 , %%xmm13 \n\t" // aa * bb + " vmulps %%xmm14 , %%xmm0 , %%xmm14 \n\t" // aa * bb + " vmulps %%xmm15 , %%xmm0 , %%xmm15 \n\t" // aa * bb + +- " vmovups %%xmm12 , (%6) \n\t" // write a +- " vmovups %%xmm13 , 16(%6) \n\t" // write a +- " vmovups %%xmm14 , 32(%6) \n\t" // write a +- " vmovups %%xmm15 , 48(%6) \n\t" // write a ++ " vmovups %%xmm12 , (%2) \n\t" // write a ++ " vmovups %%xmm13 , 16(%2) \n\t" // write a ++ " vmovups %%xmm14 , 32(%2) \n\t" // write a ++ " vmovups %%xmm15 , 48(%2) \n\t" // write a + + " vmovups %%xmm12 , (%5) \n\t" // write c1 + " vmovups %%xmm13 , 16(%5) \n\t" +@@ -194,20 +194,20 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vfnmaddps %%xmm11 , %%xmm15 , %%xmm1 , %%xmm11 \n\t" + + " \n\t" // i = 0 +- " subq $8 , %7 \n\t" // b = b - 2 +- " subq $64 , %6 \n\t" // a = a - 16 ++ " subq $8 , %3 \n\t" // b = b - 2 ++ " subq $64 , %2 \n\t" // a = a - 16 + +- " vbroadcastss (%7), %%xmm0 \n\t" // read bb ++ " vbroadcastss (%3), %%xmm0 \n\t" // read bb + + " vmulps %%xmm8 , %%xmm0 , %%xmm8 \n\t" // aa * bb + " vmulps %%xmm9 , %%xmm0 , %%xmm9 \n\t" + " vmulps %%xmm10 , %%xmm0 , %%xmm10 \n\t" + " vmulps %%xmm11 , %%xmm0 , %%xmm11 \n\t" + +- " vmovups %%xmm8 , (%6) \n\t" // write a +- " vmovups %%xmm9 , 16(%6) \n\t" +- " vmovups %%xmm10 , 32(%6) \n\t" +- " vmovups %%xmm11 , 48(%6) \n\t" ++ " vmovups %%xmm8 , (%2) \n\t" // write a ++ " vmovups %%xmm9 , 16(%2) \n\t" ++ " vmovups %%xmm10 , 32(%2) \n\t" ++ " vmovups %%xmm11 , 48(%2) \n\t" + + " vmovups %%xmm8 , (%4) \n\t" // write c0 + " vmovups %%xmm9 , 16(%4) \n\t" +@@ -217,15 +217,15 @@ static void strsm_RT_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON + " vzeroupper \n\t" + + : ++ "+r" (n1), // 0 ++ "+a" (i), // 1 ++ "+r" (as), // 2 ++ "+r" (bs) // 3 + : +- "r" (n1), // 0 +- "a" (i), // 1 +- "r" (a), // 2 +- "r" (b), // 3 +- "r" (c), // 4 +- "r" (c1), // 5 +- "r" (as), // 6 +- "r" (bs) // 7 ++ "r" (c), // 4 ++ "r" (c1), // 5 ++ "r" (a), // 6 ++ "r" (b) // 7 + : "cc", + "%xmm0", "%xmm1", "%xmm2", "%xmm3", + "%xmm4", "%xmm5", "%xmm6", "%xmm7", diff --git a/openblas.spec b/openblas.spec index 495358c..6d8fd95 100644 --- a/openblas.spec +++ b/openblas.spec @@ -15,7 +15,7 @@ Name: openblas Version: 0.3.5 -Release: 2%{?dist} +Release: 3%{?dist} Summary: An optimized BLAS library based on GotoBLAS2 License: BSD URL: https://github.com/xianyi/OpenBLAS/ @@ -29,6 +29,14 @@ Patch2: openblas-0.2.15-constructor.patch # Supply the proper flags to the test makefile Patch3: openblas-0.3.2-tests.patch +# Fix assembly code +Patch10: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2010.patch +Patch11: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2018.patch +Patch12: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2019.patch +Patch13: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2021.patch +Patch14: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2023.patch +Patch15: https://patch-diff.githubusercontent.com/raw/xianyi/OpenBLAS/pull/2024.patch + BuildRequires: gcc BuildRequires: gcc-gfortran BuildRequires: perl-devel @@ -239,6 +247,13 @@ cd OpenBLAS-%{version} %endif %patch3 -p1 -b .tests +%patch10 -p0 +%patch11 -p0 +%patch12 -p0 +%patch13 -p0 +%patch14 -p0 +%patch15 -p0 + # Fix source permissions find -name \*.f -exec chmod 644 {} \; @@ -674,6 +689,9 @@ rm -rf %{buildroot}%{_libdir}/pkgconfig %endif %changelog +* Sun Feb 17 2019 Susi Lehtola - 0.3.5-3 +- Patch assembly kernels to satisfy gcc 9 demands. + * Fri Feb 01 2019 Fedora Release Engineering - 0.3.5-2 - Rebuilt for https://fedoraproject.org/wiki/Fedora_30_Mass_Rebuild