256 lines
9.7 KiB
Diff
256 lines
9.7 KiB
Diff
From c26c0b77a7ef7f1e71b7415efeae15a0e61a244a Mon Sep 17 00:00:00 2001
|
|
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
|
Date: Fri, 15 Feb 2019 15:08:16 +0100
|
|
Subject: [PATCH] Fix wrong constraints in inline assembly
|
|
|
|
for #2009
|
|
---
|
|
kernel/x86_64/dtrsm_kernel_RN_haswell.c | 98 ++++++++++++-------------
|
|
1 file changed, 49 insertions(+), 49 deletions(-)
|
|
|
|
diff --git a/kernel/x86_64/dtrsm_kernel_RN_haswell.c b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
|
index fcab8e2c7..9ab78fc8e 100644
|
|
--- a/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
|
+++ b/kernel/x86_64/dtrsm_kernel_RN_haswell.c
|
|
@@ -119,9 +119,9 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" cmpq $0, %0 \n\t"
|
|
" je 4f \n\t"
|
|
|
|
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
|
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
|
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
|
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
|
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
|
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
|
|
|
|
|
" addq $8, %1 \n\t"
|
|
@@ -131,18 +131,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" .p2align 4 \n\t"
|
|
"1: \n\t"
|
|
|
|
- " vmovups (%2,%1,4), %%ymm4 \n\t" // read a
|
|
+ " vmovups (%8,%1,4), %%ymm4 \n\t" // read a
|
|
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
|
|
|
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm8 \n\t"
|
|
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm12 \n\t"
|
|
|
|
- " vmovups (%3,%1,8), %%ymm5 \n\t" // read b0
|
|
+ " vmovups (%9,%1,8), %%ymm5 \n\t" // read b0
|
|
" vfmadd231pd %%ymm3 , %%ymm1 , %%ymm9 \n\t"
|
|
" vfmadd231pd %%ymm3 , %%ymm2 , %%ymm13 \n\t"
|
|
|
|
" vpermpd $0x1b , %%ymm3 , %%ymm0 \n\t"
|
|
- " vmovups 32(%3,%1,8), %%ymm6 \n\t" // read b1
|
|
+ " vmovups 32(%9,%1,8), %%ymm6 \n\t" // read b1
|
|
" vpermpd $0xb1 , %%ymm0 , %%ymm3 \n\t"
|
|
" vfmadd231pd %%ymm0 , %%ymm1 , %%ymm10 \n\t"
|
|
" vfmadd231pd %%ymm0 , %%ymm2 , %%ymm14 \n\t"
|
|
@@ -155,18 +155,18 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
|
|
" jz 22f \n\t"
|
|
|
|
- " vmovups (%2,%1,4), %%ymm0 \n\t" // read a
|
|
+ " vmovups (%8,%1,4), %%ymm0 \n\t" // read a
|
|
|
|
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm8 \n\t"
|
|
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm12 \n\t"
|
|
|
|
" vpermpd $0xb1 , %%ymm4 , %%ymm4 \n\t"
|
|
- " vmovups (%3,%1,8), %%ymm1 \n\t" // read b0
|
|
+ " vmovups (%9,%1,8), %%ymm1 \n\t" // read b0
|
|
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm9 \n\t"
|
|
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm13 \n\t"
|
|
|
|
" vpermpd $0x1b , %%ymm4 , %%ymm4 \n\t"
|
|
- " vmovups 32(%3,%1,8), %%ymm2 \n\t" // read b1
|
|
+ " vmovups 32(%9,%1,8), %%ymm2 \n\t" // read b1
|
|
" vfmadd231pd %%ymm4 , %%ymm5 , %%ymm10 \n\t"
|
|
" vfmadd231pd %%ymm4 , %%ymm6 , %%ymm14 \n\t"
|
|
|
|
@@ -268,7 +268,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vmovups (%6,%7,1) , %%ymm7 \n\t" // read c7
|
|
|
|
" vsubpd %%ymm8 , %%ymm0 , %%ymm8 \n\t"
|
|
- " vmovups (%9), %%ymm0 \n\t"
|
|
+ " vmovups (%3), %%ymm0 \n\t"
|
|
" vsubpd %%ymm9 , %%ymm1 , %%ymm9 \n\t"
|
|
" vpermpd $0x55 , %%ymm0 , %%ymm1 \n\t"
|
|
" vsubpd %%ymm10, %%ymm2 , %%ymm10 \n\t"
|
|
@@ -278,7 +278,7 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vpermpd $0x00 , %%ymm0 , %%ymm0 \n\t"
|
|
|
|
" vsubpd %%ymm12, %%ymm4 , %%ymm12 \n\t"
|
|
- " vmovups 32(%9), %%ymm4 \n\t"
|
|
+ " vmovups 32(%3), %%ymm4 \n\t"
|
|
" vsubpd %%ymm13, %%ymm5 , %%ymm13 \n\t"
|
|
" vpermpd $0x55 , %%ymm4 , %%ymm5 \n\t"
|
|
" vsubpd %%ymm14, %%ymm6 , %%ymm14 \n\t"
|
|
@@ -290,15 +290,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
|
|
"5: \n\t" // i = 0
|
|
|
|
- " addq $64, %9 \n\t" // b=b+8
|
|
+ " addq $64, %3 \n\t" // b=b+8
|
|
|
|
" vmulpd %%ymm8 , %%ymm0, %%ymm8 \n\t" // a *bb
|
|
- " vmovups (%9), %%ymm0 \n\t"
|
|
- " vmovups %%ymm8 , (%8) \n\t" // write a
|
|
+ " vmovups (%3), %%ymm0 \n\t"
|
|
+ " vmovups %%ymm8 , (%2) \n\t" // write a
|
|
" vmovups %%ymm8 , (%4) \n\t" // write c
|
|
|
|
" vfnmadd231pd %%ymm8 , %%ymm1 , %%ymm9 \n\t"
|
|
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
" vfnmadd231pd %%ymm8 , %%ymm2 , %%ymm10 \n\t"
|
|
" vpermpd $0xaa , %%ymm0 , %%ymm2 \n\t"
|
|
" vfnmadd231pd %%ymm8 , %%ymm3 , %%ymm11 \n\t"
|
|
@@ -313,15 +313,15 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
|
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
|
|
|
- " addq $64, %9 \n\t" // b=b+8
|
|
- " addq $32, %8 \n\t" // a=a+8
|
|
+ " addq $64, %3 \n\t" // b=b+8
|
|
+ " addq $32, %2 \n\t" // a=a+8
|
|
|
|
|
|
|
|
" vmulpd %%ymm9 , %%ymm0, %%ymm9 \n\t" // a *bb
|
|
- " vmovups (%9), %%ymm0 \n\t"
|
|
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
- " vmovups %%ymm9 , (%8) \n\t" // write a
|
|
+ " vmovups (%3), %%ymm0 \n\t"
|
|
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
+ " vmovups %%ymm9 , (%2) \n\t" // write a
|
|
" vmovups %%ymm9 , (%4,%7,1) \n\t" // write c
|
|
|
|
" vfnmadd231pd %%ymm9 , %%ymm2 , %%ymm10 \n\t"
|
|
@@ -337,13 +337,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
|
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
|
|
|
- " addq $64, %9 \n\t" // b=b+8
|
|
- " addq $32, %8 \n\t" // a=a+8
|
|
+ " addq $64, %3 \n\t" // b=b+8
|
|
+ " addq $32, %2 \n\t" // a=a+8
|
|
|
|
" vmulpd %%ymm10, %%ymm0, %%ymm10 \n\t" // a *bb
|
|
- " vmovups (%9), %%ymm0 \n\t"
|
|
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
- " vmovups %%ymm10, (%8) \n\t" // write a
|
|
+ " vmovups (%3), %%ymm0 \n\t"
|
|
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
+ " vmovups %%ymm10, (%2) \n\t" // write a
|
|
" vmovups %%ymm10, (%4,%7,2) \n\t" // write c
|
|
|
|
" vfnmadd231pd %%ymm10, %%ymm3 , %%ymm11 \n\t"
|
|
@@ -358,14 +358,14 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vpermpd $0x00 , %%ymm1 , %%ymm4 \n\t"
|
|
|
|
|
|
- " addq $64, %9 \n\t" // b=b+8
|
|
- " addq $32, %8 \n\t" // a=a+8
|
|
+ " addq $64, %3 \n\t" // b=b+8
|
|
+ " addq $32, %2 \n\t" // a=a+8
|
|
|
|
|
|
|
|
" vmulpd %%ymm11, %%ymm0, %%ymm11 \n\t" // a *bb
|
|
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
- " vmovups %%ymm11, (%8) \n\t" // write a
|
|
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
+ " vmovups %%ymm11, (%2) \n\t" // write a
|
|
" vmovups %%ymm11, (%5) \n\t" // write c
|
|
|
|
" vfnmadd231pd %%ymm11, %%ymm4 , %%ymm12 \n\t"
|
|
@@ -378,13 +378,13 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vpermpd $0x00 , %%ymm1 , %%ymm0 \n\t"
|
|
|
|
|
|
- " addq $64, %9 \n\t" // b=b+8
|
|
- " addq $32, %8 \n\t" // a=a+8
|
|
+ " addq $64, %3 \n\t" // b=b+8
|
|
+ " addq $32, %2 \n\t" // a=a+8
|
|
|
|
|
|
" vmulpd %%ymm12, %%ymm0, %%ymm12 \n\t" // a *bb
|
|
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
- " vmovups %%ymm12, (%8) \n\t" // write a
|
|
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
+ " vmovups %%ymm12, (%2) \n\t" // write a
|
|
" vmovups %%ymm12, (%5,%7,1) \n\t" // write c
|
|
|
|
" vfnmadd231pd %%ymm12, %%ymm5 , %%ymm13 \n\t"
|
|
@@ -394,12 +394,12 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vpermpd $0xff , %%ymm1 , %%ymm7 \n\t"
|
|
" vpermpd $0x55 , %%ymm1 , %%ymm0 \n\t"
|
|
|
|
- " addq $64, %9 \n\t" // b=b+8
|
|
- " addq $32, %8 \n\t" // a=a+8
|
|
+ " addq $64, %3 \n\t" // b=b+8
|
|
+ " addq $32, %2 \n\t" // a=a+8
|
|
|
|
" vmulpd %%ymm13, %%ymm0, %%ymm13 \n\t" // a *bb
|
|
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
- " vmovups %%ymm13, (%8) \n\t" // write a
|
|
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
+ " vmovups %%ymm13, (%2) \n\t" // write a
|
|
" vmovups %%ymm13, (%5,%7,2) \n\t" // write c
|
|
|
|
" vfnmadd231pd %%ymm13, %%ymm6 , %%ymm14 \n\t"
|
|
@@ -408,39 +408,39 @@ static void dtrsm_RN_solve_opt(BLASLONG n, FLOAT *a, FLOAT *b, FLOAT *c, BLASLON
|
|
" vpermpd $0xaa , %%ymm1 , %%ymm0 \n\t"
|
|
|
|
|
|
- " addq $64, %9 \n\t" // b=b+8
|
|
- " addq $32, %8 \n\t" // a=a+8
|
|
+ " addq $64, %3 \n\t" // b=b+8
|
|
+ " addq $32, %2 \n\t" // a=a+8
|
|
|
|
|
|
" vmulpd %%ymm14, %%ymm0, %%ymm14 \n\t" // a *bb
|
|
- " vmovups 32(%9), %%ymm1 \n\t"
|
|
- " vmovups %%ymm14, (%8) \n\t" // write a
|
|
+ " vmovups 32(%3), %%ymm1 \n\t"
|
|
+ " vmovups %%ymm14, (%2) \n\t" // write a
|
|
" vmovups %%ymm14, (%6) \n\t" // write c
|
|
|
|
" vfnmadd231pd %%ymm14, %%ymm7 , %%ymm15 \n\t"
|
|
|
|
" vpermpd $0xff , %%ymm1 , %%ymm0 \n\t"
|
|
|
|
- " addq $32, %8 \n\t" // a=a+8
|
|
+ " addq $32, %2 \n\t" // a=a+8
|
|
|
|
" vmulpd %%ymm15, %%ymm0, %%ymm15 \n\t" // a *bb
|
|
- " vmovups %%ymm15, (%8) \n\t" // write a
|
|
+ " vmovups %%ymm15, (%2) \n\t" // write a
|
|
" vmovups %%ymm15, (%6,%7,1) \n\t" // write c
|
|
|
|
" vzeroupper \n\t"
|
|
|
|
:
|
|
+ "+r" (n1), // 0
|
|
+ "+a" (i), // 1
|
|
+ "+r" (as), // 2
|
|
+ "+r" (bs) // 3
|
|
:
|
|
- "r" (n1), // 0
|
|
- "a" (i), // 1
|
|
- "r" (a), // 2
|
|
- "r" (b), // 3
|
|
"r" (c), // 4
|
|
"r" (c3), // 5
|
|
"r" (c6), // 6
|
|
"r" (ldc), // 7
|
|
- "r" (as), // 8
|
|
- "r" (bs) // 9
|
|
+ "r" (a), // 8
|
|
+ "r" (b) // 9
|
|
: "cc",
|
|
"%xmm0", "%xmm1", "%xmm2", "%xmm3",
|
|
"%xmm4", "%xmm5", "%xmm6", "%xmm7",
|