diff --git a/openblas-0.2.19-fix_register_clobbers.patch b/openblas-0.2.19-fix_register_clobbers.patch index 5dc5598..b51d646 100644 --- a/openblas-0.2.19-fix_register_clobbers.patch +++ b/openblas-0.2.19-fix_register_clobbers.patch @@ -1,38 +1,107 @@ -From b8c0a1f7e25aa18d97e8a330764fc5464939b036 Mon Sep 17 00:00:00 2001 -From: Martin Kroeker -Date: Fri, 3 Feb 2017 21:17:33 +0100 -Subject: [PATCH] Fix register clobbers +From 1e70600316ab080d80e318f32868c12eb7d1f2da Mon Sep 17 00:00:00 2001 +From: Alan Modra +Date: Thu, 9 Feb 2017 08:41:51 +1030 +Subject: [PATCH] Fix power8 asm() -Remove PIC registers and memory from clobber list, add vector registers to list - fixes accidental overwriting of callee saved registers and compilation with gcc7 -Copied from patch provided by Alan Modra in #1078 ---- - kernel/power/sasum_microk_power8.c | 233 ++++++++++++++++++------------------- - 1 file changed, 112 insertions(+), 121 deletions(-) +Lots of issues here. +- The vsx regs weren't listed as clobbered. +- Poor choice of vsx regs, which along with the lack of clobbers led to + trashing v0..v21 and fr14..fr23. Ideally you'd let gcc choose all + temp vsx regs, but asms currently have a limit of 30 i/o parms. +- Other regs were clobbered unnecessarily, seemingly in an attempt to + clobber inputs, with gcc-7 complaining about the clobber of r2. + (Changed inputs should be also listed as outputs or as an i/o.) +- "r" constraint used instead of "b" for gprs used in insns where the + r0 encoding means zero rather than r0. +- There were unused asm inputs too. +- All memory was clobbered rather than hooking up memory outputs with + proper memory constraints, and that and the lack of proper memory + input constraints meant the asms needed to be volatile and their + containing function noinline. +- Some parameters were being passed unnecessarily via memory. +- When a copy of a pointer input parm was needed, the value passed to + the asm was incremented in C and decremented in asm, rather than + using i/o parms, an early clobber constraint, or a temp output reg + copied in the asm. In most cases a small change to assembly could + be made that obviated the need for the extra pointer. +- A number of functions did not compute the final sum or dot-product + in assembly, instead using scalar code in C. +- dcbt was bogus. -diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c -index 847fffe..f28eb49 100644 ---- a/kernel/power/sasum_microk_power8.c -+++ b/kernel/power/sasum_microk_power8.c -@@ -38,9 +38,6 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ (( +I've also fixed formatting of the asm. + +diff --git a/kernel/power/casum.c b/kernel/power/casum.c +index aeed0ca..d110858 100644 +--- a/kernel/power/casum.c ++++ b/kernel/power/casum.c +@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) + #ifndef HAVE_KERNEL_16 + +-static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) ++static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1) + { + + BLASLONG i=0; +@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec) + + } + +- svec[0] = sum0+sum1+sum2+sum3; +- svec[1] = 0.0; +- svec[2] = 0.0; +- svec[3] = 0.0; +- ++ return sum0+sum1+sum2+sum3; + } + + #endif +@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) + BLASLONG i=0; + BLASLONG ip=0; + FLOAT sumf = 0.0; +- FLOAT svec[4] __attribute__ ((aligned (16)));; + BLASLONG n1; + BLASLONG inc_x2; + +@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x) + if ( n1 > 0 ) + { + +- casum_kernel_16(n1, x, svec); +- sumf = svec[0] + svec[1]+svec[2]+svec[3]; ++ sumf = casum_kernel_16(n1, x); + i=n1; + ip = 2 * n1; + } +diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c +index cb50234..38a1143 100644 +--- a/kernel/power/casum_microk_power8.c ++++ b/kernel/power/casum_microk_power8.c +@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + **************************************************************************************/ + + #define HAVE_KERNEL_16 1 +-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline)); + +-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) ++static float casum_kernel_16 (long n, float *x) { - - - BLASLONG i = n; - BLASLONG o16 = 16; - BLASLONG o32 = 32; - BLASLONG o48 = 48; -@@ -48,130 +45,124 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) - BLASLONG o80 = 80; - BLASLONG o96 = 96; - BLASLONG o112 = 112; +- BLASLONG o16 = 16; +- BLASLONG o32 = 32; +- BLASLONG o48 = 48; +- BLASLONG o64 = 64; +- BLASLONG o80 = 80; +- BLASLONG o96 = 96; +- BLASLONG o112 = 112; - FLOAT *x1=x; - BLASLONG pre = 384; - +- BLASLONG pre = 384; +- - __asm__ __volatile__ -+ __asm__ - ( +- ( - - "dcbt %2 , %4 \n\t" - @@ -56,7 +125,7 @@ index 847fffe..f28eb49 100644 - - "addi %2, %2, 128 \n\t" - -- "addic. %0 , %0 , -32 \n\t" +- "addic. %0 , %0 , -16 \n\t" - "ble 2f \n\t" - - ".align 5 \n\t" @@ -95,7 +164,7 @@ index 847fffe..f28eb49 100644 - "addi %2, %2, 128 \n\t" - "xvaddsp 36, 36, 52 \n\t" - "xvaddsp 37, 37, 53 \n\t" -- "addic. %0 , %0 , -32 \n\t" +- "addic. %0 , %0 , -16 \n\t" - "xvaddsp 38, 38, 54 \n\t" - "xvaddsp 39, 39, 55 \n\t" - @@ -134,103 +203,8 @@ index 847fffe..f28eb49 100644 - - - "stxvw4x 32, 0, %3 \n\t" -+ "dcbt %1, %3 \n\t" -+ -+ "xxlxor 32, 32, 32 \n\t" -+ "xxlxor 33, 33, 33 \n\t" -+ "xxlxor 34, 34, 34 \n\t" -+ "xxlxor 35, 35, 35 \n\t" -+ "xxlxor 36, 36, 36 \n\t" -+ "xxlxor 37, 37, 37 \n\t" -+ "xxlxor 38, 38, 38 \n\t" -+ "xxlxor 39, 39, 39 \n\t" -+ -+ "lxvw4x 40, 0, %1 \n\t" -+ "lxvw4x 41, %4, %1 \n\t" -+ "lxvw4x 42, %5, %1 \n\t" -+ "lxvw4x 43, %6, %1 \n\t" -+ "lxvw4x 44, %7, %1 \n\t" -+ "lxvw4x 45, %8, %1 \n\t" -+ "lxvw4x 46, %9, %1 \n\t" -+ "lxvw4x 47, %10, %1 \n\t" -+ -+ "addi %1, %1, 128 \n\t" -+ "addic. %2, %2, -32 \n\t" -+ "ble 2f \n\t" -+ -+ ".p2align 5 \n\t" -+ "1: \n\t" -+ "dcbt %1, %3 \n\t" -+ -+ "xvabssp 48, 40 \n\t" -+ "xvabssp 49, 41 \n\t" -+ "xvabssp 50, 42 \n\t" -+ "xvabssp 51, 43 \n\t" -+ -+ "lxvw4x 40, 0, %1 \n\t" -+ "lxvw4x 41, %4, %1 \n\t" -+ -+ "xvabssp 52, 44 \n\t" -+ "xvabssp 53, 45 \n\t" -+ -+ "lxvw4x 42, %5, %1 \n\t" -+ "lxvw4x 43, %6, %1 \n\t" -+ -+ "xvabssp 54, 46 \n\t" -+ "xvabssp 55, 47 \n\t" -+ -+ "lxvw4x 44, %7, %1 \n\t" -+ "lxvw4x 45, %8, %1 \n\t" -+ -+ "xvaddsp 32, 32, 48 \n\t" -+ "xvaddsp 33, 33, 49 \n\t" -+ -+ "lxvw4x 46, %9, %1 \n\t" -+ "lxvw4x 47, %10, %1 \n\t" -+ -+ "xvaddsp 34, 34, 50 \n\t" -+ "xvaddsp 35, 35, 51 \n\t" -+ "addi %1, %1, 128 \n\t" -+ "xvaddsp 36, 36, 52 \n\t" -+ "xvaddsp 37, 37, 53 \n\t" -+ "addic. %2, %2, -32 \n\t" -+ "xvaddsp 38, 38, 54 \n\t" -+ "xvaddsp 39, 39, 55 \n\t" -+ -+ "bgt 1b \n\t" -+ -+ "2: \n\t" -+ "xvabssp 48, 40 \n\t" -+ "xvabssp 49, 41 \n\t" -+ "xvabssp 50, 42 \n\t" -+ "xvabssp 51, 43 \n\t" -+ "xvabssp 52, 44 \n\t" -+ "xvabssp 53, 45 \n\t" -+ "xvabssp 54, 46 \n\t" -+ "xvabssp 55, 47 \n\t" -+ -+ "xvaddsp 32, 32, 48 \n\t" -+ "xvaddsp 33, 33, 49 \n\t" -+ "xvaddsp 34, 34, 50 \n\t" -+ "xvaddsp 35, 35, 51 \n\t" -+ "xvaddsp 36, 36, 52 \n\t" -+ "xvaddsp 37, 37, 53 \n\t" -+ "xvaddsp 38, 38, 54 \n\t" -+ "xvaddsp 39, 39, 55 \n\t" -+ -+ "xvaddsp 32, 32, 33 \n\t" -+ "xvaddsp 34, 34, 35 \n\t" -+ "xvaddsp 36, 36, 37 \n\t" -+ "xvaddsp 38, 38, 39 \n\t" -+ -+ "xvaddsp 32, 32, 34 \n\t" -+ "xvaddsp 36, 36, 38 \n\t" -+ -+ "xvaddsp 32, 32, 36 \n\t" -+ -+ "stxvw4x 32, %y0 \n\t" - - : +- +- : - : - "r" (i), // 0 - "r" (n), // 1 @@ -245,28 +219,22 @@ index 847fffe..f28eb49 100644 - "r" (o96), // 10 - "r" (o112) // 11 - : "cr0", "%0", "%2", "memory" -+ "=m" (*svec), // 0 -+ "+b" (x), // 1 -+ "+r" (n) // 2 -+ : -+ "r" (pre), // 3 -+ "r" (o16), // 4 -+ "r" (o32), // 5 -+ "r" (o48), // 6 -+ "r" (o64), // 7 -+ "r" (o80), // 8 -+ "r" (o96), // 9 -+ "r" (o112) // 10 -+ : -+ "cr0","32","33","34","35","36","37","38","39", -+ "40","41","42","43","44","45","46","47", -+ "48","49","50","51","52","53","54","55" - ); +- ); - - } - - - 32 \n\t" +-} +- +- ++ float sum; ++ __vector float t0; ++ __vector float t1; ++ __vector float t2; ++ __vector float t3; ++ ++ __asm__ ++ ( ++ "dcbt 0, %2 \n\t" ++ ++ "xxlxor 32, 32, 32 \n\t" + "xxlxor 33, 33, 33 \n\t" + "xxlxor 34, 34, 34 \n\t" + "xxlxor 35, 35, 35 \n\t"