Fix-up patch

This commit is contained in:
Björn Esser 2017-02-13 22:49:31 +01:00
parent 33992f7d55
commit fe8187055b

View File

@ -1,38 +1,107 @@
From b8c0a1f7e25aa18d97e8a330764fc5464939b036 Mon Sep 17 00:00:00 2001 From 1e70600316ab080d80e318f32868c12eb7d1f2da Mon Sep 17 00:00:00 2001
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de> From: Alan Modra <amodra@gmail.com>
Date: Fri, 3 Feb 2017 21:17:33 +0100 Date: Thu, 9 Feb 2017 08:41:51 +1030
Subject: [PATCH] Fix register clobbers Subject: [PATCH] Fix power8 asm()
Remove PIC registers and memory from clobber list, add vector registers to list - fixes accidental overwriting of callee saved registers and compilation with gcc7 Lots of issues here.
Copied from patch provided by Alan Modra in #1078 - The vsx regs weren't listed as clobbered.
--- - Poor choice of vsx regs, which along with the lack of clobbers led to
kernel/power/sasum_microk_power8.c | 233 ++++++++++++++++++------------------- trashing v0..v21 and fr14..fr23. Ideally you'd let gcc choose all
1 file changed, 112 insertions(+), 121 deletions(-) temp vsx regs, but asms currently have a limit of 30 i/o parms.
- Other regs were clobbered unnecessarily, seemingly in an attempt to
clobber inputs, with gcc-7 complaining about the clobber of r2.
(Changed inputs should be also listed as outputs or as an i/o.)
- "r" constraint used instead of "b" for gprs used in insns where the
r0 encoding means zero rather than r0.
- There were unused asm inputs too.
- All memory was clobbered rather than hooking up memory outputs with
proper memory constraints, and that and the lack of proper memory
input constraints meant the asms needed to be volatile and their
containing function noinline.
- Some parameters were being passed unnecessarily via memory.
- When a copy of a pointer input parm was needed, the value passed to
the asm was incremented in C and decremented in asm, rather than
using i/o parms, an early clobber constraint, or a temp output reg
copied in the asm. In most cases a small change to assembly could
be made that obviated the need for the extra pointer.
- A number of functions did not compute the final sum or dot-product
in assembly, instead using scalar code in C.
- dcbt was bogus.
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c I've also fixed formatting of the asm.
index 847fffe..f28eb49 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -38,9 +38,6 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) diff --git a/kernel/power/casum.c b/kernel/power/casum.c
index aeed0ca..d110858 100644
--- a/kernel/power/casum.c
+++ b/kernel/power/casum.c
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16
-static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
{
BLASLONG i=0;
@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
}
- svec[0] = sum0+sum1+sum2+sum3;
- svec[1] = 0.0;
- svec[2] = 0.0;
- svec[3] = 0.0;
-
+ return sum0+sum1+sum2+sum3;
}
#endif
@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
- FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
BLASLONG inc_x2;
@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{
- casum_kernel_16(n1, x, svec);
- sumf = svec[0] + svec[1]+svec[2]+svec[3];
+ sumf = casum_kernel_16(n1, x);
i=n1;
ip = 2 * n1;
}
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index cb50234..38a1143 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_16 1
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+static float casum_kernel_16 (long n, float *x)
{ {
- -
- -
- BLASLONG i = n; - BLASLONG i = n;
BLASLONG o16 = 16; - BLASLONG o16 = 16;
BLASLONG o32 = 32; - BLASLONG o32 = 32;
BLASLONG o48 = 48; - BLASLONG o48 = 48;
@@ -48,130 +45,124 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) - BLASLONG o64 = 64;
BLASLONG o80 = 80; - BLASLONG o80 = 80;
BLASLONG o96 = 96; - BLASLONG o96 = 96;
BLASLONG o112 = 112; - BLASLONG o112 = 112;
- FLOAT *x1=x; - FLOAT *x1=x;
BLASLONG pre = 384; - BLASLONG pre = 384;
-
- __asm__ __volatile__ - __asm__ __volatile__
+ __asm__ - (
(
- -
- "dcbt %2 , %4 \n\t" - "dcbt %2 , %4 \n\t"
- -
@ -56,7 +125,7 @@ index 847fffe..f28eb49 100644
- -
- "addi %2, %2, 128 \n\t" - "addi %2, %2, 128 \n\t"
- -
- "addic. %0 , %0 , -32 \n\t" - "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t" - "ble 2f \n\t"
- -
- ".align 5 \n\t" - ".align 5 \n\t"
@ -95,7 +164,7 @@ index 847fffe..f28eb49 100644
- "addi %2, %2, 128 \n\t" - "addi %2, %2, 128 \n\t"
- "xvaddsp 36, 36, 52 \n\t" - "xvaddsp 36, 36, 52 \n\t"
- "xvaddsp 37, 37, 53 \n\t" - "xvaddsp 37, 37, 53 \n\t"
- "addic. %0 , %0 , -32 \n\t" - "addic. %0 , %0 , -16 \n\t"
- "xvaddsp 38, 38, 54 \n\t" - "xvaddsp 38, 38, 54 \n\t"
- "xvaddsp 39, 39, 55 \n\t" - "xvaddsp 39, 39, 55 \n\t"
- -
@ -134,103 +203,8 @@ index 847fffe..f28eb49 100644
- -
- -
- "stxvw4x 32, 0, %3 \n\t" - "stxvw4x 32, 0, %3 \n\t"
+ "dcbt %1, %3 \n\t" -
+ - :
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvw4x 40, 0, %1 \n\t"
+ "lxvw4x 41, %4, %1 \n\t"
+ "lxvw4x 42, %5, %1 \n\t"
+ "lxvw4x 43, %6, %1 \n\t"
+ "lxvw4x 44, %7, %1 \n\t"
+ "lxvw4x 45, %8, %1 \n\t"
+ "lxvw4x 46, %9, %1 \n\t"
+ "lxvw4x 47, %10, %1 \n\t"
+
+ "addi %1, %1, 128 \n\t"
+ "addic. %2, %2, -32 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n\t"
+ "1: \n\t"
+ "dcbt %1, %3 \n\t"
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+
+ "lxvw4x 40, 0, %1 \n\t"
+ "lxvw4x 41, %4, %1 \n\t"
+
+ "xvabssp 52, 44 \n\t"
+ "xvabssp 53, 45 \n\t"
+
+ "lxvw4x 42, %5, %1 \n\t"
+ "lxvw4x 43, %6, %1 \n\t"
+
+ "xvabssp 54, 46 \n\t"
+ "xvabssp 55, 47 \n\t"
+
+ "lxvw4x 44, %7, %1 \n\t"
+ "lxvw4x 45, %8, %1 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+
+ "lxvw4x 46, %9, %1 \n\t"
+ "lxvw4x 47, %10, %1 \n\t"
+
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "addi %1, %1, 128 \n\t"
+ "xvaddsp 36, 36, 52 \n\t"
+ "xvaddsp 37, 37, 53 \n\t"
+ "addic. %2, %2, -32 \n\t"
+ "xvaddsp 38, 38, 54 \n\t"
+ "xvaddsp 39, 39, 55 \n\t"
+
+ "bgt 1b \n\t"
+
+ "2: \n\t"
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+ "xvabssp 52, 44 \n\t"
+ "xvabssp 53, 45 \n\t"
+ "xvabssp 54, 46 \n\t"
+ "xvabssp 55, 47 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "xvaddsp 36, 36, 52 \n\t"
+ "xvaddsp 37, 37, 53 \n\t"
+ "xvaddsp 38, 38, 54 \n\t"
+ "xvaddsp 39, 39, 55 \n\t"
+
+ "xvaddsp 32, 32, 33 \n\t"
+ "xvaddsp 34, 34, 35 \n\t"
+ "xvaddsp 36, 36, 37 \n\t"
+ "xvaddsp 38, 38, 39 \n\t"
+
+ "xvaddsp 32, 32, 34 \n\t"
+ "xvaddsp 36, 36, 38 \n\t"
+
+ "xvaddsp 32, 32, 36 \n\t"
+
+ "stxvw4x 32, %y0 \n\t"
:
- : - :
- "r" (i), // 0 - "r" (i), // 0
- "r" (n), // 1 - "r" (n), // 1
@ -245,28 +219,22 @@ index 847fffe..f28eb49 100644
- "r" (o96), // 10 - "r" (o96), // 10
- "r" (o112) // 11 - "r" (o112) // 11
- : "cr0", "%0", "%2", "memory" - : "cr0", "%0", "%2", "memory"
+ "=m" (*svec), // 0 - );
+ "+b" (x), // 1
+ "+r" (n) // 2
+ :
+ "r" (pre), // 3
+ "r" (o16), // 4
+ "r" (o32), // 5
+ "r" (o48), // 6
+ "r" (o64), // 7
+ "r" (o80), // 8
+ "r" (o96), // 9
+ "r" (o112) // 10
+ :
+ "cr0","32","33","34","35","36","37","38","39",
+ "40","41","42","43","44","45","46","47",
+ "48","49","50","51","52","53","54","55"
);
- -
} -}
-
-
32 \n\t" + float sum;
+ __vector float t0;
+ __vector float t1;
+ __vector float t2;
+ __vector float t3;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t" + "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t" + "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t" + "xxlxor 35, 35, 35 \n\t"