Fix-up patch
This commit is contained in:
parent
33992f7d55
commit
fe8187055b
@ -1,38 +1,107 @@
|
|||||||
From b8c0a1f7e25aa18d97e8a330764fc5464939b036 Mon Sep 17 00:00:00 2001
|
From 1e70600316ab080d80e318f32868c12eb7d1f2da Mon Sep 17 00:00:00 2001
|
||||||
From: Martin Kroeker <martin@ruby.chemie.uni-freiburg.de>
|
From: Alan Modra <amodra@gmail.com>
|
||||||
Date: Fri, 3 Feb 2017 21:17:33 +0100
|
Date: Thu, 9 Feb 2017 08:41:51 +1030
|
||||||
Subject: [PATCH] Fix register clobbers
|
Subject: [PATCH] Fix power8 asm()
|
||||||
|
|
||||||
Remove PIC registers and memory from clobber list, add vector registers to list - fixes accidental overwriting of callee saved registers and compilation with gcc7
|
Lots of issues here.
|
||||||
Copied from patch provided by Alan Modra in #1078
|
- The vsx regs weren't listed as clobbered.
|
||||||
---
|
- Poor choice of vsx regs, which along with the lack of clobbers led to
|
||||||
kernel/power/sasum_microk_power8.c | 233 ++++++++++++++++++-------------------
|
trashing v0..v21 and fr14..fr23. Ideally you'd let gcc choose all
|
||||||
1 file changed, 112 insertions(+), 121 deletions(-)
|
temp vsx regs, but asms currently have a limit of 30 i/o parms.
|
||||||
|
- Other regs were clobbered unnecessarily, seemingly in an attempt to
|
||||||
|
clobber inputs, with gcc-7 complaining about the clobber of r2.
|
||||||
|
(Changed inputs should be also listed as outputs or as an i/o.)
|
||||||
|
- "r" constraint used instead of "b" for gprs used in insns where the
|
||||||
|
r0 encoding means zero rather than r0.
|
||||||
|
- There were unused asm inputs too.
|
||||||
|
- All memory was clobbered rather than hooking up memory outputs with
|
||||||
|
proper memory constraints, and that and the lack of proper memory
|
||||||
|
input constraints meant the asms needed to be volatile and their
|
||||||
|
containing function noinline.
|
||||||
|
- Some parameters were being passed unnecessarily via memory.
|
||||||
|
- When a copy of a pointer input parm was needed, the value passed to
|
||||||
|
the asm was incremented in C and decremented in asm, rather than
|
||||||
|
using i/o parms, an early clobber constraint, or a temp output reg
|
||||||
|
copied in the asm. In most cases a small change to assembly could
|
||||||
|
be made that obviated the need for the extra pointer.
|
||||||
|
- A number of functions did not compute the final sum or dot-product
|
||||||
|
in assembly, instead using scalar code in C.
|
||||||
|
- dcbt was bogus.
|
||||||
|
|
||||||
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
|
I've also fixed formatting of the asm.
|
||||||
index 847fffe..f28eb49 100644
|
|
||||||
--- a/kernel/power/sasum_microk_power8.c
|
|
||||||
+++ b/kernel/power/sasum_microk_power8.c
|
|
||||||
@@ -38,9 +38,6 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((
|
|
||||||
|
|
||||||
static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
|
diff --git a/kernel/power/casum.c b/kernel/power/casum.c
|
||||||
|
index aeed0ca..d110858 100644
|
||||||
|
--- a/kernel/power/casum.c
|
||||||
|
+++ b/kernel/power/casum.c
|
||||||
|
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
|
||||||
|
#ifndef HAVE_KERNEL_16
|
||||||
|
|
||||||
|
-static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||||
|
+static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
|
||||||
|
{
|
||||||
|
|
||||||
|
BLASLONG i=0;
|
||||||
|
@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
|
||||||
|
|
||||||
|
}
|
||||||
|
|
||||||
|
- svec[0] = sum0+sum1+sum2+sum3;
|
||||||
|
- svec[1] = 0.0;
|
||||||
|
- svec[2] = 0.0;
|
||||||
|
- svec[3] = 0.0;
|
||||||
|
-
|
||||||
|
+ return sum0+sum1+sum2+sum3;
|
||||||
|
}
|
||||||
|
|
||||||
|
#endif
|
||||||
|
@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
BLASLONG i=0;
|
||||||
|
BLASLONG ip=0;
|
||||||
|
FLOAT sumf = 0.0;
|
||||||
|
- FLOAT svec[4] __attribute__ ((aligned (16)));;
|
||||||
|
BLASLONG n1;
|
||||||
|
BLASLONG inc_x2;
|
||||||
|
|
||||||
|
@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
|
||||||
|
if ( n1 > 0 )
|
||||||
|
{
|
||||||
|
|
||||||
|
- casum_kernel_16(n1, x, svec);
|
||||||
|
- sumf = svec[0] + svec[1]+svec[2]+svec[3];
|
||||||
|
+ sumf = casum_kernel_16(n1, x);
|
||||||
|
i=n1;
|
||||||
|
ip = 2 * n1;
|
||||||
|
}
|
||||||
|
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
|
||||||
|
index cb50234..38a1143 100644
|
||||||
|
--- a/kernel/power/casum_microk_power8.c
|
||||||
|
+++ b/kernel/power/casum_microk_power8.c
|
||||||
|
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
||||||
|
**************************************************************************************/
|
||||||
|
|
||||||
|
#define HAVE_KERNEL_16 1
|
||||||
|
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
|
||||||
|
|
||||||
|
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
|
||||||
|
+static float casum_kernel_16 (long n, float *x)
|
||||||
{
|
{
|
||||||
-
|
-
|
||||||
-
|
-
|
||||||
- BLASLONG i = n;
|
- BLASLONG i = n;
|
||||||
BLASLONG o16 = 16;
|
- BLASLONG o16 = 16;
|
||||||
BLASLONG o32 = 32;
|
- BLASLONG o32 = 32;
|
||||||
BLASLONG o48 = 48;
|
- BLASLONG o48 = 48;
|
||||||
@@ -48,130 +45,124 @@ static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
|
- BLASLONG o64 = 64;
|
||||||
BLASLONG o80 = 80;
|
- BLASLONG o80 = 80;
|
||||||
BLASLONG o96 = 96;
|
- BLASLONG o96 = 96;
|
||||||
BLASLONG o112 = 112;
|
- BLASLONG o112 = 112;
|
||||||
- FLOAT *x1=x;
|
- FLOAT *x1=x;
|
||||||
BLASLONG pre = 384;
|
- BLASLONG pre = 384;
|
||||||
|
-
|
||||||
- __asm__ __volatile__
|
- __asm__ __volatile__
|
||||||
+ __asm__
|
- (
|
||||||
(
|
|
||||||
-
|
-
|
||||||
- "dcbt %2 , %4 \n\t"
|
- "dcbt %2 , %4 \n\t"
|
||||||
-
|
-
|
||||||
@ -56,7 +125,7 @@ index 847fffe..f28eb49 100644
|
|||||||
-
|
-
|
||||||
- "addi %2, %2, 128 \n\t"
|
- "addi %2, %2, 128 \n\t"
|
||||||
-
|
-
|
||||||
- "addic. %0 , %0 , -32 \n\t"
|
- "addic. %0 , %0 , -16 \n\t"
|
||||||
- "ble 2f \n\t"
|
- "ble 2f \n\t"
|
||||||
-
|
-
|
||||||
- ".align 5 \n\t"
|
- ".align 5 \n\t"
|
||||||
@ -95,7 +164,7 @@ index 847fffe..f28eb49 100644
|
|||||||
- "addi %2, %2, 128 \n\t"
|
- "addi %2, %2, 128 \n\t"
|
||||||
- "xvaddsp 36, 36, 52 \n\t"
|
- "xvaddsp 36, 36, 52 \n\t"
|
||||||
- "xvaddsp 37, 37, 53 \n\t"
|
- "xvaddsp 37, 37, 53 \n\t"
|
||||||
- "addic. %0 , %0 , -32 \n\t"
|
- "addic. %0 , %0 , -16 \n\t"
|
||||||
- "xvaddsp 38, 38, 54 \n\t"
|
- "xvaddsp 38, 38, 54 \n\t"
|
||||||
- "xvaddsp 39, 39, 55 \n\t"
|
- "xvaddsp 39, 39, 55 \n\t"
|
||||||
-
|
-
|
||||||
@ -134,103 +203,8 @@ index 847fffe..f28eb49 100644
|
|||||||
-
|
-
|
||||||
-
|
-
|
||||||
- "stxvw4x 32, 0, %3 \n\t"
|
- "stxvw4x 32, 0, %3 \n\t"
|
||||||
+ "dcbt %1, %3 \n\t"
|
-
|
||||||
+
|
- :
|
||||||
+ "xxlxor 32, 32, 32 \n\t"
|
|
||||||
+ "xxlxor 33, 33, 33 \n\t"
|
|
||||||
+ "xxlxor 34, 34, 34 \n\t"
|
|
||||||
+ "xxlxor 35, 35, 35 \n\t"
|
|
||||||
+ "xxlxor 36, 36, 36 \n\t"
|
|
||||||
+ "xxlxor 37, 37, 37 \n\t"
|
|
||||||
+ "xxlxor 38, 38, 38 \n\t"
|
|
||||||
+ "xxlxor 39, 39, 39 \n\t"
|
|
||||||
+
|
|
||||||
+ "lxvw4x 40, 0, %1 \n\t"
|
|
||||||
+ "lxvw4x 41, %4, %1 \n\t"
|
|
||||||
+ "lxvw4x 42, %5, %1 \n\t"
|
|
||||||
+ "lxvw4x 43, %6, %1 \n\t"
|
|
||||||
+ "lxvw4x 44, %7, %1 \n\t"
|
|
||||||
+ "lxvw4x 45, %8, %1 \n\t"
|
|
||||||
+ "lxvw4x 46, %9, %1 \n\t"
|
|
||||||
+ "lxvw4x 47, %10, %1 \n\t"
|
|
||||||
+
|
|
||||||
+ "addi %1, %1, 128 \n\t"
|
|
||||||
+ "addic. %2, %2, -32 \n\t"
|
|
||||||
+ "ble 2f \n\t"
|
|
||||||
+
|
|
||||||
+ ".p2align 5 \n\t"
|
|
||||||
+ "1: \n\t"
|
|
||||||
+ "dcbt %1, %3 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvabssp 48, 40 \n\t"
|
|
||||||
+ "xvabssp 49, 41 \n\t"
|
|
||||||
+ "xvabssp 50, 42 \n\t"
|
|
||||||
+ "xvabssp 51, 43 \n\t"
|
|
||||||
+
|
|
||||||
+ "lxvw4x 40, 0, %1 \n\t"
|
|
||||||
+ "lxvw4x 41, %4, %1 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvabssp 52, 44 \n\t"
|
|
||||||
+ "xvabssp 53, 45 \n\t"
|
|
||||||
+
|
|
||||||
+ "lxvw4x 42, %5, %1 \n\t"
|
|
||||||
+ "lxvw4x 43, %6, %1 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvabssp 54, 46 \n\t"
|
|
||||||
+ "xvabssp 55, 47 \n\t"
|
|
||||||
+
|
|
||||||
+ "lxvw4x 44, %7, %1 \n\t"
|
|
||||||
+ "lxvw4x 45, %8, %1 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvaddsp 32, 32, 48 \n\t"
|
|
||||||
+ "xvaddsp 33, 33, 49 \n\t"
|
|
||||||
+
|
|
||||||
+ "lxvw4x 46, %9, %1 \n\t"
|
|
||||||
+ "lxvw4x 47, %10, %1 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvaddsp 34, 34, 50 \n\t"
|
|
||||||
+ "xvaddsp 35, 35, 51 \n\t"
|
|
||||||
+ "addi %1, %1, 128 \n\t"
|
|
||||||
+ "xvaddsp 36, 36, 52 \n\t"
|
|
||||||
+ "xvaddsp 37, 37, 53 \n\t"
|
|
||||||
+ "addic. %2, %2, -32 \n\t"
|
|
||||||
+ "xvaddsp 38, 38, 54 \n\t"
|
|
||||||
+ "xvaddsp 39, 39, 55 \n\t"
|
|
||||||
+
|
|
||||||
+ "bgt 1b \n\t"
|
|
||||||
+
|
|
||||||
+ "2: \n\t"
|
|
||||||
+ "xvabssp 48, 40 \n\t"
|
|
||||||
+ "xvabssp 49, 41 \n\t"
|
|
||||||
+ "xvabssp 50, 42 \n\t"
|
|
||||||
+ "xvabssp 51, 43 \n\t"
|
|
||||||
+ "xvabssp 52, 44 \n\t"
|
|
||||||
+ "xvabssp 53, 45 \n\t"
|
|
||||||
+ "xvabssp 54, 46 \n\t"
|
|
||||||
+ "xvabssp 55, 47 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvaddsp 32, 32, 48 \n\t"
|
|
||||||
+ "xvaddsp 33, 33, 49 \n\t"
|
|
||||||
+ "xvaddsp 34, 34, 50 \n\t"
|
|
||||||
+ "xvaddsp 35, 35, 51 \n\t"
|
|
||||||
+ "xvaddsp 36, 36, 52 \n\t"
|
|
||||||
+ "xvaddsp 37, 37, 53 \n\t"
|
|
||||||
+ "xvaddsp 38, 38, 54 \n\t"
|
|
||||||
+ "xvaddsp 39, 39, 55 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvaddsp 32, 32, 33 \n\t"
|
|
||||||
+ "xvaddsp 34, 34, 35 \n\t"
|
|
||||||
+ "xvaddsp 36, 36, 37 \n\t"
|
|
||||||
+ "xvaddsp 38, 38, 39 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvaddsp 32, 32, 34 \n\t"
|
|
||||||
+ "xvaddsp 36, 36, 38 \n\t"
|
|
||||||
+
|
|
||||||
+ "xvaddsp 32, 32, 36 \n\t"
|
|
||||||
+
|
|
||||||
+ "stxvw4x 32, %y0 \n\t"
|
|
||||||
|
|
||||||
:
|
|
||||||
- :
|
- :
|
||||||
- "r" (i), // 0
|
- "r" (i), // 0
|
||||||
- "r" (n), // 1
|
- "r" (n), // 1
|
||||||
@ -245,28 +219,22 @@ index 847fffe..f28eb49 100644
|
|||||||
- "r" (o96), // 10
|
- "r" (o96), // 10
|
||||||
- "r" (o112) // 11
|
- "r" (o112) // 11
|
||||||
- : "cr0", "%0", "%2", "memory"
|
- : "cr0", "%0", "%2", "memory"
|
||||||
+ "=m" (*svec), // 0
|
- );
|
||||||
+ "+b" (x), // 1
|
|
||||||
+ "+r" (n) // 2
|
|
||||||
+ :
|
|
||||||
+ "r" (pre), // 3
|
|
||||||
+ "r" (o16), // 4
|
|
||||||
+ "r" (o32), // 5
|
|
||||||
+ "r" (o48), // 6
|
|
||||||
+ "r" (o64), // 7
|
|
||||||
+ "r" (o80), // 8
|
|
||||||
+ "r" (o96), // 9
|
|
||||||
+ "r" (o112) // 10
|
|
||||||
+ :
|
|
||||||
+ "cr0","32","33","34","35","36","37","38","39",
|
|
||||||
+ "40","41","42","43","44","45","46","47",
|
|
||||||
+ "48","49","50","51","52","53","54","55"
|
|
||||||
);
|
|
||||||
-
|
-
|
||||||
}
|
-}
|
||||||
|
-
|
||||||
|
-
|
||||||
32 \n\t"
|
+ float sum;
|
||||||
|
+ __vector float t0;
|
||||||
|
+ __vector float t1;
|
||||||
|
+ __vector float t2;
|
||||||
|
+ __vector float t3;
|
||||||
|
+
|
||||||
|
+ __asm__
|
||||||
|
+ (
|
||||||
|
+ "dcbt 0, %2 \n\t"
|
||||||
|
+
|
||||||
|
+ "xxlxor 32, 32, 32 \n\t"
|
||||||
+ "xxlxor 33, 33, 33 \n\t"
|
+ "xxlxor 33, 33, 33 \n\t"
|
||||||
+ "xxlxor 34, 34, 34 \n\t"
|
+ "xxlxor 34, 34, 34 \n\t"
|
||||||
+ "xxlxor 35, 35, 35 \n\t"
|
+ "xxlxor 35, 35, 35 \n\t"
|
||||||
|
Loading…
Reference in New Issue
Block a user