openblas/openblas-0.2.19-fix_register_clobbers.patch
2017-02-13 22:49:31 +01:00

7842 lines
212 KiB
Diff

From 1e70600316ab080d80e318f32868c12eb7d1f2da Mon Sep 17 00:00:00 2001
From: Alan Modra <amodra@gmail.com>
Date: Thu, 9 Feb 2017 08:41:51 +1030
Subject: [PATCH] Fix power8 asm()
Lots of issues here.
- The vsx regs weren't listed as clobbered.
- Poor choice of vsx regs, which along with the lack of clobbers led to
trashing v0..v21 and fr14..fr23. Ideally you'd let gcc choose all
temp vsx regs, but asms currently have a limit of 30 i/o parms.
- Other regs were clobbered unnecessarily, seemingly in an attempt to
clobber inputs, with gcc-7 complaining about the clobber of r2.
(Changed inputs should be also listed as outputs or as an i/o.)
- "r" constraint used instead of "b" for gprs used in insns where the
r0 encoding means zero rather than r0.
- There were unused asm inputs too.
- All memory was clobbered rather than hooking up memory outputs with
proper memory constraints, and that and the lack of proper memory
input constraints meant the asms needed to be volatile and their
containing function noinline.
- Some parameters were being passed unnecessarily via memory.
- When a copy of a pointer input parm was needed, the value passed to
the asm was incremented in C and decremented in asm, rather than
using i/o parms, an early clobber constraint, or a temp output reg
copied in the asm. In most cases a small change to assembly could
be made that obviated the need for the extra pointer.
- A number of functions did not compute the final sum or dot-product
in assembly, instead using scalar code in C.
- dcbt was bogus.
I've also fixed formatting of the asm.
diff --git a/kernel/power/casum.c b/kernel/power/casum.c
index aeed0ca..d110858 100644
--- a/kernel/power/casum.c
+++ b/kernel/power/casum.c
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16
-static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
{
BLASLONG i=0;
@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
}
- svec[0] = sum0+sum1+sum2+sum3;
- svec[1] = 0.0;
- svec[2] = 0.0;
- svec[3] = 0.0;
-
+ return sum0+sum1+sum2+sum3;
}
#endif
@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
- FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
BLASLONG inc_x2;
@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{
- casum_kernel_16(n1, x, svec);
- sumf = svec[0] + svec[1]+svec[2]+svec[3];
+ sumf = casum_kernel_16(n1, x);
i=n1;
ip = 2 * n1;
}
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index cb50234..38a1143 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_16 1
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+static float casum_kernel_16 (long n, float *x)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "dcbt %2 , %4 \n\t"
-
- "xxlxor 32,32,32 \n\t"
- "xxlxor 33,33,33 \n\t"
- "xxlxor 34,34,34 \n\t"
- "xxlxor 35,35,35 \n\t"
- "xxlxor 36,36,36 \n\t"
- "xxlxor 37,37,37 \n\t"
- "xxlxor 38,38,38 \n\t"
- "xxlxor 39,39,39 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2 , %4 \n\t"
-
- "xvabssp 48, 40 \n\t"
- "xvabssp 49, 41 \n\t"
- "xvabssp 50, 42 \n\t"
- "xvabssp 51, 43 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
-
- "xvabssp 52, 44 \n\t"
- "xvabssp 53, 45 \n\t"
-
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
-
- "xvabssp 54, 46 \n\t"
- "xvabssp 55, 47 \n\t"
-
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
-
- "xvaddsp 32, 32, 48 \n\t"
- "xvaddsp 33, 33, 49 \n\t"
-
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "xvaddsp 34, 34, 50 \n\t"
- "xvaddsp 35, 35, 51 \n\t"
- "addi %2, %2, 128 \n\t"
- "xvaddsp 36, 36, 52 \n\t"
- "xvaddsp 37, 37, 53 \n\t"
- "addic. %0 , %0 , -16 \n\t"
- "xvaddsp 38, 38, 54 \n\t"
- "xvaddsp 39, 39, 55 \n\t"
-
- "bgt 1b \n\t"
-
- "2: \n\t"
-
-
- "xvabssp 48, 40 \n\t"
- "xvabssp 49, 41 \n\t"
- "xvabssp 50, 42 \n\t"
- "xvabssp 51, 43 \n\t"
- "xvabssp 52, 44 \n\t"
- "xvabssp 53, 45 \n\t"
- "xvabssp 54, 46 \n\t"
- "xvabssp 55, 47 \n\t"
-
- "xvaddsp 32, 32, 48 \n\t"
- "xvaddsp 33, 33, 49 \n\t"
- "xvaddsp 34, 34, 50 \n\t"
- "xvaddsp 35, 35, 51 \n\t"
- "xvaddsp 36, 36, 52 \n\t"
- "xvaddsp 37, 37, 53 \n\t"
- "xvaddsp 38, 38, 54 \n\t"
- "xvaddsp 39, 39, 55 \n\t"
-
- "xvaddsp 32, 32, 33 \n\t"
- "xvaddsp 34, 34, 35 \n\t"
- "xvaddsp 36, 36, 37 \n\t"
- "xvaddsp 38, 38, 39 \n\t"
-
- "xvaddsp 32, 32, 34 \n\t"
- "xvaddsp 36, 36, 38 \n\t"
-
- "xvaddsp 32, 32, 36 \n\t"
-
-
- "stxvw4x 32, 0, %3 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (svec), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2", "memory"
- );
-
-}
-
-
+ float sum;
+ __vector float t0;
+ __vector float t1;
+ __vector float t2;
+ __vector float t3;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %8, %2 \n\t"
+ "lxvw4x 42, %9, %2 \n\t"
+ "lxvw4x 43, %10, %2 \n\t"
+ "lxvw4x 44, %11, %2 \n\t"
+ "lxvw4x 45, %12, %2 \n\t"
+ "lxvw4x 46, %13, %2 \n\t"
+ "lxvw4x 47, %14, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %8, %2 \n\t"
+
+ "xvabssp %x3, 44 \n\t"
+ "xvabssp %x4, 45 \n\t"
+
+ "lxvw4x 42, %9, %2 \n\t"
+ "lxvw4x 43, %10, %2 \n\t"
+
+ "xvabssp %x5, 46 \n\t"
+ "xvabssp %x6, 47 \n\t"
+
+ "lxvw4x 44, %11, %2 \n\t"
+ "lxvw4x 45, %12, %2 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+
+ "lxvw4x 46, %13, %2 \n\t"
+ "lxvw4x 47, %14, %2 \n\t"
+
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvaddsp 36, 36, %x3 \n\t"
+ "xvaddsp 37, 37, %x4 \n\t"
+ "addic. %1, %1, -16 \n\t"
+ "xvaddsp 38, 38, %x5 \n\t"
+ "xvaddsp 39, 39, %x6 \n\t"
+
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+ "xvabssp %x3, 44 \n\t"
+ "xvabssp %x4, 45 \n\t"
+ "xvabssp %x5, 46 \n\t"
+ "xvabssp %x6, 47 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "xvaddsp 36, 36, %x3 \n\t"
+ "xvaddsp 37, 37, %x4 \n\t"
+ "xvaddsp 38, 38, %x5 \n\t"
+ "xvaddsp 39, 39, %x6 \n\t"
+
+ "xvaddsp 32, 32, 33 \n\t"
+ "xvaddsp 34, 34, 35 \n\t"
+ "xvaddsp 36, 36, 37 \n\t"
+ "xvaddsp 38, 38, 39 \n\t"
+
+ "xvaddsp 32, 32, 34 \n\t"
+ "xvaddsp 36, 36, 38 \n\t"
+
+ "xvaddsp 32, 32, 36 \n\t"
+
+ "xxsldwi 33, 32, 32, 2 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xxsldwi 33, 32, 32, 1 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xscvspdp %0, 32 \n"
+
+ "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+ "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+ :
+ "=f" (sum), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "=wa" (t0), // 3
+ "=wa" (t1), // 4
+ "=wa" (t2), // 5
+ "=wa" (t3) // 6
+ :
+ "m" (*x),
+ "b" (16), // 8
+ "b" (32), // 9
+ "b" (48), // 10
+ "b" (64), // 11
+ "b" (80), // 12
+ "b" (96), // 13
+ "b" (112) // 14
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+
+ return sum;
+}
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
index 95b3559..b2b1bea 100644
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1
-static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void ccopy_kernel_32 (long n, float *x, float *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvw4x 50, 0, %2 \n\t"
- "lxvw4x 51, %5, %2 \n\t"
- "lxvw4x 52, %6, %2 \n\t"
- "lxvw4x 53, %7, %2 \n\t"
- "lxvw4x 54, %8, %2 \n\t"
- "lxvw4x 55, %9, %2 \n\t"
- "lxvw4x 56, %10, %2 \n\t"
- "lxvw4x 57, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "stxvw4x 40, 0, %1 \n\t"
- "stxvw4x 41, %5, %1 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "stxvw4x 42, %6, %1 \n\t"
- "stxvw4x 43, %7, %1 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "stxvw4x 44, %8, %1 \n\t"
- "stxvw4x 45, %9, %1 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "stxvw4x 46, %10, %1 \n\t"
- "stxvw4x 47, %11, %1 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "stxvw4x 50, 0, %1 \n\t"
- "stxvw4x 51, %5, %1 \n\t"
- "lxvw4x 50, 0, %2 \n\t"
- "lxvw4x 51, %5, %2 \n\t"
- "stxvw4x 52, %6, %1 \n\t"
- "stxvw4x 53, %7, %1 \n\t"
- "lxvw4x 52, %6, %2 \n\t"
- "lxvw4x 53, %7, %2 \n\t"
- "stxvw4x 54, %8, %1 \n\t"
- "stxvw4x 55, %9, %1 \n\t"
- "lxvw4x 54, %8, %2 \n\t"
- "lxvw4x 55, %9, %2 \n\t"
- "stxvw4x 56, %10, %1 \n\t"
- "stxvw4x 57, %11, %1 \n\t"
- "lxvw4x 56, %10, %2 \n\t"
- "lxvw4x 57, %11, %2 \n\t"
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "stxvw4x 40, 0, %1 \n\t"
- "stxvw4x 41, %5, %1 \n\t"
- "stxvw4x 42, %6, %1 \n\t"
- "stxvw4x 43, %7, %1 \n\t"
- "stxvw4x 44, %8, %1 \n\t"
- "stxvw4x 45, %9, %1 \n\t"
- "stxvw4x 46, %10, %1 \n\t"
- "stxvw4x 47, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "stxvw4x 50, 0, %1 \n\t"
- "stxvw4x 51, %5, %1 \n\t"
- "stxvw4x 52, %6, %1 \n\t"
- "stxvw4x 53, %7, %1 \n\t"
- "stxvw4x 54, %8, %1 \n\t"
- "stxvw4x 55, %9, %1 \n\t"
- "stxvw4x 56, %10, %1 \n\t"
- "stxvw4x 57, %11, %1 \n\t"
-
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ "lxvw4x 32, 0, %2 \n\t"
+ "lxvw4x 33, %5, %2 \n\t"
+ "lxvw4x 34, %6, %2 \n\t"
+ "lxvw4x 35, %7, %2 \n\t"
+ "lxvw4x 36, %8, %2 \n\t"
+ "lxvw4x 37, %9, %2 \n\t"
+ "lxvw4x 38, %10, %2 \n\t"
+ "lxvw4x 39, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "stxvw4x 32, 0, %3 \n\t"
+ "stxvw4x 33, %5, %3 \n\t"
+ "lxvw4x 32, 0, %2 \n\t"
+ "lxvw4x 33, %5, %2 \n\t"
+ "stxvw4x 34, %6, %3 \n\t"
+ "stxvw4x 35, %7, %3 \n\t"
+ "lxvw4x 34, %6, %2 \n\t"
+ "lxvw4x 35, %7, %2 \n\t"
+ "stxvw4x 36, %8, %3 \n\t"
+ "stxvw4x 37, %9, %3 \n\t"
+ "lxvw4x 36, %8, %2 \n\t"
+ "lxvw4x 37, %9, %2 \n\t"
+ "stxvw4x 38, %10, %3 \n\t"
+ "stxvw4x 39, %11, %3 \n\t"
+ "lxvw4x 38, %10, %2 \n\t"
+ "lxvw4x 39, %11, %2 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "stxvw4x 40, 0, %3 \n\t"
+ "stxvw4x 41, %5, %3 \n\t"
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "stxvw4x 42, %6, %3 \n\t"
+ "stxvw4x 43, %7, %3 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "stxvw4x 44, %8, %3 \n\t"
+ "stxvw4x 45, %9, %3 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "stxvw4x 46, %10, %3 \n\t"
+ "stxvw4x 47, %11, %3 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "stxvw4x 32, 0, %3 \n\t"
+ "stxvw4x 33, %5, %3 \n\t"
+ "stxvw4x 34, %6, %3 \n\t"
+ "stxvw4x 35, %7, %3 \n\t"
+ "stxvw4x 36, %8, %3 \n\t"
+ "stxvw4x 37, %9, %3 \n\t"
+ "stxvw4x 38, %10, %3 \n\t"
+ "stxvw4x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvw4x 40, 0, %3 \n\t"
+ "stxvw4x 41, %5, %3 \n\t"
+ "stxvw4x 42, %6, %3 \n\t"
+ "stxvw4x 43, %7, %3 \n\t"
+ "stxvw4x 44, %8, %3 \n\t"
+ "stxvw4x 45, %9, %3 \n\t"
+ "stxvw4x 46, %10, %3 \n\t"
+ "stxvw4x 47, %11, %3 \n"
+
+ "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "=m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "m" (*x),
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
index 90ab59c..1dd03dc 100644
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -35,146 +35,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1
-static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void cswap_kernel_32 (long n, float *x, float *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *x2=x+1;
- FLOAT *y2=y+1;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "addi %3, %3, -4 \n\t"
- "addi %4, %4, -4 \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %5, %2 \n\t"
- "lxvw4x 34, %6, %2 \n\t"
- "lxvw4x 35, %7, %2 \n\t"
- "lxvw4x 36, %8, %2 \n\t"
- "lxvw4x 37, %9, %2 \n\t"
- "lxvw4x 38, %10, %2 \n\t"
- "lxvw4x 39, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvw4x 48, 0, %1 \n\t"
- "lxvw4x 49, %5, %1 \n\t"
- "lxvw4x 50, %6, %1 \n\t"
- "lxvw4x 51, %7, %1 \n\t"
- "lxvw4x 52, %8, %1 \n\t"
- "lxvw4x 53, %9, %1 \n\t"
- "lxvw4x 54, %10, %1 \n\t"
- "lxvw4x 55, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "lxvw4x 56, 0, %1 \n\t"
- "lxvw4x 57, %5, %1 \n\t"
- "lxvw4x 58, %6, %1 \n\t"
- "lxvw4x 59, %7, %1 \n\t"
- "lxvw4x 60, %8, %1 \n\t"
- "lxvw4x 61, %9, %1 \n\t"
- "lxvw4x 62, %10, %1 \n\t"
- "lxvw4x 63, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
-
- "addi %3, %3, 128 \n\t"
-
- "stxvw4x 40, 0, %3 \n\t"
- "stxvw4x 41, %5, %3 \n\t"
- "stxvw4x 42, %6, %3 \n\t"
- "stxvw4x 43, %7, %3 \n\t"
- "stxvw4x 44, %8, %3 \n\t"
- "stxvw4x 45, %9, %3 \n\t"
- "stxvw4x 46, %10, %3 \n\t"
- "stxvw4x 47, %11, %3 \n\t"
-
- "addi %3, %3, 128 \n\t"
-
- "stxvw4x 48, 0, %4 \n\t"
- "stxvw4x 49, %5, %4 \n\t"
- "stxvw4x 50, %6, %4 \n\t"
- "stxvw4x 51, %7, %4 \n\t"
- "stxvw4x 52, %8, %4 \n\t"
- "stxvw4x 53, %9, %4 \n\t"
- "stxvw4x 54, %10, %4 \n\t"
- "stxvw4x 55, %11, %4 \n\t"
-
- "addi %4, %4, 128 \n\t"
-
- "stxvw4x 56, 0, %4 \n\t"
- "stxvw4x 57, %5, %4 \n\t"
- "stxvw4x 58, %6, %4 \n\t"
- "stxvw4x 59, %7, %4 \n\t"
- "stxvw4x 60, %8, %4 \n\t"
- "stxvw4x 61, %9, %4 \n\t"
- "stxvw4x 62, %10, %4 \n\t"
- "stxvw4x 63, %11, %4 \n\t"
-
- "addi %4, %4, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (y2), // 3
- "r" (x2), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "lxvw4x 32, 0, %4 \n\t"
+ "lxvw4x 33, %5, %4 \n\t"
+ "lxvw4x 34, %6, %4 \n\t"
+ "lxvw4x 35, %7, %4 \n\t"
+ "lxvw4x 36, %8, %4 \n\t"
+ "lxvw4x 37, %9, %4 \n\t"
+ "lxvw4x 38, %10, %4 \n\t"
+ "lxvw4x 39, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "lxvw4x 40, 0, %4 \n\t"
+ "lxvw4x 41, %5, %4 \n\t"
+ "lxvw4x 42, %6, %4 \n\t"
+ "lxvw4x 43, %7, %4 \n\t"
+ "lxvw4x 44, %8, %4 \n\t"
+ "lxvw4x 45, %9, %4 \n\t"
+ "lxvw4x 46, %10, %4 \n\t"
+ "lxvw4x 47, %11, %4 \n\t"
+
+ "addi %4, %4, -128 \n\t"
+
+ "lxvw4x 48, 0, %3 \n\t"
+ "lxvw4x 49, %5, %3 \n\t"
+ "lxvw4x 50, %6, %3 \n\t"
+ "lxvw4x 51, %7, %3 \n\t"
+ "lxvw4x 0, %8, %3 \n\t"
+ "lxvw4x 1, %9, %3 \n\t"
+ "lxvw4x 2, %10, %3 \n\t"
+ "lxvw4x 3, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "lxvw4x 4, 0, %3 \n\t"
+ "lxvw4x 5, %5, %3 \n\t"
+ "lxvw4x 6, %6, %3 \n\t"
+ "lxvw4x 7, %7, %3 \n\t"
+ "lxvw4x 8, %8, %3 \n\t"
+ "lxvw4x 9, %9, %3 \n\t"
+ "lxvw4x 10, %10, %3 \n\t"
+ "lxvw4x 11, %11, %3 \n\t"
+
+ "addi %3, %3, -128 \n\t"
+
+ "stxvw4x 32, 0, %3 \n\t"
+ "stxvw4x 33, %5, %3 \n\t"
+ "stxvw4x 34, %6, %3 \n\t"
+ "stxvw4x 35, %7, %3 \n\t"
+ "stxvw4x 36, %8, %3 \n\t"
+ "stxvw4x 37, %9, %3 \n\t"
+ "stxvw4x 38, %10, %3 \n\t"
+ "stxvw4x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvw4x 40, 0, %3 \n\t"
+ "stxvw4x 41, %5, %3 \n\t"
+ "stxvw4x 42, %6, %3 \n\t"
+ "stxvw4x 43, %7, %3 \n\t"
+ "stxvw4x 44, %8, %3 \n\t"
+ "stxvw4x 45, %9, %3 \n\t"
+ "stxvw4x 46, %10, %3 \n\t"
+ "stxvw4x 47, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvw4x 48, 0, %4 \n\t"
+ "stxvw4x 49, %5, %4 \n\t"
+ "stxvw4x 50, %6, %4 \n\t"
+ "stxvw4x 51, %7, %4 \n\t"
+ "stxvw4x 0, %8, %4 \n\t"
+ "stxvw4x 1, %9, %4 \n\t"
+ "stxvw4x 2, %10, %4 \n\t"
+ "stxvw4x 3, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "stxvw4x 4, 0, %4 \n\t"
+ "stxvw4x 5, %5, %4 \n\t"
+ "stxvw4x 6, %6, %4 \n\t"
+ "stxvw4x 7, %7, %4 \n\t"
+ "stxvw4x 8, %8, %4 \n\t"
+ "stxvw4x 9, %9, %4 \n\t"
+ "stxvw4x 10, %10, %4 \n\t"
+ "stxvw4x 11, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %2, %2, -32 \n\t"
+ "bgt 1b \n"
+
+ "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "+m" (*x),
+ "+m" (*y),
+ "+r" (n), // 2
+ "+b" (x), // 3
+ "+b" (y) // 4
+ :
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+ "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+ );
+}
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index 77f5345..73962c2 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#else
-#define ABS fabsf
+#error supports double only
#endif
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16
-static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
{
BLASLONG i=0;
@@ -92,9 +92,7 @@ static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
}
- svec[0] = sum0+sum1+sum2+sum3;
- svec[1] = 0.0;
-
+ return sum0+sum1+sum2+sum3;
}
#endif
@@ -103,7 +101,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
- FLOAT svec[2] __attribute__ ((aligned (16)));;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return(sumf);
@@ -115,8 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{
- dasum_kernel_16(n1, x, svec);
- sumf = svec[0] + svec[1];
+ sumf = dasum_kernel_16(n1, x);
i=n1;
}
diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c
index cc38c4f..880d7d2 100644
--- a/kernel/power/dasum_microk_power8.c
+++ b/kernel/power/dasum_microk_power8.c
@@ -34,144 +34,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_16 1
-static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
-static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+static double dasum_kernel_16 (long n, double *x)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "dcbt %2 , %4 \n\t"
-
- "xxlxor 32,32,32 \n\t"
- "xxlxor 33,33,33 \n\t"
- "xxlxor 34,34,34 \n\t"
- "xxlxor 35,35,35 \n\t"
- "xxlxor 36,36,36 \n\t"
- "xxlxor 37,37,37 \n\t"
- "xxlxor 38,38,38 \n\t"
- "xxlxor 39,39,39 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2 , %4 \n\t"
-
- "xvabsdp 48, 40 \n\t"
- "xvabsdp 49, 41 \n\t"
- "xvabsdp 50, 42 \n\t"
- "xvabsdp 51, 43 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
-
- "xvabsdp 52, 44 \n\t"
- "xvabsdp 53, 45 \n\t"
-
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
-
- "xvabsdp 54, 46 \n\t"
- "xvabsdp 55, 47 \n\t"
-
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
-
- "xvadddp 32, 32, 48 \n\t"
- "xvadddp 33, 33, 49 \n\t"
-
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "xvadddp 34, 34, 50 \n\t"
- "xvadddp 35, 35, 51 \n\t"
- "addi %2, %2, 128 \n\t"
- "xvadddp 36, 36, 52 \n\t"
- "xvadddp 37, 37, 53 \n\t"
- "addic. %0 , %0 , -16 \n\t"
- "xvadddp 38, 38, 54 \n\t"
- "xvadddp 39, 39, 55 \n\t"
-
- "bgt 1b \n\t"
-
- "2: \n\t"
-
-
- "xvabsdp 48, 40 \n\t"
- "xvabsdp 49, 41 \n\t"
- "xvabsdp 50, 42 \n\t"
- "xvabsdp 51, 43 \n\t"
- "xvabsdp 52, 44 \n\t"
- "xvabsdp 53, 45 \n\t"
- "xvabsdp 54, 46 \n\t"
- "xvabsdp 55, 47 \n\t"
-
- "xvadddp 32, 32, 48 \n\t"
- "xvadddp 33, 33, 49 \n\t"
- "xvadddp 34, 34, 50 \n\t"
- "xvadddp 35, 35, 51 \n\t"
- "xvadddp 36, 36, 52 \n\t"
- "xvadddp 37, 37, 53 \n\t"
- "xvadddp 38, 38, 54 \n\t"
- "xvadddp 39, 39, 55 \n\t"
-
- "xvadddp 32, 32, 33 \n\t"
- "xvadddp 34, 34, 35 \n\t"
- "xvadddp 36, 36, 37 \n\t"
- "xvadddp 38, 38, 39 \n\t"
-
- "xvadddp 32, 32, 34 \n\t"
- "xvadddp 36, 36, 38 \n\t"
-
- "xvadddp 32, 32, 36 \n\t"
-
-
- "stxvd2x 32, 0, %3 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (svec), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2", "memory"
- );
-
-}
+ double sum;
+ __vector double t0;
+ __vector double t1;
+ __vector double t2;
+ __vector double t3;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+
+ "xvabsdp %x3, 44 \n\t"
+ "xvabsdp %x4, 45 \n\t"
+
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+
+ "xvabsdp %x5, 46 \n\t"
+ "xvabsdp %x6, 47 \n\t"
+
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
+
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvadddp 36, 36, %x3 \n\t"
+ "xvadddp 37, 37, %x4 \n\t"
+ "addic. %1, %1, -16 \n\t"
+ "xvadddp 38, 38, %x5 \n\t"
+ "xvadddp 39, 39, %x6 \n\t"
+
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+ "xvabsdp %x3, 44 \n\t"
+ "xvabsdp %x4, 45 \n\t"
+ "xvabsdp %x5, 46 \n\t"
+ "xvabsdp %x6, 47 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "xvadddp 36, 36, %x3 \n\t"
+ "xvadddp 37, 37, %x4 \n\t"
+ "xvadddp 38, 38, %x5 \n\t"
+ "xvadddp 39, 39, %x6 \n\t"
+
+ "xvadddp 32, 32, 33 \n\t"
+ "xvadddp 34, 34, 35 \n\t"
+ "xvadddp 36, 36, 37 \n\t"
+ "xvadddp 38, 38, 39 \n\t"
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+
+ "xxswapd 33, 32 \n\t"
+ "xsadddp %x0, 32, 33 \n"
+
+ "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+ "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+ :
+ "=d" (sum), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "=wa" (t0), // 3
+ "=wa" (t1), // 4
+ "=wa" (t2), // 5
+ "=wa" (t3) // 6
+ :
+ "m" (*x),
+ "b" (16), // 8
+ "b" (32), // 9
+ "b" (48), // 10
+ "b" (64), // 11
+ "b" (80), // 12
+ "b" (96), // 13
+ "b" (112) // 14
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+
+ return sum;
+}
diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c
index 4365bd8..df0572e 100644
--- a/kernel/power/daxpy.c
+++ b/kernel/power/daxpy.c
@@ -43,21 +43,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8
-static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
{
BLASLONG register i = 0;
- FLOAT a = *alpha;
while(i < n)
{
- y[i] += a * x[i];
- y[i+1] += a * x[i+1];
- y[i+2] += a * x[i+2];
- y[i+3] += a * x[i+3];
- y[i+4] += a * x[i+4];
- y[i+5] += a * x[i+5];
- y[i+6] += a * x[i+6];
- y[i+7] += a * x[i+7];
+ y[i] += alpha * x[i];
+ y[i+1] += alpha * x[i+1];
+ y[i+2] += alpha * x[i+2];
+ y[i+3] += alpha * x[i+3];
+ y[i+4] += alpha * x[i+4];
+ y[i+5] += alpha * x[i+5];
+ y[i+6] += alpha * x[i+6];
+ y[i+7] += alpha * x[i+7];
i+=8 ;
}
@@ -70,11 +69,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
- FLOAT a2[4];
- a2[0]=da;
- a2[1]=da;
- a2[2]=da;
- a2[3]=da;
if ( n <= 0 ) return(0);
@@ -84,7 +78,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -16;
if ( n1 )
- daxpy_kernel_8(n1, x, y , a2 );
+ daxpy_kernel_8(n1, x, y, da);
i = n1;
while(i < n)
diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c
index bb3f73a..fb714a3 100644
--- a/kernel/power/daxpy_microk_power8.c
+++ b/kernel/power/daxpy_microk_power8.c
@@ -35,167 +35,183 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_8 1
-static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
-static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
{
+ __vector double t0;
+ __vector double t1;
+ __vector double t2;
+ __vector double t3;
+ __vector double t4;
+ __vector double t5;
+ __vector double t6;
+ __vector double t7;
+ __vector double t8;
+ __vector double t9;
+ __vector double t10;
+ __vector double t11;
+ __vector double t12;
+ __vector double t13;
+ __vector double t14;
+ __vector double t15;
+ __vector double t16;
+ __asm__
+ (
+ "xxspltd %x4, %x22, 0 \n\t"
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *y2=y+1;
- BLASLONG pre = 384;
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
- __asm__ __volatile__
- (
+ "lxvd2x %x5, 0, %2 \n\t"
+ "lxvd2x %x6, %23, %2 \n\t"
+ "lxvd2x %x7, %24, %2 \n\t"
+ "lxvd2x %x8, %25, %2 \n\t"
- "lxsdx 33, %5, %4 \n\t"
- "xxspltd 32, 33, 0 \n\t"
- "addi %8, %8, -8 \n\t"
+ "lxvd2x %x13, 0, %3 \n\t"
+ "lxvd2x %x14, %23, %3 \n\t"
+ "lxvd2x %x15, %24, %3 \n\t"
+ "lxvd2x %x16, %25, %3 \n\t"
- "dcbt %2, %9 \n\t"
- "dcbt %3, %9 \n\t"
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x %x9, 0, %2 \n\t"
+ "lxvd2x %x10, %23, %2 \n\t"
+ "lxvd2x %x11, %24, %2 \n\t"
+ "lxvd2x %x12, %25, %2 \n\t"
- "lxvd2x 48, 0, %3 \n\t"
- "lxvd2x 49, %5, %3 \n\t"
- "lxvd2x 50, %6, %3 \n\t"
- "lxvd2x 51, %7, %3 \n\t"
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
- "lxvd2x 44, 0, %2 \n\t"
- "lxvd2x 45, %5, %2 \n\t"
- "lxvd2x 46, %6, %2 \n\t"
- "lxvd2x 47, %7, %2 \n\t"
-
- "lxvd2x 52, 0, %3 \n\t"
- "lxvd2x 53, %5, %3 \n\t"
- "lxvd2x 54, %6, %3 \n\t"
- "lxvd2x 55, %7, %3 \n\t"
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %9 \n\t"
- "dcbt %3, %9 \n\t"
-
- "xvmaddadp 48, 40, 32 \n\t"
- "xvmaddadp 49, 41, 32 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
-
- "stxvd2x 48, 0, %8 \n\t"
- "stxvd2x 49, %5, %8 \n\t"
-
- "xvmaddadp 50, 42, 32 \n\t"
- "xvmaddadp 51, 43, 32 \n\t"
-
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
-
- "stxvd2x 50, %6, %8 \n\t"
- "stxvd2x 51, %7, %8 \n\t"
-
- "lxvd2x 48, 0, %3 \n\t"
- "lxvd2x 49, %5, %3 \n\t"
- "lxvd2x 50, %6, %3 \n\t"
- "lxvd2x 51, %7, %3 \n\t"
-
- "addi %2, %2, 64 \n\t"
- "addi %8, %8, 64 \n\t"
-
- "xvmaddadp 52, 44, 32 \n\t"
- "addi %3, %3, 64 \n\t"
- "xvmaddadp 53, 45, 32 \n\t"
-
- "lxvd2x 44, 0, %2 \n\t"
- "lxvd2x 45, %5, %2 \n\t"
-
- "stxvd2x 52, 0, %8 \n\t"
- "stxvd2x 53, %5, %8 \n\t"
-
- "xvmaddadp 54, 46, 32 \n\t"
- "xvmaddadp 55, 47, 32 \n\t"
-
- "lxvd2x 46, %6, %2 \n\t"
- "lxvd2x 47, %7, %2 \n\t"
-
- "stxvd2x 54, %6, %8 \n\t"
- "stxvd2x 55, %7, %8 \n\t"
-
- "addi %2, %2, 64 \n\t"
- "addi %8, %8, 64 \n\t"
-
- "lxvd2x 52, 0, %3 \n\t"
- "lxvd2x 53, %5, %3 \n\t"
- "lxvd2x 54, %6, %3 \n\t"
- "lxvd2x 55, %7, %3 \n\t"
-
- "addi %3, %3, 64 \n\t"
-
-
- "addic. %0 , %0 , -16 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
-
- "xvmaddadp 48, 40, 32 \n\t"
- "xvmaddadp 49, 41, 32 \n\t"
- "xvmaddadp 50, 42, 32 \n\t"
- "xvmaddadp 51, 43, 32 \n\t"
-
- "xvmaddadp 52, 44, 32 \n\t"
- "xvmaddadp 53, 45, 32 \n\t"
- "xvmaddadp 54, 46, 32 \n\t"
- "xvmaddadp 55, 47, 32 \n\t"
-
- "stxvd2x 48, 0, %8 \n\t"
- "stxvd2x 49, %5, %8 \n\t"
- "stxvd2x 50, %6, %8 \n\t"
- "stxvd2x 51, %7, %8 \n\t"
-
- "addi %8, %8, 64 \n\t"
-
- "stxvd2x 52, 0, %8 \n\t"
- "stxvd2x 53, %5, %8 \n\t"
- "stxvd2x 54, %6, %8 \n\t"
- "stxvd2x 55, %7, %8 \n\t"
-
- "addi %8, %8, 64 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (y1), // 3
- "r" (alpha), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (y2), // 8
- "r" (pre) // 9
- : "cr0", "%0", "%2" , "%3", "%8", "memory"
- );
-
-}
+ "lxvd2x %x17, 0, %3 \n\t"
+ "lxvd2x %x18, %23, %3 \n\t"
+ "lxvd2x %x19, %24, %3 \n\t"
+ "lxvd2x %x20, %25, %3 \n\t"
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, -64 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble 2f \n\t"
+
+ ".align 5 \n"
+ "1: \n\t"
+
+ "xvmaddadp %x13, %x5, %x4 \n\t"
+ "xvmaddadp %x14, %x6, %x4 \n\t"
+
+ "lxvd2x %x5, 0, %2 \n\t"
+ "lxvd2x %x6, %23, %2 \n\t"
+
+ "stxvd2x %x13, 0, %3 \n\t"
+ "stxvd2x %x14, %23, %3 \n\t"
+
+ "xvmaddadp %x15, %x7, %x4 \n\t"
+ "xvmaddadp %x16, %x8, %x4 \n\t"
+
+ "lxvd2x %x7, %24, %2 \n\t"
+ "lxvd2x %x8, %25, %2 \n\t"
+
+ "stxvd2x %x15, %24, %3 \n\t"
+ "stxvd2x %x16, %25, %3 \n\t"
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "lxvd2x %x13, 0, %3 \n\t"
+ "lxvd2x %x14, %23, %3 \n\t"
+ "lxvd2x %x15, %24, %3 \n\t"
+ "lxvd2x %x16, %25, %3 \n\t"
+
+ "addi %3, %3, -64 \n\t"
+
+ "xvmaddadp %x17, %x9, %x4 \n\t"
+ "xvmaddadp %x18, %x10, %x4 \n\t"
+
+ "lxvd2x %x9, 0, %2 \n\t"
+ "lxvd2x %x10, %23, %2 \n\t"
+
+ "stxvd2x %x17, 0, %3 \n\t"
+ "stxvd2x %x18, %23, %3 \n\t"
+
+ "xvmaddadp %x19, %x11, %x4 \n\t"
+ "xvmaddadp %x20, %x12, %x4 \n\t"
+
+ "lxvd2x %x11, %24, %2 \n\t"
+ "lxvd2x %x12, %25, %2 \n\t"
+
+ "stxvd2x %x19, %24, %3 \n\t"
+ "stxvd2x %x20, %25, %3 \n\t"
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "lxvd2x %x17, 0, %3 \n\t"
+ "lxvd2x %x18, %23, %3 \n\t"
+ "lxvd2x %x19, %24, %3 \n\t"
+ "lxvd2x %x20, %25, %3 \n\t"
+
+ "addi %3, %3, -64 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmaddadp %x13, %x5, %x4 \n\t"
+ "xvmaddadp %x14, %x6, %x4 \n\t"
+ "xvmaddadp %x15, %x7, %x4 \n\t"
+ "xvmaddadp %x16, %x8, %x4 \n\t"
+
+ "xvmaddadp %x17, %x9, %x4 \n\t"
+ "xvmaddadp %x18, %x10, %x4 \n\t"
+ "xvmaddadp %x19, %x11, %x4 \n\t"
+ "xvmaddadp %x20, %x12, %x4 \n\t"
+
+ "stxvd2x %x13, 0, %3 \n\t"
+ "stxvd2x %x14, %23, %3 \n\t"
+ "stxvd2x %x15, %24, %3 \n\t"
+ "stxvd2x %x16, %25, %3 \n\t"
+
+ "addi %3, %3, 64 \n\t"
+
+ "stxvd2x %x17, 0, %3 \n\t"
+ "stxvd2x %x18, %23, %3 \n\t"
+ "stxvd2x %x19, %24, %3 \n\t"
+ "stxvd2x %x20, %25, %3 \n"
+
+ "#n=%1 x=%21=%2 y=%0=%3 alpha=%22 o16=%23 o32=%24 o48=%25\n"
+ "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15 t12=%x16 t13=%x17 t14=%x18 t15=%x19 t16=%x20"
+ :
+ "+m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y), // 3
+ "=wa" (t0), // 4
+ "=wa" (t1), // 5
+ "=wa" (t2), // 6
+ "=wa" (t3), // 7
+ "=wa" (t4), // 8
+ "=wa" (t5), // 9
+ "=wa" (t6), // 10
+ "=wa" (t7), // 11
+ "=wa" (t8), // 12
+ "=wa" (t9), // 13
+ "=wa" (t10), // 14
+ "=wa" (t11), // 15
+ "=wa" (t12), // 16
+ "=wa" (t13), // 17
+ "=wa" (t14), // 18
+ "=wa" (t15), // 19
+ "=wa" (t16) // 20
+ :
+ "m" (*x),
+ "d" (alpha), // 22
+ "b" (16), // 23
+ "b" (32), // 24
+ "b" (48) // 25
+ :
+ "cr0"
+ );
+
+}
diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c
index 04f7db5..261dc04 100644
--- a/kernel/power/dcopy_microk_power8.c
+++ b/kernel/power/dcopy_microk_power8.c
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1
-static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void dcopy_kernel_32 (long n, double *x, double *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvd2x 50, 0, %2 \n\t"
- "lxvd2x 51, %5, %2 \n\t"
- "lxvd2x 52, %6, %2 \n\t"
- "lxvd2x 53, %7, %2 \n\t"
- "lxvd2x 54, %8, %2 \n\t"
- "lxvd2x 55, %9, %2 \n\t"
- "lxvd2x 56, %10, %2 \n\t"
- "lxvd2x 57, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "stxvd2x 40, 0, %1 \n\t"
- "stxvd2x 41, %5, %1 \n\t"
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "stxvd2x 42, %6, %1 \n\t"
- "stxvd2x 43, %7, %1 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "stxvd2x 44, %8, %1 \n\t"
- "stxvd2x 45, %9, %1 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "stxvd2x 46, %10, %1 \n\t"
- "stxvd2x 47, %11, %1 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "stxvd2x 50, 0, %1 \n\t"
- "stxvd2x 51, %5, %1 \n\t"
- "lxvd2x 50, 0, %2 \n\t"
- "lxvd2x 51, %5, %2 \n\t"
- "stxvd2x 52, %6, %1 \n\t"
- "stxvd2x 53, %7, %1 \n\t"
- "lxvd2x 52, %6, %2 \n\t"
- "lxvd2x 53, %7, %2 \n\t"
- "stxvd2x 54, %8, %1 \n\t"
- "stxvd2x 55, %9, %1 \n\t"
- "lxvd2x 54, %8, %2 \n\t"
- "lxvd2x 55, %9, %2 \n\t"
- "stxvd2x 56, %10, %1 \n\t"
- "stxvd2x 57, %11, %1 \n\t"
- "lxvd2x 56, %10, %2 \n\t"
- "lxvd2x 57, %11, %2 \n\t"
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "stxvd2x 40, 0, %1 \n\t"
- "stxvd2x 41, %5, %1 \n\t"
- "stxvd2x 42, %6, %1 \n\t"
- "stxvd2x 43, %7, %1 \n\t"
- "stxvd2x 44, %8, %1 \n\t"
- "stxvd2x 45, %9, %1 \n\t"
- "stxvd2x 46, %10, %1 \n\t"
- "stxvd2x 47, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "stxvd2x 50, 0, %1 \n\t"
- "stxvd2x 51, %5, %1 \n\t"
- "stxvd2x 52, %6, %1 \n\t"
- "stxvd2x 53, %7, %1 \n\t"
- "stxvd2x 54, %8, %1 \n\t"
- "stxvd2x 55, %9, %1 \n\t"
- "stxvd2x 56, %10, %1 \n\t"
- "stxvd2x 57, %11, %1 \n\t"
-
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n"
+
+ "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "=m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "m" (*x),
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c
index cef60a2..e43470e 100644
--- a/kernel/power/ddot.c
+++ b/kernel/power/ddot.c
@@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8
-static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
@@ -62,8 +62,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
i+=8 ;
}
- *d += dot;
-
+ return dot;
}
#endif
@@ -83,7 +82,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -16;
if ( n1 )
- ddot_kernel_8(n1, x, y , &dot );
+ dot = ddot_kernel_8(n1, x, y);
i = n1;
while(i < n)
diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c
index b880492..4e6bc29 100644
--- a/kernel/power/ddot_microk_power8.c
+++ b/kernel/power/ddot_microk_power8.c
@@ -34,145 +34,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_8 1
-static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
-static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+static double ddot_kernel_8 (long n, double *x, double *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
- "xxlxor 32,32,32 \n\t"
- "xxlxor 33,33,33 \n\t"
- "xxlxor 34,34,34 \n\t"
- "xxlxor 35,35,35 \n\t"
- "xxlxor 36,36,36 \n\t"
- "xxlxor 37,37,37 \n\t"
- "xxlxor 38,38,38 \n\t"
- "xxlxor 39,39,39 \n\t"
-
- "dcbt %2, %12 \n\t"
- "dcbt %3, %12 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 48, 0, %3 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 49, %5, %3 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 50, %6, %3 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 51, %7, %3 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 52, %8, %3 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 53, %9, %3 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 54, %10, %3 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
- "lxvd2x 55, %11, %3 \n\t"
-
- "addi %2, %2, 128 \n\t"
- "addi %3, %3, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %12 \n\t"
- "dcbt %3, %12 \n\t"
-
- "xvmaddadp 32, 40, 48 \n\t"
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 48, 0, %3 \n\t"
- "xvmaddadp 33, 41, 49 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 49, %5, %3 \n\t"
- "xvmaddadp 34, 42, 50 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 50, %6, %3 \n\t"
- "xvmaddadp 35, 43, 51 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 51, %7, %3 \n\t"
- "xvmaddadp 36, 44, 52 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 52, %8, %3 \n\t"
- "xvmaddadp 37, 45, 53 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 53, %9, %3 \n\t"
- "xvmaddadp 38, 46, 54 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 54, %10, %3 \n\t"
- "xvmaddadp 39, 47, 55 \n\t"
-
- "lxvd2x 47, %11, %2 \n\t"
- "lxvd2x 55, %11, %3 \n\t"
-
-
- "addi %2, %2, 128 \n\t"
- "addi %3, %3, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmaddadp 32, 40, 48 \n\t"
- "xvmaddadp 33, 41, 49 \n\t"
- "xvmaddadp 34, 42, 50 \n\t"
- "xvmaddadp 35, 43, 51 \n\t"
- "xvmaddadp 36, 44, 52 \n\t"
- "xvmaddadp 37, 45, 53 \n\t"
- "xvmaddadp 38, 46, 54 \n\t"
- "xvmaddadp 39, 47, 55 \n\t"
-
- "xvadddp 32, 32, 33 \n\t"
- "xvadddp 34, 34, 35 \n\t"
- "xvadddp 36, 36, 37 \n\t"
- "xvadddp 38, 38, 39 \n\t"
-
- "xvadddp 32, 32, 34 \n\t"
- "xvadddp 36, 36, 38 \n\t"
-
- "xvadddp 32, 32, 36 \n\t"
-
- "xxswapd 33, 32 \n\t"
-
- "xsadddp 32, 32, 33 \n\t"
-
- "stxsdx 32, 0, %4 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (y1), // 3
- "r" (dot), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112), // 11
- "r" (pre) // 12
- : "cr0", "%0", "%2" , "%3", "memory"
- );
-
-}
-
-
+ double dot;
+ __vector double t0;
+ __vector double t1;
+ __vector double t2;
+ __vector double t3;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 41, %10, %2 \n\t"
+ "lxvd2x 49, %10, %3 \n\t"
+ "lxvd2x 42, %11, %2 \n\t"
+ "lxvd2x 50, %11, %3 \n\t"
+ "lxvd2x 43, %12, %2 \n\t"
+ "lxvd2x 51, %12, %3 \n\t"
+ "lxvd2x 44, %13, %2 \n\t"
+ "lxvd2x %x4, %13, %3 \n\t"
+ "lxvd2x 45, %14, %2 \n\t"
+ "lxvd2x %x5, %14, %3 \n\t"
+ "lxvd2x 46, %15, %2 \n\t"
+ "lxvd2x %x6, %15, %3 \n\t"
+ "lxvd2x 47, %16, %2 \n\t"
+ "lxvd2x %x7, %16, %3 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 48, 0, %3 \n\t"
+ "xvmaddadp 33, 41, 49 \n\t"
+ "lxvd2x 41, %10, %2 \n\t"
+ "lxvd2x 49, %10, %3 \n\t"
+ "xvmaddadp 34, 42, 50 \n\t"
+ "lxvd2x 42, %11, %2 \n\t"
+ "lxvd2x 50, %11, %3 \n\t"
+ "xvmaddadp 35, 43, 51 \n\t"
+ "lxvd2x 43, %12, %2 \n\t"
+ "lxvd2x 51, %12, %3 \n\t"
+ "xvmaddadp 36, 44, %x4 \n\t"
+ "lxvd2x 44, %13, %2 \n\t"
+ "lxvd2x %x4, %13, %3 \n\t"
+ "xvmaddadp 37, 45, %x5 \n\t"
+ "lxvd2x 45, %14, %2 \n\t"
+ "lxvd2x %x5, %14, %3 \n\t"
+ "xvmaddadp 38, 46, %x6 \n\t"
+ "lxvd2x 46, %15, %2 \n\t"
+ "lxvd2x %x6, %15, %3 \n\t"
+ "xvmaddadp 39, 47, %x7 \n\t"
+ "lxvd2x 47, %16, %2 \n\t"
+ "lxvd2x %x7, %16, %3 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t"
+ "xvmaddadp 33, 41, 49 \n\t"
+ "xvmaddadp 34, 42, 50 \n\t"
+ "xvmaddadp 35, 43, 51 \n\t"
+ "xvmaddadp 36, 44, %x4 \n\t"
+ "xvmaddadp 37, 45, %x5 \n\t"
+ "xvmaddadp 38, 46, %x6 \n\t"
+ "xvmaddadp 39, 47, %x7 \n\t"
+
+ "xvadddp 32, 32, 33 \n\t"
+ "xvadddp 34, 34, 35 \n\t"
+ "xvadddp 36, 36, 37 \n\t"
+ "xvadddp 38, 38, 39 \n\t"
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+
+ "xxswapd 33, 32 \n\t"
+
+ "xsadddp %x0, 32, 33 \n"
+
+ "#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n"
+ "#t0=%x4 t1=%x5 t2=%x6 t3=%x7"
+ :
+ "=d" (dot), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y), // 3
+ "=wa" (t0), // 4
+ "=wa" (t1), // 5
+ "=wa" (t2), // 6
+ "=wa" (t3) // 7
+ :
+ "m" (*x),
+ "m" (*y),
+ "b" (16), // 10
+ "b" (32), // 11
+ "b" (48), // 12
+ "b" (64), // 13
+ "b" (80), // 14
+ "b" (96), // 15
+ "b" (112) // 16
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+
+ return dot;
+}
diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c
index 812d09d..57f9f9e 100644
--- a/kernel/power/dgemv_n.c
+++ b/kernel/power/dgemv_n.c
@@ -47,18 +47,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_4x4
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha)
{
BLASLONG i;
- FLOAT *a0,*a1,*a2,*a3;
FLOAT x[4] __attribute__ ((aligned (16)));;
- a0 = ap[0];
- a1 = ap[1];
- a2 = ap[2];
- a3 = ap[3];
+ FLOAT *a0 = a_ptr;
+ FLOAT *a1 = a0 + lda;
+ FLOAT *a2 = a1 + lda;
+ FLOAT *a3 = a2 + lda;
+
for ( i=0; i<4; i++)
- x[i] = xo[i] * *alpha;
+ x[i] = xo[i] * alpha;
for ( i=0; i< n; i+=4 )
{
@@ -73,16 +73,13 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
#ifndef HAVE_KERNEL_4x2
-static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
{
BLASLONG i;
- FLOAT *a0,*a1;
FLOAT x[4] __attribute__ ((aligned (16)));;
- a0 = ap[0];
- a1 = ap[1];
for ( i=0; i<2; i++)
- x[i] = xo[i] * *alpha;
+ x[i] = xo[i] * alpha;
for ( i=0; i< n; i+=4 )
{
@@ -98,15 +95,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT
#ifndef HAVE_KERNEL_4x1
-static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
{
BLASLONG i;
- FLOAT *a0;
FLOAT x[4] __attribute__ ((aligned (16)));;
- a0 = ap;
for ( i=0; i<1; i++)
- x[i] = xo[i] * *alpha;
+ x[i] = xo[i] * alpha;
for ( i=0; i< n; i+=4 )
{
@@ -141,7 +136,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
{
BLASLONG i;
- BLASLONG j;
FLOAT *a_ptr;
FLOAT *x_ptr;
FLOAT *y_ptr;
@@ -151,13 +145,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
BLASLONG m3;
BLASLONG n2;
BLASLONG lda4 = lda << 2;
- FLOAT *ap[4] __attribute__ ((aligned (16)));;
FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
- FLOAT alpha_r[4] __attribute__ ((aligned (16)));;
FLOAT *ybuffer;
- alpha_r[0] = alpha;
-
if ( m < 1 ) return(0);
if ( n < 1 ) return(0);
@@ -187,11 +177,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
a_ptr = a;
x_ptr = x;
- ap[0] = a_ptr;
- ap[1] = a_ptr + lda;
- ap[2] = ap[1] + lda;
- ap[3] = ap[2] + lda;
-
if ( inc_y != 1 )
memset(ybuffer,0,NB*8);
else
@@ -203,18 +188,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
for( i = 0; i < n1 ; i++)
{
- dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r);
- ap[0] += lda4;
- ap[1] += lda4;
- ap[2] += lda4;
- ap[3] += lda4;
+ dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
a_ptr += lda4;
x_ptr += 4;
}
if ( n2 & 2 )
{
- dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r);
+ dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha);
a_ptr += lda*2;
x_ptr += 2;
}
@@ -222,7 +203,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
if ( n2 & 1 )
{
- dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r);
+ dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha);
a_ptr += lda;
x_ptr += 1;
@@ -243,11 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
x_ptr += inc_x;
xbuffer[3] = x_ptr[0];
x_ptr += inc_x;
- dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r);
- ap[0] += lda4;
- ap[1] += lda4;
- ap[2] += lda4;
- ap[3] += lda4;
+ dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
a_ptr += lda4;
}
@@ -255,7 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
{
xbuffer[0] = x_ptr[0];
x_ptr += inc_x;
- dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r);
+ dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
a_ptr += lda;
}
diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c
index 9eabe55..5b42bbb 100644
--- a/kernel/power/dgemv_n_microk_power8.c
+++ b/kernel/power/dgemv_n_microk_power8.c
@@ -35,267 +35,264 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_4x4 1
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
-
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
{
- BLASLONG i=n;
- BLASLONG o8 = 8;
- BLASLONG o16 = 16;
- BLASLONG o24 = 24;
- BLASLONG pre = 384;
-
- FLOAT *a0,*a1,*a2,*a3;
- FLOAT *y1=y+1;
- FLOAT x[4] __attribute__ ((aligned (16)));;
- a0 = ap[0]+1;
- a1 = ap[1]+1;
- a2 = ap[2]+1;
- a3 = ap[3]+1;
-
- x[0]=xo[0] * *alpha;
- x[1]=xo[1] * *alpha;
- x[2]=xo[2] * *alpha;
- x[3]=xo[3] * *alpha;
+ double *a0;
+ double *a1;
+ double *a2;
+ double *a3;
+
+ __asm__
+ (
+ "lxvd2x 34, 0, %9 \n\t" // x0, x1
+ "lxvd2x 35, %10, %9 \n\t" // x2, x3
+ "xxspltd 32, %x8, 0 \n\t" // alpha, alpha
+
+ "sldi %6, %4, 3 \n\t" // lda * sizeof (double)
+
+ "xvmuldp 34, 34, 32 \n\t" // x0 * alpha, x1 * alpha
+ "xvmuldp 35, 35, 32 \n\t" // x2 * alpha, x3 * alpha
+
+ "add %4, %3, %6 \n\t" // a1 = a0 + lda
+ "add %6, %6, %6 \n\t" // 2 * lda
+
+ "xxspltd 32, 34, 0 \n\t" // x0 * alpha, x0 * alpha
+ "xxspltd 33, 34, 1 \n\t" // x1 * alpha, x1 * alpha
+ "xxspltd 34, 35, 0 \n\t" // x2 * alpha, x2 * alpha
+ "xxspltd 35, 35, 1 \n\t" // x3 * alpha, x3 * alpha
+
+ "add %5, %3, %6 \n\t" // a2 = a0 + 2 * lda
+ "add %6, %4, %6 \n\t" // a3 = a1 + 2 * lda
+
+ "dcbt 0, %3 \n\t"
+ "dcbt 0, %4 \n\t"
+ "dcbt 0, %5 \n\t"
+ "dcbt 0, %6 \n\t"
+ "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
+ "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
- __asm__ __volatile__
- (
- "lxvdsx 32, 0 , %1 \n\t" // x0
- "lxvdsx 33,%3 , %1 \n\t" // x1
- "lxvdsx 34,%4 , %1 \n\t" // x2
- "lxvdsx 35,%5 , %1 \n\t" // x3
- "addi %2 , %2 , -8 \n\t"
- "addi %6 , %6 , -8 \n\t"
- "addi %7 , %7 , -8 \n\t"
- "addi %8 , %8 , -8 \n\t"
- "addi %9 , %9 , -8 \n\t"
-
- "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
- "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
-
- "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
- "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+ "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
+ "lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
- "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
- "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+ "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
+ "lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
- "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
- "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+ "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
+ "lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
- "addi %6, %6, 32 \n\t"
- "addi %7, %7, 32 \n\t"
- "addi %8, %8, 32 \n\t"
- "addi %9, %9, 32 \n\t"
+ "dcbt 0, %2 \n\t"
- "addic. %0 , %0 , -4 \n\t"
- "ble 2f \n\t"
+ "addi %3, %3, 32 \n\t"
+ "addi %4, %4, 32 \n\t"
+ "addi %5, %5, 32 \n\t"
+ "addi %6, %6, 32 \n\t"
- ".align 5 \n\t"
- "1: \n\t"
+ "addic. %1, %1, -4 \n\t"
+ "ble 2f \n\t"
- "dcbt %2, %10 \n\t"
+ ".p2align 5 \n"
+ "1: \n\t"
- "lxvd2x 40, 0, %2 \n\t" // y0, y1
- "lxvd2x 41,%4, %2 \n\t" // y2, y3
-
- "dcbt %6, %10 \n\t"
- "dcbt %7, %10 \n\t"
- "dcbt %8, %10 \n\t"
- "dcbt %9, %10 \n\t"
+ "lxvd2x 36, 0, %2 \n\t" // y0, y1
+ "lxvd2x 37, %10, %2 \n\t" // y2, y3
- "xvmaddadp 40, 48, 32 \n\t"
- "xvmaddadp 41, 49, 32 \n\t"
+ "xvmaddadp 36, 40, 32 \n\t"
+ "xvmaddadp 37, 41, 32 \n\t"
- "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
- "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+ "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
+ "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
- "xvmaddadp 40, 50, 33 \n\t"
- "addi %6, %6, 32 \n\t"
- "xvmaddadp 41, 51, 33 \n\t"
+ "xvmaddadp 36, 42, 33 \n\t"
+ "addi %3, %3, 32 \n\t"
+ "xvmaddadp 37, 43, 33 \n\t"
- "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
- "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+ "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
+ "lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
- "xvmaddadp 40, 52, 34 \n\t"
- "addi %7, %7, 32 \n\t"
- "xvmaddadp 41, 53, 34 \n\t"
+ "xvmaddadp 36, 44, 34 \n\t"
+ "addi %4, %4, 32 \n\t"
+ "xvmaddadp 37, 45, 34 \n\t"
- "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
- "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+ "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
+ "lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
- "xvmaddadp 40, 54, 35 \n\t"
- "addi %8, %8, 32 \n\t"
- "xvmaddadp 41, 55, 35 \n\t"
+ "xvmaddadp 36, 46, 35 \n\t"
+ "addi %5, %5, 32 \n\t"
+ "xvmaddadp 37, 47, 35 \n\t"
- "stxvd2x 40, 0, %2 \n\t" // y0, y1
- "stxvd2x 41,%4, %2 \n\t" // y2, y3
+ "stxvd2x 36, 0, %2 \n\t" // y0, y1
+ "stxvd2x 37, %10, %2 \n\t" // y2, y3
- "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
- "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+ "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
+ "lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
- "addi %9, %9, 32 \n\t"
- "addi %2, %2, 32 \n\t"
+ "addi %6, %6, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
- "addic. %0 , %0 , -4 \n\t"
- "ble 2f \n\t"
+ "addic. %1, %1, -4 \n\t"
+ "ble 2f \n\t"
- "lxvd2x 40, 0, %2 \n\t" // y0, y1
- "lxvd2x 41,%4, %2 \n\t" // y2, y3
-
- "xvmaddadp 40, 48, 32 \n\t"
- "xvmaddadp 41, 49, 32 \n\t"
+ "lxvd2x 36, 0, %2 \n\t" // y0, y1
+ "lxvd2x 37, %10, %2 \n\t" // y2, y3
- "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
- "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+ "xvmaddadp 36, 40, 32 \n\t"
+ "xvmaddadp 37, 41, 32 \n\t"
- "xvmaddadp 40, 50, 33 \n\t"
- "addi %6, %6, 32 \n\t"
- "xvmaddadp 41, 51, 33 \n\t"
+ "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
+ "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
- "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
- "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+ "xvmaddadp 36, 42, 33 \n\t"
+ "addi %3, %3, 32 \n\t"
+ "xvmaddadp 37, 43, 33 \n\t"
- "xvmaddadp 40, 52, 34 \n\t"
- "addi %7, %7, 32 \n\t"
- "xvmaddadp 41, 53, 34 \n\t"
+ "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
+ "lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
- "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
- "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+ "xvmaddadp 36, 44, 34 \n\t"
+ "addi %4, %4, 32 \n\t"
+ "xvmaddadp 37, 45, 34 \n\t"
- "xvmaddadp 40, 54, 35 \n\t"
- "addi %8, %8, 32 \n\t"
- "xvmaddadp 41, 55, 35 \n\t"
+ "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
+ "lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
- "stxvd2x 40, 0, %2 \n\t" // y0, y1
- "stxvd2x 41,%4, %2 \n\t" // y2, y3
+ "xvmaddadp 36, 46, 35 \n\t"
+ "addi %5, %5, 32 \n\t"
+ "xvmaddadp 37, 47, 35 \n\t"
- "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
- "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+ "stxvd2x 36, 0, %2 \n\t" // y0, y1
+ "stxvd2x 37, %10, %2 \n\t" // y2, y3
- "addi %9, %9, 32 \n\t"
- "addi %2, %2, 32 \n\t"
+ "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
+ "lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
- "addic. %0 , %0 , -4 \n\t"
- "ble 2f \n\t"
+ "addi %6, %6, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
+ "addic. %1, %1, -4 \n\t"
+ "ble 2f \n\t"
- "lxvd2x 40, 0, %2 \n\t" // y0, y1
- "lxvd2x 41,%4, %2 \n\t" // y2, y3
-
- "xvmaddadp 40, 48, 32 \n\t"
- "xvmaddadp 41, 49, 32 \n\t"
- "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
- "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
+ "lxvd2x 36, 0, %2 \n\t" // y0, y1
+ "lxvd2x 37, %10, %2 \n\t" // y2, y3
- "xvmaddadp 40, 50, 33 \n\t"
- "addi %6, %6, 32 \n\t"
- "xvmaddadp 41, 51, 33 \n\t"
+ "xvmaddadp 36, 40, 32 \n\t"
+ "xvmaddadp 37, 41, 32 \n\t"
- "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
- "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+ "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
+ "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
- "xvmaddadp 40, 52, 34 \n\t"
- "addi %7, %7, 32 \n\t"
- "xvmaddadp 41, 53, 34 \n\t"
+ "xvmaddadp 36, 42, 33 \n\t"
+ "addi %3, %3, 32 \n\t"
+ "xvmaddadp 37, 43, 33 \n\t"
- "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
- "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+ "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
+ "lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
- "xvmaddadp 40, 54, 35 \n\t"
- "addi %8, %8, 32 \n\t"
- "xvmaddadp 41, 55, 35 \n\t"
+ "xvmaddadp 36, 44, 34 \n\t"
+ "addi %4, %4, 32 \n\t"
+ "xvmaddadp 37, 45, 34 \n\t"
- "stxvd2x 40, 0, %2 \n\t" // y0, y1
- "stxvd2x 41,%4, %2 \n\t" // y2, y3
+ "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
+ "lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
- "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
- "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+ "xvmaddadp 36, 46, 35 \n\t"
+ "addi %5, %5, 32 \n\t"
+ "xvmaddadp 37, 47, 35 \n\t"
- "addi %9, %9, 32 \n\t"
- "addi %2, %2, 32 \n\t"
+ "stxvd2x 36, 0, %2 \n\t" // y0, y1
+ "stxvd2x 37, %10, %2 \n\t" // y2, y3
- "addic. %0 , %0 , -4 \n\t"
- "ble 2f \n\t"
+ "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
+ "lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
+ "addi %6, %6, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
- "lxvd2x 40, 0, %2 \n\t" // y0, y1
- "lxvd2x 41,%4, %2 \n\t" // y2, y3
-
- "xvmaddadp 40, 48, 32 \n\t"
- "xvmaddadp 41, 49, 32 \n\t"
+ "addic. %1, %1, -4 \n\t"
+ "ble 2f \n\t"
- "lxvd2x 48, 0, %6 \n\t" // a0[0], a0[1]
- "lxvd2x 49,%4, %6 \n\t" // a0[2], a0[3]
- "xvmaddadp 40, 50, 33 \n\t"
- "addi %6, %6, 32 \n\t"
- "xvmaddadp 41, 51, 33 \n\t"
+ "lxvd2x 36, 0, %2 \n\t" // y0, y1
+ "lxvd2x 37, %10, %2 \n\t" // y2, y3
- "lxvd2x 50, 0, %7 \n\t" // a1[0], a1[1]
- "lxvd2x 51,%4, %7 \n\t" // a1[2], a1[3]
+ "xvmaddadp 36, 40, 32 \n\t"
+ "xvmaddadp 37, 41, 32 \n\t"
- "xvmaddadp 40, 52, 34 \n\t"
- "addi %7, %7, 32 \n\t"
- "xvmaddadp 41, 53, 34 \n\t"
+ "lxvd2x 40, 0, %3 \n\t" // a0[0], a0[1]
+ "lxvd2x 41, %10, %3 \n\t" // a0[2], a0[3]
- "lxvd2x 52, 0, %8 \n\t" // a2[0], a2[1]
- "lxvd2x 53,%4, %8 \n\t" // a2[2], a2[3]
+ "xvmaddadp 36, 42, 33 \n\t"
+ "addi %3, %3, 32 \n\t"
+ "xvmaddadp 37, 43, 33 \n\t"
- "xvmaddadp 40, 54, 35 \n\t"
- "addi %8, %8, 32 \n\t"
- "xvmaddadp 41, 55, 35 \n\t"
+ "lxvd2x 42, 0, %4 \n\t" // a1[0], a1[1]
+ "lxvd2x 43, %10, %4 \n\t" // a1[2], a1[3]
- "stxvd2x 40, 0, %2 \n\t" // y0, y1
- "stxvd2x 41,%4, %2 \n\t" // y2, y3
+ "xvmaddadp 36, 44, 34 \n\t"
+ "addi %4, %4, 32 \n\t"
+ "xvmaddadp 37, 45, 34 \n\t"
- "lxvd2x 54, 0, %9 \n\t" // a3[0], a3[1]
- "lxvd2x 55,%4, %9 \n\t" // a3[2], a3[3]
+ "lxvd2x 44, 0, %5 \n\t" // a2[0], a2[1]
+ "lxvd2x 45, %10, %5 \n\t" // a2[2], a2[3]
- "addi %9, %9, 32 \n\t"
- "addi %2, %2, 32 \n\t"
+ "xvmaddadp 36, 46, 35 \n\t"
+ "addi %5, %5, 32 \n\t"
+ "xvmaddadp 37, 47, 35 \n\t"
- "addic. %0 , %0 , -4 \n\t"
- "bgt 1b \n\t"
+ "stxvd2x 36, 0, %2 \n\t" // y0, y1
+ "stxvd2x 37, %10, %2 \n\t" // y2, y3
- "2: \n\t"
+ "lxvd2x 46, 0, %6 \n\t" // a3[0], a3[1]
+ "lxvd2x 47, %10, %6 \n\t" // a3[2], a3[3]
- "lxvd2x 40, 0, %2 \n\t" // y0, y1
- "lxvd2x 41,%4, %2 \n\t" // y2, y3
+ "addi %6, %6, 32 \n\t"
+ "addi %2, %2, 32 \n\t"
- "xvmaddadp 40, 48, 32 \n\t"
- "xvmaddadp 41, 49, 32 \n\t"
+ "addic. %1, %1, -4 \n\t"
+ "bgt 1b \n"
- "xvmaddadp 40, 50, 33 \n\t"
- "xvmaddadp 41, 51, 33 \n\t"
+ "2: \n\t"
- "xvmaddadp 40, 52, 34 \n\t"
- "xvmaddadp 41, 53, 34 \n\t"
+ "lxvd2x 36, 0, %2 \n\t" // y0, y1
+ "lxvd2x 37, %10, %2 \n\t" // y2, y3
- "xvmaddadp 40, 54, 35 \n\t"
- "xvmaddadp 41, 55, 35 \n\t"
+ "xvmaddadp 36, 40, 32 \n\t"
+ "xvmaddadp 37, 41, 32 \n\t"
- "stxvd2x 40, 0, %2 \n\t" // y0, y1
- "stxvd2x 41,%4, %2 \n\t" // y2, y3
+ "xvmaddadp 36, 42, 33 \n\t"
+ "xvmaddadp 37, 43, 33 \n\t"
- :
- :
- "r" (i), // 0
- "r" (x), // 1
- "r" (y1), // 2
- "r" (o8), // 3
- "r" (o16), // 4
- "r" (o24), // 5
- "r" (a0), // 6
- "r" (a1), // 7
- "r" (a2), // 8
- "r" (a3), // 9
- "r" (pre) // 10
- : "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
- );
+ "xvmaddadp 36, 44, 34 \n\t"
+ "xvmaddadp 37, 45, 34 \n\t"
-}
+ "xvmaddadp 36, 46, 35 \n\t"
+ "xvmaddadp 37, 47, 35 \n\t"
+ "stxvd2x 36, 0, %2 \n\t" // y0, y1
+ "stxvd2x 37, %10, %2 \n" // y2, y3
+ "#n=%1 ap=%11 lda=%12 x=%7=%9 y=%0=%2 alpha=%8 o16=%10\n"
+ "#a0=%3 a1=%4 a2=%5 a3=%6"
+ :
+ "=m" (*y),
+ "+r" (n), // 1
+ "+b" (y), // 2
+ "=b" (a0), // 3
+ "=b" (a1), // 4
+ "=&b" (a2), // 5
+ "=&b" (a3) // 6
+ :
+ "m" (*x),
+ "d" (alpha), // 8
+ "r" (x), // 9
+ "b" (16), // 10
+ "3" (ap), // 11
+ "4" (lda) // 12
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index c93f69b..3e10748 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16
-static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
@@ -56,8 +56,6 @@ static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
FLOAT y00, y01, y02, y03;
FLOAT *x1=x;
FLOAT *y1=y;
- FLOAT c1=*c;
- FLOAT s1=*s;
while ( i<n )
{
@@ -71,14 +69,14 @@ static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
x03 = x1[3];
y03 = y1[3];
- f0 = c1*x00 + s1*y00;
- g0 = c1*y00 - s1*x00;
- f1 = c1*x01 + s1*y01;
- g1 = c1*y01 - s1*x01;
- f2 = c1*x02 + s1*y02;
- g2 = c1*y02 - s1*x02;
- f3 = c1*x03 + s1*y03;
- g3 = c1*y03 - s1*x03;
+ f0 = c*x00 + s*y00;
+ g0 = c*y00 - s*x00;
+ f1 = c*x01 + s*y01;
+ g1 = c*y01 - s*x01;
+ f2 = c*x02 + s*y02;
+ g2 = c*y02 - s*x02;
+ f3 = c*x03 + s*y03;
+ g3 = c*y03 - s*x03;
x1[0] = f0;
y1[0] = g0;
@@ -106,8 +104,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
- FLOAT c1[4] __attribute__ ((aligned (16)));;
- FLOAT s1[4] __attribute__ ((aligned (16)));;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT temp;
@@ -120,15 +116,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
- c1[0]=c;
- c1[1]=c;
- c1[2]=c;
- c1[3]=c;
- s1[0]=s;
- s1[1]=s;
- s1[2]=s;
- s1[3]=s;
- drot_kernel_16(n1, x1, y1, c1, s1);
+ drot_kernel_16(n1, x1, y1, c, s);
i=n1;
}
diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c
index 4444ac7..016b776 100644
--- a/kernel/power/drot_microk_power8.c
+++ b/kernel/power/drot_microk_power8.c
@@ -38,174 +38,176 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1
-static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
-
-static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
{
+ __vector double t0;
+ __vector double t1;
+ __vector double t2;
+ __vector double t3;
+ __vector double t4;
+ __vector double t5;
+ __vector double t6;
+ __vector double t7;
+ __asm__
+ (
+ "xxspltd 36, %x13, 0 \n\t" // load c to both dwords
+ "xxspltd 37, %x14, 0 \n\t" // load s to both dwords
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *x2=x+1;
- FLOAT *y2=y+1;
-
- __asm__ __volatile__
- (
-
- "lxsdx 36 , %5, %3 \n\t" // load c
- "lxsdx 37 , %5, %4 \n\t" // load s
- "addi %8 , %8, -8 \n\t"
- "addi %9 , %9, -8 \n\t"
-
- "xxspltd 36 , 36, 0 \n\t"
- "xxspltd 37 , 37, 0 \n\t"
-
- "lxvd2x 32, 0, %1 \n\t" // load x
- "lxvd2x 33, %5, %1 \n\t"
- "lxvd2x 34, %6, %1 \n\t"
- "lxvd2x 35, %7, %1 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t" // load y
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
-
- "addi %1, %1, 64 \n\t"
- "addi %2, %2, 64 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "xvmuldp 48, 32, 36 \n\t" // c * x
- "xvmuldp 49, 33, 36 \n\t"
- "xvmuldp 50, 34, 36 \n\t"
- "xvmuldp 51, 35, 36 \n\t"
-
- "xvmuldp 56, 40, 36 \n\t" // c * y
- "xvmuldp 57, 41, 36 \n\t"
- "xvmuldp 58, 42, 36 \n\t"
- "xvmuldp 59, 43, 36 \n\t"
-
- "xvmuldp 52, 32, 37 \n\t" // s * x
- "xvmuldp 53, 33, 37 \n\t"
-
- "lxvd2x 32, 0, %1 \n\t" // load x
- "lxvd2x 33, %5, %1 \n\t"
-
- "xvmuldp 54, 34, 37 \n\t"
- "xvmuldp 55, 35, 37 \n\t"
-
- "lxvd2x 34, %6, %1 \n\t"
- "lxvd2x 35, %7, %1 \n\t"
-
- "xvmuldp 60, 40, 37 \n\t" // s * y
- "xvmuldp 61, 41, 37 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t" // load y
- "lxvd2x 41, %5, %2 \n\t"
-
- "xvmuldp 62, 42, 37 \n\t"
- "xvmuldp 63, 43, 37 \n\t"
-
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
-
- "xvadddp 48, 48 , 60 \n\t" // c * x + s * y
- "xvadddp 49, 49 , 61 \n\t" // c * x + s * y
-
- "addi %1, %1, 64 \n\t"
- "addi %2, %2, 64 \n\t"
-
- "xvadddp 50, 50 , 62 \n\t" // c * x + s * y
- "xvadddp 51, 51 , 63 \n\t" // c * x + s * y
-
- "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
- "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
- "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
- "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
-
- "stxvd2x 48, 0, %8 \n\t" // store x
- "stxvd2x 49, %5, %8 \n\t"
- "stxvd2x 50, %6, %8 \n\t"
- "stxvd2x 51, %7, %8 \n\t"
-
- "stxvd2x 56, 0, %9 \n\t" // store y
- "stxvd2x 57, %5, %9 \n\t"
- "stxvd2x 58, %6, %9 \n\t"
- "stxvd2x 59, %7, %9 \n\t"
-
- "addi %8, %8, 64 \n\t"
- "addi %9, %9, 64 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmuldp 48, 32, 36 \n\t" // c * x
- "xvmuldp 49, 33, 36 \n\t"
- "xvmuldp 50, 34, 36 \n\t"
- "xvmuldp 51, 35, 36 \n\t"
-
- "xvmuldp 56, 40, 36 \n\t" // c * y
- "xvmuldp 57, 41, 36 \n\t"
- "xvmuldp 58, 42, 36 \n\t"
- "xvmuldp 59, 43, 36 \n\t"
-
- "xvmuldp 52, 32, 37 \n\t" // s * x
- "xvmuldp 53, 33, 37 \n\t"
- "xvmuldp 54, 34, 37 \n\t"
- "xvmuldp 55, 35, 37 \n\t"
-
- "xvmuldp 60, 40, 37 \n\t" // s * y
- "xvmuldp 61, 41, 37 \n\t"
- "xvmuldp 62, 42, 37 \n\t"
- "xvmuldp 63, 43, 37 \n\t"
-
- "xvadddp 48, 48 , 60 \n\t" // c * x + s * y
- "xvadddp 49, 49 , 61 \n\t" // c * x + s * y
- "xvadddp 50, 50 , 62 \n\t" // c * x + s * y
- "xvadddp 51, 51 , 63 \n\t" // c * x + s * y
-
- "xvsubdp 56, 56 , 52 \n\t" // c * y - s * x
- "xvsubdp 57, 57 , 53 \n\t" // c * y - s * x
- "xvsubdp 58, 58 , 54 \n\t" // c * y - s * x
- "xvsubdp 59, 59 , 55 \n\t" // c * y - s * x
-
- "stxvd2x 48, 0, %8 \n\t" // store x
- "stxvd2x 49, %5, %8 \n\t"
- "stxvd2x 50, %6, %8 \n\t"
- "stxvd2x 51, %7, %8 \n\t"
+ "lxvd2x 32, 0, %3 \n\t" // load x
+ "lxvd2x 33, %15, %3 \n\t"
+ "lxvd2x 34, %16, %3 \n\t"
+ "lxvd2x 35, %17, %3 \n\t"
- "stxvd2x 56, 0, %9 \n\t" // store y
- "stxvd2x 57, %5, %9 \n\t"
- "stxvd2x 58, %6, %9 \n\t"
- "stxvd2x 59, %7, %9 \n\t"
+ "lxvd2x 48, 0, %4 \n\t" // load y
+ "lxvd2x 49, %15, %4 \n\t"
+ "lxvd2x 50, %16, %4 \n\t"
+ "lxvd2x 51, %17, %4 \n\t"
+ "addi %3, %3, 64 \n\t"
+ "addi %4, %4, 64 \n\t"
+ "addic. %2, %2, -8 \n\t"
+ "ble 2f \n\t"
- :
- :
- "r" (i), // 0
- "r" (x1), // 1
- "r" (y1), // 2
- "r" (c), // 3
- "r" (s), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (x2), // 8
- "r" (y2) // 9
- : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
- );
+ ".p2align 5 \n"
+ "1: \n\t"
-}
+ "xvmuldp 40, 32, 36 \n\t" // c * x
+ "xvmuldp 41, 33, 36 \n\t"
+ "xvmuldp 42, 34, 36 \n\t"
+ "xvmuldp 43, 35, 36 \n\t"
+ "xvmuldp %x5, 48, 36 \n\t" // c * y
+ "xvmuldp %x6, 49, 36 \n\t"
+ "xvmuldp %x7, 50, 36 \n\t"
+ "xvmuldp %x8, 51, 36 \n\t"
+ "xvmuldp 44, 32, 37 \n\t" // s * x
+ "xvmuldp 45, 33, 37 \n\t"
+
+ "lxvd2x 32, 0, %3 \n\t" // load x
+ "lxvd2x 33, %15, %3 \n\t"
+
+ "xvmuldp 46, 34, 37 \n\t"
+ "xvmuldp 47, 35, 37 \n\t"
+
+ "lxvd2x 34, %16, %3 \n\t"
+ "lxvd2x 35, %17, %3 \n\t"
+
+ "xvmuldp %x9, 48, 37 \n\t" // s * y
+ "xvmuldp %x10, 49, 37 \n\t"
+
+ "lxvd2x 48, 0, %4 \n\t" // load y
+ "lxvd2x 49, %15, %4 \n\t"
+
+ "xvmuldp %x11, 50, 37 \n\t"
+ "xvmuldp %x12, 51, 37 \n\t"
+
+ "lxvd2x 50, %16, %4 \n\t"
+ "lxvd2x 51, %17, %4 \n\t"
+
+ "xvadddp 40, 40, %x9 \n\t" // c * x + s * y
+ "xvadddp 41, 41, %x10 \n\t" // c * x + s * y
+
+ "addi %3, %3, -64 \n\t"
+ "addi %4, %4, -64 \n\t"
+
+ "xvadddp 42, 42, %x11 \n\t" // c * x + s * y
+ "xvadddp 43, 43, %x12 \n\t" // c * x + s * y
+
+ "xvsubdp %x5, %x5, 44 \n\t" // c * y - s * x
+ "xvsubdp %x6, %x6, 45 \n\t" // c * y - s * x
+ "xvsubdp %x7, %x7, 46 \n\t" // c * y - s * x
+ "xvsubdp %x8, %x8, 47 \n\t" // c * y - s * x
+
+ "stxvd2x 40, 0, %3 \n\t" // store x
+ "stxvd2x 41, %15, %3 \n\t"
+ "stxvd2x 42, %16, %3 \n\t"
+ "stxvd2x 43, %17, %3 \n\t"
+
+ "stxvd2x %x5, 0, %4 \n\t" // store y
+ "stxvd2x %x6, %15, %4 \n\t"
+ "stxvd2x %x7, %16, %4 \n\t"
+ "stxvd2x %x8, %17, %4 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %2, %2, -8 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmuldp 40, 32, 36 \n\t" // c * x
+ "xvmuldp 41, 33, 36 \n\t"
+ "xvmuldp 42, 34, 36 \n\t"
+ "xvmuldp 43, 35, 36 \n\t"
+
+ "xvmuldp %x5, 48, 36 \n\t" // c * y
+ "xvmuldp %x6, 49, 36 \n\t"
+ "xvmuldp %x7, 50, 36 \n\t"
+ "xvmuldp %x8, 51, 36 \n\t"
+
+ "xvmuldp 44, 32, 37 \n\t" // s * x
+ "xvmuldp 45, 33, 37 \n\t"
+ "xvmuldp 46, 34, 37 \n\t"
+ "xvmuldp 47, 35, 37 \n\t"
+
+ "xvmuldp %x9, 48, 37 \n\t" // s * y
+ "xvmuldp %x10, 49, 37 \n\t"
+ "xvmuldp %x11, 50, 37 \n\t"
+ "xvmuldp %x12, 51, 37 \n\t"
+
+ "addi %3, %3, -64 \n\t"
+ "addi %4, %4, -64 \n\t"
+
+ "xvadddp 40, 40, %x9 \n\t" // c * x + s * y
+ "xvadddp 41, 41, %x10 \n\t" // c * x + s * y
+ "xvadddp 42, 42, %x11 \n\t" // c * x + s * y
+ "xvadddp 43, 43, %x12 \n\t" // c * x + s * y
+
+ "xvsubdp %x5, %x5, 44 \n\t" // c * y - s * x
+ "xvsubdp %x6, %x6, 45 \n\t" // c * y - s * x
+ "xvsubdp %x7, %x7, 46 \n\t" // c * y - s * x
+ "xvsubdp %x8, %x8, 47 \n\t" // c * y - s * x
+
+ "stxvd2x 40, 0, %3 \n\t" // store x
+ "stxvd2x 41, %15, %3 \n\t"
+ "stxvd2x 42, %16, %3 \n\t"
+ "stxvd2x 43, %17, %3 \n\t"
+
+ "stxvd2x %x5, 0, %4 \n\t" // store y
+ "stxvd2x %x6, %15, %4 \n\t"
+ "stxvd2x %x7, %16, %4 \n\t"
+ "stxvd2x %x8, %17, %4 \n"
+
+ "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
+ "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
+ :
+ "+m" (*x),
+ "+m" (*y),
+ "+r" (n), // 2
+ "+b" (x), // 3
+ "+b" (y), // 4
+ "=wa" (t0), // 5
+ "=wa" (t1), // 6
+ "=wa" (t2), // 7
+ "=wa" (t3), // 8
+ "=wa" (t4), // 9
+ "=wa" (t5), // 10
+ "=wa" (t6), // 11
+ "=wa" (t7) // 12
+ :
+ "d" (c), // 13
+ "d" (s), // 14
+ "b" (16), // 15
+ "b" (32), // 16
+ "b" (48) // 17
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+}
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
index c62a563..f32dc4b 100644
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@@ -41,11 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(HAVE_KERNEL_8)
-static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
+static void dscal_kernel_8 (BLASLONG n, FLOAT *x, FLOAT alpha)
{
BLASLONG i;
- FLOAT alpha = *da;
for( i=0; i<n; i+=8 )
{
@@ -62,7 +61,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
}
-static void dscal_kernel_8_zero( BLASLONG n, FLOAT *da , FLOAT *x )
+static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x)
{
BLASLONG i;
@@ -102,10 +101,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
- FLOAT alpha[2];
- alpha[0]=da;
- alpha[1]=da;
- dscal_kernel_8_zero(n1 , alpha , x);
+ dscal_kernel_8_zero(n1, x);
j=n1;
}
@@ -123,10 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
- FLOAT alpha[2];
- alpha[0]=da;
- alpha[1]=da;
- dscal_kernel_8(n1 , alpha , x);
+ dscal_kernel_8(n1, x, da);
j=n1;
}
while(j < n)
diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c
index d90c3d8..04898eb 100644
--- a/kernel/power/dscal_microk_power8.c
+++ b/kernel/power/dscal_microk_power8.c
@@ -35,185 +35,149 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_8 1
-static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
+static void dscal_kernel_8 (long n, double *x, double alpha)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *x2=x+1;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "lxsdx 33, 0, %3 \n\t"
- "xxspltd 32, 33, 0 \n\t"
- "addi %1, %1, -8 \n\t"
-
- "dcbt %2, %4 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %4 \n\t"
-
- "xvmuldp 48, 40, 32 \n\t"
- "xvmuldp 49, 41, 32 \n\t"
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "xvmuldp 50, 42, 32 \n\t"
- "xvmuldp 51, 43, 32 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "xvmuldp 52, 44, 32 \n\t"
- "xvmuldp 53, 45, 32 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "xvmuldp 54, 46, 32 \n\t"
- "xvmuldp 55, 47, 32 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "stxvd2x 48, 0, %1 \n\t"
- "stxvd2x 49, %5, %1 \n\t"
- "stxvd2x 50, %6, %1 \n\t"
- "stxvd2x 51, %7, %1 \n\t"
- "stxvd2x 52, %8, %1 \n\t"
- "stxvd2x 53, %9, %1 \n\t"
- "stxvd2x 54, %10, %1 \n\t"
- "stxvd2x 55, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmuldp 48, 40, 32 \n\t"
- "xvmuldp 49, 41, 32 \n\t"
- "xvmuldp 50, 42, 32 \n\t"
- "xvmuldp 51, 43, 32 \n\t"
- "xvmuldp 52, 44, 32 \n\t"
- "xvmuldp 53, 45, 32 \n\t"
- "xvmuldp 54, 46, 32 \n\t"
- "xvmuldp 55, 47, 32 \n\t"
-
- "stxvd2x 48, 0, %1 \n\t"
- "stxvd2x 49, %5, %1 \n\t"
- "stxvd2x 50, %6, %1 \n\t"
- "stxvd2x 51, %7, %1 \n\t"
- "stxvd2x 52, %8, %1 \n\t"
- "stxvd2x 53, %9, %1 \n\t"
- "stxvd2x 54, %10, %1 \n\t"
- "stxvd2x 55, %11, %1 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (x2), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
-static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xxspltd %x3, %x3, 0 \n\t"
+
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %4, %2 \n\t"
+ "lxvd2x 34, %5, %2 \n\t"
+ "lxvd2x 35, %6, %2 \n\t"
+ "lxvd2x 36, %7, %2 \n\t"
+ "lxvd2x 37, %8, %2 \n\t"
+ "lxvd2x 38, %9, %2 \n\t"
+ "lxvd2x 39, %10, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvmuldp 40, 32, %x3 \n\t"
+ "xvmuldp 41, 33, %x3 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %4, %2 \n\t"
+ "xvmuldp 42, 34, %x3 \n\t"
+ "xvmuldp 43, 35, %x3 \n\t"
+ "lxvd2x 34, %5, %2 \n\t"
+ "lxvd2x 35, %6, %2 \n\t"
+ "xvmuldp 44, 36, %x3 \n\t"
+ "xvmuldp 45, 37, %x3 \n\t"
+ "lxvd2x 36, %7, %2 \n\t"
+ "lxvd2x 37, %8, %2 \n\t"
+ "xvmuldp 46, 38, %x3 \n\t"
+ "xvmuldp 47, 39, %x3 \n\t"
+ "lxvd2x 38, %9, %2 \n\t"
+ "lxvd2x 39, %10, %2 \n\t"
+
+ "addi %2, %2, -128 \n\t"
+
+ "stxvd2x 40, 0, %2 \n\t"
+ "stxvd2x 41, %4, %2 \n\t"
+ "stxvd2x 42, %5, %2 \n\t"
+ "stxvd2x 43, %6, %2 \n\t"
+ "stxvd2x 44, %7, %2 \n\t"
+ "stxvd2x 45, %8, %2 \n\t"
+ "stxvd2x 46, %9, %2 \n\t"
+ "stxvd2x 47, %10, %2 \n\t"
+
+ "addi %2, %2, 256 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmuldp 40, 32, %x3 \n\t"
+ "xvmuldp 41, 33, %x3 \n\t"
+ "xvmuldp 42, 34, %x3 \n\t"
+ "xvmuldp 43, 35, %x3 \n\t"
+
+ "addi %2, %2, -128 \n\t"
+
+ "xvmuldp 44, 36, %x3 \n\t"
+ "xvmuldp 45, 37, %x3 \n\t"
+ "xvmuldp 46, 38, %x3 \n\t"
+ "xvmuldp 47, 39, %x3 \n\t"
+
+ "stxvd2x 40, 0, %2 \n\t"
+ "stxvd2x 41, %4, %2 \n\t"
+ "stxvd2x 42, %5, %2 \n\t"
+ "stxvd2x 43, %6, %2 \n\t"
+ "stxvd2x 44, %7, %2 \n\t"
+ "stxvd2x 45, %8, %2 \n\t"
+ "stxvd2x 46, %9, %2 \n\t"
+ "stxvd2x 47, %10, %2 \n"
+
+ "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+ :
+ "+m" (*x),
+ "+r" (n), // 1
+ "+b" (x) // 2
+ :
+ "d" (alpha), // 3
+ "b" (16), // 4
+ "b" (32), // 5
+ "b" (48), // 6
+ "b" (64), // 7
+ "b" (80), // 8
+ "b" (96), // 9
+ "b" (112) // 10
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
+
+
+static void dscal_kernel_8_zero (long n, double *x)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *x2=x+1;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "xxlxor 32 , 32 , 32 \n\t"
- "addi %1, %1, -8 \n\t"
-
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "stxvd2x 32, 0, %1 \n\t"
- "stxvd2x 32, %5, %1 \n\t"
- "stxvd2x 32, %6, %1 \n\t"
- "stxvd2x 32, %7, %1 \n\t"
- "stxvd2x 32, %8, %1 \n\t"
- "stxvd2x 32, %9, %1 \n\t"
- "stxvd2x 32, %10, %1 \n\t"
- "stxvd2x 32, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (x2), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
+ __vector double t0;
+
+ __asm__
+ (
+ "xxlxor %x3, %x3, %x3 \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "stxvd2x %x3, 0, %2 \n\t"
+ "stxvd2x %x3, %4, %2 \n\t"
+ "stxvd2x %x3, %5, %2 \n\t"
+ "stxvd2x %x3, %6, %2 \n\t"
+ "stxvd2x %x3, %7, %2 \n\t"
+ "stxvd2x %x3, %8, %2 \n\t"
+ "stxvd2x %x3, %9, %2 \n\t"
+ "stxvd2x %x3, %10, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "bgt 1b \n"
+
+ "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+ :
+ "=m" (*x),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "=wa" (t0) // 3
+ :
+ "b" (16), // 4
+ "b" (32), // 5
+ "b" (48), // 6
+ "b" (64), // 7
+ "b" (80), // 8
+ "b" (96), // 9
+ "b" (112) // 10
+ :
+ "cr0"
+ );
+}
diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c
index 77747c3..31eff34 100644
--- a/kernel/power/dswap_microk_power8.c
+++ b/kernel/power/dswap_microk_power8.c
@@ -35,146 +35,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1
-static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void dswap_kernel_32 (long n, double *x, double *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *x2=x+1;
- FLOAT *y2=y+1;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "addi %3, %3, -8 \n\t"
- "addi %4, %4, -8 \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "lxvd2x 32, 0, %2 \n\t"
- "lxvd2x 33, %5, %2 \n\t"
- "lxvd2x 34, %6, %2 \n\t"
- "lxvd2x 35, %7, %2 \n\t"
- "lxvd2x 36, %8, %2 \n\t"
- "lxvd2x 37, %9, %2 \n\t"
- "lxvd2x 38, %10, %2 \n\t"
- "lxvd2x 39, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvd2x 48, 0, %1 \n\t"
- "lxvd2x 49, %5, %1 \n\t"
- "lxvd2x 50, %6, %1 \n\t"
- "lxvd2x 51, %7, %1 \n\t"
- "lxvd2x 52, %8, %1 \n\t"
- "lxvd2x 53, %9, %1 \n\t"
- "lxvd2x 54, %10, %1 \n\t"
- "lxvd2x 55, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "lxvd2x 56, 0, %1 \n\t"
- "lxvd2x 57, %5, %1 \n\t"
- "lxvd2x 58, %6, %1 \n\t"
- "lxvd2x 59, %7, %1 \n\t"
- "lxvd2x 60, %8, %1 \n\t"
- "lxvd2x 61, %9, %1 \n\t"
- "lxvd2x 62, %10, %1 \n\t"
- "lxvd2x 63, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "stxvd2x 32, 0, %3 \n\t"
- "stxvd2x 33, %5, %3 \n\t"
- "stxvd2x 34, %6, %3 \n\t"
- "stxvd2x 35, %7, %3 \n\t"
- "stxvd2x 36, %8, %3 \n\t"
- "stxvd2x 37, %9, %3 \n\t"
- "stxvd2x 38, %10, %3 \n\t"
- "stxvd2x 39, %11, %3 \n\t"
-
- "addi %3, %3, 128 \n\t"
-
- "stxvd2x 40, 0, %3 \n\t"
- "stxvd2x 41, %5, %3 \n\t"
- "stxvd2x 42, %6, %3 \n\t"
- "stxvd2x 43, %7, %3 \n\t"
- "stxvd2x 44, %8, %3 \n\t"
- "stxvd2x 45, %9, %3 \n\t"
- "stxvd2x 46, %10, %3 \n\t"
- "stxvd2x 47, %11, %3 \n\t"
-
- "addi %3, %3, 128 \n\t"
-
- "stxvd2x 48, 0, %4 \n\t"
- "stxvd2x 49, %5, %4 \n\t"
- "stxvd2x 50, %6, %4 \n\t"
- "stxvd2x 51, %7, %4 \n\t"
- "stxvd2x 52, %8, %4 \n\t"
- "stxvd2x 53, %9, %4 \n\t"
- "stxvd2x 54, %10, %4 \n\t"
- "stxvd2x 55, %11, %4 \n\t"
-
- "addi %4, %4, 128 \n\t"
-
- "stxvd2x 56, 0, %4 \n\t"
- "stxvd2x 57, %5, %4 \n\t"
- "stxvd2x 58, %6, %4 \n\t"
- "stxvd2x 59, %7, %4 \n\t"
- "stxvd2x 60, %8, %4 \n\t"
- "stxvd2x 61, %9, %4 \n\t"
- "stxvd2x 62, %10, %4 \n\t"
- "stxvd2x 63, %11, %4 \n\t"
-
- "addi %4, %4, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (y2), // 3
- "r" (x2), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "lxvd2x 32, 0, %4 \n\t"
+ "lxvd2x 33, %5, %4 \n\t"
+ "lxvd2x 34, %6, %4 \n\t"
+ "lxvd2x 35, %7, %4 \n\t"
+ "lxvd2x 36, %8, %4 \n\t"
+ "lxvd2x 37, %9, %4 \n\t"
+ "lxvd2x 38, %10, %4 \n\t"
+ "lxvd2x 39, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "lxvd2x 40, 0, %4 \n\t"
+ "lxvd2x 41, %5, %4 \n\t"
+ "lxvd2x 42, %6, %4 \n\t"
+ "lxvd2x 43, %7, %4 \n\t"
+ "lxvd2x 44, %8, %4 \n\t"
+ "lxvd2x 45, %9, %4 \n\t"
+ "lxvd2x 46, %10, %4 \n\t"
+ "lxvd2x 47, %11, %4 \n\t"
+
+ "addi %4, %4, -128 \n\t"
+
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+ "lxvd2x 0, %8, %3 \n\t"
+ "lxvd2x 1, %9, %3 \n\t"
+ "lxvd2x 2, %10, %3 \n\t"
+ "lxvd2x 3, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "lxvd2x 4, 0, %3 \n\t"
+ "lxvd2x 5, %5, %3 \n\t"
+ "lxvd2x 6, %6, %3 \n\t"
+ "lxvd2x 7, %7, %3 \n\t"
+ "lxvd2x 8, %8, %3 \n\t"
+ "lxvd2x 9, %9, %3 \n\t"
+ "lxvd2x 10, %10, %3 \n\t"
+ "lxvd2x 11, %11, %3 \n\t"
+
+ "addi %3, %3, -128 \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 48, 0, %4 \n\t"
+ "stxvd2x 49, %5, %4 \n\t"
+ "stxvd2x 50, %6, %4 \n\t"
+ "stxvd2x 51, %7, %4 \n\t"
+ "stxvd2x 0, %8, %4 \n\t"
+ "stxvd2x 1, %9, %4 \n\t"
+ "stxvd2x 2, %10, %4 \n\t"
+ "stxvd2x 3, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "stxvd2x 4, 0, %4 \n\t"
+ "stxvd2x 5, %5, %4 \n\t"
+ "stxvd2x 6, %6, %4 \n\t"
+ "stxvd2x 7, %7, %4 \n\t"
+ "stxvd2x 8, %8, %4 \n\t"
+ "stxvd2x 9, %9, %4 \n\t"
+ "stxvd2x 10, %10, %4 \n\t"
+ "stxvd2x 11, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %2, %2, -32 \n\t"
+ "bgt 1b \n"
+
+ "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "+m" (*x),
+ "+m" (*y),
+ "+r" (n), // 2
+ "+b" (x), // 3
+ "+b" (y) // 4
+ :
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+ "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+ );
+}
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
index 43311f2..fb10b1d 100644
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if defined(DOUBLE)
-#define ABS fabs
+#error supports float only
#else
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_32
-static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
{
BLASLONG i=0;
@@ -92,11 +92,7 @@ static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
}
- svec[0] = sum0+sum1+sum2+sum3;
- svec[1] = 0.0;
- svec[2] = 0.0;
- svec[3] = 0.0;
-
+ return sum0+sum1+sum2+sum3;
}
#endif
@@ -105,7 +101,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
{
BLASLONG i=0;
FLOAT sumf = 0.0;
- FLOAT svec[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
if (n <= 0 || inc_x <= 0) return(sumf);
@@ -117,8 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{
- sasum_kernel_32(n1, x, svec);
- sumf = svec[0] + svec[1]+svec[2]+svec[3];
+ sumf = sasum_kernel_32(n1, x);
i=n1;
}
diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
index 847fffe..25a969d 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_32 1
-static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
-static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
+static float sasum_kernel_32 (long n, float *x)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "dcbt %2 , %4 \n\t"
-
- "xxlxor 32,32,32 \n\t"
- "xxlxor 33,33,33 \n\t"
- "xxlxor 34,34,34 \n\t"
- "xxlxor 35,35,35 \n\t"
- "xxlxor 36,36,36 \n\t"
- "xxlxor 37,37,37 \n\t"
- "xxlxor 38,38,38 \n\t"
- "xxlxor 39,39,39 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2 , %4 \n\t"
-
- "xvabssp 48, 40 \n\t"
- "xvabssp 49, 41 \n\t"
- "xvabssp 50, 42 \n\t"
- "xvabssp 51, 43 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
-
- "xvabssp 52, 44 \n\t"
- "xvabssp 53, 45 \n\t"
-
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
-
- "xvabssp 54, 46 \n\t"
- "xvabssp 55, 47 \n\t"
-
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
-
- "xvaddsp 32, 32, 48 \n\t"
- "xvaddsp 33, 33, 49 \n\t"
-
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "xvaddsp 34, 34, 50 \n\t"
- "xvaddsp 35, 35, 51 \n\t"
- "addi %2, %2, 128 \n\t"
- "xvaddsp 36, 36, 52 \n\t"
- "xvaddsp 37, 37, 53 \n\t"
- "addic. %0 , %0 , -32 \n\t"
- "xvaddsp 38, 38, 54 \n\t"
- "xvaddsp 39, 39, 55 \n\t"
-
- "bgt 1b \n\t"
-
- "2: \n\t"
-
-
- "xvabssp 48, 40 \n\t"
- "xvabssp 49, 41 \n\t"
- "xvabssp 50, 42 \n\t"
- "xvabssp 51, 43 \n\t"
- "xvabssp 52, 44 \n\t"
- "xvabssp 53, 45 \n\t"
- "xvabssp 54, 46 \n\t"
- "xvabssp 55, 47 \n\t"
-
- "xvaddsp 32, 32, 48 \n\t"
- "xvaddsp 33, 33, 49 \n\t"
- "xvaddsp 34, 34, 50 \n\t"
- "xvaddsp 35, 35, 51 \n\t"
- "xvaddsp 36, 36, 52 \n\t"
- "xvaddsp 37, 37, 53 \n\t"
- "xvaddsp 38, 38, 54 \n\t"
- "xvaddsp 39, 39, 55 \n\t"
-
- "xvaddsp 32, 32, 33 \n\t"
- "xvaddsp 34, 34, 35 \n\t"
- "xvaddsp 36, 36, 37 \n\t"
- "xvaddsp 38, 38, 39 \n\t"
-
- "xvaddsp 32, 32, 34 \n\t"
- "xvaddsp 36, 36, 38 \n\t"
-
- "xvaddsp 32, 32, 36 \n\t"
-
-
- "stxvw4x 32, 0, %3 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (svec), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2", "memory"
- );
-
-}
-
-
+ float sum;
+ __vector float t0;
+ __vector float t1;
+ __vector float t2;
+ __vector float t3;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %8, %2 \n\t"
+ "lxvw4x 42, %9, %2 \n\t"
+ "lxvw4x 43, %10, %2 \n\t"
+ "lxvw4x 44, %11, %2 \n\t"
+ "lxvw4x 45, %12, %2 \n\t"
+ "lxvw4x 46, %13, %2 \n\t"
+ "lxvw4x 47, %14, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %8, %2 \n\t"
+
+ "xvabssp %x3, 44 \n\t"
+ "xvabssp %x4, 45 \n\t"
+
+ "lxvw4x 42, %9, %2 \n\t"
+ "lxvw4x 43, %10, %2 \n\t"
+
+ "xvabssp %x5, 46 \n\t"
+ "xvabssp %x6, 47 \n\t"
+
+ "lxvw4x 44, %11, %2 \n\t"
+ "lxvw4x 45, %12, %2 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+
+ "lxvw4x 46, %13, %2 \n\t"
+ "lxvw4x 47, %14, %2 \n\t"
+
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvaddsp 36, 36, %x3 \n\t"
+ "xvaddsp 37, 37, %x4 \n\t"
+ "addic. %1, %1, -32 \n\t"
+ "xvaddsp 38, 38, %x5 \n\t"
+ "xvaddsp 39, 39, %x6 \n\t"
+
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvabssp 48, 40 \n\t"
+ "xvabssp 49, 41 \n\t"
+ "xvabssp 50, 42 \n\t"
+ "xvabssp 51, 43 \n\t"
+ "xvabssp %x3, 44 \n\t"
+ "xvabssp %x4, 45 \n\t"
+ "xvabssp %x5, 46 \n\t"
+ "xvabssp %x6, 47 \n\t"
+
+ "xvaddsp 32, 32, 48 \n\t"
+ "xvaddsp 33, 33, 49 \n\t"
+ "xvaddsp 34, 34, 50 \n\t"
+ "xvaddsp 35, 35, 51 \n\t"
+ "xvaddsp 36, 36, %x3 \n\t"
+ "xvaddsp 37, 37, %x4 \n\t"
+ "xvaddsp 38, 38, %x5 \n\t"
+ "xvaddsp 39, 39, %x6 \n\t"
+
+ "xvaddsp 32, 32, 33 \n\t"
+ "xvaddsp 34, 34, 35 \n\t"
+ "xvaddsp 36, 36, 37 \n\t"
+ "xvaddsp 38, 38, 39 \n\t"
+
+ "xvaddsp 32, 32, 34 \n\t"
+ "xvaddsp 36, 36, 38 \n\t"
+
+ "xvaddsp 32, 32, 36 \n\t"
+
+ "xxsldwi 33, 32, 32, 2 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xxsldwi 33, 32, 32, 1 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xscvspdp %0, 32 \n"
+
+ "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+ "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+ :
+ "=f" (sum), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "=wa" (t0), // 3
+ "=wa" (t1), // 4
+ "=wa" (t2), // 5
+ "=wa" (t3) // 6
+ :
+ "m" (*x),
+ "b" (16), // 8
+ "b" (32), // 9
+ "b" (48), // 10
+ "b" (64), // 11
+ "b" (80), // 12
+ "b" (96), // 13
+ "b" (112) // 14
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+
+ return sum;
+}
diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
index 2e08e35..444a6d4 100644
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -35,97 +35,78 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1
-static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void scopy_kernel_32 (long n, float *x, float *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "stxvw4x 40, 0, %1 \n\t"
- "stxvw4x 41, %5, %1 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "stxvw4x 42, %6, %1 \n\t"
- "stxvw4x 43, %7, %1 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "stxvw4x 44, %8, %1 \n\t"
- "stxvw4x 45, %9, %1 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "stxvw4x 46, %10, %1 \n\t"
- "stxvw4x 47, %11, %1 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "stxvw4x 40, 0, %1 \n\t"
- "stxvw4x 41, %5, %1 \n\t"
- "stxvw4x 42, %6, %1 \n\t"
- "stxvw4x 43, %7, %1 \n\t"
- "stxvw4x 44, %8, %1 \n\t"
- "stxvw4x 45, %9, %1 \n\t"
- "stxvw4x 46, %10, %1 \n\t"
- "stxvw4x 47, %11, %1 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "stxvw4x 40, 0, %3 \n\t"
+ "stxvw4x 41, %5, %3 \n\t"
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 41, %5, %2 \n\t"
+ "stxvw4x 42, %6, %3 \n\t"
+ "stxvw4x 43, %7, %3 \n\t"
+ "lxvw4x 42, %6, %2 \n\t"
+ "lxvw4x 43, %7, %2 \n\t"
+ "stxvw4x 44, %8, %3 \n\t"
+ "stxvw4x 45, %9, %3 \n\t"
+ "lxvw4x 44, %8, %2 \n\t"
+ "lxvw4x 45, %9, %2 \n\t"
+ "stxvw4x 46, %10, %3 \n\t"
+ "stxvw4x 47, %11, %3 \n\t"
+ "lxvw4x 46, %10, %2 \n\t"
+ "lxvw4x 47, %11, %2 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "stxvw4x 40, 0, %3 \n\t"
+ "stxvw4x 41, %5, %3 \n\t"
+ "stxvw4x 42, %6, %3 \n\t"
+ "stxvw4x 43, %7, %3 \n\t"
+ "stxvw4x 44, %8, %3 \n\t"
+ "stxvw4x 45, %9, %3 \n\t"
+ "stxvw4x 46, %10, %3 \n\t"
+ "stxvw4x 47, %11, %3 \n"
+
+ "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "=m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "m" (*x),
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c
index 52fb1fe..31f4734 100644
--- a/kernel/power/sdot.c
+++ b/kernel/power/sdot.c
@@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16
-static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
{
BLASLONG register i = 0;
FLOAT dot = 0.0;
@@ -61,8 +61,7 @@ static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
i+=8 ;
}
- *d += dot;
-
+ return dot;
}
#endif
@@ -82,8 +81,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
BLASLONG n1 = n & -32;
if ( n1 )
- sdot_kernel_16(n1, x, y , &dot );
-
+ dot = sdot_kernel_16(n1, x, y);
i = n1;
while(i < n)
diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
index 6dd588a..7f7ccfa 100644
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -34,146 +34,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_16 1
-static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
-static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+static float sdot_kernel_16 (long n, float *x, float *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- BLASLONG pre = 384;
- FLOAT tempdot[4];
-
-
- __asm__ __volatile__
- (
- "xxlxor 32,32,32 \n\t"
- "xxlxor 33,33,33 \n\t"
- "xxlxor 34,34,34 \n\t"
- "xxlxor 35,35,35 \n\t"
- "xxlxor 36,36,36 \n\t"
- "xxlxor 37,37,37 \n\t"
- "xxlxor 38,38,38 \n\t"
- "xxlxor 39,39,39 \n\t"
-
- "dcbt %2, %12 \n\t"
- "dcbt %3, %12 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 49, %5, %3 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 50, %6, %3 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 51, %7, %3 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 52, %8, %3 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 53, %9, %3 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 54, %10, %3 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
- "lxvw4x 55, %11, %3 \n\t"
-
- "addi %2, %2, 128 \n\t"
- "addi %3, %3, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %12 \n\t"
- "dcbt %3, %12 \n\t"
-
- "xvmaddasp 32, 40, 48 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 48, 0, %3 \n\t"
- "xvmaddasp 33, 41, 49 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 49, %5, %3 \n\t"
- "xvmaddasp 34, 42, 50 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 50, %6, %3 \n\t"
- "xvmaddasp 35, 43, 51 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 51, %7, %3 \n\t"
- "xvmaddasp 36, 44, 52 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 52, %8, %3 \n\t"
- "xvmaddasp 37, 45, 53 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 53, %9, %3 \n\t"
- "xvmaddasp 38, 46, 54 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 54, %10, %3 \n\t"
- "xvmaddasp 39, 47, 55 \n\t"
-
- "lxvw4x 47, %11, %2 \n\t"
- "lxvw4x 55, %11, %3 \n\t"
-
-
- "addi %2, %2, 128 \n\t"
- "addi %3, %3, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmaddasp 32, 40, 48 \n\t"
- "xvmaddasp 33, 41, 49 \n\t"
- "xvmaddasp 34, 42, 50 \n\t"
- "xvmaddasp 35, 43, 51 \n\t"
- "xvmaddasp 36, 44, 52 \n\t"
- "xvmaddasp 37, 45, 53 \n\t"
- "xvmaddasp 38, 46, 54 \n\t"
- "xvmaddasp 39, 47, 55 \n\t"
-
- "xvaddsp 32, 32 , 33 \n\t"
- "xvaddsp 34, 34 , 35 \n\t"
- "xvaddsp 36, 36 , 37 \n\t"
- "xvaddsp 38, 38 , 39 \n\t"
-
- "xvaddsp 32, 32 , 34 \n\t"
- "xvaddsp 36, 36 , 38 \n\t"
-
- "xvaddsp 32, 32 , 36 \n\t"
-
- "stxvw4x 32, 0 , %4 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (y1), // 3
- "r" (tempdot), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112), // 11
- "r" (pre) // 12
- : "cr0", "%0", "%2" , "%3", "memory"
- );
-
- *dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3];
-
-
-}
-
-
+ float dot;
+ __vector float t0;
+ __vector float t1;
+ __vector float t2;
+ __vector float t3;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 48, 0, %3 \n\t"
+ "lxvw4x 41, %10, %2 \n\t"
+ "lxvw4x 49, %10, %3 \n\t"
+ "lxvw4x 42, %11, %2 \n\t"
+ "lxvw4x 50, %11, %3 \n\t"
+ "lxvw4x 43, %12, %2 \n\t"
+ "lxvw4x 51, %12, %3 \n\t"
+ "lxvw4x 44, %13, %2 \n\t"
+ "lxvw4x %x4, %13, %3 \n\t"
+ "lxvw4x 45, %14, %2 \n\t"
+ "lxvw4x %x5, %14, %3 \n\t"
+ "lxvw4x 46, %15, %2 \n\t"
+ "lxvw4x %x6, %15, %3 \n\t"
+ "lxvw4x 47, %16, %2 \n\t"
+ "lxvw4x %x7, %16, %3 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvmaddasp 32, 40, 48 \n\t"
+ "lxvw4x 40, 0, %2 \n\t"
+ "lxvw4x 48, 0, %3 \n\t"
+ "xvmaddasp 33, 41, 49 \n\t"
+ "lxvw4x 41, %10, %2 \n\t"
+ "lxvw4x 49, %10, %3 \n\t"
+ "xvmaddasp 34, 42, 50 \n\t"
+ "lxvw4x 42, %11, %2 \n\t"
+ "lxvw4x 50, %11, %3 \n\t"
+ "xvmaddasp 35, 43, 51 \n\t"
+ "lxvw4x 43, %12, %2 \n\t"
+ "lxvw4x 51, %12, %3 \n\t"
+ "xvmaddasp 36, 44, %x4 \n\t"
+ "lxvw4x 44, %13, %2 \n\t"
+ "lxvw4x %x4, %13, %3 \n\t"
+ "xvmaddasp 37, 45, %x5 \n\t"
+ "lxvw4x 45, %14, %2 \n\t"
+ "lxvw4x %x5, %14, %3 \n\t"
+ "xvmaddasp 38, 46, %x6 \n\t"
+ "lxvw4x 46, %15, %2 \n\t"
+ "lxvw4x %x6, %15, %3 \n\t"
+ "xvmaddasp 39, 47, %x7 \n\t"
+ "lxvw4x 47, %16, %2 \n\t"
+ "lxvw4x %x7, %16, %3 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+ "addi %3, %3, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmaddasp 32, 40, 48 \n\t"
+ "xvmaddasp 33, 41, 49 \n\t"
+ "xvmaddasp 34, 42, 50 \n\t"
+ "xvmaddasp 35, 43, 51 \n\t"
+ "xvmaddasp 36, 44, %x4 \n\t"
+ "xvmaddasp 37, 45, %x5 \n\t"
+ "xvmaddasp 38, 46, %x6 \n\t"
+ "xvmaddasp 39, 47, %x7 \n\t"
+
+ "xvaddsp 32, 32, 33 \n\t"
+ "xvaddsp 34, 34, 35 \n\t"
+ "xvaddsp 36, 36, 37 \n\t"
+ "xvaddsp 38, 38, 39 \n\t"
+
+ "xvaddsp 32, 32, 34 \n\t"
+ "xvaddsp 36, 36, 38 \n\t"
+
+ "xvaddsp 32, 32, 36 \n\t"
+
+ "xxsldwi 33, 32, 32, 2 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xxsldwi 33, 32, 32, 1 \n\t"
+ "xvaddsp 32, 32, 33 \n\t"
+
+ "xscvspdp %x0, 32 \n"
+
+ "#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n"
+ "#t0=%x4 t1=%x5 t2=%x6 t3=%x7"
+ :
+ "=f" (dot), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y), // 3
+ "=wa" (t0), // 4
+ "=wa" (t1), // 5
+ "=wa" (t2), // 6
+ "=wa" (t3) // 7
+ :
+ "m" (*x),
+ "m" (*y),
+ "b" (16), // 10
+ "b" (32), // 11
+ "b" (48), // 12
+ "b" (64), // 13
+ "b" (80), // 14
+ "b" (96), // 15
+ "b" (112) // 16
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+
+ return dot;
+}
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
index d464846..d2910ff 100644
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_16
-static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
{
BLASLONG i=0;
@@ -56,8 +56,6 @@ static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
FLOAT y00, y01, y02, y03;
FLOAT *x1=x;
FLOAT *y1=y;
- FLOAT c1=*c;
- FLOAT s1=*s;
while ( i<n )
{
@@ -71,14 +69,14 @@ static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
x03 = x1[3];
y03 = y1[3];
- f0 = c1*x00 + s1*y00;
- g0 = c1*y00 - s1*x00;
- f1 = c1*x01 + s1*y01;
- g1 = c1*y01 - s1*x01;
- f2 = c1*x02 + s1*y02;
- g2 = c1*y02 - s1*x02;
- f3 = c1*x03 + s1*y03;
- g3 = c1*y03 - s1*x03;
+ f0 = c*x00 + s*y00;
+ g0 = c*y00 - s*x00;
+ f1 = c*x01 + s*y01;
+ g1 = c*y01 - s*x01;
+ f2 = c*x02 + s*y02;
+ g2 = c*y02 - s*x02;
+ f3 = c*x03 + s*y03;
+ g3 = c*y03 - s*x03;
x1[0] = f0;
y1[0] = g0;
@@ -106,8 +104,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
- FLOAT c1[4] __attribute__ ((aligned (16)));;
- FLOAT s1[4] __attribute__ ((aligned (16)));;
FLOAT *x1=x;
FLOAT *y1=y;
FLOAT temp;
@@ -120,15 +116,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
BLASLONG n1 = n & -16;
if ( n1 > 0 )
{
- c1[0]=c;
- c1[1]=c;
- c1[2]=c;
- c1[3]=c;
- s1[0]=s;
- s1[1]=s;
- s1[2]=s;
- s1[3]=s;
- srot_kernel_16(n1, x1, y1, c1, s1);
+ srot_kernel_16(n1, x1, y1, c, s);
i=n1;
}
diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
index ade6550..0a18c16 100644
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -38,171 +38,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1
-static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
-
-static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
{
+ __vector float t0;
+ __vector float t1;
+ __vector float t2;
+ __vector float t3;
+ __vector float t4;
+ __vector float t5;
+ __vector float t6;
+ __vector float t7;
+ __asm__
+ (
+ "xscvdpspn 36, %x13 \n\t" // load c to all words
+ "xxspltw 36, 36, 0 \n\t"
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *x2=x+1;
- FLOAT *y2=y+1;
-
- __asm__ __volatile__
- (
-
- "lxvw4x 36 , 0, %3 \n\t" // load c
- "lxvw4x 37 , 0, %4 \n\t" // load s
- "addi %8 , %8, -4 \n\t"
- "addi %9 , %9, -4 \n\t"
-
- "lxvw4x 32, 0, %1 \n\t" // load x
- "lxvw4x 33, %5, %1 \n\t"
- "lxvw4x 34, %6, %1 \n\t"
- "lxvw4x 35, %7, %1 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t" // load y
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
-
- "addi %1, %1, 64 \n\t"
- "addi %2, %2, 64 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "xvmulsp 48, 32, 36 \n\t" // c * x
- "xvmulsp 49, 33, 36 \n\t"
- "xvmulsp 50, 34, 36 \n\t"
- "xvmulsp 51, 35, 36 \n\t"
-
- "xvmulsp 56, 40, 36 \n\t" // c * y
- "xvmulsp 57, 41, 36 \n\t"
- "xvmulsp 58, 42, 36 \n\t"
- "xvmulsp 59, 43, 36 \n\t"
-
- "xvmulsp 52, 32, 37 \n\t" // s * x
- "xvmulsp 53, 33, 37 \n\t"
-
- "lxvw4x 32, 0, %1 \n\t" // load x
- "lxvw4x 33, %5, %1 \n\t"
-
- "xvmulsp 54, 34, 37 \n\t"
- "xvmulsp 55, 35, 37 \n\t"
-
- "lxvw4x 34, %6, %1 \n\t"
- "lxvw4x 35, %7, %1 \n\t"
-
- "xvmulsp 60, 40, 37 \n\t" // s * y
- "xvmulsp 61, 41, 37 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t" // load y
- "lxvw4x 41, %5, %2 \n\t"
-
- "xvmulsp 62, 42, 37 \n\t"
- "xvmulsp 63, 43, 37 \n\t"
-
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
-
- "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
- "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
-
- "addi %1, %1, 64 \n\t"
- "addi %2, %2, 64 \n\t"
-
- "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
- "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
-
- "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
- "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
- "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
- "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
-
- "stxvw4x 48, 0, %8 \n\t" // store x
- "stxvw4x 49, %5, %8 \n\t"
- "stxvw4x 50, %6, %8 \n\t"
- "stxvw4x 51, %7, %8 \n\t"
-
- "stxvw4x 56, 0, %9 \n\t" // store y
- "stxvw4x 57, %5, %9 \n\t"
- "stxvw4x 58, %6, %9 \n\t"
- "stxvw4x 59, %7, %9 \n\t"
-
- "addi %8, %8, 64 \n\t"
- "addi %9, %9, 64 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmulsp 48, 32, 36 \n\t" // c * x
- "xvmulsp 49, 33, 36 \n\t"
- "xvmulsp 50, 34, 36 \n\t"
- "xvmulsp 51, 35, 36 \n\t"
-
- "xvmulsp 56, 40, 36 \n\t" // c * y
- "xvmulsp 57, 41, 36 \n\t"
- "xvmulsp 58, 42, 36 \n\t"
- "xvmulsp 59, 43, 36 \n\t"
-
- "xvmulsp 52, 32, 37 \n\t" // s * x
- "xvmulsp 53, 33, 37 \n\t"
- "xvmulsp 54, 34, 37 \n\t"
- "xvmulsp 55, 35, 37 \n\t"
-
- "xvmulsp 60, 40, 37 \n\t" // s * y
- "xvmulsp 61, 41, 37 \n\t"
- "xvmulsp 62, 42, 37 \n\t"
- "xvmulsp 63, 43, 37 \n\t"
-
- "xvaddsp 48, 48 , 60 \n\t" // c * x + s * y
- "xvaddsp 49, 49 , 61 \n\t" // c * x + s * y
- "xvaddsp 50, 50 , 62 \n\t" // c * x + s * y
- "xvaddsp 51, 51 , 63 \n\t" // c * x + s * y
-
- "xvsubsp 56, 56 , 52 \n\t" // c * y - s * x
- "xvsubsp 57, 57 , 53 \n\t" // c * y - s * x
- "xvsubsp 58, 58 , 54 \n\t" // c * y - s * x
- "xvsubsp 59, 59 , 55 \n\t" // c * y - s * x
+ "xscvdpspn 37, %x14 \n\t" // load s to all words
+ "xxspltw 37, 37, 0 \n\t"
- "stxvw4x 48, 0, %8 \n\t" // store x
- "stxvw4x 49, %5, %8 \n\t"
- "stxvw4x 50, %6, %8 \n\t"
- "stxvw4x 51, %7, %8 \n\t"
+ "lxvw4x 32, 0, %3 \n\t" // load x
+ "lxvw4x 33, %15, %3 \n\t"
+ "lxvw4x 34, %16, %3 \n\t"
+ "lxvw4x 35, %17, %3 \n\t"
- "stxvw4x 56, 0, %9 \n\t" // store y
- "stxvw4x 57, %5, %9 \n\t"
- "stxvw4x 58, %6, %9 \n\t"
- "stxvw4x 59, %7, %9 \n\t"
+ "lxvw4x 48, 0, %4 \n\t" // load y
+ "lxvw4x 49, %15, %4 \n\t"
+ "lxvw4x 50, %16, %4 \n\t"
+ "lxvw4x 51, %17, %4 \n\t"
+ "addi %3, %3, 64 \n\t"
+ "addi %4, %4, 64 \n\t"
+ "addic. %2, %2, -16 \n\t"
+ "ble 2f \n\t"
- :
- :
- "r" (i), // 0
- "r" (x1), // 1
- "r" (y1), // 2
- "r" (c), // 3
- "r" (s), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (x2), // 8
- "r" (y2) // 9
- : "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
- );
+ ".p2align 5 \n"
+ "1: \n\t"
-}
+ "xvmulsp 40, 32, 36 \n\t" // c * x
+ "xvmulsp 41, 33, 36 \n\t"
+ "xvmulsp 42, 34, 36 \n\t"
+ "xvmulsp 43, 35, 36 \n\t"
+ "xvmulsp %x5, 48, 36 \n\t" // c * y
+ "xvmulsp %x6, 49, 36 \n\t"
+ "xvmulsp %x7, 50, 36 \n\t"
+ "xvmulsp %x8, 51, 36 \n\t"
+ "xvmulsp 44, 32, 37 \n\t" // s * x
+ "xvmulsp 45, 33, 37 \n\t"
+
+ "lxvw4x 32, 0, %3 \n\t" // load x
+ "lxvw4x 33, %15, %3 \n\t"
+
+ "xvmulsp 46, 34, 37 \n\t"
+ "xvmulsp 47, 35, 37 \n\t"
+
+ "lxvw4x 34, %16, %3 \n\t"
+ "lxvw4x 35, %17, %3 \n\t"
+
+ "xvmulsp %x9, 48, 37 \n\t" // s * y
+ "xvmulsp %x10, 49, 37 \n\t"
+
+ "lxvw4x 48, 0, %4 \n\t" // load y
+ "lxvw4x 49, %15, %4 \n\t"
+
+ "xvmulsp %x11, 50, 37 \n\t"
+ "xvmulsp %x12, 51, 37 \n\t"
+
+ "lxvw4x 50, %16, %4 \n\t"
+ "lxvw4x 51, %17, %4 \n\t"
+
+ "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
+ "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
+
+ "addi %3, %3, -64 \n\t"
+ "addi %4, %4, -64 \n\t"
+
+ "xvaddsp 42, 42, %x11 \n\t" // c * x + s * y
+ "xvaddsp 43, 43, %x12 \n\t" // c * x + s * y
+
+ "xvsubsp %x5, %x5, 44 \n\t" // c * y - s * x
+ "xvsubsp %x6, %x6, 45 \n\t" // c * y - s * x
+ "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
+ "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
+
+ "stxvw4x 40, 0, %3 \n\t" // store x
+ "stxvw4x 41, %15, %3 \n\t"
+ "stxvw4x 42, %16, %3 \n\t"
+ "stxvw4x 43, %17, %3 \n\t"
+
+ "stxvw4x %x5, 0, %4 \n\t" // store y
+ "stxvw4x %x6, %15, %4 \n\t"
+ "stxvw4x %x7, %16, %4 \n\t"
+ "stxvw4x %x8, %17, %4 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %2, %2, -16 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmulsp 40, 32, 36 \n\t" // c * x
+ "xvmulsp 41, 33, 36 \n\t"
+ "xvmulsp 42, 34, 36 \n\t"
+ "xvmulsp 43, 35, 36 \n\t"
+
+ "xvmulsp %x5, 48, 36 \n\t" // c * y
+ "xvmulsp %x6, 49, 36 \n\t"
+ "xvmulsp %x7, 50, 36 \n\t"
+ "xvmulsp %x8, 51, 36 \n\t"
+
+ "xvmulsp 44, 32, 37 \n\t" // s * x
+ "xvmulsp 45, 33, 37 \n\t"
+ "xvmulsp 46, 34, 37 \n\t"
+ "xvmulsp 47, 35, 37 \n\t"
+
+ "xvmulsp %x9, 48, 37 \n\t" // s * y
+ "xvmulsp %x10, 49, 37 \n\t"
+ "xvmulsp %x11, 50, 37 \n\t"
+ "xvmulsp %x12, 51, 37 \n\t"
+
+ "addi %3, %3, -64 \n\t"
+ "addi %4, %4, -64 \n\t"
+
+ "xvaddsp 40, 40, %x9 \n\t" // c * x + s * y
+ "xvaddsp 41, 41, %x10 \n\t" // c * x + s * y
+ "xvaddsp 42, 42, %x11 \n\t" // c * x + s * y
+ "xvaddsp 43, 43, %x12 \n\t" // c * x + s * y
+
+ "xvsubsp %x5, %x5, 44 \n\t" // c * y - s * x
+ "xvsubsp %x6, %x6, 45 \n\t" // c * y - s * x
+ "xvsubsp %x7, %x7, 46 \n\t" // c * y - s * x
+ "xvsubsp %x8, %x8, 47 \n\t" // c * y - s * x
+
+ "stxvw4x 40, 0, %3 \n\t" // store x
+ "stxvw4x 41, %15, %3 \n\t"
+ "stxvw4x 42, %16, %3 \n\t"
+ "stxvw4x 43, %17, %3 \n\t"
+
+ "stxvw4x %x5, 0, %4 \n\t" // store y
+ "stxvw4x %x6, %15, %4 \n\t"
+ "stxvw4x %x7, %16, %4 \n\t"
+ "stxvw4x %x8, %17, %4 \n"
+
+ "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
+ "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
+ :
+ "+m" (*x),
+ "+m" (*y),
+ "+r" (n), // 2
+ "+b" (x), // 3
+ "+b" (y), // 4
+ "=wa" (t0), // 5
+ "=wa" (t1), // 6
+ "=wa" (t2), // 7
+ "=wa" (t3), // 8
+ "=wa" (t4), // 9
+ "=wa" (t5), // 10
+ "=wa" (t6), // 11
+ "=wa" (t7) // 12
+ :
+ "f" (c), // 13
+ "f" (s), // 14
+ "b" (16), // 15
+ "b" (32), // 16
+ "b" (48) // 17
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+}
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
index c6ef5e9..bd5cdc4 100644
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@@ -42,11 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#if !defined(HAVE_KERNEL_16)
-static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
+static void sscal_kernel_16 (BLASLONG n, FLOAT *x, FLOAT alpha)
{
BLASLONG i;
- FLOAT alpha = *da;
for( i=0; i<n; i+=8 )
{
@@ -63,7 +62,7 @@ static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
}
-static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
+static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x )
{
BLASLONG i;
@@ -90,7 +89,6 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
{
BLASLONG i=0,j=0;
- FLOAT alpha[4] __attribute__ ((aligned (16)));;
if ( n <= 0 || inc_x <=0 )
return(0);
@@ -105,11 +103,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
- alpha[0]=da;
- alpha[1]=da;
- alpha[2]=da;
- alpha[3]=da;
- sscal_kernel_16_zero(n1 , alpha , x);
+ sscal_kernel_16_zero(n1, x);
j=n1;
}
@@ -127,11 +121,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
BLASLONG n1 = n & -32;
if ( n1 > 0 )
{
- alpha[0]=da;
- alpha[1]=da;
- alpha[2]=da;
- alpha[3]=da;
- sscal_kernel_16(n1 , alpha , x);
+ sscal_kernel_16(n1, x, da);
j=n1;
}
while(j < n)
diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
index 963cec7..49862a3 100644
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -35,184 +35,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1
-static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
+static void sscal_kernel_16 (long n, float *x, float alpha)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *x2=x+1;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "lxvw4x 32, 0, %3 \n\t"
- "addi %1, %1, -4 \n\t"
-
- "dcbt %2, %4 \n\t"
-
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %4 \n\t"
-
- "xvmulsp 48, 40, 32 \n\t"
- "xvmulsp 49, 41, 32 \n\t"
- "lxvw4x 40, 0, %2 \n\t"
- "lxvw4x 41, %5, %2 \n\t"
- "xvmulsp 50, 42, 32 \n\t"
- "xvmulsp 51, 43, 32 \n\t"
- "lxvw4x 42, %6, %2 \n\t"
- "lxvw4x 43, %7, %2 \n\t"
- "xvmulsp 52, 44, 32 \n\t"
- "xvmulsp 53, 45, 32 \n\t"
- "lxvw4x 44, %8, %2 \n\t"
- "lxvw4x 45, %9, %2 \n\t"
- "xvmulsp 54, 46, 32 \n\t"
- "xvmulsp 55, 47, 32 \n\t"
- "lxvw4x 46, %10, %2 \n\t"
- "lxvw4x 47, %11, %2 \n\t"
-
- "stxvw4x 48, 0, %1 \n\t"
- "stxvw4x 49, %5, %1 \n\t"
- "stxvw4x 50, %6, %1 \n\t"
- "stxvw4x 51, %7, %1 \n\t"
- "stxvw4x 52, %8, %1 \n\t"
- "stxvw4x 53, %9, %1 \n\t"
- "stxvw4x 54, %10, %1 \n\t"
- "stxvw4x 55, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmulsp 48, 40, 32 \n\t"
- "xvmulsp 49, 41, 32 \n\t"
- "xvmulsp 50, 42, 32 \n\t"
- "xvmulsp 51, 43, 32 \n\t"
- "xvmulsp 52, 44, 32 \n\t"
- "xvmulsp 53, 45, 32 \n\t"
- "xvmulsp 54, 46, 32 \n\t"
- "xvmulsp 55, 47, 32 \n\t"
-
- "stxvw4x 48, 0, %1 \n\t"
- "stxvw4x 49, %5, %1 \n\t"
- "stxvw4x 50, %6, %1 \n\t"
- "stxvw4x 51, %7, %1 \n\t"
- "stxvw4x 52, %8, %1 \n\t"
- "stxvw4x 53, %9, %1 \n\t"
- "stxvw4x 54, %10, %1 \n\t"
- "stxvw4x 55, %11, %1 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (x2), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
-static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xscvdpspn %x3, %x3 \n\t"
+ "xxspltw %x3, %x3, 0 \n\t"
+
+ "lxvw4x 32, 0, %2 \n\t"
+ "lxvw4x 33, %4, %2 \n\t"
+ "lxvw4x 34, %5, %2 \n\t"
+ "lxvw4x 35, %6, %2 \n\t"
+ "lxvw4x 36, %7, %2 \n\t"
+ "lxvw4x 37, %8, %2 \n\t"
+ "lxvw4x 38, %9, %2 \n\t"
+ "lxvw4x 39, %10, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvmulsp 40, 32, %x3 \n\t"
+ "xvmulsp 41, 33, %x3 \n\t"
+ "lxvw4x 32, 0, %2 \n\t"
+ "lxvw4x 33, %4, %2 \n\t"
+ "xvmulsp 42, 34, %x3 \n\t"
+ "xvmulsp 43, 35, %x3 \n\t"
+ "lxvw4x 34, %5, %2 \n\t"
+ "lxvw4x 35, %6, %2 \n\t"
+ "xvmulsp 44, 36, %x3 \n\t"
+ "xvmulsp 45, 37, %x3 \n\t"
+ "lxvw4x 36, %7, %2 \n\t"
+ "lxvw4x 37, %8, %2 \n\t"
+ "xvmulsp 46, 38, %x3 \n\t"
+ "xvmulsp 47, 39, %x3 \n\t"
+ "lxvw4x 38, %9, %2 \n\t"
+ "lxvw4x 39, %10, %2 \n\t"
+
+ "addi %2, %2, -128 \n\t"
+
+ "stxvw4x 40, 0, %2 \n\t"
+ "stxvw4x 41, %4, %2 \n\t"
+ "stxvw4x 42, %5, %2 \n\t"
+ "stxvw4x 43, %6, %2 \n\t"
+ "stxvw4x 44, %7, %2 \n\t"
+ "stxvw4x 45, %8, %2 \n\t"
+ "stxvw4x 46, %9, %2 \n\t"
+ "stxvw4x 47, %10, %2 \n\t"
+
+ "addi %2, %2, 256 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmulsp 40, 32, %x3 \n\t"
+ "xvmulsp 41, 33, %x3 \n\t"
+ "xvmulsp 42, 34, %x3 \n\t"
+ "xvmulsp 43, 35, %x3 \n\t"
+
+ "addi %2, %2, -128 \n\t"
+
+ "xvmulsp 44, 36, %x3 \n\t"
+ "xvmulsp 45, 37, %x3 \n\t"
+ "xvmulsp 46, 38, %x3 \n\t"
+ "xvmulsp 47, 39, %x3 \n\t"
+
+ "stxvw4x 40, 0, %2 \n\t"
+ "stxvw4x 41, %4, %2 \n\t"
+ "stxvw4x 42, %5, %2 \n\t"
+ "stxvw4x 43, %6, %2 \n\t"
+ "stxvw4x 44, %7, %2 \n\t"
+ "stxvw4x 45, %8, %2 \n\t"
+ "stxvw4x 46, %9, %2 \n\t"
+ "stxvw4x 47, %10, %2 \n"
+
+ "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+ :
+ "+m" (*x),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+f" (alpha) // 3
+ :
+ "b" (16), // 4
+ "b" (32), // 5
+ "b" (48), // 6
+ "b" (64), // 7
+ "b" (80), // 8
+ "b" (96), // 9
+ "b" (112) // 10
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
+
+
+static void sscal_kernel_16_zero (long n, float *x)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *x2=x+1;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "xxlxor 32 , 32 , 32 \n\t"
- "addi %1, %1, -4 \n\t"
-
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "stxvw4x 32, 0, %1 \n\t"
- "stxvw4x 32, %5, %1 \n\t"
- "stxvw4x 32, %6, %1 \n\t"
- "stxvw4x 32, %7, %1 \n\t"
- "stxvw4x 32, %8, %1 \n\t"
- "stxvw4x 32, %9, %1 \n\t"
- "stxvw4x 32, %10, %1 \n\t"
- "stxvw4x 32, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (x2), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
+ __vector float t0;
+
+ __asm__
+ (
+ "xxlxor %x3, %x3, %x3 \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "stxvw4x %x3, 0, %2 \n\t"
+ "stxvw4x %x3, %4, %2 \n\t"
+ "stxvw4x %x3, %5, %2 \n\t"
+ "stxvw4x %x3, %6, %2 \n\t"
+ "stxvw4x %x3, %7, %2 \n\t"
+ "stxvw4x %x3, %8, %2 \n\t"
+ "stxvw4x %x3, %9, %2 \n\t"
+ "stxvw4x %x3, %10, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -32 \n\t"
+ "bgt 1b \n"
+
+ "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+ :
+ "=m" (*x),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "=wa" (t0) // 3
+ :
+ "b" (16), // 4
+ "b" (32), // 5
+ "b" (48), // 6
+ "b" (64), // 7
+ "b" (80), // 8
+ "b" (96), // 9
+ "b" (112) // 10
+ :
+ "cr0"
+ );
+}
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
index c48e743..d44f167 100644
--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -35,102 +35,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_32 1
-static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void sswap_kernel_32 (long n, float *x, float *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *x2=x+1;
- FLOAT *y2=y+1;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "addi %3, %3, -4 \n\t"
- "addi %4, %4, -4 \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "lxvw4x 32, 0, %2 \n\t"
- "lxvw4x 33, %5, %2 \n\t"
- "lxvw4x 34, %6, %2 \n\t"
- "lxvw4x 35, %7, %2 \n\t"
- "lxvw4x 36, %8, %2 \n\t"
- "lxvw4x 37, %9, %2 \n\t"
- "lxvw4x 38, %10, %2 \n\t"
- "lxvw4x 39, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvw4x 48, 0, %1 \n\t"
- "lxvw4x 49, %5, %1 \n\t"
- "lxvw4x 50, %6, %1 \n\t"
- "lxvw4x 51, %7, %1 \n\t"
- "lxvw4x 52, %8, %1 \n\t"
- "lxvw4x 53, %9, %1 \n\t"
- "lxvw4x 54, %10, %1 \n\t"
- "lxvw4x 55, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "stxvw4x 32, 0, %3 \n\t"
- "stxvw4x 33, %5, %3 \n\t"
- "stxvw4x 34, %6, %3 \n\t"
- "stxvw4x 35, %7, %3 \n\t"
- "stxvw4x 36, %8, %3 \n\t"
- "stxvw4x 37, %9, %3 \n\t"
- "stxvw4x 38, %10, %3 \n\t"
- "stxvw4x 39, %11, %3 \n\t"
-
- "addi %3, %3, 128 \n\t"
-
- "stxvw4x 48, 0, %4 \n\t"
- "stxvw4x 49, %5, %4 \n\t"
- "stxvw4x 50, %6, %4 \n\t"
- "stxvw4x 51, %7, %4 \n\t"
- "stxvw4x 52, %8, %4 \n\t"
- "stxvw4x 53, %9, %4 \n\t"
- "stxvw4x 54, %10, %4 \n\t"
- "stxvw4x 55, %11, %4 \n\t"
-
- "addi %4, %4, 128 \n\t"
-
- "addic. %0 , %0 , -32 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (y2), // 3
- "r" (x2), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "lxvw4x 32, 0, %4 \n\t"
+ "lxvw4x 33, %5, %4 \n\t"
+ "lxvw4x 34, %6, %4 \n\t"
+ "lxvw4x 35, %7, %4 \n\t"
+ "lxvw4x 36, %8, %4 \n\t"
+ "lxvw4x 37, %9, %4 \n\t"
+ "lxvw4x 38, %10, %4 \n\t"
+ "lxvw4x 39, %11, %4 \n\t"
+
+ "lxvw4x 40, 0, %3 \n\t"
+ "lxvw4x 41, %5, %3 \n\t"
+ "lxvw4x 42, %6, %3 \n\t"
+ "lxvw4x 43, %7, %3 \n\t"
+ "lxvw4x 44, %8, %3 \n\t"
+ "lxvw4x 45, %9, %3 \n\t"
+ "lxvw4x 46, %10, %3 \n\t"
+ "lxvw4x 47, %11, %3 \n\t"
+
+ "stxvw4x 32, 0, %3 \n\t"
+ "stxvw4x 33, %5, %3 \n\t"
+ "stxvw4x 34, %6, %3 \n\t"
+ "stxvw4x 35, %7, %3 \n\t"
+ "stxvw4x 36, %8, %3 \n\t"
+ "stxvw4x 37, %9, %3 \n\t"
+ "stxvw4x 38, %10, %3 \n\t"
+ "stxvw4x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvw4x 40, 0, %4 \n\t"
+ "stxvw4x 41, %5, %4 \n\t"
+ "stxvw4x 42, %6, %4 \n\t"
+ "stxvw4x 43, %7, %4 \n\t"
+ "stxvw4x 44, %8, %4 \n\t"
+ "stxvw4x 45, %9, %4 \n\t"
+ "stxvw4x 46, %10, %4 \n\t"
+ "stxvw4x 47, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "addic. %2, %2, -32 \n\t"
+ "bgt 1b \n"
+
+ "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "+m" (*x),
+ "+m" (*y),
+ "+r" (n), // 2
+ "+b" (x), // 3
+ "+b" (y) // 4
+ :
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c
index abd6ec0..0b6b87d 100644
--- a/kernel/power/zasum.c
+++ b/kernel/power/zasum.c
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8
-static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT zasum_kernel_8(BLASLONG n, FLOAT *x1)
{
BLASLONG i=0;
@@ -92,9 +92,7 @@ static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
}
- svec[0] = sum0+sum1+sum2+sum3;
- svec[1] = 0.0;
-
+ return sum0+sum1+sum2+sum3;
}
#endif
@@ -104,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
BLASLONG i=0;
BLASLONG ip=0;
FLOAT sumf = 0.0;
- FLOAT svec[2] __attribute__ ((aligned (16)));;
BLASLONG n1;
BLASLONG inc_x2;
@@ -117,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
if ( n1 > 0 )
{
- zasum_kernel_8(n1, x, svec);
- sumf = svec[0] + svec[1];
+ sumf = zasum_kernel_8(n1, x);
i=n1;
ip=2*n1;
}
diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c
index b9f6c0a..8236690 100644
--- a/kernel/power/zasum_microk_power8.c
+++ b/kernel/power/zasum_microk_power8.c
@@ -34,144 +34,140 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_8 1
-static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));
-static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec)
+static double zasum_kernel_8 (long n, double *x)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "dcbt %2 , %4 \n\t"
-
- "xxlxor 32,32,32 \n\t"
- "xxlxor 33,33,33 \n\t"
- "xxlxor 34,34,34 \n\t"
- "xxlxor 35,35,35 \n\t"
- "xxlxor 36,36,36 \n\t"
- "xxlxor 37,37,37 \n\t"
- "xxlxor 38,38,38 \n\t"
- "xxlxor 39,39,39 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2 , %4 \n\t"
-
- "xvabsdp 48, 40 \n\t"
- "xvabsdp 49, 41 \n\t"
- "xvabsdp 50, 42 \n\t"
- "xvabsdp 51, 43 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
-
- "xvabsdp 52, 44 \n\t"
- "xvabsdp 53, 45 \n\t"
-
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
-
- "xvabsdp 54, 46 \n\t"
- "xvabsdp 55, 47 \n\t"
-
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
-
- "xvadddp 32, 32, 48 \n\t"
- "xvadddp 33, 33, 49 \n\t"
-
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "xvadddp 34, 34, 50 \n\t"
- "xvadddp 35, 35, 51 \n\t"
- "addi %2, %2, 128 \n\t"
- "xvadddp 36, 36, 52 \n\t"
- "xvadddp 37, 37, 53 \n\t"
- "addic. %0 , %0 , -8 \n\t"
- "xvadddp 38, 38, 54 \n\t"
- "xvadddp 39, 39, 55 \n\t"
-
- "bgt 1b \n\t"
-
- "2: \n\t"
-
-
- "xvabsdp 48, 40 \n\t"
- "xvabsdp 49, 41 \n\t"
- "xvabsdp 50, 42 \n\t"
- "xvabsdp 51, 43 \n\t"
- "xvabsdp 52, 44 \n\t"
- "xvabsdp 53, 45 \n\t"
- "xvabsdp 54, 46 \n\t"
- "xvabsdp 55, 47 \n\t"
-
- "xvadddp 32, 32, 48 \n\t"
- "xvadddp 33, 33, 49 \n\t"
- "xvadddp 34, 34, 50 \n\t"
- "xvadddp 35, 35, 51 \n\t"
- "xvadddp 36, 36, 52 \n\t"
- "xvadddp 37, 37, 53 \n\t"
- "xvadddp 38, 38, 54 \n\t"
- "xvadddp 39, 39, 55 \n\t"
-
- "xvadddp 32, 32, 33 \n\t"
- "xvadddp 34, 34, 35 \n\t"
- "xvadddp 36, 36, 37 \n\t"
- "xvadddp 38, 38, 39 \n\t"
-
- "xvadddp 32, 32, 34 \n\t"
- "xvadddp 36, 36, 38 \n\t"
-
- "xvadddp 32, 32, 36 \n\t"
-
-
- "stxvd2x 32, 0, %3 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (svec), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2", "memory"
- );
-
-}
-
-
+ double sum;
+ __vector double t0;
+ __vector double t1;
+ __vector double t2;
+ __vector double t3;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -8 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %8, %2 \n\t"
+
+ "xvabsdp %x3, 44 \n\t"
+ "xvabsdp %x4, 45 \n\t"
+
+ "lxvd2x 42, %9, %2 \n\t"
+ "lxvd2x 43, %10, %2 \n\t"
+
+ "xvabsdp %x5, 46 \n\t"
+ "xvabsdp %x6, 47 \n\t"
+
+ "lxvd2x 44, %11, %2 \n\t"
+ "lxvd2x 45, %12, %2 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+
+ "lxvd2x 46, %13, %2 \n\t"
+ "lxvd2x 47, %14, %2 \n\t"
+
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "addi %2, %2, 128 \n\t"
+ "xvadddp 36, 36, %x3 \n\t"
+ "xvadddp 37, 37, %x4 \n\t"
+ "addic. %1, %1, -8 \n\t"
+ "xvadddp 38, 38, %x5 \n\t"
+ "xvadddp 39, 39, %x6 \n\t"
+
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvabsdp 48, 40 \n\t"
+ "xvabsdp 49, 41 \n\t"
+ "xvabsdp 50, 42 \n\t"
+ "xvabsdp 51, 43 \n\t"
+ "xvabsdp %x3, 44 \n\t"
+ "xvabsdp %x4, 45 \n\t"
+ "xvabsdp %x5, 46 \n\t"
+ "xvabsdp %x6, 47 \n\t"
+
+ "xvadddp 32, 32, 48 \n\t"
+ "xvadddp 33, 33, 49 \n\t"
+ "xvadddp 34, 34, 50 \n\t"
+ "xvadddp 35, 35, 51 \n\t"
+ "xvadddp 36, 36, %x3 \n\t"
+ "xvadddp 37, 37, %x4 \n\t"
+ "xvadddp 38, 38, %x5 \n\t"
+ "xvadddp 39, 39, %x6 \n\t"
+
+ "xvadddp 32, 32, 33 \n\t"
+ "xvadddp 34, 34, 35 \n\t"
+ "xvadddp 36, 36, 37 \n\t"
+ "xvadddp 38, 38, 39 \n\t"
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+
+ "xxswapd 33, 32 \n\t"
+ "xsadddp %x0, 32, 33 \n"
+
+ "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+ "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+ :
+ "=d" (sum), // 0
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "=wa" (t0), // 3
+ "=wa" (t1), // 4
+ "=wa" (t2), // 5
+ "=wa" (t3) // 6
+ :
+ "m" (*x),
+ "b" (16), // 8
+ "b" (32), // 9
+ "b" (48), // 10
+ "b" (64), // 11
+ "b" (80), // 12
+ "b" (96), // 13
+ "b" (112) // 14
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+
+ return sum;
+}
diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c
index 0ee0c1b..dd7ab6c 100644
--- a/kernel/power/zaxpy.c
+++ b/kernel/power/zaxpy.c
@@ -78,7 +78,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
{
BLASLONG i=0;
BLASLONG ix=0,iy=0;
- FLOAT da[4];
if ( n <= 0 ) return(0);
@@ -89,11 +88,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
if ( n1 )
{
- da[0] = da_r;
- da[1] = da_r;
- da[2] = da_i;
- da[3] = da_i;
- zaxpy_kernel_4(n1, x, y , da );
+ zaxpy_kernel_4 (n1, x, y, da_r, da_i);
ix = 2 * n1;
}
i = n1;
diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c
index c8a529f..124614f 100644
--- a/kernel/power/zaxpy_microk_power8.c
+++ b/kernel/power/zaxpy_microk_power8.c
@@ -35,216 +35,225 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_4 1
-static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
-
-static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void zaxpy_kernel_4 (long n, double *x, double *y,
+ double alpha_r, double alpha_i)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *y2=y+1;
- BLASLONG pre = 384;
-
#if !defined(CONJ)
- FLOAT mvec[2] = { -1.0, 1.0 };
+ static const double mvec[2] = { -1.0, 1.0 };
#else
- FLOAT mvec[2] = { 1.0, -1.0 };
+ static const double mvec[2] = { 1.0, -1.0 };
#endif
-
-
- __asm__ __volatile__
- (
-
- "lxsdx 34, 0 , %4 \n\t" // alpha_r
- "lxsdx 35, %5, %4 \n\t" // alpha_i
- "xxspltd 32, 34, 0 \n\t"
- "xxspltd 33, 35, 0 \n\t"
-
- "lxvd2x 36, 0, %9 \n\t" // mvec
+ const double *mvecp = mvec;
+
+ __vector double t0;
+ __vector double t1;
+ __vector double t2;
+ __vector double t3;
+ __vector double t4;
+ __vector double t5;
+ __vector double t6;
+ __vector double t7;
+ __vector double t8;
+ __vector double t9;
+ __vector double t10;
+ __vector double t11;
+ long ytmp;
+
+ __asm__
+ (
+ "xxspltd 32, %x19, 0 \n\t" // alpha_r
+ "xxspltd 33, %x20, 0 \n\t" // alpha_i
+
+ "lxvd2x 36, 0, %21 \n\t" // mvec
#if !defined(CONJ)
- "xvmuldp 33, 33 , 36 \n\t" // alpha_i * mvec
+ "xvmuldp 33, 33, 36 \n\t" // alpha_i * mvec
#else
- "xvmuldp 32, 32 , 36 \n\t" // alpha_r * mvec
+ "xvmuldp 32, 32, 36 \n\t" // alpha_r * mvec
#endif
- "addi %8, %8, -8 \n\t"
-
- "dcbt %2, %10 \n\t"
- "dcbt %3, %10 \n\t"
-
-
- "lxvd2x 40, 0, %2 \n\t" // x0
- "lxvd2x 41, %5, %2 \n\t" // x1
- "lxvd2x 42, %6, %2 \n\t" // x2
- "lxvd2x 43, %7, %2 \n\t" // x3
-
- "lxvd2x 48, 0, %3 \n\t" // y0
- "lxvd2x 49, %5, %3 \n\t" // y1
- "lxvd2x 50, %6, %3 \n\t" // y2
- "lxvd2x 51, %7, %3 \n\t" // y3
-
- "xxswapd 56, 40 \n\t" // exchange real and imag part
- "xxswapd 57, 41 \n\t" // exchange real and imag part
- "xxswapd 58, 42 \n\t" // exchange real and imag part
- "xxswapd 59, 43 \n\t" // exchange real and imag part
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
- "lxvd2x 44, 0, %2 \n\t" // x4
- "lxvd2x 45, %5, %2 \n\t" // x5
- "lxvd2x 46, %6, %2 \n\t" // x6
- "lxvd2x 47, %7, %2 \n\t" // x7
-
- "lxvd2x 52, 0, %3 \n\t" // y4
- "lxvd2x 53, %5, %3 \n\t" // y5
- "lxvd2x 54, %6, %3 \n\t" // y6
- "lxvd2x 55, %7, %3 \n\t" // y7
-
- "xxswapd 60, 44 \n\t" // exchange real and imag part
- "xxswapd 61, 45 \n\t" // exchange real and imag part
- "xxswapd 62, 46 \n\t" // exchange real and imag part
- "xxswapd 63, 47 \n\t" // exchange real and imag part
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %10 \n\t"
- "dcbt %3, %10 \n\t"
-
- "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
- "xvmaddadp 49, 41, 32 \n\t"
- "lxvd2x 40, 0, %2 \n\t" // x0
- "lxvd2x 41, %5, %2 \n\t" // x1
- "xvmaddadp 50, 42, 32 \n\t"
- "xvmaddadp 51, 43, 32 \n\t"
- "lxvd2x 42, %6, %2 \n\t" // x2
- "lxvd2x 43, %7, %2 \n\t" // x3
-
- "xvmaddadp 52, 44, 32 \n\t"
- "addi %2, %2, 64 \n\t"
- "xvmaddadp 53, 45, 32 \n\t"
- "lxvd2x 44, 0, %2 \n\t" // x4
- "lxvd2x 45, %5, %2 \n\t" // x5
- "xvmaddadp 54, 46, 32 \n\t"
- "xvmaddadp 55, 47, 32 \n\t"
- "lxvd2x 46, %6, %2 \n\t" // x6
- "lxvd2x 47, %7, %2 \n\t" // x7
-
- "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
- "addi %2, %2, 64 \n\t"
- "xvmaddadp 49, 57, 33 \n\t"
- "xvmaddadp 50, 58, 33 \n\t"
- "xvmaddadp 51, 59, 33 \n\t"
-
- "xvmaddadp 52, 60, 33 \n\t"
- "xvmaddadp 53, 61, 33 \n\t"
- "xvmaddadp 54, 62, 33 \n\t"
- "xvmaddadp 55, 63, 33 \n\t"
-
- "stxvd2x 48, 0, %8 \n\t"
- "stxvd2x 49, %5, %8 \n\t"
- "stxvd2x 50, %6, %8 \n\t"
- "stxvd2x 51, %7, %8 \n\t"
-
- "addi %8, %8, 64 \n\t"
-
- "stxvd2x 52, 0, %8 \n\t"
- "stxvd2x 53, %5, %8 \n\t"
- "stxvd2x 54, %6, %8 \n\t"
- "stxvd2x 55, %7, %8 \n\t"
-
- "addi %8, %8, 64 \n\t"
-
- "xxswapd 56, 40 \n\t" // exchange real and imag part
- "xxswapd 57, 41 \n\t" // exchange real and imag part
- "lxvd2x 48, 0, %3 \n\t" // y0
- "lxvd2x 49, %5, %3 \n\t" // y1
- "xxswapd 58, 42 \n\t" // exchange real and imag part
- "xxswapd 59, 43 \n\t" // exchange real and imag part
- "lxvd2x 50, %6, %3 \n\t" // y2
- "lxvd2x 51, %7, %3 \n\t" // y3
-
- "xxswapd 60, 44 \n\t" // exchange real and imag part
- "addi %3, %3, 64 \n\t"
- "xxswapd 61, 45 \n\t" // exchange real and imag part
- "lxvd2x 52, 0, %3 \n\t" // y4
- "lxvd2x 53, %5, %3 \n\t" // y5
- "xxswapd 62, 46 \n\t" // exchange real and imag part
- "xxswapd 63, 47 \n\t" // exchange real and imag part
- "lxvd2x 54, %6, %3 \n\t" // y6
- "lxvd2x 55, %7, %3 \n\t" // y7
-
- "addi %3, %3, 64 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
- "xvmaddadp 49, 41, 32 \n\t"
- "xvmaddadp 50, 42, 32 \n\t"
- "xvmaddadp 51, 43, 32 \n\t"
-
- "xvmaddadp 52, 44, 32 \n\t"
- "xvmaddadp 53, 45, 32 \n\t"
- "xvmaddadp 54, 46, 32 \n\t"
- "xvmaddadp 55, 47, 32 \n\t"
-
- "xvmaddadp 48, 56, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
- "xvmaddadp 49, 57, 33 \n\t"
- "xvmaddadp 50, 58, 33 \n\t"
- "xvmaddadp 51, 59, 33 \n\t"
-
- "xvmaddadp 52, 60, 33 \n\t"
- "xvmaddadp 53, 61, 33 \n\t"
- "xvmaddadp 54, 62, 33 \n\t"
- "xvmaddadp 55, 63, 33 \n\t"
-
-
- "stxvd2x 48, 0, %8 \n\t"
- "stxvd2x 49, %5, %8 \n\t"
- "stxvd2x 50, %6, %8 \n\t"
- "stxvd2x 51, %7, %8 \n\t"
-
- "addi %8, %8, 64 \n\t"
-
- "stxvd2x 52, 0, %8 \n\t"
- "stxvd2x 53, %5, %8 \n\t"
- "stxvd2x 54, %6, %8 \n\t"
- "stxvd2x 55, %7, %8 \n\t"
-
- "addi %8, %8, 64 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (y1), // 3
- "r" (alpha), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (y2), // 8
- "r" (mvec), // 9
- "r" (pre) // 10
- : "cr0", "%0", "%2" , "%3", "%8", "memory"
- );
-
-}
-
-
+ "mr %16, %3 \n\t"
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+
+ "lxvd2x 40, 0, %2 \n\t" // x0
+ "lxvd2x 41, %22, %2 \n\t" // x1
+ "lxvd2x 42, %23, %2 \n\t" // x2
+ "lxvd2x 43, %24, %2 \n\t" // x3
+
+ "lxvd2x 48, 0, %3 \n\t" // y0
+ "lxvd2x 49, %22, %3 \n\t" // y1
+ "lxvd2x 50, %23, %3 \n\t" // y2
+ "lxvd2x 51, %24, %3 \n\t" // y3
+
+ "xxswapd %x8, 40 \n\t" // exchange real and imag part
+ "xxswapd %x9, 41 \n\t" // exchange real and imag part
+ "xxswapd %x10, 42 \n\t" // exchange real and imag part
+ "xxswapd %x11, 43 \n\t" // exchange real and imag part
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "lxvd2x 44, 0, %2 \n\t" // x4
+ "lxvd2x 45, %22, %2 \n\t" // x5
+ "lxvd2x 46, %23, %2 \n\t" // x6
+ "lxvd2x 47, %24, %2 \n\t" // x7
+
+ "lxvd2x %x4, 0, %3 \n\t" // y4
+ "lxvd2x %x5, %22, %3 \n\t" // y5
+ "lxvd2x %x6, %23, %3 \n\t" // y6
+ "lxvd2x %x7, %24, %3 \n\t" // y7
+
+ "xxswapd %x12, 44 \n\t" // exchange real and imag part
+ "xxswapd %x13, 45 \n\t" // exchange real and imag part
+ "xxswapd %x14, 46 \n\t" // exchange real and imag part
+ "xxswapd %x15, 47 \n\t" // exchange real and imag part
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "addic. %1, %1, -8 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
+ "xvmaddadp 49, 41, 32 \n\t"
+ "lxvd2x 40, 0, %2 \n\t" // x0
+ "lxvd2x 41, %22, %2 \n\t" // x1
+ "xvmaddadp 50, 42, 32 \n\t"
+ "xvmaddadp 51, 43, 32 \n\t"
+ "lxvd2x 42, %23, %2 \n\t" // x2
+ "lxvd2x 43, %24, %2 \n\t" // x3
+
+ "xvmaddadp %x4, 44, 32 \n\t"
+ "addi %2, %2, 64 \n\t"
+ "xvmaddadp %x5, 45, 32 \n\t"
+ "lxvd2x 44, 0, %2 \n\t" // x4
+ "lxvd2x 45, %22, %2 \n\t" // x5
+ "xvmaddadp %x6, 46, 32 \n\t"
+ "xvmaddadp %x7, 47, 32 \n\t"
+ "lxvd2x 46, %23, %2 \n\t" // x6
+ "lxvd2x 47, %24, %2 \n\t" // x7
+
+ "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
+ "addi %2, %2, 64 \n\t"
+ "xvmaddadp 49, %x9, 33 \n\t"
+ "xvmaddadp 50, %x10, 33 \n\t"
+ "xvmaddadp 51, %x11, 33 \n\t"
+
+ "xvmaddadp %x4, %x12, 33 \n\t"
+ "xvmaddadp %x5, %x13, 33 \n\t"
+ "xvmaddadp %x6, %x14, 33 \n\t"
+ "xvmaddadp %x7, %x15, 33 \n\t"
+
+ "stxvd2x 48, 0, %16 \n\t"
+ "stxvd2x 49, %22, %16 \n\t"
+ "stxvd2x 50, %23, %16 \n\t"
+ "stxvd2x 51, %24, %16 \n\t"
+
+ "addi %16, %16, 64 \n\t"
+
+ "stxvd2x %x4, 0, %16 \n\t"
+ "stxvd2x %x5, %22, %16 \n\t"
+ "stxvd2x %x6, %23, %16 \n\t"
+ "stxvd2x %x7, %24, %16 \n\t"
+
+ "addi %16, %16, 64 \n\t"
+
+ "xxswapd %x8, 40 \n\t" // exchange real and imag part
+ "xxswapd %x9, 41 \n\t" // exchange real and imag part
+ "lxvd2x 48, 0, %3 \n\t" // y0
+ "lxvd2x 49, %22, %3 \n\t" // y1
+ "xxswapd %x10, 42 \n\t" // exchange real and imag part
+ "xxswapd %x11, 43 \n\t" // exchange real and imag part
+ "lxvd2x 50, %23, %3 \n\t" // y2
+ "lxvd2x 51, %24, %3 \n\t" // y3
+
+ "xxswapd %x12, 44 \n\t" // exchange real and imag part
+ "addi %3, %3, 64 \n\t"
+ "xxswapd %x13, 45 \n\t" // exchange real and imag part
+ "lxvd2x %x4, 0, %3 \n\t" // y4
+ "lxvd2x %x5, %22, %3 \n\t" // y5
+ "xxswapd %x14, 46 \n\t" // exchange real and imag part
+ "xxswapd %x15, 47 \n\t" // exchange real and imag part
+ "lxvd2x %x6, %23, %3 \n\t" // y6
+ "lxvd2x %x7, %24, %3 \n\t" // y7
+
+ "addi %3, %3, 64 \n\t"
+
+ "addic. %1, %1, -8 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmaddadp 48, 40, 32 \n\t" // alpha_r * x0_r , alpha_r * x0_i
+ "xvmaddadp 49, 41, 32 \n\t"
+ "xvmaddadp 50, 42, 32 \n\t"
+ "xvmaddadp 51, 43, 32 \n\t"
+
+ "xvmaddadp %x4, 44, 32 \n\t"
+ "xvmaddadp %x5, 45, 32 \n\t"
+ "xvmaddadp %x6, 46, 32 \n\t"
+ "xvmaddadp %x7, 47, 32 \n\t"
+
+ "xvmaddadp 48, %x8, 33 \n\t" // alpha_i * x0_i , alpha_i * x0_r
+ "xvmaddadp 49, %x9, 33 \n\t"
+ "xvmaddadp 50, %x10, 33 \n\t"
+ "xvmaddadp 51, %x11, 33 \n\t"
+
+ "xvmaddadp %x4, %x12, 33 \n\t"
+ "xvmaddadp %x5, %x13, 33 \n\t"
+ "xvmaddadp %x6, %x14, 33 \n\t"
+ "xvmaddadp %x7, %x15, 33 \n\t"
+
+ "stxvd2x 48, 0, %16 \n\t"
+ "stxvd2x 49, %22, %16 \n\t"
+ "stxvd2x 50, %23, %16 \n\t"
+ "stxvd2x 51, %24, %16 \n\t"
+
+ "addi %16, %16, 64 \n\t"
+
+ "stxvd2x %x4, 0, %16 \n\t"
+ "stxvd2x %x5, %22, %16 \n\t"
+ "stxvd2x %x6, %23, %16 \n\t"
+ "stxvd2x %x7, %24, %16 \n"
+
+ "#n=%1 x=%17=%2 y=%0=%3 alpha=(%19,%20) mvecp=%18=%16 o16=%22 o32=%23 o48=%24 ytmp=%16\n"
+ "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15"
+ :
+ "+m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y), // 3
+ "=wa" (t0), // 4
+ "=wa" (t1), // 5
+ "=wa" (t2), // 6
+ "=wa" (t3), // 7
+ "=wa" (t4), // 8
+ "=wa" (t5), // 9
+ "=wa" (t6), // 10
+ "=wa" (t7), // 11
+ "=wa" (t8), // 12
+ "=wa" (t9), // 13
+ "=wa" (t10), // 14
+ "=wa" (t11), // 15
+ "=b" (ytmp) // 16
+ :
+ "m" (*x),
+ "m" (*mvecp),
+ "d" (alpha_r), // 19
+ "d" (alpha_i), // 20
+ "16" (mvecp), // 21
+ "b" (16), // 22
+ "b" (32), // 23
+ "b" (48) // 24
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+}
diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c
index 73abe08..5ca34b6 100644
--- a/kernel/power/zcopy_microk_power8.c
+++ b/kernel/power/zcopy_microk_power8.c
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1
-static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvd2x 50, 0, %2 \n\t"
- "lxvd2x 51, %5, %2 \n\t"
- "lxvd2x 52, %6, %2 \n\t"
- "lxvd2x 53, %7, %2 \n\t"
- "lxvd2x 54, %8, %2 \n\t"
- "lxvd2x 55, %9, %2 \n\t"
- "lxvd2x 56, %10, %2 \n\t"
- "lxvd2x 57, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "stxvd2x 40, 0, %1 \n\t"
- "stxvd2x 41, %5, %1 \n\t"
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "stxvd2x 42, %6, %1 \n\t"
- "stxvd2x 43, %7, %1 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "stxvd2x 44, %8, %1 \n\t"
- "stxvd2x 45, %9, %1 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "stxvd2x 46, %10, %1 \n\t"
- "stxvd2x 47, %11, %1 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "stxvd2x 50, 0, %1 \n\t"
- "stxvd2x 51, %5, %1 \n\t"
- "lxvd2x 50, 0, %2 \n\t"
- "lxvd2x 51, %5, %2 \n\t"
- "stxvd2x 52, %6, %1 \n\t"
- "stxvd2x 53, %7, %1 \n\t"
- "lxvd2x 52, %6, %2 \n\t"
- "lxvd2x 53, %7, %2 \n\t"
- "stxvd2x 54, %8, %1 \n\t"
- "stxvd2x 55, %9, %1 \n\t"
- "lxvd2x 54, %8, %2 \n\t"
- "lxvd2x 55, %9, %2 \n\t"
- "stxvd2x 56, %10, %1 \n\t"
- "stxvd2x 57, %11, %1 \n\t"
- "lxvd2x 56, %10, %2 \n\t"
- "lxvd2x 57, %11, %2 \n\t"
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "stxvd2x 40, 0, %1 \n\t"
- "stxvd2x 41, %5, %1 \n\t"
- "stxvd2x 42, %6, %1 \n\t"
- "stxvd2x 43, %7, %1 \n\t"
- "stxvd2x 44, %8, %1 \n\t"
- "stxvd2x 45, %9, %1 \n\t"
- "stxvd2x 46, %10, %1 \n\t"
- "stxvd2x 47, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "stxvd2x 50, 0, %1 \n\t"
- "stxvd2x 51, %5, %1 \n\t"
- "stxvd2x 52, %6, %1 \n\t"
- "stxvd2x 53, %7, %1 \n\t"
- "stxvd2x 54, %8, %1 \n\t"
- "stxvd2x 55, %9, %1 \n\t"
- "stxvd2x 56, %10, %1 \n\t"
- "stxvd2x 57, %11, %1 \n\t"
-
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "lxvd2x 32, 0, %2 \n\t"
+ "lxvd2x 33, %5, %2 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "lxvd2x 34, %6, %2 \n\t"
+ "lxvd2x 35, %7, %2 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "lxvd2x 36, %8, %2 \n\t"
+ "lxvd2x 37, %9, %2 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+ "lxvd2x 38, %10, %2 \n\t"
+ "lxvd2x 39, %11, %2 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "lxvd2x 40, 0, %2 \n\t"
+ "lxvd2x 41, %5, %2 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "lxvd2x 42, %6, %2 \n\t"
+ "lxvd2x 43, %7, %2 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "lxvd2x 44, %8, %2 \n\t"
+ "lxvd2x 45, %9, %2 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+ "lxvd2x 46, %10, %2 \n\t"
+ "lxvd2x 47, %11, %2 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -16 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n"
+
+ "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "=m" (*y),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "m" (*x),
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+ );
+}
diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c
index bc1a95e..b83f832 100644
--- a/kernel/power/zdot.c
+++ b/kernel/power/zdot.c
@@ -43,8 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8
-static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
-
static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
{
BLASLONG register i = 0;
diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c
index 296d3d4..71078b6 100644
--- a/kernel/power/zdot_microk_power8.c
+++ b/kernel/power/zdot_microk_power8.c
@@ -34,186 +34,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
**************************************************************************************/
#define HAVE_KERNEL_8 1
-static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));
-static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- FLOAT *x1=x;
- FLOAT *y1=y;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
- "xxlxor 32,32,32 \n\t"
- "xxlxor 33,33,33 \n\t"
- "xxlxor 34,34,34 \n\t"
- "xxlxor 35,35,35 \n\t"
- "xxlxor 36,36,36 \n\t"
- "xxlxor 37,37,37 \n\t"
- "xxlxor 38,38,38 \n\t"
- "xxlxor 39,39,39 \n\t"
-
- "dcbt %2, %8 \n\t"
- "dcbt %3, %8 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
- "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
- "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
- "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
- "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
- "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
- "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
- "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
-
- "xxswapd 52,48 \n\t" // y0_i, y0_r
- "xxswapd 53,49 \n\t" // y1_i, y1_r
- "xxswapd 54,50 \n\t" // y2_i, y2_r
- "xxswapd 55,51 \n\t" // y3_i, y3_r
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
-
- "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
- "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
- "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
- "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
- "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
- "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
- "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
- "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
-
- "xxswapd 60,56 \n\t" // y0_i, y0_r
- "xxswapd 61,57 \n\t" // y1_i, y1_r
- "xxswapd 62,58 \n\t" // y2_i, y2_r
- "xxswapd 63,59 \n\t" // y3_i, y3_r
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %8 \n\t"
- "dcbt %3, %8 \n\t"
-
- "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
- "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
- "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
- "lxvd2x 49, %5, %3 \n\t" // y1_r, y1_i
-
- "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
- "lxvd2x 50, %6, %3 \n\t" // y2_r, y2_i
- "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
- "lxvd2x 51, %7, %3 \n\t" // y3_r, y3_i
-
- "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
- "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
- "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
- "lxvd2x 41, %5, %2 \n\t" // x1_r, x1_i
-
- "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
- "lxvd2x 42, %6, %2 \n\t" // x2_r, x2_i
- "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
- "lxvd2x 43, %7, %2 \n\t" // x3_r, x3_i
-
- "xxswapd 52,48 \n\t" // y0_i, y0_r
- "xxswapd 53,49 \n\t" // y1_i, y1_r
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
- "xxswapd 54,50 \n\t" // y2_i, y2_r
- "xxswapd 55,51 \n\t" // y3_i, y3_r
-
- "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
- "lxvd2x 56, 0, %3 \n\t" // y0_r, y0_i
- "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
- "lxvd2x 57, %5, %3 \n\t" // y1_r, y1_i
- "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
- "lxvd2x 58, %6, %3 \n\t" // y2_r, y2_i
- "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
- "lxvd2x 59, %7, %3 \n\t" // y3_r, y3_i
-
- "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
- "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
- "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
- "lxvd2x 45, %5, %2 \n\t" // x1_r, x1_i
- "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
- "lxvd2x 46, %6, %2 \n\t" // x2_r, x2_i
- "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
- "lxvd2x 47, %7, %2 \n\t" // x3_r, x3_i
-
- "xxswapd 60,56 \n\t" // y0_i, y0_r
- "xxswapd 61,57 \n\t" // y1_i, y1_r
-
- "addi %2, %2, 64 \n\t"
- "addi %3, %3, 64 \n\t"
-
- "xxswapd 62,58 \n\t" // y2_i, y2_r
- "xxswapd 63,59 \n\t" // y3_i, y3_r
-
- "addic. %0 , %0 , -8 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
- "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
- "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
- "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
-
- "xvmaddadp 33, 40, 52 \n\t" // x0_r * y0_i , x0_i * y0_r
- "xvmaddadp 35, 41, 53 \n\t" // x1_r * y1_i , x1_i * y1_r
- "xvmaddadp 37, 42, 54 \n\t" // x2_r * y2_i , x2_i * y2_r
- "xvmaddadp 39, 43, 55 \n\t" // x3_r * y3_i , x3_i * y3_r
-
- "xvmaddadp 32, 44, 56 \n\t" // x0_r * y0_r , x0_i * y0_i
- "xvmaddadp 34, 45, 57 \n\t" // x1_r * y1_r , x1_i * y1_i
- "xvmaddadp 36, 46, 58 \n\t" // x2_r * y2_r , x2_i * y2_i
- "xvmaddadp 38, 47, 59 \n\t" // x3_r * y3_r , x3_i * y3_i
-
- "xvmaddadp 33, 44, 60 \n\t" // x0_r * y0_i , x0_i * y0_r
- "xvmaddadp 35, 45, 61 \n\t" // x1_r * y1_i , x1_i * y1_r
- "xvmaddadp 37, 46, 62 \n\t" // x2_r * y2_i , x2_i * y2_r
- "xvmaddadp 39, 47, 63 \n\t" // x3_r * y3_i , x3_i * y3_r
-
-
- "xvadddp 32, 32, 34 \n\t"
- "xvadddp 36, 36, 38 \n\t"
-
- "xvadddp 33, 33, 35 \n\t"
- "xvadddp 37, 37, 39 \n\t"
-
- "xvadddp 32, 32, 36 \n\t"
- "xvadddp 33, 33, 37 \n\t"
-
- "stxvd2x 32, 0, %4 \n\t"
- "stxvd2x 33, %5, %4 \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (n), // 1
- "r" (x1), // 2
- "r" (y1), // 3
- "r" (dot), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (pre) // 8
- : "cr0", "%0", "%2" , "%3", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+ "dcbt 0, %3 \n\t"
+
+ "xxlxor 32, 32, 32 \n\t"
+ "xxlxor 33, 33, 33 \n\t"
+ "xxlxor 34, 34, 34 \n\t"
+ "xxlxor 35, 35, 35 \n\t"
+ "xxlxor 36, 36, 36 \n\t"
+ "xxlxor 37, 37, 37 \n\t"
+ "xxlxor 38, 38, 38 \n\t"
+ "xxlxor 39, 39, 39 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
+ "lxvd2x 41, %7, %2 \n\t" // x1_r, x1_i
+ "lxvd2x 49, %7, %3 \n\t" // y1_r, y1_i
+ "lxvd2x 42, %8, %2 \n\t" // x2_r, x2_i
+ "lxvd2x 50, %8, %3 \n\t" // y2_r, y2_i
+ "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
+ "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
+
+ "xxswapd 0, 48 \n\t" // y0_i, y0_r
+ "xxswapd 1, 49 \n\t" // y1_i, y1_r
+ "xxswapd 2, 50 \n\t" // y2_i, y2_r
+ "xxswapd 3, 51 \n\t" // y3_i, y3_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
+ "lxvd2x 45, %7, %2 \n\t" // x1_r, x1_i
+ "lxvd2x 5, %7, %3 \n\t" // y1_r, y1_i
+ "lxvd2x 46, %8, %2 \n\t" // x2_r, x2_i
+ "lxvd2x 6, %8, %3 \n\t" // y2_r, y2_i
+ "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
+ "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
+
+ "xxswapd 8, 4 \n\t" // y0_i, y0_r
+ "xxswapd 9, 5 \n\t" // y1_i, y1_r
+ "xxswapd 10, 6 \n\t" // y2_i, y2_r
+ "xxswapd 11, 7 \n\t" // y3_i, y3_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "addic. %1, %1, -8 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "lxvd2x 48, 0, %3 \n\t" // y0_r, y0_i
+ "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "lxvd2x 49, %7, %3 \n\t" // y1_r, y1_i
+
+ "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "lxvd2x 50, %8, %3 \n\t" // y2_r, y2_i
+ "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
+ "lxvd2x 51, %9, %3 \n\t" // y3_r, y3_i
+
+ "xvmaddadp 33, 40, 0 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "xvmaddadp 35, 41, 1 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "lxvd2x 41, %7, %2 \n\t" // x1_r, x1_i
+
+ "xvmaddadp 37, 42, 2 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "lxvd2x 42, %8, %2 \n\t" // x2_r, x2_i
+ "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
+ "lxvd2x 43, %9, %2 \n\t" // x3_r, x3_i
+
+ "xxswapd 0,48 \n\t" // y0_i, y0_r
+ "xxswapd 1,49 \n\t" // y1_i, y1_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "xxswapd 2,50 \n\t" // y2_i, y2_r
+ "xxswapd 3,51 \n\t" // y3_i, y3_r
+
+ "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "lxvd2x 4, 0, %3 \n\t" // y0_r, y0_i
+ "xvmaddadp 34, 45, 5 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "lxvd2x 5, %7, %3 \n\t" // y1_r, y1_i
+ "xvmaddadp 36, 46, 6 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "lxvd2x 6, %8, %3 \n\t" // y2_r, y2_i
+ "xvmaddadp 38, 47, 7 \n\t" // x3_r * y3_r , x3_i * y3_i
+ "lxvd2x 7, %9, %3 \n\t" // y3_r, y3_i
+
+ "xvmaddadp 33, 44, 8 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "lxvd2x 44, 0, %2 \n\t" // x0_r, x0_i
+ "xvmaddadp 35, 45, 9 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "lxvd2x 45, %7, %2 \n\t" // x1_r, x1_i
+ "xvmaddadp 37, 46, 10 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "lxvd2x 46, %8, %2 \n\t" // x2_r, x2_i
+ "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
+ "lxvd2x 47, %9, %2 \n\t" // x3_r, x3_i
+
+ "xxswapd 8,4 \n\t" // y0_i, y0_r
+ "xxswapd 9,5 \n\t" // y1_i, y1_r
+
+ "addi %2, %2, 64 \n\t"
+ "addi %3, %3, 64 \n\t"
+
+ "xxswapd 10,6 \n\t" // y2_i, y2_r
+ "xxswapd 11,7 \n\t" // y3_i, y3_r
+
+ "addic. %1, %1, -8 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmaddadp 32, 40, 48 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "xvmaddadp 34, 41, 49 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "xvmaddadp 36, 42, 50 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "xvmaddadp 38, 43, 51 \n\t" // x3_r * y3_r , x3_i * y3_i
+
+ "xvmaddadp 33, 40, 0 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "xvmaddadp 35, 41, 1 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "xvmaddadp 37, 42, 2 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "xvmaddadp 39, 43, 3 \n\t" // x3_r * y3_i , x3_i * y3_r
+
+ "xvmaddadp 32, 44, 4 \n\t" // x0_r * y0_r , x0_i * y0_i
+ "xvmaddadp 34, 45, 5 \n\t" // x1_r * y1_r , x1_i * y1_i
+ "xvmaddadp 36, 46, 6 \n\t" // x2_r * y2_r , x2_i * y2_i
+ "xvmaddadp 38, 47, 7 \n\t" // x3_r * y3_r , x3_i * y3_i
+
+ "xvmaddadp 33, 44, 8 \n\t" // x0_r * y0_i , x0_i * y0_r
+ "xvmaddadp 35, 45, 9 \n\t" // x1_r * y1_i , x1_i * y1_r
+ "xvmaddadp 37, 46, 10 \n\t" // x2_r * y2_i , x2_i * y2_r
+ "xvmaddadp 39, 47, 11 \n\t" // x3_r * y3_i , x3_i * y3_r
+
+ "xvadddp 32, 32, 34 \n\t"
+ "xvadddp 36, 36, 38 \n\t"
+
+ "xvadddp 33, 33, 35 \n\t"
+ "xvadddp 37, 37, 39 \n\t"
+
+ "xvadddp 32, 32, 36 \n\t"
+ "xvadddp 33, 33, 37 \n\t"
+
+ "stxvd2x 32, 0, %6 \n\t"
+ "stxvd2x 33, %7, %6 \n"
+
+ "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6 o16=%7 o32=%8 o48=%9"
+ :
+ "=m" (*dot),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "+b" (y) // 3
+ :
+ "m" (*x),
+ "m" (*y),
+ "b" (dot), // 6
+ "b" (16), // 7
+ "b" (32), // 8
+ "b" (48) // 9
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+ "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+ );
+}
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index 410fc98..14d677f 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -47,15 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#ifndef HAVE_KERNEL_8
-static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha)
+static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT da_r, FLOAT da_i)
{
BLASLONG i=0;
FLOAT *x1=x;
- FLOAT alpha_r1=alpha[0];
- FLOAT alpha_r2=alpha[1];
- FLOAT alpha_i1=alpha[2];
- FLOAT alpha_i2=alpha[3];
+ FLOAT alpha_r1=da_r;
+ FLOAT alpha_r2=da_r;
+ FLOAT alpha_i1=-da_i;
+ FLOAT alpha_i2=da_i;
FLOAT temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
FLOAT x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;
@@ -116,7 +116,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
BLASLONG inc_x2;
BLASLONG ip = 0;
FLOAT temp;
- FLOAT alpha[4] __attribute__ ((aligned (16)));;
BLASLONG n1;
if ( n <= 0 )
@@ -147,11 +146,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
n1 = n & -8;
if ( n1 > 0 )
{
- alpha[0] = da_r;
- alpha[1] = da_r;
- alpha[2] = -da_i;
- alpha[3] = da_i;
- zscal_kernel_8(n1, x, alpha);
+ zscal_kernel_8(n1, x, da_r, da_i);
i=n1;
ip = n1 * 2;
diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c
index 5e09d8d..aba9029 100644
--- a/kernel/power/zscal_microk_power8.c
+++ b/kernel/power/zscal_microk_power8.c
@@ -38,187 +38,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_8 1
-static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline));
-
-static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
+static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *x2=x+1;
- BLASLONG pre = 384;
-
- __asm__ __volatile__
- (
-
- "lxvd2x 32, 0, %3 \n\t" // alpha_r , alpha_r
- "lxvd2x 33, %5, %3 \n\t" // -alpha_i , alpha_i
- "addi %1, %1, -8 \n\t"
-
- "dcbt %2, %4 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "ble 2f \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "dcbt %2, %4 \n\t"
-
- "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
- "xvmuldp 49, 41, 32 \n\t"
- "xvmuldp 50, 42, 32 \n\t"
- "xvmuldp 51, 43, 32 \n\t"
- "xvmuldp 52, 44, 32 \n\t"
- "xvmuldp 53, 45, 32 \n\t"
- "xvmuldp 54, 46, 32 \n\t"
- "xvmuldp 55, 47, 32 \n\t"
-
- "xxswapd 56, 40 \n\t"
- "xxswapd 57, 41 \n\t"
- "xxswapd 58, 42 \n\t"
- "xxswapd 59, 43 \n\t"
- "xxswapd 60, 44 \n\t"
- "xxswapd 61, 45 \n\t"
- "xxswapd 62, 46 \n\t"
- "xxswapd 63, 47 \n\t"
-
- "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
- "xvmuldp 57, 57, 33 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
- "lxvd2x 41, %5, %2 \n\t"
-
- "xvmuldp 58, 58, 33 \n\t"
- "xvmuldp 59, 59, 33 \n\t"
-
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
-
- "xvmuldp 60, 60, 33 \n\t"
- "xvmuldp 61, 61, 33 \n\t"
-
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
-
- "xvmuldp 62, 62, 33 \n\t"
- "xvmuldp 63, 63, 33 \n\t"
-
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "xvadddp 48, 48 , 56 \n\t"
- "xvadddp 49, 49 , 57 \n\t"
- "xvadddp 50, 50 , 58 \n\t"
- "xvadddp 51, 51 , 59 \n\t"
-
- "stxvd2x 48, 0, %1 \n\t"
- "stxvd2x 49, %5, %1 \n\t"
-
- "xvadddp 52, 52 , 60 \n\t"
- "xvadddp 53, 53 , 61 \n\t"
-
- "stxvd2x 50, %6, %1 \n\t"
- "stxvd2x 51, %7, %1 \n\t"
-
- "xvadddp 54, 54 , 62 \n\t"
- "xvadddp 55, 55 , 63 \n\t"
-
- "stxvd2x 52, %8, %1 \n\t"
- "stxvd2x 53, %9, %1 \n\t"
- "stxvd2x 54, %10, %1 \n\t"
- "stxvd2x 55, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
- "addi %2, %2, 128 \n\t"
-
- "addic. %0 , %0 , -8 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
- "xvmuldp 49, 41, 32 \n\t"
- "xvmuldp 50, 42, 32 \n\t"
- "xvmuldp 51, 43, 32 \n\t"
- "xvmuldp 52, 44, 32 \n\t"
- "xvmuldp 53, 45, 32 \n\t"
- "xvmuldp 54, 46, 32 \n\t"
- "xvmuldp 55, 47, 32 \n\t"
-
- "xxswapd 56, 40 \n\t"
- "xxswapd 57, 41 \n\t"
- "xxswapd 58, 42 \n\t"
- "xxswapd 59, 43 \n\t"
- "xxswapd 60, 44 \n\t"
- "xxswapd 61, 45 \n\t"
- "xxswapd 62, 46 \n\t"
- "xxswapd 63, 47 \n\t"
-
- "xvmuldp 56, 56, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
- "xvmuldp 57, 57, 33 \n\t"
- "xvmuldp 58, 58, 33 \n\t"
- "xvmuldp 59, 59, 33 \n\t"
- "xvmuldp 60, 60, 33 \n\t"
- "xvmuldp 61, 61, 33 \n\t"
- "xvmuldp 62, 62, 33 \n\t"
- "xvmuldp 63, 63, 33 \n\t"
-
- "xvadddp 48, 48 , 56 \n\t"
- "xvadddp 49, 49 , 57 \n\t"
- "xvadddp 50, 50 , 58 \n\t"
- "xvadddp 51, 51 , 59 \n\t"
- "xvadddp 52, 52 , 60 \n\t"
- "xvadddp 53, 53 , 61 \n\t"
- "xvadddp 54, 54 , 62 \n\t"
- "xvadddp 55, 55 , 63 \n\t"
-
- "stxvd2x 48, 0, %1 \n\t"
- "stxvd2x 49, %5, %1 \n\t"
- "stxvd2x 50, %6, %1 \n\t"
- "stxvd2x 51, %7, %1 \n\t"
- "stxvd2x 52, %8, %1 \n\t"
- "stxvd2x 53, %9, %1 \n\t"
- "stxvd2x 54, %10, %1 \n\t"
- "stxvd2x 55, %11, %1 \n\t"
-
-
- :
- :
- "r" (i), // 0
- "r" (x2), // 1
- "r" (x1), // 2
- "r" (alpha), // 3
- "r" (pre), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "memory"
- );
-
-}
-
-
+ __vector double t0;
+ __vector double t1;
+ __vector double t2;
+ __vector double t3;
+ __vector double t4;
+ __vector double t5;
+ __vector double t6;
+ __vector double t7;
+ __vector double t8;
+ __vector double t9;
+ __vector double t10;
+ __vector double t11;
+
+ __asm__
+ (
+ "dcbt 0, %2 \n\t"
+
+ "xsnegdp 33, %x16 \n\t" // -alpha_i
+ "xxspltd 32, %x15, 0 \n\t" // alpha_r , alpha_r
+ "xxmrghd 33, 33, %x16 \n\t" // -alpha_i , alpha_i
+
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 41, %17, %2 \n\t"
+ "lxvd2x 42, %18, %2 \n\t"
+ "lxvd2x 43, %19, %2 \n\t"
+ "lxvd2x 44, %20, %2 \n\t"
+ "lxvd2x 45, %21, %2 \n\t"
+ "lxvd2x 46, %22, %2 \n\t"
+ "lxvd2x 47, %23, %2 \n\t"
+
+ "addi %2, %2, 128 \n\t"
+
+ "addic. %1, %1, -8 \n\t"
+ "ble 2f \n\t"
+
+ ".p2align 5 \n"
+ "1: \n\t"
+
+ "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
+ "xvmuldp 49, 41, 32 \n\t"
+ "xvmuldp 50, 42, 32 \n\t"
+ "xvmuldp 51, 43, 32 \n\t"
+ "xvmuldp %x3, 44, 32 \n\t"
+ "xvmuldp %x4, 45, 32 \n\t"
+ "xvmuldp %x5, 46, 32 \n\t"
+ "xvmuldp %x6, 47, 32 \n\t"
+
+ "xxswapd %x7, 40 \n\t"
+ "xxswapd %x8, 41 \n\t"
+ "xxswapd %x9, 42 \n\t"
+ "xxswapd %x10, 43 \n\t"
+ "xxswapd %x11, 44 \n\t"
+ "xxswapd %x12, 45 \n\t"
+ "xxswapd %x13, 46 \n\t"
+ "xxswapd %x14, 47 \n\t"
+
+ "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
+ "xvmuldp %x8, %x8, 33 \n\t"
+
+ "lxvd2x 40, 0, %2 \n\t" // x0_r, x0_i
+ "lxvd2x 41, %17, %2 \n\t"
+
+ "xvmuldp %x9, %x9, 33 \n\t"
+ "xvmuldp %x10, %x10, 33 \n\t"
+
+ "lxvd2x 42, %18, %2 \n\t"
+ "lxvd2x 43, %19, %2 \n\t"
+
+ "xvmuldp %x11, %x11, 33 \n\t"
+ "xvmuldp %x12, %x12, 33 \n\t"
+
+ "lxvd2x 44, %20, %2 \n\t"
+ "lxvd2x 45, %21, %2 \n\t"
+
+ "xvmuldp %x13, %x13, 33 \n\t"
+ "xvmuldp %x14, %x14, 33 \n\t"
+
+ "lxvd2x 46, %22, %2 \n\t"
+ "lxvd2x 47, %23, %2 \n\t"
+
+ "addi %2, %2, -128 \n\t"
+
+ "xvadddp 48, 48, %x7 \n\t"
+ "xvadddp 49, 49, %x8 \n\t"
+ "xvadddp 50, 50, %x9 \n\t"
+ "xvadddp 51, 51, %x10 \n\t"
+
+ "stxvd2x 48, 0, %2 \n\t"
+ "stxvd2x 49, %17, %2 \n\t"
+
+ "xvadddp %x3, %x3, %x11 \n\t"
+ "xvadddp %x4, %x4, %x12 \n\t"
+
+ "stxvd2x 50, %18, %2 \n\t"
+ "stxvd2x 51, %19, %2 \n\t"
+
+ "xvadddp %x5, %x5, %x13 \n\t"
+ "xvadddp %x6, %x6, %x14 \n\t"
+
+ "stxvd2x %x3, %20, %2 \n\t"
+ "stxvd2x %x4, %21, %2 \n\t"
+ "stxvd2x %x5, %22, %2 \n\t"
+ "stxvd2x %x6, %23, %2 \n\t"
+
+ "addi %2, %2, 256 \n\t"
+
+ "addic. %1, %1, -8 \n\t"
+ "bgt 1b \n"
+
+ "2: \n\t"
+
+ "xvmuldp 48, 40, 32 \n\t" // x0_r * alpha_r, x0_i * alpha_r
+ "xvmuldp 49, 41, 32 \n\t"
+ "xvmuldp 50, 42, 32 \n\t"
+ "xvmuldp 51, 43, 32 \n\t"
+ "xvmuldp %x3, 44, 32 \n\t"
+ "xvmuldp %x4, 45, 32 \n\t"
+ "xvmuldp %x5, 46, 32 \n\t"
+ "xvmuldp %x6, 47, 32 \n\t"
+
+ "xxswapd %x7, 40 \n\t"
+ "xxswapd %x8, 41 \n\t"
+ "xxswapd %x9, 42 \n\t"
+ "xxswapd %x10, 43 \n\t"
+ "xxswapd %x11, 44 \n\t"
+ "xxswapd %x12, 45 \n\t"
+ "xxswapd %x13, 46 \n\t"
+ "xxswapd %x14, 47 \n\t"
+
+ "addi %2, %2, -128 \n\t"
+
+ "xvmuldp %x7, %x7, 33 \n\t" // x0_i * -alpha_i, x0_r * alpha_i
+ "xvmuldp %x8, %x8, 33 \n\t"
+ "xvmuldp %x9, %x9, 33 \n\t"
+ "xvmuldp %x10, %x10, 33 \n\t"
+ "xvmuldp %x11, %x11, 33 \n\t"
+ "xvmuldp %x12, %x12, 33 \n\t"
+ "xvmuldp %x13, %x13, 33 \n\t"
+ "xvmuldp %x14, %x14, 33 \n\t"
+
+ "xvadddp 48, 48, %x7 \n\t"
+ "xvadddp 49, 49, %x8 \n\t"
+ "xvadddp 50, 50, %x9 \n\t"
+ "xvadddp 51, 51, %x10 \n\t"
+
+ "stxvd2x 48, 0, %2 \n\t"
+ "stxvd2x 49, %17, %2 \n\t"
+
+ "xvadddp %x3, %x3, %x11 \n\t"
+ "xvadddp %x4, %x4, %x12 \n\t"
+
+ "stxvd2x 50, %18, %2 \n\t"
+ "stxvd2x 51, %19, %2 \n\t"
+
+ "xvadddp %x5, %x5, %x13 \n\t"
+ "xvadddp %x6, %x6, %x14 \n\t"
+
+ "stxvd2x %x3, %20, %2 \n\t"
+ "stxvd2x %x4, %21, %2 \n\t"
+ "stxvd2x %x5, %22, %2 \n\t"
+ "stxvd2x %x6, %23, %2 \n"
+
+ "#n=%1 x=%0=%2 alpha=(%15,%16) o16=%17 o32=%18 o48=%19 o64=%20 o80=%21 o96=%22 o112=%23\n"
+ "#t0=%x3 t1=%x4 t2=%x5 t3=%x6 t4=%x7 t5=%x8 t6=%x9 t7=%x10 t8=%x11 t9=%x12 t10=%x13 t11=%x14"
+ :
+ "+m" (*x),
+ "+r" (n), // 1
+ "+b" (x), // 2
+ "=wa" (t0), // 3
+ "=wa" (t1), // 4
+ "=wa" (t2), // 5
+ "=wa" (t3), // 6
+ "=wa" (t4), // 7
+ "=wa" (t5), // 8
+ "=wa" (t6), // 9
+ "=wa" (t7), // 10
+ "=wa" (t8), // 11
+ "=wa" (t9), // 12
+ "=wa" (t10), // 13
+ "=wa" (t11) // 14
+ :
+ "d" (alpha_r), // 15
+ "d" (alpha_i), // 16
+ "b" (16), // 17
+ "b" (32), // 18
+ "b" (48), // 19
+ "b" (64), // 20
+ "b" (80), // 21
+ "b" (96), // 22
+ "b" (112) // 23
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51"
+ );
+}
diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c
index 9e56237..54391ba 100644
--- a/kernel/power/zswap_microk_power8.c
+++ b/kernel/power/zswap_microk_power8.c
@@ -35,146 +35,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#define HAVE_KERNEL_16 1
-static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+static void
+zswap_kernel_16 (long n, double *x, double *y)
{
-
-
- BLASLONG i = n;
- BLASLONG o16 = 16;
- BLASLONG o32 = 32;
- BLASLONG o48 = 48;
- BLASLONG o64 = 64;
- BLASLONG o80 = 80;
- BLASLONG o96 = 96;
- BLASLONG o112 = 112;
- FLOAT *x1=x;
- FLOAT *y1=y;
- FLOAT *x2=x+1;
- FLOAT *y2=y+1;
- BLASLONG pre = 384;
- BLASLONG alpha=0;
-
- __asm__ __volatile__
- (
-
- "addi %3, %3, -8 \n\t"
- "addi %4, %4, -8 \n\t"
-
- ".align 5 \n\t"
- "1: \n\t"
-
- "lxvd2x 32, 0, %2 \n\t"
- "lxvd2x 33, %5, %2 \n\t"
- "lxvd2x 34, %6, %2 \n\t"
- "lxvd2x 35, %7, %2 \n\t"
- "lxvd2x 36, %8, %2 \n\t"
- "lxvd2x 37, %9, %2 \n\t"
- "lxvd2x 38, %10, %2 \n\t"
- "lxvd2x 39, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvd2x 40, 0, %2 \n\t"
- "lxvd2x 41, %5, %2 \n\t"
- "lxvd2x 42, %6, %2 \n\t"
- "lxvd2x 43, %7, %2 \n\t"
- "lxvd2x 44, %8, %2 \n\t"
- "lxvd2x 45, %9, %2 \n\t"
- "lxvd2x 46, %10, %2 \n\t"
- "lxvd2x 47, %11, %2 \n\t"
-
- "addi %2, %2, 128 \n\t"
-
- "lxvd2x 48, 0, %1 \n\t"
- "lxvd2x 49, %5, %1 \n\t"
- "lxvd2x 50, %6, %1 \n\t"
- "lxvd2x 51, %7, %1 \n\t"
- "lxvd2x 52, %8, %1 \n\t"
- "lxvd2x 53, %9, %1 \n\t"
- "lxvd2x 54, %10, %1 \n\t"
- "lxvd2x 55, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "lxvd2x 56, 0, %1 \n\t"
- "lxvd2x 57, %5, %1 \n\t"
- "lxvd2x 58, %6, %1 \n\t"
- "lxvd2x 59, %7, %1 \n\t"
- "lxvd2x 60, %8, %1 \n\t"
- "lxvd2x 61, %9, %1 \n\t"
- "lxvd2x 62, %10, %1 \n\t"
- "lxvd2x 63, %11, %1 \n\t"
-
- "addi %1, %1, 128 \n\t"
-
- "stxvd2x 32, 0, %3 \n\t"
- "stxvd2x 33, %5, %3 \n\t"
- "stxvd2x 34, %6, %3 \n\t"
- "stxvd2x 35, %7, %3 \n\t"
- "stxvd2x 36, %8, %3 \n\t"
- "stxvd2x 37, %9, %3 \n\t"
- "stxvd2x 38, %10, %3 \n\t"
- "stxvd2x 39, %11, %3 \n\t"
-
- "addi %3, %3, 128 \n\t"
-
- "stxvd2x 40, 0, %3 \n\t"
- "stxvd2x 41, %5, %3 \n\t"
- "stxvd2x 42, %6, %3 \n\t"
- "stxvd2x 43, %7, %3 \n\t"
- "stxvd2x 44, %8, %3 \n\t"
- "stxvd2x 45, %9, %3 \n\t"
- "stxvd2x 46, %10, %3 \n\t"
- "stxvd2x 47, %11, %3 \n\t"
-
- "addi %3, %3, 128 \n\t"
-
- "stxvd2x 48, 0, %4 \n\t"
- "stxvd2x 49, %5, %4 \n\t"
- "stxvd2x 50, %6, %4 \n\t"
- "stxvd2x 51, %7, %4 \n\t"
- "stxvd2x 52, %8, %4 \n\t"
- "stxvd2x 53, %9, %4 \n\t"
- "stxvd2x 54, %10, %4 \n\t"
- "stxvd2x 55, %11, %4 \n\t"
-
- "addi %4, %4, 128 \n\t"
-
- "stxvd2x 56, 0, %4 \n\t"
- "stxvd2x 57, %5, %4 \n\t"
- "stxvd2x 58, %6, %4 \n\t"
- "stxvd2x 59, %7, %4 \n\t"
- "stxvd2x 60, %8, %4 \n\t"
- "stxvd2x 61, %9, %4 \n\t"
- "stxvd2x 62, %10, %4 \n\t"
- "stxvd2x 63, %11, %4 \n\t"
-
- "addi %4, %4, 128 \n\t"
-
- "addic. %0 , %0 , -16 \n\t"
- "bgt 1b \n\t"
-
- "2: \n\t"
-
- :
- :
- "r" (i), // 0
- "r" (y1), // 1
- "r" (x1), // 2
- "r" (y2), // 3
- "r" (x2), // 4
- "r" (o16), // 5
- "r" (o32), // 6
- "r" (o48), // 7
- "r" (o64), // 8
- "r" (o80), // 9
- "r" (o96), // 10
- "r" (o112) // 11
- : "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
- );
-
-}
-
-
+ __asm__
+ (
+ ".p2align 5 \n"
+ "1: \n\t"
+ "lxvd2x 32, 0, %4 \n\t"
+ "lxvd2x 33, %5, %4 \n\t"
+ "lxvd2x 34, %6, %4 \n\t"
+ "lxvd2x 35, %7, %4 \n\t"
+ "lxvd2x 36, %8, %4 \n\t"
+ "lxvd2x 37, %9, %4 \n\t"
+ "lxvd2x 38, %10, %4 \n\t"
+ "lxvd2x 39, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "lxvd2x 40, 0, %4 \n\t"
+ "lxvd2x 41, %5, %4 \n\t"
+ "lxvd2x 42, %6, %4 \n\t"
+ "lxvd2x 43, %7, %4 \n\t"
+ "lxvd2x 44, %8, %4 \n\t"
+ "lxvd2x 45, %9, %4 \n\t"
+ "lxvd2x 46, %10, %4 \n\t"
+ "lxvd2x 47, %11, %4 \n\t"
+
+ "addi %4, %4, -128 \n\t"
+
+ "lxvd2x 48, 0, %3 \n\t"
+ "lxvd2x 49, %5, %3 \n\t"
+ "lxvd2x 50, %6, %3 \n\t"
+ "lxvd2x 51, %7, %3 \n\t"
+ "lxvd2x 0, %8, %3 \n\t"
+ "lxvd2x 1, %9, %3 \n\t"
+ "lxvd2x 2, %10, %3 \n\t"
+ "lxvd2x 3, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "lxvd2x 4, 0, %3 \n\t"
+ "lxvd2x 5, %5, %3 \n\t"
+ "lxvd2x 6, %6, %3 \n\t"
+ "lxvd2x 7, %7, %3 \n\t"
+ "lxvd2x 8, %8, %3 \n\t"
+ "lxvd2x 9, %9, %3 \n\t"
+ "lxvd2x 10, %10, %3 \n\t"
+ "lxvd2x 11, %11, %3 \n\t"
+
+ "addi %3, %3, -128 \n\t"
+
+ "stxvd2x 32, 0, %3 \n\t"
+ "stxvd2x 33, %5, %3 \n\t"
+ "stxvd2x 34, %6, %3 \n\t"
+ "stxvd2x 35, %7, %3 \n\t"
+ "stxvd2x 36, %8, %3 \n\t"
+ "stxvd2x 37, %9, %3 \n\t"
+ "stxvd2x 38, %10, %3 \n\t"
+ "stxvd2x 39, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 40, 0, %3 \n\t"
+ "stxvd2x 41, %5, %3 \n\t"
+ "stxvd2x 42, %6, %3 \n\t"
+ "stxvd2x 43, %7, %3 \n\t"
+ "stxvd2x 44, %8, %3 \n\t"
+ "stxvd2x 45, %9, %3 \n\t"
+ "stxvd2x 46, %10, %3 \n\t"
+ "stxvd2x 47, %11, %3 \n\t"
+
+ "addi %3, %3, 128 \n\t"
+
+ "stxvd2x 48, 0, %4 \n\t"
+ "stxvd2x 49, %5, %4 \n\t"
+ "stxvd2x 50, %6, %4 \n\t"
+ "stxvd2x 51, %7, %4 \n\t"
+ "stxvd2x 0, %8, %4 \n\t"
+ "stxvd2x 1, %9, %4 \n\t"
+ "stxvd2x 2, %10, %4 \n\t"
+ "stxvd2x 3, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+
+ "stxvd2x 4, 0, %4 \n\t"
+ "stxvd2x 5, %5, %4 \n\t"
+ "stxvd2x 6, %6, %4 \n\t"
+ "stxvd2x 7, %7, %4 \n\t"
+ "stxvd2x 8, %8, %4 \n\t"
+ "stxvd2x 9, %9, %4 \n\t"
+ "stxvd2x 10, %10, %4 \n\t"
+ "stxvd2x 11, %11, %4 \n\t"
+
+ "addi %4, %4, 128 \n\t"
+ "addic. %2, %2, -16 \n\t"
+ "bgt 1b \n"
+
+ "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+ :
+ "+m" (*x),
+ "+m" (*y),
+ "+r" (n), // 2
+ "+b" (x), // 3
+ "+b" (y) // 4
+ :
+ "b" (16), // 5
+ "b" (32), // 6
+ "b" (48), // 7
+ "b" (64), // 8
+ "b" (80), // 9
+ "b" (96), // 10
+ "b" (112) // 11
+ :
+ "cr0",
+ "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+ "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+ "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+ "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+ );
+}