openblas/openblas-0.2.19-fix_register_clobbers.patch

From 1e70600316ab080d80e318f32868c12eb7d1f2da Mon Sep 17 00:00:00 2001
From: Alan Modra <amodra@gmail.com>
Date: Thu, 9 Feb 2017 08:41:51 +1030
Subject: [PATCH] Fix power8 asm()

Lots of issues here.
- The vsx regs weren't listed as clobbered.
- Poor choice of vsx regs, which along with the lack of clobbers led to
  trashing v0..v21 and fr14..fr23.  Ideally you'd let gcc choose all
  temp vsx regs, but asms currently have a limit of 30 i/o parms.
- Other regs were clobbered unnecessarily, seemingly in an attempt to
  clobber inputs, with gcc-7 complaining about the clobber of r2.
  (Changed inputs should be also listed as outputs or as an i/o.)
- "r" constraint used instead of "b" for gprs used in insns where the
  r0 encoding means zero rather than r0.
- There were unused asm inputs too.
- All memory was clobbered rather than hooking up memory outputs with
  proper memory constraints, and that and the lack of proper memory
  input constraints meant the asms needed to be volatile and their
  containing function noinline.
- Some parameters were being passed unnecessarily via memory.
- When a copy of a pointer input parm was needed, the value passed to
  the asm was incremented in C and decremented in asm, rather than
  using i/o parms, an early clobber constraint, or a temp output reg
  copied in the asm.  In most cases a small change to assembly could
  be made that obviated the need for the extra pointer.
- A number of functions did not compute the final sum or dot-product
  in assembly, instead using scalar code in C.
- dcbt was bogus.

I've also fixed formatting of the asm.

diff --git a/kernel/power/casum.c b/kernel/power/casum.c
index aeed0ca..d110858 100644
--- a/kernel/power/casum.c
+++ b/kernel/power/casum.c
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_16

-static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT casum_kernel_16(BLASLONG n, FLOAT *x1)
 {

 	BLASLONG i=0;
@@ -92,11 +92,7 @@ static void casum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)

 	}

-	svec[0] = sum0+sum1+sum2+sum3;
-	svec[1] = 0.0;
-	svec[2] = 0.0;
-	svec[3] = 0.0;
-
+	return sum0+sum1+sum2+sum3;
 }

 #endif
@@ -106,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG i=0;
 	BLASLONG ip=0;
 	FLOAT sumf = 0.0;
-	FLOAT svec[4] __attribute__ ((aligned (16)));;
 	BLASLONG n1;
 	BLASLONG inc_x2;

@@ -119,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		if ( n1 > 0 )
 		{

-			casum_kernel_16(n1, x, svec);
-			sumf = svec[0] + svec[1]+svec[2]+svec[3];
+			sumf = casum_kernel_16(n1, x);
 			i=n1;
 			ip = 2 * n1;
 		}
diff --git a/kernel/power/casum_microk_power8.c b/kernel/power/casum_microk_power8.c
index cb50234..38a1143 100644
--- a/kernel/power/casum_microk_power8.c
+++ b/kernel/power/casum_microk_power8.c
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

 #define HAVE_KERNEL_16 1
-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));

-static void casum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+static float casum_kernel_16 (long n, float *x)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xxlxor		32,32,32			    \n\t"
-	"xxlxor		33,33,33			    \n\t"
-	"xxlxor		34,34,34			    \n\t"
-	"xxlxor		35,35,35			    \n\t"
-	"xxlxor		36,36,36			    \n\t"
-	"xxlxor		37,37,37			    \n\t"
-	"xxlxor		38,38,38			    \n\t"
-	"xxlxor		39,39,39			    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xvabssp	48, 40				    \n\t"
-	"xvabssp	49, 41				    \n\t"
-	"xvabssp	50, 42				    \n\t"
-	"xvabssp	51, 43				    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-
-	"xvabssp	52, 44				    \n\t"
-	"xvabssp	53, 45				    \n\t"
-
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-
-	"xvabssp	54, 46				    \n\t"
-	"xvabssp	55, 47				    \n\t"
-
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-
-	"xvaddsp	32, 32, 48		    \n\t"
-	"xvaddsp	33, 33, 49		    \n\t"
-
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"xvaddsp	34, 34, 50		    \n\t"
-	"xvaddsp	35, 35, 51		    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-	"xvaddsp	36, 36, 52		    \n\t"
-	"xvaddsp	37, 37, 53		    \n\t"
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"xvaddsp	38, 38, 54		    \n\t"
-	"xvaddsp	39, 39, 55		    \n\t"
-
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-
-	"xvabssp	48, 40				    \n\t"
-	"xvabssp	49, 41				    \n\t"
-	"xvabssp	50, 42				    \n\t"
-	"xvabssp	51, 43				    \n\t"
-	"xvabssp	52, 44				    \n\t"
-	"xvabssp	53, 45				    \n\t"
-	"xvabssp	54, 46				    \n\t"
-	"xvabssp	55, 47				    \n\t"
-
-	"xvaddsp	32, 32, 48		    \n\t"
-	"xvaddsp	33, 33, 49		    \n\t"
-	"xvaddsp	34, 34, 50		    \n\t"
-	"xvaddsp	35, 35, 51		    \n\t"
-	"xvaddsp	36, 36, 52		    \n\t"
-	"xvaddsp	37, 37, 53		    \n\t"
-	"xvaddsp	38, 38, 54		    \n\t"
-	"xvaddsp	39, 39, 55		    \n\t"
-
-	"xvaddsp	32, 32, 33		     \n\t"
-	"xvaddsp	34, 34, 35		     \n\t"
-	"xvaddsp	36, 36, 37		     \n\t"
-	"xvaddsp	38, 38, 39		     \n\t"
-
-	"xvaddsp	32, 32, 34		     \n\t"
-	"xvaddsp	36, 36, 38		     \n\t"
-
-	"xvaddsp	32, 32, 36		     \n\t"
-
-
-	"stxvw4x	32, 0, %3		     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (svec),   // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)   // 11
-	: "cr0", "%0", "%2",  "memory"
-	);
-
-}
-
-
+  float sum;
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %8, %2	\n\t"
+       "lxvw4x		42, %9, %2	\n\t"
+       "lxvw4x		43, %10, %2	\n\t"
+       "lxvw4x		44, %11, %2	\n\t"
+       "lxvw4x		45, %12, %2	\n\t"
+       "lxvw4x		46, %13, %2	\n\t"
+       "lxvw4x		47, %14, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvabssp		48, 40		\n\t"
+       "xvabssp		49, 41		\n\t"
+       "xvabssp		50, 42		\n\t"
+       "xvabssp		51, 43		\n\t"
+
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %8, %2	\n\t"
+
+       "xvabssp		%x3, 44		\n\t"
+       "xvabssp		%x4, 45		\n\t"
+
+       "lxvw4x		42, %9, %2	\n\t"
+       "lxvw4x		43, %10, %2	\n\t"
+
+       "xvabssp		%x5, 46		\n\t"
+       "xvabssp		%x6, 47		\n\t"
+
+       "lxvw4x		44, %11, %2	\n\t"
+       "lxvw4x		45, %12, %2	\n\t"
+
+       "xvaddsp		32, 32, 48	\n\t"
+       "xvaddsp		33, 33, 49	\n\t"
+
+       "lxvw4x		46, %13, %2	\n\t"
+       "lxvw4x		47, %14, %2	\n\t"
+
+       "xvaddsp		34, 34, 50	\n\t"
+       "xvaddsp		35, 35, 51	\n\t"
+       "addi		%2, %2, 128	\n\t"
+       "xvaddsp		36, 36, %x3	\n\t"
+       "xvaddsp		37, 37, %x4	\n\t"
+       "addic.		%1, %1, -16	\n\t"
+       "xvaddsp		38, 38, %x5	\n\t"
+       "xvaddsp		39, 39, %x6	\n\t"
+
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvabssp		48, 40		\n\t"
+       "xvabssp		49, 41		\n\t"
+       "xvabssp		50, 42		\n\t"
+       "xvabssp		51, 43		\n\t"
+       "xvabssp		%x3, 44		\n\t"
+       "xvabssp		%x4, 45		\n\t"
+       "xvabssp		%x5, 46		\n\t"
+       "xvabssp		%x6, 47		\n\t"
+
+       "xvaddsp		32, 32, 48	\n\t"
+       "xvaddsp		33, 33, 49	\n\t"
+       "xvaddsp		34, 34, 50	\n\t"
+       "xvaddsp		35, 35, 51	\n\t"
+       "xvaddsp		36, 36, %x3	\n\t"
+       "xvaddsp		37, 37, %x4	\n\t"
+       "xvaddsp		38, 38, %x5	\n\t"
+       "xvaddsp		39, 39, %x6	\n\t"
+
+       "xvaddsp		32, 32, 33	\n\t"
+       "xvaddsp		34, 34, 35	\n\t"
+       "xvaddsp		36, 36, 37	\n\t"
+       "xvaddsp		38, 38, 39	\n\t"
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		32, 32, 36	\n\t"
+
+       "xxsldwi		33, 32, 32, 2	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xxsldwi		33, 32, 32, 1	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xscvspdp	%0, 32		\n"
+
+     "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+     :
+       "=f" (sum),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3)	// 6
+     :
+       "m" (*x),
+       "b" (16),	// 8
+       "b" (32),	// 9
+       "b" (48),	// 10
+       "b" (64),	// 11
+       "b" (80),	// 12
+       "b" (96),	// 13
+       "b" (112)	// 14
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+
+  return sum;
+}
diff --git a/kernel/power/ccopy_microk_power8.c b/kernel/power/ccopy_microk_power8.c
index 95b3559..b2b1bea 100644
--- a/kernel/power/ccopy_microk_power8.c
+++ b/kernel/power/ccopy_microk_power8.c
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_32 1

-static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void ccopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void ccopy_kernel_32 (long n, float *x, float *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvw4x		50, 0, %2			    \n\t"
-	"lxvw4x		51, %5, %2			    \n\t"
-	"lxvw4x		52, %6, %2			    \n\t"
-	"lxvw4x		53, %7, %2			    \n\t"
-	"lxvw4x		54, %8, %2			    \n\t"
-	"lxvw4x		55, %9, %2			    \n\t"
-	"lxvw4x		56, %10, %2			    \n\t"
-	"lxvw4x		57, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"stxvw4x		40, 0, %1			    \n\t"
-	"stxvw4x		41, %5, %1			    \n\t"
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"stxvw4x		42, %6, %1			    \n\t"
-	"stxvw4x		43, %7, %1			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"stxvw4x		44, %8, %1			    \n\t"
-	"stxvw4x		45, %9, %1			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"stxvw4x		46, %10, %1			    \n\t"
-	"stxvw4x		47, %11, %1			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"stxvw4x		50, 0, %1			    \n\t"
-	"stxvw4x		51, %5, %1			    \n\t"
-	"lxvw4x		50, 0, %2			    \n\t"
-	"lxvw4x		51, %5, %2			    \n\t"
-	"stxvw4x		52, %6, %1			    \n\t"
-	"stxvw4x		53, %7, %1			    \n\t"
-	"lxvw4x		52, %6, %2			    \n\t"
-	"lxvw4x		53, %7, %2			    \n\t"
-	"stxvw4x		54, %8, %1			    \n\t"
-	"stxvw4x		55, %9, %1			    \n\t"
-	"lxvw4x		54, %8, %2			    \n\t"
-	"lxvw4x		55, %9, %2			    \n\t"
-	"stxvw4x		56, %10, %1			    \n\t"
-	"stxvw4x		57, %11, %1			    \n\t"
-	"lxvw4x		56, %10, %2			    \n\t"
-	"lxvw4x		57, %11, %2			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"stxvw4x		40, 0, %1			    \n\t"
-	"stxvw4x		41, %5, %1			    \n\t"
-	"stxvw4x		42, %6, %1			    \n\t"
-	"stxvw4x		43, %7, %1			    \n\t"
-	"stxvw4x		44, %8, %1			    \n\t"
-	"stxvw4x		45, %9, %1			    \n\t"
-	"stxvw4x		46, %10, %1			    \n\t"
-	"stxvw4x		47, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"stxvw4x		50, 0, %1			    \n\t"
-	"stxvw4x		51, %5, %1			    \n\t"
-	"stxvw4x		52, %6, %1			    \n\t"
-	"stxvw4x		53, %7, %1			    \n\t"
-	"stxvw4x		54, %8, %1			    \n\t"
-	"stxvw4x		55, %9, %1			    \n\t"
-	"stxvw4x		56, %10, %1			    \n\t"
-	"stxvw4x		57, %11, %1			    \n\t"
-
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       "lxvw4x		32, 0, %2	\n\t"
+       "lxvw4x		33, %5, %2	\n\t"
+       "lxvw4x		34, %6, %2	\n\t"
+       "lxvw4x		35, %7, %2	\n\t"
+       "lxvw4x		36, %8, %2	\n\t"
+       "lxvw4x		37, %9, %2	\n\t"
+       "lxvw4x		38, %10, %2	\n\t"
+       "lxvw4x		39, %11, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %5, %2	\n\t"
+       "lxvw4x		42, %6, %2	\n\t"
+       "lxvw4x		43, %7, %2	\n\t"
+       "lxvw4x		44, %8, %2	\n\t"
+       "lxvw4x		45, %9, %2	\n\t"
+       "lxvw4x		46, %10, %2	\n\t"
+       "lxvw4x		47, %11, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "stxvw4x		32, 0, %3	\n\t"
+       "stxvw4x		33, %5, %3	\n\t"
+       "lxvw4x		32, 0, %2	\n\t"
+       "lxvw4x		33, %5, %2	\n\t"
+       "stxvw4x		34, %6, %3	\n\t"
+       "stxvw4x		35, %7, %3	\n\t"
+       "lxvw4x		34, %6, %2	\n\t"
+       "lxvw4x		35, %7, %2	\n\t"
+       "stxvw4x		36, %8, %3	\n\t"
+       "stxvw4x		37, %9, %3	\n\t"
+       "lxvw4x		36, %8, %2	\n\t"
+       "lxvw4x		37, %9, %2	\n\t"
+       "stxvw4x		38, %10, %3	\n\t"
+       "stxvw4x		39, %11, %3	\n\t"
+       "lxvw4x		38, %10, %2	\n\t"
+       "lxvw4x		39, %11, %2	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%2, %2, 128	\n\t"
+
+       "stxvw4x		40, 0, %3	\n\t"
+       "stxvw4x		41, %5, %3	\n\t"
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %5, %2	\n\t"
+       "stxvw4x		42, %6, %3	\n\t"
+       "stxvw4x		43, %7, %3	\n\t"
+       "lxvw4x		42, %6, %2	\n\t"
+       "lxvw4x		43, %7, %2	\n\t"
+       "stxvw4x		44, %8, %3	\n\t"
+       "stxvw4x		45, %9, %3	\n\t"
+       "lxvw4x		44, %8, %2	\n\t"
+       "lxvw4x		45, %9, %2	\n\t"
+       "stxvw4x		46, %10, %3	\n\t"
+       "stxvw4x		47, %11, %3	\n\t"
+       "lxvw4x		46, %10, %2	\n\t"
+       "lxvw4x		47, %11, %2	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "stxvw4x		32, 0, %3	\n\t"
+       "stxvw4x		33, %5, %3	\n\t"
+       "stxvw4x		34, %6, %3	\n\t"
+       "stxvw4x		35, %7, %3	\n\t"
+       "stxvw4x		36, %8, %3	\n\t"
+       "stxvw4x		37, %9, %3	\n\t"
+       "stxvw4x		38, %10, %3	\n\t"
+       "stxvw4x		39, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvw4x		40, 0, %3	\n\t"
+       "stxvw4x		41, %5, %3	\n\t"
+       "stxvw4x		42, %6, %3	\n\t"
+       "stxvw4x		43, %7, %3	\n\t"
+       "stxvw4x		44, %8, %3	\n\t"
+       "stxvw4x		45, %9, %3	\n\t"
+       "stxvw4x		46, %10, %3	\n\t"
+       "stxvw4x		47, %11, %3	\n"
+
+     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/cswap_microk_power8.c b/kernel/power/cswap_microk_power8.c
index 90ab59c..1dd03dc 100644
--- a/kernel/power/cswap_microk_power8.c
+++ b/kernel/power/cswap_microk_power8.c
@@ -35,146 +35,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_32 1

-static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void cswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void cswap_kernel_32 (long n, float *x, float *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *x2=x+1;
-	FLOAT *y2=y+1;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"addi		%3, %3, -4			    \n\t"
-	"addi		%4, %4, -4			    \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"lxvw4x		32, 0, %2			    \n\t"
-	"lxvw4x		33, %5, %2			    \n\t"
-	"lxvw4x		34, %6, %2			    \n\t"
-	"lxvw4x		35, %7, %2			    \n\t"
-	"lxvw4x		36, %8, %2			    \n\t"
-	"lxvw4x		37, %9, %2			    \n\t"
-	"lxvw4x		38, %10, %2			    \n\t"
-	"lxvw4x		39, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvw4x		48, 0, %1			    \n\t"
-	"lxvw4x		49, %5, %1			    \n\t"
-	"lxvw4x		50, %6, %1			    \n\t"
-	"lxvw4x		51, %7, %1			    \n\t"
-	"lxvw4x		52, %8, %1			    \n\t"
-	"lxvw4x		53, %9, %1			    \n\t"
-	"lxvw4x		54, %10, %1			    \n\t"
-	"lxvw4x		55, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"lxvw4x		56, 0, %1			    \n\t"
-	"lxvw4x		57, %5, %1			    \n\t"
-	"lxvw4x		58, %6, %1			    \n\t"
-	"lxvw4x		59, %7, %1			    \n\t"
-	"lxvw4x		60, %8, %1			    \n\t"
-	"lxvw4x		61, %9, %1			    \n\t"
-	"lxvw4x		62, %10, %1			    \n\t"
-	"lxvw4x		63, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"stxvw4x		32, 0, %3			    \n\t"
-	"stxvw4x		33, %5, %3			    \n\t"
-	"stxvw4x		34, %6, %3			    \n\t"
-	"stxvw4x		35, %7, %3			    \n\t"
-	"stxvw4x		36, %8, %3			    \n\t"
-	"stxvw4x		37, %9, %3			    \n\t"
-	"stxvw4x		38, %10, %3			    \n\t"
-	"stxvw4x		39, %11, %3			    \n\t"
-
-	"addi		%3, %3, 128			    \n\t"
-
-	"stxvw4x		40, 0, %3			    \n\t"
-	"stxvw4x		41, %5, %3			    \n\t"
-	"stxvw4x		42, %6, %3			    \n\t"
-	"stxvw4x		43, %7, %3			    \n\t"
-	"stxvw4x		44, %8, %3			    \n\t"
-	"stxvw4x		45, %9, %3			    \n\t"
-	"stxvw4x		46, %10, %3			    \n\t"
-	"stxvw4x		47, %11, %3			    \n\t"
-
-	"addi		%3, %3, 128			    \n\t"
-
-	"stxvw4x		48, 0, %4			    \n\t"
-	"stxvw4x		49, %5, %4			    \n\t"
-	"stxvw4x		50, %6, %4			    \n\t"
-	"stxvw4x		51, %7, %4			    \n\t"
-	"stxvw4x		52, %8, %4			    \n\t"
-	"stxvw4x		53, %9, %4			    \n\t"
-	"stxvw4x		54, %10, %4			    \n\t"
-	"stxvw4x		55, %11, %4			    \n\t"
-
-	"addi		%4, %4, 128			    \n\t"
-
-	"stxvw4x		56, 0, %4			    \n\t"
-	"stxvw4x		57, %5, %4			    \n\t"
-	"stxvw4x		58, %6, %4			    \n\t"
-	"stxvw4x		59, %7, %4			    \n\t"
-	"stxvw4x		60, %8, %4			    \n\t"
-	"stxvw4x		61, %9, %4			    \n\t"
-	"stxvw4x		62, %10, %4			    \n\t"
-	"stxvw4x		63, %11, %4			    \n\t"
-
-	"addi		%4, %4, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (y2),     // 3
-          "r" (x2),     // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "lxvw4x		32, 0, %4	\n\t"
+       "lxvw4x		33, %5, %4	\n\t"
+       "lxvw4x		34, %6, %4	\n\t"
+       "lxvw4x		35, %7, %4	\n\t"
+       "lxvw4x		36, %8, %4	\n\t"
+       "lxvw4x		37, %9, %4	\n\t"
+       "lxvw4x		38, %10, %4	\n\t"
+       "lxvw4x		39, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "lxvw4x		40, 0, %4	\n\t"
+       "lxvw4x		41, %5, %4	\n\t"
+       "lxvw4x		42, %6, %4	\n\t"
+       "lxvw4x		43, %7, %4	\n\t"
+       "lxvw4x		44, %8, %4	\n\t"
+       "lxvw4x		45, %9, %4	\n\t"
+       "lxvw4x		46, %10, %4	\n\t"
+       "lxvw4x		47, %11, %4	\n\t"
+
+       "addi		%4, %4, -128	\n\t"
+
+       "lxvw4x		48, 0, %3	\n\t"
+       "lxvw4x		49, %5, %3	\n\t"
+       "lxvw4x		50, %6, %3	\n\t"
+       "lxvw4x		51, %7, %3	\n\t"
+       "lxvw4x		0, %8, %3	\n\t"
+       "lxvw4x		1, %9, %3	\n\t"
+       "lxvw4x		2, %10, %3	\n\t"
+       "lxvw4x		3, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "lxvw4x		4, 0, %3	\n\t"
+       "lxvw4x		5, %5, %3	\n\t"
+       "lxvw4x		6, %6, %3	\n\t"
+       "lxvw4x		7, %7, %3	\n\t"
+       "lxvw4x		8, %8, %3	\n\t"
+       "lxvw4x		9, %9, %3	\n\t"
+       "lxvw4x		10, %10, %3	\n\t"
+       "lxvw4x		11, %11, %3	\n\t"
+
+       "addi		%3, %3, -128	\n\t"
+
+       "stxvw4x		32, 0, %3	\n\t"
+       "stxvw4x		33, %5, %3	\n\t"
+       "stxvw4x		34, %6, %3	\n\t"
+       "stxvw4x		35, %7, %3	\n\t"
+       "stxvw4x		36, %8, %3	\n\t"
+       "stxvw4x		37, %9, %3	\n\t"
+       "stxvw4x		38, %10, %3	\n\t"
+       "stxvw4x		39, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvw4x		40, 0, %3	\n\t"
+       "stxvw4x		41, %5, %3	\n\t"
+       "stxvw4x		42, %6, %3	\n\t"
+       "stxvw4x		43, %7, %3	\n\t"
+       "stxvw4x		44, %8, %3	\n\t"
+       "stxvw4x		45, %9, %3	\n\t"
+       "stxvw4x		46, %10, %3	\n\t"
+       "stxvw4x		47, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvw4x		48, 0, %4	\n\t"
+       "stxvw4x		49, %5, %4	\n\t"
+       "stxvw4x		50, %6, %4	\n\t"
+       "stxvw4x		51, %7, %4	\n\t"
+       "stxvw4x		0, %8, %4	\n\t"
+       "stxvw4x		1, %9, %4	\n\t"
+       "stxvw4x		2, %10, %4	\n\t"
+       "stxvw4x		3, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "stxvw4x		4, 0, %4	\n\t"
+       "stxvw4x		5, %5, %4	\n\t"
+       "stxvw4x		6, %6, %4	\n\t"
+       "stxvw4x		7, %7, %4	\n\t"
+       "stxvw4x		8, %8, %4	\n\t"
+       "stxvw4x		9, %9, %4	\n\t"
+       "stxvw4x		10, %10, %4	\n\t"
+       "stxvw4x		11, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "addic.		%2, %2, -32	\n\t"
+       "bgt		1b		\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y)		// 4
+     :
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+       "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+     );
+}
diff --git a/kernel/power/dasum.c b/kernel/power/dasum.c
index 77f5345..73962c2 100644
--- a/kernel/power/dasum.c
+++ b/kernel/power/dasum.c
@@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #else

-#define ABS fabsf
+#error supports double only

 #endif

@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_16

-static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT dasum_kernel_16(BLASLONG n, FLOAT *x1)
 {

 	BLASLONG i=0;
@@ -92,9 +92,7 @@ static void dasum_kernel_16(BLASLONG n, FLOAT *x1, FLOAT *svec)

 	}

-	svec[0] = sum0+sum1+sum2+sum3;
-	svec[1] = 0.0;
-
+	return sum0+sum1+sum2+sum3;
 }

 #endif
@@ -103,7 +101,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
-	FLOAT svec[2] __attribute__ ((aligned (16)));;
 	BLASLONG n1;

 	if (n <= 0 || inc_x <= 0) return(sumf);
@@ -115,8 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		if ( n1 > 0 )
 		{

-			dasum_kernel_16(n1, x, svec);
-			sumf = svec[0] + svec[1];
+			sumf = dasum_kernel_16(n1, x);
 			i=n1;
 		}

diff --git a/kernel/power/dasum_microk_power8.c b/kernel/power/dasum_microk_power8.c
index cc38c4f..880d7d2 100644
--- a/kernel/power/dasum_microk_power8.c
+++ b/kernel/power/dasum_microk_power8.c
@@ -34,144 +34,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

 #define HAVE_KERNEL_16 1
-static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));

-static void dasum_kernel_16( BLASLONG n, FLOAT *x, FLOAT *svec)
+static double dasum_kernel_16 (long n, double *x)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xxlxor		32,32,32			    \n\t"
-	"xxlxor		33,33,33			    \n\t"
-	"xxlxor		34,34,34			    \n\t"
-	"xxlxor		35,35,35			    \n\t"
-	"xxlxor		36,36,36			    \n\t"
-	"xxlxor		37,37,37			    \n\t"
-	"xxlxor		38,38,38			    \n\t"
-	"xxlxor		39,39,39			    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xvabsdp	48, 40				    \n\t"
-	"xvabsdp	49, 41				    \n\t"
-	"xvabsdp	50, 42				    \n\t"
-	"xvabsdp	51, 43				    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-
-	"xvabsdp	52, 44				    \n\t"
-	"xvabsdp	53, 45				    \n\t"
-
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-
-	"xvabsdp	54, 46				    \n\t"
-	"xvabsdp	55, 47				    \n\t"
-
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-
-	"xvadddp	32, 32, 48		    \n\t"
-	"xvadddp	33, 33, 49		    \n\t"
-
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"xvadddp	34, 34, 50		    \n\t"
-	"xvadddp	35, 35, 51		    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-	"xvadddp	36, 36, 52		    \n\t"
-	"xvadddp	37, 37, 53		    \n\t"
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"xvadddp	38, 38, 54		    \n\t"
-	"xvadddp	39, 39, 55		    \n\t"
-
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-
-	"xvabsdp	48, 40				    \n\t"
-	"xvabsdp	49, 41				    \n\t"
-	"xvabsdp	50, 42				    \n\t"
-	"xvabsdp	51, 43				    \n\t"
-	"xvabsdp	52, 44				    \n\t"
-	"xvabsdp	53, 45				    \n\t"
-	"xvabsdp	54, 46				    \n\t"
-	"xvabsdp	55, 47				    \n\t"
-
-	"xvadddp	32, 32, 48		    \n\t"
-	"xvadddp	33, 33, 49		    \n\t"
-	"xvadddp	34, 34, 50		    \n\t"
-	"xvadddp	35, 35, 51		    \n\t"
-	"xvadddp	36, 36, 52		    \n\t"
-	"xvadddp	37, 37, 53		    \n\t"
-	"xvadddp	38, 38, 54		    \n\t"
-	"xvadddp	39, 39, 55		    \n\t"
-
-	"xvadddp	32, 32, 33		     \n\t"
-	"xvadddp	34, 34, 35		     \n\t"
-	"xvadddp	36, 36, 37		     \n\t"
-	"xvadddp	38, 38, 39		     \n\t"
-
-	"xvadddp	32, 32, 34		     \n\t"
-	"xvadddp	36, 36, 38		     \n\t"
-
-	"xvadddp	32, 32, 36		     \n\t"
-
-
-	"stxvd2x	32, 0, %3		     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (svec),   // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)   // 11
-	: "cr0", "%0", "%2",  "memory"
-	);
-
-}
+  double sum;
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvabsdp		48, 40		\n\t"
+       "xvabsdp		49, 41		\n\t"
+       "xvabsdp		50, 42		\n\t"
+       "xvabsdp		51, 43		\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+
+       "xvabsdp		%x3, 44		\n\t"
+       "xvabsdp		%x4, 45		\n\t"
+
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+
+       "xvabsdp		%x5, 46		\n\t"
+       "xvabsdp		%x6, 47		\n\t"
+
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+
+       "xvadddp		32, 32, 48	\n\t"
+       "xvadddp		33, 33, 49	\n\t"
+
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
+
+       "xvadddp		34, 34, 50	\n\t"
+       "xvadddp		35, 35, 51	\n\t"
+       "addi		%2, %2, 128	\n\t"
+       "xvadddp		36, 36, %x3	\n\t"
+       "xvadddp		37, 37, %x4	\n\t"
+       "addic.		%1, %1, -16	\n\t"
+       "xvadddp		38, 38, %x5	\n\t"
+       "xvadddp		39, 39, %x6	\n\t"
+
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvabsdp		48, 40		\n\t"
+       "xvabsdp		49, 41		\n\t"
+       "xvabsdp		50, 42		\n\t"
+       "xvabsdp		51, 43		\n\t"
+       "xvabsdp		%x3, 44		\n\t"
+       "xvabsdp		%x4, 45		\n\t"
+       "xvabsdp		%x5, 46		\n\t"
+       "xvabsdp		%x6, 47		\n\t"
+
+       "xvadddp		32, 32, 48	\n\t"
+       "xvadddp		33, 33, 49	\n\t"
+       "xvadddp		34, 34, 50	\n\t"
+       "xvadddp		35, 35, 51	\n\t"
+       "xvadddp		36, 36, %x3	\n\t"
+       "xvadddp		37, 37, %x4	\n\t"
+       "xvadddp		38, 38, %x5	\n\t"
+       "xvadddp		39, 39, %x6	\n\t"
+
+       "xvadddp		32, 32, 33	\n\t"
+       "xvadddp		34, 34, 35	\n\t"
+       "xvadddp		36, 36, 37	\n\t"
+       "xvadddp		38, 38, 39	\n\t"
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+
+       "xxswapd		33, 32		\n\t"
+       "xsadddp		%x0, 32, 33	\n"
+
+     "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+     :
+       "=d" (sum),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3)	// 6
+     :
+       "m" (*x),
+       "b" (16),	// 8
+       "b" (32),	// 9
+       "b" (48),	// 10
+       "b" (64),	// 11
+       "b" (80),	// 12
+       "b" (96),	// 13
+       "b" (112)	// 14
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+
+  return sum;
+}


diff --git a/kernel/power/daxpy.c b/kernel/power/daxpy.c
index 4365bd8..df0572e 100644
--- a/kernel/power/daxpy.c
+++ b/kernel/power/daxpy.c
@@ -43,21 +43,20 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_8

-static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void daxpy_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT alpha)
 {
 	BLASLONG register i = 0;
-	FLOAT a = *alpha;

 	while(i < n)
         {
-              y[i]   += a * x[i];
-              y[i+1] += a * x[i+1];
-              y[i+2] += a * x[i+2];
-              y[i+3] += a * x[i+3];
-              y[i+4] += a * x[i+4];
-              y[i+5] += a * x[i+5];
-              y[i+6] += a * x[i+6];
-              y[i+7] += a * x[i+7];
+              y[i]   += alpha * x[i];
+              y[i+1] += alpha * x[i+1];
+              y[i+2] += alpha * x[i+2];
+              y[i+3] += alpha * x[i+3];
+              y[i+4] += alpha * x[i+4];
+              y[i+5] += alpha * x[i+5];
+              y[i+6] += alpha * x[i+6];
+              y[i+7] += alpha * x[i+7];
               i+=8 ;

        }
@@ -70,11 +69,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
-	FLOAT a2[4];
-	a2[0]=da;
-	a2[1]=da;
-	a2[2]=da;
-	a2[3]=da;

 	if ( n <= 0 )  return(0);

@@ -84,7 +78,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 		BLASLONG n1 = n & -16;

 		if ( n1 )
-			daxpy_kernel_8(n1, x, y , a2 );
+			daxpy_kernel_8(n1, x, y, da);

 		i = n1;
 		while(i < n)
diff --git a/kernel/power/daxpy_microk_power8.c b/kernel/power/daxpy_microk_power8.c
index bb3f73a..fb714a3 100644
--- a/kernel/power/daxpy_microk_power8.c
+++ b/kernel/power/daxpy_microk_power8.c
@@ -35,167 +35,183 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


 #define HAVE_KERNEL_8 1
-static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));

-static void daxpy_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void daxpy_kernel_8 (long n, double *x, double *y, double alpha)
 {
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+  __vector double t4;
+  __vector double t5;
+  __vector double t6;
+  __vector double t7;
+  __vector double t8;
+  __vector double t9;
+  __vector double t10;
+  __vector double t11;
+  __vector double t12;
+  __vector double t13;
+  __vector double t14;
+  __vector double t15;
+  __vector double t16;

+  __asm__
+    (
+       "xxspltd		%x4, %x22, 0	\n\t"

-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *y2=y+1;
-	BLASLONG pre = 384;
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"

-	__asm__  __volatile__
-	(
+       "lxvd2x		%x5, 0, %2	\n\t"
+       "lxvd2x		%x6, %23, %2	\n\t"
+       "lxvd2x		%x7, %24, %2	\n\t"
+       "lxvd2x		%x8, %25, %2	\n\t"

-	"lxsdx		33, %5, %4			    \n\t"
-	"xxspltd	32, 33, 0			    \n\t"
-	"addi		%8, %8, -8			    \n\t"
+       "lxvd2x		%x13, 0, %3	\n\t"
+       "lxvd2x		%x14, %23, %3	\n\t"
+       "lxvd2x		%x15, %24, %3	\n\t"
+       "lxvd2x		%x16, %25, %3	\n\t"

-	"dcbt		%2, %9				    \n\t"
-	"dcbt		%3, %9				    \n\t"
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 64	\n\t"

-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
+       "lxvd2x		%x9, 0, %2	\n\t"
+       "lxvd2x		%x10, %23, %2	\n\t"
+       "lxvd2x		%x11, %24, %2	\n\t"
+       "lxvd2x		%x12, %25, %2	\n\t"

-	"lxvd2x		48, 0, %3			    \n\t"
-	"lxvd2x		49, %5, %3			    \n\t"
-	"lxvd2x		50, %6, %3			    \n\t"
-	"lxvd2x		51, %7, %3			    \n\t"
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-	"lxvd2x		44, 0, %2			    \n\t"
-	"lxvd2x		45, %5, %2			    \n\t"
-	"lxvd2x		46, %6, %2			    \n\t"
-	"lxvd2x		47, %7, %2			    \n\t"
-
-	"lxvd2x		52, 0, %3			    \n\t"
-	"lxvd2x		53, %5, %3			    \n\t"
-	"lxvd2x		54, %6, %3			    \n\t"
-	"lxvd2x		55, %7, %3			    \n\t"
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %9				    \n\t"
-	"dcbt		%3, %9				    \n\t"
-
-	"xvmaddadp	48, 40, 32		    	    \n\t"
-	"xvmaddadp	49, 41, 32		    	    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-
-	"stxvd2x	48,  0, %8			    \n\t"
-	"stxvd2x	49, %5, %8			    \n\t"
-
-	"xvmaddadp	50, 42, 32		    	    \n\t"
-	"xvmaddadp	51, 43, 32		    	    \n\t"
-
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-
-	"stxvd2x	50, %6, %8			    \n\t"
-	"stxvd2x	51, %7, %8			    \n\t"
-
-	"lxvd2x		48, 0, %3			    \n\t"
-	"lxvd2x		49, %5, %3			    \n\t"
-	"lxvd2x		50, %6, %3			    \n\t"
-	"lxvd2x		51, %7, %3			    \n\t"
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%8, %8, 64			    \n\t"
-
-	"xvmaddadp	52, 44, 32		    	    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-	"xvmaddadp	53, 45, 32		    	    \n\t"
-
-	"lxvd2x		44, 0, %2			    \n\t"
-	"lxvd2x		45, %5, %2			    \n\t"
-
-	"stxvd2x	52,  0, %8			    \n\t"
-	"stxvd2x	53, %5, %8			    \n\t"
-
-	"xvmaddadp	54, 46, 32		    	    \n\t"
-	"xvmaddadp	55, 47, 32		    	    \n\t"
-
-	"lxvd2x		46, %6, %2			    \n\t"
-	"lxvd2x		47, %7, %2			    \n\t"
-
-	"stxvd2x	54, %6, %8			    \n\t"
-	"stxvd2x	55, %7, %8			    \n\t"
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%8, %8, 64			    \n\t"
-
-	"lxvd2x		52, 0, %3			    \n\t"
-	"lxvd2x		53, %5, %3			    \n\t"
-	"lxvd2x		54, %6, %3			    \n\t"
-	"lxvd2x		55, %7, %3			    \n\t"
-
-	"addi		%3, %3, 64			    \n\t"
-
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-
-	"xvmaddadp	48, 40, 32		    	    \n\t"
-	"xvmaddadp	49, 41, 32		    	    \n\t"
-	"xvmaddadp	50, 42, 32		    	    \n\t"
-	"xvmaddadp	51, 43, 32		    	    \n\t"
-
-	"xvmaddadp	52, 44, 32		    	    \n\t"
-	"xvmaddadp	53, 45, 32		    	    \n\t"
-	"xvmaddadp	54, 46, 32		    	    \n\t"
-	"xvmaddadp	55, 47, 32		    	    \n\t"
-
-	"stxvd2x	48,  0, %8			    \n\t"
-	"stxvd2x	49, %5, %8			    \n\t"
-	"stxvd2x	50, %6, %8			    \n\t"
-	"stxvd2x	51, %7, %8			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-
-	"stxvd2x	52,  0, %8			    \n\t"
-	"stxvd2x	53, %5, %8			    \n\t"
-	"stxvd2x	54, %6, %8			    \n\t"
-	"stxvd2x	55, %7, %8			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (y1),     // 3
-          "r" (alpha),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-	  "r" (y2),     // 8
-	  "r" (pre)	// 9
-	: "cr0", "%0", "%2" , "%3", "%8", "memory"
-	);
-
-}
+       "lxvd2x		%x17, 0, %3	\n\t"
+       "lxvd2x		%x18, %23, %3	\n\t"
+       "lxvd2x		%x19, %24, %3	\n\t"
+       "lxvd2x		%x20, %25, %3	\n\t"
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, -64	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		2f		\n\t"
+
+       ".align 5			\n"
+     "1:				\n\t"
+
+       "xvmaddadp	%x13, %x5, %x4	\n\t"
+       "xvmaddadp	%x14, %x6, %x4	\n\t"
+
+       "lxvd2x		%x5, 0, %2	\n\t"
+       "lxvd2x		%x6, %23, %2	\n\t"
+
+       "stxvd2x		%x13, 0, %3	\n\t"
+       "stxvd2x		%x14, %23, %3	\n\t"
+
+       "xvmaddadp	%x15, %x7, %x4	\n\t"
+       "xvmaddadp	%x16, %x8, %x4	\n\t"
+
+       "lxvd2x		%x7, %24, %2	\n\t"
+       "lxvd2x		%x8, %25, %2	\n\t"
+
+       "stxvd2x		%x15, %24, %3	\n\t"
+       "stxvd2x		%x16, %25, %3	\n\t"
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "lxvd2x		%x13, 0, %3	\n\t"
+       "lxvd2x		%x14, %23, %3	\n\t"
+       "lxvd2x		%x15, %24, %3	\n\t"
+       "lxvd2x		%x16, %25, %3	\n\t"
+
+       "addi		%3, %3, -64	\n\t"
+
+       "xvmaddadp	%x17, %x9, %x4	\n\t"
+       "xvmaddadp	%x18, %x10, %x4	\n\t"
+
+       "lxvd2x		%x9, 0, %2	\n\t"
+       "lxvd2x		%x10, %23, %2	\n\t"
+
+       "stxvd2x		%x17, 0, %3	\n\t"
+       "stxvd2x		%x18, %23, %3	\n\t"
+
+       "xvmaddadp	%x19, %x11, %x4	\n\t"
+       "xvmaddadp	%x20, %x12, %x4	\n\t"
+
+       "lxvd2x		%x11, %24, %2	\n\t"
+       "lxvd2x		%x12, %25, %2	\n\t"
+
+       "stxvd2x		%x19, %24, %3	\n\t"
+       "stxvd2x		%x20, %25, %3	\n\t"
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "lxvd2x		%x17, 0, %3	\n\t"
+       "lxvd2x		%x18, %23, %3	\n\t"
+       "lxvd2x		%x19, %24, %3	\n\t"
+       "lxvd2x		%x20, %25, %3	\n\t"
+
+       "addi		%3, %3, -64	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmaddadp	%x13, %x5, %x4	\n\t"
+       "xvmaddadp	%x14, %x6, %x4	\n\t"
+       "xvmaddadp	%x15, %x7, %x4	\n\t"
+       "xvmaddadp	%x16, %x8, %x4	\n\t"
+
+       "xvmaddadp	%x17, %x9, %x4	\n\t"
+       "xvmaddadp	%x18, %x10, %x4	\n\t"
+       "xvmaddadp	%x19, %x11, %x4	\n\t"
+       "xvmaddadp	%x20, %x12, %x4	\n\t"
+
+       "stxvd2x		%x13, 0, %3	\n\t"
+       "stxvd2x		%x14, %23, %3	\n\t"
+       "stxvd2x		%x15, %24, %3	\n\t"
+       "stxvd2x		%x16, %25, %3	\n\t"
+
+       "addi		%3, %3, 64	\n\t"
+
+       "stxvd2x		%x17, 0, %3	\n\t"
+       "stxvd2x		%x18, %23, %3	\n\t"
+       "stxvd2x		%x19, %24, %3	\n\t"
+       "stxvd2x		%x20, %25, %3	\n"
+
+     "#n=%1 x=%21=%2 y=%0=%3 alpha=%22 o16=%23 o32=%24 o48=%25\n"
+     "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15 t12=%x16 t13=%x17 t14=%x18 t15=%x19 t16=%x20"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y),	// 3
+       "=wa" (t0),	// 4
+       "=wa" (t1),	// 5
+       "=wa" (t2),	// 6
+       "=wa" (t3),	// 7
+       "=wa" (t4),	// 8
+       "=wa" (t5),	// 9
+       "=wa" (t6),	// 10
+       "=wa" (t7),	// 11
+       "=wa" (t8),	// 12
+       "=wa" (t9),	// 13
+       "=wa" (t10),	// 14
+       "=wa" (t11),	// 15
+       "=wa" (t12),	// 16
+       "=wa" (t13),	// 17
+       "=wa" (t14),	// 18
+       "=wa" (t15),	// 19
+       "=wa" (t16)	// 20
+     :
+       "m" (*x),
+       "d" (alpha),	// 22
+       "b" (16),	// 23
+       "b" (32),	// 24
+       "b" (48)		// 25
+     :
+       "cr0"
+     );
+
+}


diff --git a/kernel/power/dcopy_microk_power8.c b/kernel/power/dcopy_microk_power8.c
index 04f7db5..261dc04 100644
--- a/kernel/power/dcopy_microk_power8.c
+++ b/kernel/power/dcopy_microk_power8.c
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_32 1

-static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void dcopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void dcopy_kernel_32 (long n, double *x, double *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvd2x		50, 0, %2			    \n\t"
-	"lxvd2x		51, %5, %2			    \n\t"
-	"lxvd2x		52, %6, %2			    \n\t"
-	"lxvd2x		53, %7, %2			    \n\t"
-	"lxvd2x		54, %8, %2			    \n\t"
-	"lxvd2x		55, %9, %2			    \n\t"
-	"lxvd2x		56, %10, %2			    \n\t"
-	"lxvd2x		57, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"stxvd2x		40, 0, %1			    \n\t"
-	"stxvd2x		41, %5, %1			    \n\t"
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"stxvd2x		42, %6, %1			    \n\t"
-	"stxvd2x		43, %7, %1			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"stxvd2x		44, %8, %1			    \n\t"
-	"stxvd2x		45, %9, %1			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"stxvd2x		46, %10, %1			    \n\t"
-	"stxvd2x		47, %11, %1			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"stxvd2x		50, 0, %1			    \n\t"
-	"stxvd2x		51, %5, %1			    \n\t"
-	"lxvd2x		50, 0, %2			    \n\t"
-	"lxvd2x		51, %5, %2			    \n\t"
-	"stxvd2x		52, %6, %1			    \n\t"
-	"stxvd2x		53, %7, %1			    \n\t"
-	"lxvd2x		52, %6, %2			    \n\t"
-	"lxvd2x		53, %7, %2			    \n\t"
-	"stxvd2x		54, %8, %1			    \n\t"
-	"stxvd2x		55, %9, %1			    \n\t"
-	"lxvd2x		54, %8, %2			    \n\t"
-	"lxvd2x		55, %9, %2			    \n\t"
-	"stxvd2x		56, %10, %1			    \n\t"
-	"stxvd2x		57, %11, %1			    \n\t"
-	"lxvd2x		56, %10, %2			    \n\t"
-	"lxvd2x		57, %11, %2			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"stxvd2x		40, 0, %1			    \n\t"
-	"stxvd2x		41, %5, %1			    \n\t"
-	"stxvd2x		42, %6, %1			    \n\t"
-	"stxvd2x		43, %7, %1			    \n\t"
-	"stxvd2x		44, %8, %1			    \n\t"
-	"stxvd2x		45, %9, %1			    \n\t"
-	"stxvd2x		46, %10, %1			    \n\t"
-	"stxvd2x		47, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"stxvd2x		50, 0, %1			    \n\t"
-	"stxvd2x		51, %5, %1			    \n\t"
-	"stxvd2x		52, %6, %1			    \n\t"
-	"stxvd2x		53, %7, %1			    \n\t"
-	"stxvd2x		54, %8, %1			    \n\t"
-	"stxvd2x		55, %9, %1			    \n\t"
-	"stxvd2x		56, %10, %1			    \n\t"
-	"stxvd2x		57, %11, %1			    \n\t"
-
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%2, %2, 128	\n\t"
+
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"
+
+     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/ddot.c b/kernel/power/ddot.c
index cef60a2..e43470e 100644
--- a/kernel/power/ddot.c
+++ b/kernel/power/ddot.c
@@ -43,7 +43,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_8

-static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+static FLOAT ddot_kernel_8 (BLASLONG n, FLOAT *x, FLOAT *y)
 {
 	BLASLONG register i = 0;
 	FLOAT dot = 0.0;
@@ -62,8 +62,7 @@ static void ddot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
               i+=8 ;

        }
-       *d += dot;
-
+       return dot;
 }

 #endif
@@ -83,7 +82,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 		BLASLONG n1 = n & -16;

 		if ( n1 )
-			ddot_kernel_8(n1, x, y , &dot );
+			dot = ddot_kernel_8(n1, x, y);

 		i = n1;
 		while(i < n)
diff --git a/kernel/power/ddot_microk_power8.c b/kernel/power/ddot_microk_power8.c
index b880492..4e6bc29 100644
--- a/kernel/power/ddot_microk_power8.c
+++ b/kernel/power/ddot_microk_power8.c
@@ -34,145 +34,138 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

 #define HAVE_KERNEL_8 1
-static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));

-static void ddot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+static double ddot_kernel_8 (long n, double *x, double *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-	"xxlxor		32,32,32			    \n\t"
-	"xxlxor		33,33,33			    \n\t"
-	"xxlxor		34,34,34			    \n\t"
-	"xxlxor		35,35,35			    \n\t"
-	"xxlxor		36,36,36			    \n\t"
-	"xxlxor		37,37,37			    \n\t"
-	"xxlxor		38,38,38			    \n\t"
-	"xxlxor		39,39,39			    \n\t"
-
-	"dcbt		%2, %12				    \n\t"
-	"dcbt		%3, %12				    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		48, 0, %3			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		49, %5, %3			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		50, %6, %3			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		51, %7, %3			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		52, %8, %3			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		53, %9, %3			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		54, %10, %3			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-	"lxvd2x		55, %11, %3			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-	"addi		%3, %3, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %12				    \n\t"
-	"dcbt		%3, %12				    \n\t"
-
-	"xvmaddadp	32, 40, 48		    \n\t"
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		48, 0, %3			    \n\t"
-	"xvmaddadp	33, 41, 49		    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		49, %5, %3			    \n\t"
-	"xvmaddadp	34, 42, 50		    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		50, %6, %3			    \n\t"
-	"xvmaddadp	35, 43, 51		    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		51, %7, %3			    \n\t"
-	"xvmaddadp	36, 44, 52		    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		52, %8, %3			    \n\t"
-	"xvmaddadp	37, 45, 53		    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		53, %9, %3			    \n\t"
-	"xvmaddadp	38, 46, 54		    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		54, %10, %3			    \n\t"
-	"xvmaddadp	39, 47, 55		    \n\t"
-
-	"lxvd2x		47, %11, %2			    \n\t"
-	"lxvd2x		55, %11, %3			    \n\t"
-
-
-	"addi		%2, %2, 128			    \n\t"
-	"addi		%3, %3, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmaddadp	32, 40, 48		    \n\t"
-	"xvmaddadp	33, 41, 49		    \n\t"
-	"xvmaddadp	34, 42, 50		    \n\t"
-	"xvmaddadp	35, 43, 51		    \n\t"
-	"xvmaddadp	36, 44, 52		    \n\t"
-	"xvmaddadp	37, 45, 53		    \n\t"
-	"xvmaddadp	38, 46, 54		    \n\t"
-	"xvmaddadp	39, 47, 55		    \n\t"
-
-	"xvadddp	32, 32, 33		     \n\t"
-	"xvadddp	34, 34, 35		     \n\t"
-	"xvadddp	36, 36, 37		     \n\t"
-	"xvadddp	38, 38, 39		     \n\t"
-
-	"xvadddp	32, 32, 34		     \n\t"
-	"xvadddp	36, 36, 38		     \n\t"
-
-	"xvadddp	32, 32, 36		     \n\t"
-
-	"xxswapd	33, 32			     \n\t"
-
-	"xsadddp	32, 32, 33		     \n\t"
-
-	"stxsdx		32, 0, %4			     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (y1),     // 3
-          "r" (dot),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112),   // 11
-	  "r" (pre)	// 12
-	: "cr0", "%0", "%2" , "%3", "memory"
-	);
-
-}
-
-
+  double dot;
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		48, 0, %3	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "lxvd2x		41, %10, %2	\n\t"
+       "lxvd2x		49, %10, %3	\n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "lxvd2x		42, %11, %2	\n\t"
+       "lxvd2x		50, %11, %3	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "lxvd2x		43, %12, %2	\n\t"
+       "lxvd2x		51, %12, %3	\n\t"
+       "xvmaddadp	36, 44, %x4	\n\t"
+       "lxvd2x		44, %13, %2	\n\t"
+       "lxvd2x		%x4, %13, %3	\n\t"
+       "xvmaddadp	37, 45, %x5	\n\t"
+       "lxvd2x		45, %14, %2	\n\t"
+       "lxvd2x		%x5, %14, %3	\n\t"
+       "xvmaddadp	38, 46, %x6	\n\t"
+       "lxvd2x		46, %15, %2	\n\t"
+       "lxvd2x		%x6, %15, %3	\n\t"
+       "xvmaddadp	39, 47, %x7	\n\t"
+       "lxvd2x		47, %16, %2	\n\t"
+       "lxvd2x		%x7, %16, %3	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"
+       "xvmaddadp	33, 41, 49	\n\t"
+       "xvmaddadp	34, 42, 50	\n\t"
+       "xvmaddadp	35, 43, 51	\n\t"
+       "xvmaddadp	36, 44, %x4	\n\t"
+       "xvmaddadp	37, 45, %x5	\n\t"
+       "xvmaddadp	38, 46, %x6	\n\t"
+       "xvmaddadp	39, 47, %x7	\n\t"
+
+       "xvadddp		32, 32, 33	\n\t"
+       "xvadddp		34, 34, 35	\n\t"
+       "xvadddp		36, 36, 37	\n\t"
+       "xvadddp		38, 38, 39	\n\t"
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+
+       "xxswapd		33, 32		\n\t"
+
+       "xsadddp		%x0, 32, 33	\n"
+
+     "#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n"
+     "#t0=%x4 t1=%x5 t2=%x6 t3=%x7"
+     :
+       "=d" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y),	// 3
+       "=wa" (t0),	// 4
+       "=wa" (t1),	// 5
+       "=wa" (t2),	// 6
+       "=wa" (t3)	// 7
+     :
+       "m" (*x),
+       "m" (*y),
+       "b" (16),	// 10
+       "b" (32),	// 11
+       "b" (48),	// 12
+       "b" (64),	// 13
+       "b" (80),	// 14
+       "b" (96),	// 15
+       "b" (112)	// 16
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/dgemv_n.c b/kernel/power/dgemv_n.c
index 812d09d..57f9f9e 100644
--- a/kernel/power/dgemv_n.c
+++ b/kernel/power/dgemv_n.c
@@ -47,18 +47,18 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_4x4

-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x4(BLASLONG n, FLOAT *a_ptr, BLASLONG lda, FLOAT *xo, FLOAT *y, FLOAT alpha)
 {
 	BLASLONG i;
-	FLOAT *a0,*a1,*a2,*a3;
 	FLOAT x[4]  __attribute__ ((aligned (16)));;
-	a0 = ap[0];
-	a1 = ap[1];
-	a2 = ap[2];
-	a3 = ap[3];
+	FLOAT *a0 = a_ptr;
+	FLOAT *a1 = a0 + lda;
+	FLOAT *a2 = a1 + lda;
+	FLOAT *a3 = a2 + lda;
+

 	for ( i=0; i<4; i++)
-		x[i] = xo[i] * *alpha;
+		x[i] = xo[i] * alpha;

 	for ( i=0; i< n; i+=4 )
 	{
@@ -73,16 +73,13 @@ static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT

 #ifndef HAVE_KERNEL_4x2

-static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x2(BLASLONG n, FLOAT *a0, FLOAT *a1, FLOAT *xo, FLOAT *y, FLOAT alpha)
 {
 	BLASLONG i;
-	FLOAT *a0,*a1;
 	FLOAT x[4]  __attribute__ ((aligned (16)));;
-	a0 = ap[0];
-	a1 = ap[1];

 	for ( i=0; i<2; i++)
-		x[i] = xo[i] * *alpha;
+		x[i] = xo[i] * alpha;

 	for ( i=0; i< n; i+=4 )
 	{
@@ -98,15 +95,13 @@ static void dgemv_kernel_4x2(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT

 #ifndef HAVE_KERNEL_4x1

-static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x1(BLASLONG n, FLOAT *a0, FLOAT *xo, FLOAT *y, FLOAT alpha)
 {
 	BLASLONG i;
-	FLOAT *a0;
 	FLOAT x[4]  __attribute__ ((aligned (16)));;
-	a0 = ap;

 	for ( i=0; i<1; i++)
-		x[i] = xo[i] * *alpha;
+		x[i] = xo[i] * alpha;

 	for ( i=0; i< n; i+=4 )
 	{
@@ -141,7 +136,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 {

 	BLASLONG i;
-	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
@@ -151,13 +145,9 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 	BLASLONG m3;
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
-	FLOAT *ap[4] __attribute__ ((aligned (16)));;
 	FLOAT xbuffer[8] __attribute__ ((aligned (16)));;
-	FLOAT alpha_r[4] __attribute__ ((aligned (16)));;
 	FLOAT *ybuffer;

-	alpha_r[0] = alpha;
-
         if ( m < 1 ) return(0);
         if ( n < 1 ) return(0);

@@ -187,11 +177,6 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 		a_ptr = a;
 		x_ptr = x;

-		ap[0] = a_ptr;
-		ap[1] = a_ptr + lda;
-		ap[2] = ap[1] + lda;
-		ap[3] = ap[2] + lda;
-
 		if ( inc_y != 1 )
 			memset(ybuffer,0,NB*8);
 		else
@@ -203,18 +188,14 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO

 			for( i = 0; i < n1 ; i++)
 			{
-				dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,alpha_r);
-				ap[0] += lda4;
-				ap[1] += lda4;
-				ap[2] += lda4;
-				ap[3] += lda4;
+				dgemv_kernel_4x4(NB,a_ptr,lda,x_ptr,ybuffer,alpha);
 				a_ptr += lda4;
 				x_ptr += 4;
 			}

 			if ( n2 & 2 )
 			{
-				dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,alpha_r);
+				dgemv_kernel_4x2(NB,a_ptr,a_ptr+lda,x_ptr,ybuffer,alpha);
 				a_ptr += lda*2;
 				x_ptr += 2;
 			}
@@ -222,7 +203,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO

 			if ( n2 & 1 )
 			{
-				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha_r);
+				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,alpha);
 				a_ptr += lda;
 				x_ptr += 1;

@@ -243,11 +224,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 				x_ptr += inc_x;
 				xbuffer[3] = x_ptr[0];
 				x_ptr += inc_x;
-				dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,alpha_r);
-				ap[0] += lda4;
-				ap[1] += lda4;
-				ap[2] += lda4;
-				ap[3] += lda4;
+				dgemv_kernel_4x4(NB,a_ptr,lda,xbuffer,ybuffer,alpha);
 				a_ptr += lda4;
 			}

@@ -255,7 +232,7 @@ int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLO
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;
-				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha_r);
+				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,alpha);
 				a_ptr += lda;

 			}
diff --git a/kernel/power/dgemv_n_microk_power8.c b/kernel/power/dgemv_n_microk_power8.c
index 9eabe55..5b42bbb 100644
--- a/kernel/power/dgemv_n_microk_power8.c
+++ b/kernel/power/dgemv_n_microk_power8.c
@@ -35,267 +35,264 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_4x4 1

-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));
-
-static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
+static void dgemv_kernel_4x4 (long n, double *ap, long lda, double *x, double *y, double alpha)
 {
-        BLASLONG i=n;
-	BLASLONG o8  = 8;
-	BLASLONG o16 = 16;
-	BLASLONG o24 = 24;
-	BLASLONG pre = 384;
-
-        FLOAT *a0,*a1,*a2,*a3;
-	FLOAT *y1=y+1;
-        FLOAT x[4]  __attribute__ ((aligned (16)));;
-        a0 = ap[0]+1;
-        a1 = ap[1]+1;
-        a2 = ap[2]+1;
-        a3 = ap[3]+1;
-
-	x[0]=xo[0] * *alpha;
-	x[1]=xo[1] * *alpha;
-	x[2]=xo[2] * *alpha;
-	x[3]=xo[3] * *alpha;
+  double *a0;
+  double *a1;
+  double *a2;
+  double *a3;
+
+  __asm__
+    (
+       "lxvd2x		34, 0, %9	\n\t"	// x0, x1
+       "lxvd2x		35, %10, %9	\n\t"	// x2, x3
+       "xxspltd		32, %x8, 0	\n\t"	// alpha, alpha
+
+       "sldi		%6, %4, 3	\n\t"	// lda * sizeof (double)
+
+       "xvmuldp		34, 34, 32	\n\t"	// x0 * alpha, x1 * alpha
+       "xvmuldp		35, 35, 32	\n\t"	// x2 * alpha, x3 * alpha
+
+       "add		%4, %3, %6	\n\t"	// a1 = a0 + lda
+       "add		%6, %6, %6	\n\t"	// 2 * lda
+
+       "xxspltd		32, 34, 0	\n\t"	// x0 * alpha, x0 * alpha
+       "xxspltd		33, 34, 1	\n\t"	// x1 * alpha, x1 * alpha
+       "xxspltd		34, 35, 0	\n\t"	// x2 * alpha, x2 * alpha
+       "xxspltd		35, 35, 1	\n\t"	// x3 * alpha, x3 * alpha
+
+       "add		%5, %3, %6	\n\t"	// a2 = a0 + 2 * lda
+       "add		%6, %4, %6	\n\t"	// a3 = a1 + 2 * lda
+
+       "dcbt		0, %3		\n\t"
+       "dcbt		0, %4		\n\t"
+       "dcbt		0, %5		\n\t"
+       "dcbt		0, %6		\n\t"

+       "lxvd2x		40, 0, %3	\n\t"	// a0[0], a0[1]
+       "lxvd2x		41, %10, %3	\n\t"	// a0[2], a0[3]

-	__asm__  __volatile__
-	(
-	"lxvdsx		32, 0 , %1			    \n\t"	// x0
-	"lxvdsx		33,%3 , %1			    \n\t"	// x1
-	"lxvdsx		34,%4 , %1			    \n\t"	// x2
-	"lxvdsx		35,%5 , %1			    \n\t"	// x3
-	"addi		%2 , %2 , -8			    \n\t"
-	"addi		%6 , %6 , -8			    \n\t"
-	"addi		%7 , %7 , -8			    \n\t"
-	"addi		%8 , %8 , -8			    \n\t"
-	"addi		%9 , %9 , -8			    \n\t"
-
-	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1]
-	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3]
-
-	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1]
-	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3]
+       "lxvd2x		42, 0, %4	\n\t"	// a1[0], a1[1]
+       "lxvd2x		43, %10, %4	\n\t"	// a1[2], a1[3]

-	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1]
-	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3]
+       "lxvd2x		44, 0, %5	\n\t"	// a2[0], a2[1]
+       "lxvd2x		45, %10, %5	\n\t"	// a2[2], a2[3]

-	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1]
-	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3]
+       "lxvd2x		46, 0, %6	\n\t"	// a3[0], a3[1]
+       "lxvd2x		47, %10, %6	\n\t"	// a3[2], a3[3]

-	"addi		%6, %6, 32			    \n\t"
-	"addi		%7, %7, 32			    \n\t"
-	"addi		%8, %8, 32			    \n\t"
-	"addi		%9, %9, 32			    \n\t"
+       "dcbt		0, %2		\n\t"

-	"addic.		%0 , %0	, -4  	 	             \n\t"
-	"ble		2f		             	     \n\t"
+       "addi		%3, %3, 32	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "addi		%6, %6, 32	\n\t"

-	".align 5				            \n\t"
-	"1:				                    \n\t"
+       "addic.		%1, %1, -4	\n\t"
+       "ble		2f		\n\t"

-	"dcbt		%2, %10				    \n\t"
+       ".p2align	5		\n"
+     "1:				\n\t"

-	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
-	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
-
-	"dcbt		%6, %10				    \n\t"
-	"dcbt		%7, %10				    \n\t"
-	"dcbt		%8, %10				    \n\t"
-	"dcbt		%9, %10				    \n\t"
+       "lxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "lxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"xvmaddadp	40, 48, 32			    \n\t"
-	"xvmaddadp	41, 49, 32			    \n\t"
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"

-	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1]
-	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3]
+       "lxvd2x		40, 0, %3	\n\t"	// a0[0], a0[1]
+       "lxvd2x		41, %10, %3	\n\t"	// a0[2], a0[3]

-	"xvmaddadp	40, 50, 33			    \n\t"
-	"addi		%6, %6, 32			    \n\t"
-	"xvmaddadp	41, 51, 33			    \n\t"
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"

-	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1]
-	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3]
+       "lxvd2x		42, 0, %4	\n\t"	// a1[0], a1[1]
+       "lxvd2x		43, %10, %4	\n\t"	// a1[2], a1[3]

-	"xvmaddadp	40, 52, 34			    \n\t"
-	"addi		%7, %7, 32			    \n\t"
-	"xvmaddadp	41, 53, 34			    \n\t"
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"

-	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1]
-	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3]
+       "lxvd2x		44, 0, %5	\n\t"	// a2[0], a2[1]
+       "lxvd2x		45, %10, %5	\n\t"	// a2[2], a2[3]

-	"xvmaddadp	40, 54, 35			    \n\t"
-	"addi		%8, %8, 32			    \n\t"
-	"xvmaddadp	41, 55, 35			    \n\t"
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"

-	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
-	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+       "stxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "stxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1]
-	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3]
+       "lxvd2x		46, 0, %6	\n\t"	// a3[0], a3[1]
+       "lxvd2x		47, %10, %6	\n\t"	// a3[2], a3[3]

-	"addi		%9, %9, 32			    \n\t"
-	"addi		%2, %2, 32			    \n\t"
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"

-	"addic.		%0 , %0	, -4  	 	             \n\t"
-	"ble		2f		             	     \n\t"
+       "addic.		%1, %1, -4	\n\t"
+       "ble		2f		\n\t"


-	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
-	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
-
-	"xvmaddadp	40, 48, 32			    \n\t"
-	"xvmaddadp	41, 49, 32			    \n\t"
+       "lxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "lxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1]
-	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3]
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"

-	"xvmaddadp	40, 50, 33			    \n\t"
-	"addi		%6, %6, 32			    \n\t"
-	"xvmaddadp	41, 51, 33			    \n\t"
+       "lxvd2x		40, 0, %3	\n\t"	// a0[0], a0[1]
+       "lxvd2x		41, %10, %3	\n\t"	// a0[2], a0[3]

-	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1]
-	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3]
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"

-	"xvmaddadp	40, 52, 34			    \n\t"
-	"addi		%7, %7, 32			    \n\t"
-	"xvmaddadp	41, 53, 34			    \n\t"
+       "lxvd2x		42, 0, %4	\n\t"	// a1[0], a1[1]
+       "lxvd2x		43, %10, %4	\n\t"	// a1[2], a1[3]

-	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1]
-	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3]
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"

-	"xvmaddadp	40, 54, 35			    \n\t"
-	"addi		%8, %8, 32			    \n\t"
-	"xvmaddadp	41, 55, 35			    \n\t"
+       "lxvd2x		44, 0, %5	\n\t"	// a2[0], a2[1]
+       "lxvd2x		45, %10, %5	\n\t"	// a2[2], a2[3]

-	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
-	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"

-	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1]
-	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3]
+       "stxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "stxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"addi		%9, %9, 32			    \n\t"
-	"addi		%2, %2, 32			    \n\t"
+       "lxvd2x		46, 0, %6	\n\t"	// a3[0], a3[1]
+       "lxvd2x		47, %10, %6	\n\t"	// a3[2], a3[3]

-	"addic.		%0 , %0	, -4  	 	             \n\t"
-	"ble		2f		             	     \n\t"
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"

+       "addic.		%1, %1, -4	\n\t"
+       "ble		2f		\n\t"

-	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
-	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
-
-	"xvmaddadp	40, 48, 32			    \n\t"
-	"xvmaddadp	41, 49, 32			    \n\t"

-	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1]
-	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3]
+       "lxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "lxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"xvmaddadp	40, 50, 33			    \n\t"
-	"addi		%6, %6, 32			    \n\t"
-	"xvmaddadp	41, 51, 33			    \n\t"
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"

-	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1]
-	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3]
+       "lxvd2x		40, 0, %3	\n\t"	// a0[0], a0[1]
+       "lxvd2x		41, %10, %3	\n\t"	// a0[2], a0[3]

-	"xvmaddadp	40, 52, 34			    \n\t"
-	"addi		%7, %7, 32			    \n\t"
-	"xvmaddadp	41, 53, 34			    \n\t"
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"

-	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1]
-	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3]
+       "lxvd2x		42, 0, %4	\n\t"	// a1[0], a1[1]
+       "lxvd2x		43, %10, %4	\n\t"	// a1[2], a1[3]

-	"xvmaddadp	40, 54, 35			    \n\t"
-	"addi		%8, %8, 32			    \n\t"
-	"xvmaddadp	41, 55, 35			    \n\t"
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"

-	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
-	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+       "lxvd2x		44, 0, %5	\n\t"	// a2[0], a2[1]
+       "lxvd2x		45, %10, %5	\n\t"	// a2[2], a2[3]

-	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1]
-	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3]
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"

-	"addi		%9, %9, 32			    \n\t"
-	"addi		%2, %2, 32			    \n\t"
+       "stxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "stxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"addic.		%0 , %0	, -4  	 	             \n\t"
-	"ble		2f		             	     \n\t"
+       "lxvd2x		46, 0, %6	\n\t"	// a3[0], a3[1]
+       "lxvd2x		47, %10, %6	\n\t"	// a3[2], a3[3]

+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"

-	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
-	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
-
-	"xvmaddadp	40, 48, 32			    \n\t"
-	"xvmaddadp	41, 49, 32			    \n\t"
+       "addic.		%1, %1, -4	\n\t"
+       "ble		2f		\n\t"

-	"lxvd2x		48, 0, %6			    \n\t"	// a0[0], a0[1]
-	"lxvd2x		49,%4, %6			    \n\t"	// a0[2], a0[3]

-	"xvmaddadp	40, 50, 33			    \n\t"
-	"addi		%6, %6, 32			    \n\t"
-	"xvmaddadp	41, 51, 33			    \n\t"
+       "lxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "lxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"lxvd2x		50, 0, %7			    \n\t"	// a1[0], a1[1]
-	"lxvd2x		51,%4, %7			    \n\t"	// a1[2], a1[3]
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"

-	"xvmaddadp	40, 52, 34			    \n\t"
-	"addi		%7, %7, 32			    \n\t"
-	"xvmaddadp	41, 53, 34			    \n\t"
+       "lxvd2x		40, 0, %3	\n\t"	// a0[0], a0[1]
+       "lxvd2x		41, %10, %3	\n\t"	// a0[2], a0[3]

-	"lxvd2x		52, 0, %8			    \n\t"	// a2[0], a2[1]
-	"lxvd2x		53,%4, %8			    \n\t"	// a2[2], a2[3]
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "addi		%3, %3, 32	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"

-	"xvmaddadp	40, 54, 35			    \n\t"
-	"addi		%8, %8, 32			    \n\t"
-	"xvmaddadp	41, 55, 35			    \n\t"
+       "lxvd2x		42, 0, %4	\n\t"	// a1[0], a1[1]
+       "lxvd2x		43, %10, %4	\n\t"	// a1[2], a1[3]

-	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
-	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "addi		%4, %4, 32	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"

-	"lxvd2x		54, 0, %9			    \n\t"	// a3[0], a3[1]
-	"lxvd2x		55,%4, %9			    \n\t"	// a3[2], a3[3]
+       "lxvd2x		44, 0, %5	\n\t"	// a2[0], a2[1]
+       "lxvd2x		45, %10, %5	\n\t"	// a2[2], a2[3]

-	"addi		%9, %9, 32			    \n\t"
-	"addi		%2, %2, 32			    \n\t"
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "addi		%5, %5, 32	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"

-	"addic.		%0 , %0	, -4  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
+       "stxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "stxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"2:						     \n\t"
+       "lxvd2x		46, 0, %6	\n\t"	// a3[0], a3[1]
+       "lxvd2x		47, %10, %6	\n\t"	// a3[2], a3[3]

-	"lxvd2x		40, 0, %2			    \n\t"	// y0, y1
-	"lxvd2x		41,%4, %2			    \n\t"	// y2, y3
+       "addi		%6, %6, 32	\n\t"
+       "addi		%2, %2, 32	\n\t"

-	"xvmaddadp	40, 48, 32			    \n\t"
-	"xvmaddadp	41, 49, 32			    \n\t"
+       "addic.		%1, %1, -4	\n\t"
+       "bgt		1b		\n"

-	"xvmaddadp	40, 50, 33			    \n\t"
-	"xvmaddadp	41, 51, 33			    \n\t"
+     "2:				\n\t"

-	"xvmaddadp	40, 52, 34			    \n\t"
-	"xvmaddadp	41, 53, 34			    \n\t"
+       "lxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "lxvd2x		37, %10, %2	\n\t"	// y2, y3

-	"xvmaddadp	40, 54, 35			    \n\t"
-	"xvmaddadp	41, 55, 35			    \n\t"
+       "xvmaddadp 	36, 40, 32	\n\t"
+       "xvmaddadp 	37, 41, 32	\n\t"

-	"stxvd2x	40, 0, %2			    \n\t"	// y0, y1
-	"stxvd2x	41,%4, %2			    \n\t"	// y2, y3
+       "xvmaddadp 	36, 42, 33	\n\t"
+       "xvmaddadp 	37, 43, 33	\n\t"

-	:
-        :
-          "r" (i),	// 0
-          "r" (x),      // 1
-          "r" (y1),     // 2
-	  "r" (o8),	// 3
-	  "r" (o16),	// 4
-	  "r" (o24),	// 5
-	  "r" (a0),	// 6
-	  "r" (a1),	// 7
-	  "r" (a2),	// 8
-	  "r" (a3),	// 9
-	  "r" (pre)	// 10
-	: "cr0", "%0", "%2" , "%6", "%7", "%8", "%9", "memory"
-	);
+       "xvmaddadp 	36, 44, 34	\n\t"
+       "xvmaddadp 	37, 45, 34	\n\t"

-}
+       "xvmaddadp 	36, 46, 35	\n\t"
+       "xvmaddadp 	37, 47, 35	\n\t"

+       "stxvd2x		36, 0, %2	\n\t"	// y0, y1
+       "stxvd2x		37, %10, %2	\n"	// y2, y3

+     "#n=%1 ap=%11 lda=%12 x=%7=%9 y=%0=%2 alpha=%8 o16=%10\n"
+     "#a0=%3 a1=%4 a2=%5 a3=%6"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (y),	// 2
+       "=b" (a0),	// 3
+       "=b" (a1),	// 4
+       "=&b" (a2),	// 5
+       "=&b" (a3)	// 6
+     :
+       "m" (*x),
+       "d" (alpha),	// 8
+       "r" (x),		// 9
+       "b" (16),	// 10
+       "3" (ap),	// 11
+       "4" (lda)	// 12
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/drot.c b/kernel/power/drot.c
index c93f69b..3e10748 100644
--- a/kernel/power/drot.c
+++ b/kernel/power/drot.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_16

-static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {

 	BLASLONG i=0;
@@ -56,8 +56,6 @@ static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
 	FLOAT y00, y01, y02, y03;
 	FLOAT *x1=x;
 	FLOAT *y1=y;
-	FLOAT c1=*c;
-	FLOAT s1=*s;

 	while ( i<n )
 	{
@@ -71,14 +69,14 @@ static void drot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
 		x03 = x1[3];
 		y03 = y1[3];

-		f0 = c1*x00 + s1*y00;
-		g0 = c1*y00 - s1*x00;
-		f1 = c1*x01 + s1*y01;
-		g1 = c1*y01 - s1*x01;
-		f2 = c1*x02 + s1*y02;
-		g2 = c1*y02 - s1*x02;
-		f3 = c1*x03 + s1*y03;
-		g3 = c1*y03 - s1*x03;
+		f0 = c*x00 + s*y00;
+		g0 = c*y00 - s*x00;
+		f1 = c*x01 + s*y01;
+		g1 = c*y01 - s*x01;
+		f2 = c*x02 + s*y02;
+		g2 = c*y02 - s*x02;
+		f3 = c*x03 + s*y03;
+		g3 = c*y03 - s*x03;

 		x1[0] = f0;
 		y1[0] = g0;
@@ -106,8 +104,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
-	FLOAT c1[4] __attribute__ ((aligned (16)));;
-	FLOAT s1[4] __attribute__ ((aligned (16)));;
 	FLOAT *x1=x;
 	FLOAT *y1=y;
 	FLOAT temp;
@@ -120,15 +116,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 		BLASLONG n1 = n & -16;
 		if ( n1 > 0 )
 		{
-			c1[0]=c;
-			c1[1]=c;
-			c1[2]=c;
-			c1[3]=c;
-			s1[0]=s;
-			s1[1]=s;
-			s1[2]=s;
-			s1[3]=s;
-			drot_kernel_16(n1, x1, y1, c1, s1);
+			drot_kernel_16(n1, x1, y1, c, s);
 			i=n1;
 		}

diff --git a/kernel/power/drot_microk_power8.c b/kernel/power/drot_microk_power8.c
index 4444ac7..016b776 100644
--- a/kernel/power/drot_microk_power8.c
+++ b/kernel/power/drot_microk_power8.c
@@ -38,174 +38,176 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_16 1

-static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
-
-static void drot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void drot_kernel_16 (long n, double *x, double *y, double c, double s)
 {
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+  __vector double t4;
+  __vector double t5;
+  __vector double t6;
+  __vector double t7;

+  __asm__
+    (
+       "xxspltd		36, %x13, 0	\n\t"	// load c to both dwords
+       "xxspltd		37, %x14, 0	\n\t"	// load s to both dwords

-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *x2=x+1;
-	FLOAT *y2=y+1;
-
-	__asm__  __volatile__
-	(
-
-        "lxsdx          36 , %5, %3                          \n\t"	// load c
-        "lxsdx          37 , %5, %4                          \n\t"	// load s
-	"addi		%8 , %8, -8			     \n\t"
-	"addi		%9 , %9, -8			     \n\t"
-
-        "xxspltd        36 , 36, 0                           \n\t"
-        "xxspltd        37 , 37, 0                           \n\t"
-
-	"lxvd2x		32, 0, %1			    \n\t"	// load x
-	"lxvd2x		33, %5, %1			    \n\t"
-	"lxvd2x		34, %6, %1			    \n\t"
-	"lxvd2x		35, %7, %1			    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"	// load y
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-
-	"addi		%1, %1, 64			    \n\t"
-	"addi		%2, %2, 64			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"xvmuldp	48, 32, 36		    	    \n\t"	// c * x
-	"xvmuldp	49, 33, 36		    	    \n\t"
-	"xvmuldp	50, 34, 36		    	    \n\t"
-	"xvmuldp	51, 35, 36		    	    \n\t"
-
-	"xvmuldp	56, 40, 36		    	    \n\t"	// c * y
-	"xvmuldp	57, 41, 36		    	    \n\t"
-	"xvmuldp	58, 42, 36		    	    \n\t"
-	"xvmuldp	59, 43, 36		    	    \n\t"
-
-	"xvmuldp	52, 32, 37		    	    \n\t"	// s * x
-	"xvmuldp	53, 33, 37		    	    \n\t"
-
-	"lxvd2x		32, 0, %1			    \n\t"	// load x
-	"lxvd2x		33, %5, %1			    \n\t"
-
-	"xvmuldp	54, 34, 37		    	    \n\t"
-	"xvmuldp	55, 35, 37		    	    \n\t"
-
-	"lxvd2x		34, %6, %1			    \n\t"
-	"lxvd2x		35, %7, %1			    \n\t"
-
-	"xvmuldp	60, 40, 37		    	    \n\t"	// s * y
-	"xvmuldp	61, 41, 37		    	    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"	// load y
-	"lxvd2x		41, %5, %2			    \n\t"
-
-	"xvmuldp	62, 42, 37		    	    \n\t"
-	"xvmuldp	63, 43, 37		    	    \n\t"
-
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-
-	"xvadddp	48, 48 , 60			    \n\t"	// c * x + s * y
-	"xvadddp	49, 49 , 61			    \n\t"	// c * x + s * y
-
-	"addi		%1, %1, 64			    \n\t"
-	"addi		%2, %2, 64			    \n\t"
-
-	"xvadddp	50, 50 , 62			    \n\t"	// c * x + s * y
-	"xvadddp	51, 51 , 63			    \n\t"	// c * x + s * y
-
-	"xvsubdp	56, 56 , 52			    \n\t"	// c * y - s * x
-	"xvsubdp	57, 57 , 53			    \n\t"	// c * y - s * x
-	"xvsubdp	58, 58 , 54			    \n\t"	// c * y - s * x
-	"xvsubdp	59, 59 , 55			    \n\t"	// c * y - s * x
-
-	"stxvd2x	48, 0, %8			    \n\t"	// store x
-	"stxvd2x	49, %5, %8			    \n\t"
-	"stxvd2x	50, %6, %8			    \n\t"
-	"stxvd2x	51, %7, %8			    \n\t"
-
-	"stxvd2x	56, 0, %9			    \n\t"	// store y
-	"stxvd2x	57, %5, %9			    \n\t"
-	"stxvd2x	58, %6, %9			    \n\t"
-	"stxvd2x	59, %7, %9			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-	"addi		%9, %9, 64			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmuldp	48, 32, 36		    	    \n\t"	// c * x
-	"xvmuldp	49, 33, 36		    	    \n\t"
-	"xvmuldp	50, 34, 36		    	    \n\t"
-	"xvmuldp	51, 35, 36		    	    \n\t"
-
-	"xvmuldp	56, 40, 36		    	    \n\t"	// c * y
-	"xvmuldp	57, 41, 36		    	    \n\t"
-	"xvmuldp	58, 42, 36		    	    \n\t"
-	"xvmuldp	59, 43, 36		    	    \n\t"
-
-	"xvmuldp	52, 32, 37		    	    \n\t"	// s * x
-	"xvmuldp	53, 33, 37		    	    \n\t"
-	"xvmuldp	54, 34, 37		    	    \n\t"
-	"xvmuldp	55, 35, 37		    	    \n\t"
-
-	"xvmuldp	60, 40, 37		    	    \n\t"	// s * y
-	"xvmuldp	61, 41, 37		    	    \n\t"
-	"xvmuldp	62, 42, 37		    	    \n\t"
-	"xvmuldp	63, 43, 37		    	    \n\t"
-
-	"xvadddp	48, 48 , 60			    \n\t"	// c * x + s * y
-	"xvadddp	49, 49 , 61			    \n\t"	// c * x + s * y
-	"xvadddp	50, 50 , 62			    \n\t"	// c * x + s * y
-	"xvadddp	51, 51 , 63			    \n\t"	// c * x + s * y
-
-	"xvsubdp	56, 56 , 52			    \n\t"	// c * y - s * x
-	"xvsubdp	57, 57 , 53			    \n\t"	// c * y - s * x
-	"xvsubdp	58, 58 , 54			    \n\t"	// c * y - s * x
-	"xvsubdp	59, 59 , 55			    \n\t"	// c * y - s * x
-
-	"stxvd2x	48, 0, %8			    \n\t"	// store x
-	"stxvd2x	49, %5, %8			    \n\t"
-	"stxvd2x	50, %6, %8			    \n\t"
-	"stxvd2x	51, %7, %8			    \n\t"
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"

-	"stxvd2x	56, 0, %9			    \n\t"	// store y
-	"stxvd2x	57, %5, %9			    \n\t"
-	"stxvd2x	58, %6, %9			    \n\t"
-	"stxvd2x	59, %7, %9			    \n\t"
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"

+       "addi		%3, %3, 64	\n\t"
+       "addi		%4, %4, 64	\n\t"

+       "addic.		%2, %2, -8	\n\t"
+       "ble		2f		\n\t"

-	:
-        :
-          "r" (i),	// 0
-	  "r" (x1),  	// 1
-          "r" (y1),     // 2
-          "r" (c),      // 3
-          "r" (s),      // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-	  "r" (x2),     // 8
-	  "r" (y2)      // 9
-	: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
-	);
+       ".p2align	5		\n"
+     "1:				\n\t"

-}
+       "xvmuldp		40, 32, 36	\n\t"	// c * x
+       "xvmuldp		41, 33, 36	\n\t"
+       "xvmuldp		42, 34, 36	\n\t"
+       "xvmuldp		43, 35, 36	\n\t"

+       "xvmuldp		%x5, 48, 36	\n\t"	// c * y
+       "xvmuldp		%x6, 49, 36	\n\t"
+       "xvmuldp		%x7, 50, 36	\n\t"
+       "xvmuldp		%x8, 51, 36	\n\t"

+       "xvmuldp		44, 32, 37	\n\t"	// s * x
+       "xvmuldp		45, 33, 37	\n\t"
+
+       "lxvd2x		32, 0, %3	\n\t"	// load x
+       "lxvd2x		33, %15, %3	\n\t"
+
+       "xvmuldp		46, 34, 37	\n\t"
+       "xvmuldp		47, 35, 37	\n\t"
+
+       "lxvd2x		34, %16, %3	\n\t"
+       "lxvd2x		35, %17, %3	\n\t"
+
+       "xvmuldp		%x9, 48, 37	\n\t"	// s * y
+       "xvmuldp		%x10, 49, 37	\n\t"
+
+       "lxvd2x		48, 0, %4	\n\t"	// load y
+       "lxvd2x		49, %15, %4	\n\t"
+
+       "xvmuldp		%x11, 50, 37	\n\t"
+       "xvmuldp		%x12, 51, 37	\n\t"
+
+       "lxvd2x		50, %16, %4	\n\t"
+       "lxvd2x		51, %17, %4	\n\t"
+
+       "xvadddp		40, 40, %x9	\n\t"	// c * x + s * y
+       "xvadddp		41, 41, %x10	\n\t"	// c * x + s * y
+
+       "addi		%3, %3, -64	\n\t"
+       "addi		%4, %4, -64	\n\t"
+
+       "xvadddp		42, 42, %x11	\n\t"	// c * x + s * y
+       "xvadddp		43, 43, %x12	\n\t"	// c * x + s * y
+
+       "xvsubdp		%x5, %x5, 44	\n\t"	// c * y - s * x
+       "xvsubdp		%x6, %x6, 45	\n\t"	// c * y - s * x
+       "xvsubdp		%x7, %x7, 46	\n\t"	// c * y - s * x
+       "xvsubdp		%x8, %x8, 47	\n\t"	// c * y - s * x
+
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"
+
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%4, %4, 128	\n\t"
+
+       "addic.		%2, %2, -8	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmuldp		40, 32, 36	\n\t"	// c * x
+       "xvmuldp		41, 33, 36	\n\t"
+       "xvmuldp		42, 34, 36	\n\t"
+       "xvmuldp		43, 35, 36	\n\t"
+
+       "xvmuldp		%x5, 48, 36	\n\t"	// c * y
+       "xvmuldp		%x6, 49, 36	\n\t"
+       "xvmuldp		%x7, 50, 36	\n\t"
+       "xvmuldp		%x8, 51, 36	\n\t"
+
+       "xvmuldp		44, 32, 37	\n\t"	// s * x
+       "xvmuldp		45, 33, 37	\n\t"
+       "xvmuldp		46, 34, 37	\n\t"
+       "xvmuldp		47, 35, 37	\n\t"
+
+       "xvmuldp		%x9, 48, 37	\n\t"	// s * y
+       "xvmuldp		%x10, 49, 37	\n\t"
+       "xvmuldp		%x11, 50, 37	\n\t"
+       "xvmuldp		%x12, 51, 37	\n\t"
+
+       "addi		%3, %3, -64	\n\t"
+       "addi		%4, %4, -64	\n\t"
+
+       "xvadddp		40, 40, %x9	\n\t"	// c * x + s * y
+       "xvadddp		41, 41, %x10	\n\t"	// c * x + s * y
+       "xvadddp		42, 42, %x11	\n\t"	// c * x + s * y
+       "xvadddp		43, 43, %x12	\n\t"	// c * x + s * y
+
+       "xvsubdp		%x5, %x5, 44	\n\t"	// c * y - s * x
+       "xvsubdp		%x6, %x6, 45	\n\t"	// c * y - s * x
+       "xvsubdp		%x7, %x7, 46	\n\t"	// c * y - s * x
+       "xvsubdp		%x8, %x8, 47	\n\t"	// c * y - s * x
+
+       "stxvd2x		40, 0, %3	\n\t"	// store x
+       "stxvd2x		41, %15, %3	\n\t"
+       "stxvd2x		42, %16, %3	\n\t"
+       "stxvd2x		43, %17, %3	\n\t"
+
+       "stxvd2x		%x5, 0, %4	\n\t"	// store y
+       "stxvd2x		%x6, %15, %4	\n\t"
+       "stxvd2x		%x7, %16, %4	\n\t"
+       "stxvd2x		%x8, %17, %4	\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
+     "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y),	// 4
+       "=wa" (t0),	// 5
+       "=wa" (t1),	// 6
+       "=wa" (t2),	// 7
+       "=wa" (t3),	// 8
+       "=wa" (t4),	// 9
+       "=wa" (t5),	// 10
+       "=wa" (t6),	// 11
+       "=wa" (t7)	// 12
+     :
+       "d" (c),		// 13
+       "d" (s),		// 14
+       "b" (16),	// 15
+       "b" (32),	// 16
+       "b" (48)		// 17
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
diff --git a/kernel/power/dscal.c b/kernel/power/dscal.c
index c62a563..f32dc4b 100644
--- a/kernel/power/dscal.c
+++ b/kernel/power/dscal.c
@@ -41,11 +41,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(HAVE_KERNEL_8)

-static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )
+static void dscal_kernel_8 (BLASLONG n, FLOAT *x, FLOAT alpha)
 {

         BLASLONG i;
-        FLOAT alpha = *da;

         for( i=0; i<n; i+=8 )
         {
@@ -62,7 +61,7 @@ static void dscal_kernel_8( BLASLONG n, FLOAT *da , FLOAT *x )

 }

-static void dscal_kernel_8_zero( BLASLONG n, FLOAT *da , FLOAT *x )
+static void dscal_kernel_8_zero (BLASLONG n, FLOAT *x)
 {

         BLASLONG i;
@@ -102,10 +101,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			BLASLONG n1 = n & -16;
 			if ( n1 > 0 )
 			{
-				FLOAT alpha[2];
-				alpha[0]=da;
-				alpha[1]=da;
-				dscal_kernel_8_zero(n1 , alpha , x);
+				dscal_kernel_8_zero(n1, x);
 				j=n1;
 			}

@@ -123,10 +119,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			BLASLONG n1 = n & -16;
 			if ( n1 > 0 )
 			{
-				FLOAT alpha[2];
-				alpha[0]=da;
-				alpha[1]=da;
-				dscal_kernel_8(n1 , alpha , x);
+				dscal_kernel_8(n1, x, da);
 				j=n1;
 			}
 			while(j < n)
diff --git a/kernel/power/dscal_microk_power8.c b/kernel/power/dscal_microk_power8.c
index d90c3d8..04898eb 100644
--- a/kernel/power/dscal_microk_power8.c
+++ b/kernel/power/dscal_microk_power8.c
@@ -35,185 +35,149 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_8 1

-static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void dscal_kernel_8( BLASLONG n, FLOAT *alpha, FLOAT *x)
+static void dscal_kernel_8 (long n, double *x, double alpha)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *x2=x+1;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-        "lxsdx          33, 0, %3                           \n\t"
-        "xxspltd        32, 33, 0                           \n\t"
-        "addi           %1, %1, -8                          \n\t"
-
-	"dcbt		%2, %4				    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %4				    \n\t"
-
-	"xvmuldp	48, 40, 32		    	    \n\t"
-	"xvmuldp	49, 41, 32		    	    \n\t"
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"xvmuldp	50, 42, 32		    	    \n\t"
-	"xvmuldp	51, 43, 32		    	    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"xvmuldp	52, 44, 32		    	    \n\t"
-	"xvmuldp	53, 45, 32		    	    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"xvmuldp	54, 46, 32		    	    \n\t"
-	"xvmuldp	55, 47, 32		    	    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"stxvd2x	48, 0, %1			    \n\t"
-	"stxvd2x	49, %5, %1			    \n\t"
-	"stxvd2x	50, %6, %1			    \n\t"
-	"stxvd2x	51, %7, %1			    \n\t"
-	"stxvd2x	52, %8, %1			    \n\t"
-	"stxvd2x	53, %9, %1			    \n\t"
-	"stxvd2x	54, %10, %1			    \n\t"
-	"stxvd2x	55, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmuldp	48, 40, 32		    	    \n\t"
-	"xvmuldp	49, 41, 32		    	    \n\t"
-	"xvmuldp	50, 42, 32		    	    \n\t"
-	"xvmuldp	51, 43, 32		    	    \n\t"
-	"xvmuldp	52, 44, 32		    	    \n\t"
-	"xvmuldp	53, 45, 32		    	    \n\t"
-	"xvmuldp	54, 46, 32		    	    \n\t"
-	"xvmuldp	55, 47, 32		    	    \n\t"
-
-	"stxvd2x	48, 0, %1			    \n\t"
-	"stxvd2x	49, %5, %1			    \n\t"
-	"stxvd2x	50, %6, %1			    \n\t"
-	"stxvd2x	51, %7, %1			    \n\t"
-	"stxvd2x	52, %8, %1			    \n\t"
-	"stxvd2x	53, %9, %1			    \n\t"
-	"stxvd2x	54, %10, %1			    \n\t"
-	"stxvd2x	55, %11, %1			    \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (x2),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
-static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void dscal_kernel_8_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxspltd		%x3, %x3, 0	\n\t"
+
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvmuldp		40, 32, %x3	\n\t"
+       "xvmuldp		41, 33, %x3	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %4, %2	\n\t"
+       "xvmuldp		42, 34, %x3	\n\t"
+       "xvmuldp		43, 35, %x3	\n\t"
+       "lxvd2x		34, %5, %2	\n\t"
+       "lxvd2x		35, %6, %2	\n\t"
+       "xvmuldp		44, 36, %x3	\n\t"
+       "xvmuldp		45, 37, %x3	\n\t"
+       "lxvd2x		36, %7, %2	\n\t"
+       "lxvd2x		37, %8, %2	\n\t"
+       "xvmuldp		46, 38, %x3	\n\t"
+       "xvmuldp		47, 39, %x3	\n\t"
+       "lxvd2x		38, %9, %2	\n\t"
+       "lxvd2x		39, %10, %2	\n\t"
+
+       "addi		%2, %2, -128	\n\t"
+
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmuldp		40, 32, %x3	\n\t"
+       "xvmuldp		41, 33, %x3	\n\t"
+       "xvmuldp		42, 34, %x3	\n\t"
+       "xvmuldp		43, 35, %x3	\n\t"
+
+       "addi		%2, %2, -128	\n\t"
+
+       "xvmuldp		44, 36, %x3	\n\t"
+       "xvmuldp		45, 37, %x3	\n\t"
+       "xvmuldp		46, 38, %x3	\n\t"
+       "xvmuldp		47, 39, %x3	\n\t"
+
+       "stxvd2x		40, 0, %2	\n\t"
+       "stxvd2x		41, %4, %2	\n\t"
+       "stxvd2x		42, %5, %2	\n\t"
+       "stxvd2x		43, %6, %2	\n\t"
+       "stxvd2x		44, %7, %2	\n\t"
+       "stxvd2x		45, %8, %2	\n\t"
+       "stxvd2x		46, %9, %2	\n\t"
+       "stxvd2x		47, %10, %2	\n"
+
+     "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+     :
+       "+m" (*x),
+       "+r" (n),	// 1
+       "+b" (x)		// 2
+     :
+       "d" (alpha),	// 3
+       "b" (16),	// 4
+       "b" (32),	// 5
+       "b" (48),	// 6
+       "b" (64),	// 7
+       "b" (80),	// 8
+       "b" (96),	// 9
+       "b" (112)	// 10
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
+
+
+static void dscal_kernel_8_zero (long n, double *x)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *x2=x+1;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-	"xxlxor		32 , 32 , 32			    \n\t"
-        "addi           %1, %1, -8                          \n\t"
-
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"stxvd2x	32, 0, %1			    \n\t"
-	"stxvd2x	32, %5, %1			    \n\t"
-	"stxvd2x	32, %6, %1			    \n\t"
-	"stxvd2x	32, %7, %1			    \n\t"
-	"stxvd2x	32, %8, %1			    \n\t"
-	"stxvd2x	32, %9, %1			    \n\t"
-	"stxvd2x	32, %10, %1			    \n\t"
-	"stxvd2x	32, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (x2),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
+  __vector double t0;
+
+  __asm__
+    (
+       "xxlxor		%x3, %x3, %x3	\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "stxvd2x		%x3, 0, %2	\n\t"
+       "stxvd2x		%x3, %4, %2	\n\t"
+       "stxvd2x		%x3, %5, %2	\n\t"
+       "stxvd2x		%x3, %6, %2	\n\t"
+       "stxvd2x		%x3, %7, %2	\n\t"
+       "stxvd2x		%x3, %8, %2	\n\t"
+       "stxvd2x		%x3, %9, %2	\n\t"
+       "stxvd2x		%x3, %10, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		1b		\n"
+
+     "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+     :
+       "=m" (*x),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0)	// 3
+     :
+       "b" (16),	// 4
+       "b" (32),	// 5
+       "b" (48),	// 6
+       "b" (64),	// 7
+       "b" (80),	// 8
+       "b" (96),	// 9
+       "b" (112)	// 10
+     :
+       "cr0"
+     );
+}
diff --git a/kernel/power/dswap_microk_power8.c b/kernel/power/dswap_microk_power8.c
index 77747c3..31eff34 100644
--- a/kernel/power/dswap_microk_power8.c
+++ b/kernel/power/dswap_microk_power8.c
@@ -35,146 +35,124 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_32 1

-static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void dswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void dswap_kernel_32 (long n, double *x, double *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *x2=x+1;
-	FLOAT *y2=y+1;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"addi		%3, %3, -8			    \n\t"
-	"addi		%4, %4, -8			    \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"lxvd2x		32, 0, %2			    \n\t"
-	"lxvd2x		33, %5, %2			    \n\t"
-	"lxvd2x		34, %6, %2			    \n\t"
-	"lxvd2x		35, %7, %2			    \n\t"
-	"lxvd2x		36, %8, %2			    \n\t"
-	"lxvd2x		37, %9, %2			    \n\t"
-	"lxvd2x		38, %10, %2			    \n\t"
-	"lxvd2x		39, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvd2x		48, 0, %1			    \n\t"
-	"lxvd2x		49, %5, %1			    \n\t"
-	"lxvd2x		50, %6, %1			    \n\t"
-	"lxvd2x		51, %7, %1			    \n\t"
-	"lxvd2x		52, %8, %1			    \n\t"
-	"lxvd2x		53, %9, %1			    \n\t"
-	"lxvd2x		54, %10, %1			    \n\t"
-	"lxvd2x		55, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"lxvd2x		56, 0, %1			    \n\t"
-	"lxvd2x		57, %5, %1			    \n\t"
-	"lxvd2x		58, %6, %1			    \n\t"
-	"lxvd2x		59, %7, %1			    \n\t"
-	"lxvd2x		60, %8, %1			    \n\t"
-	"lxvd2x		61, %9, %1			    \n\t"
-	"lxvd2x		62, %10, %1			    \n\t"
-	"lxvd2x		63, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"stxvd2x		32, 0, %3			    \n\t"
-	"stxvd2x		33, %5, %3			    \n\t"
-	"stxvd2x		34, %6, %3			    \n\t"
-	"stxvd2x		35, %7, %3			    \n\t"
-	"stxvd2x		36, %8, %3			    \n\t"
-	"stxvd2x		37, %9, %3			    \n\t"
-	"stxvd2x		38, %10, %3			    \n\t"
-	"stxvd2x		39, %11, %3			    \n\t"
-
-	"addi		%3, %3, 128			    \n\t"
-
-	"stxvd2x		40, 0, %3			    \n\t"
-	"stxvd2x		41, %5, %3			    \n\t"
-	"stxvd2x		42, %6, %3			    \n\t"
-	"stxvd2x		43, %7, %3			    \n\t"
-	"stxvd2x		44, %8, %3			    \n\t"
-	"stxvd2x		45, %9, %3			    \n\t"
-	"stxvd2x		46, %10, %3			    \n\t"
-	"stxvd2x		47, %11, %3			    \n\t"
-
-	"addi		%3, %3, 128			    \n\t"
-
-	"stxvd2x		48, 0, %4			    \n\t"
-	"stxvd2x		49, %5, %4			    \n\t"
-	"stxvd2x		50, %6, %4			    \n\t"
-	"stxvd2x		51, %7, %4			    \n\t"
-	"stxvd2x		52, %8, %4			    \n\t"
-	"stxvd2x		53, %9, %4			    \n\t"
-	"stxvd2x		54, %10, %4			    \n\t"
-	"stxvd2x		55, %11, %4			    \n\t"
-
-	"addi		%4, %4, 128			    \n\t"
-
-	"stxvd2x		56, 0, %4			    \n\t"
-	"stxvd2x		57, %5, %4			    \n\t"
-	"stxvd2x		58, %6, %4			    \n\t"
-	"stxvd2x		59, %7, %4			    \n\t"
-	"stxvd2x		60, %8, %4			    \n\t"
-	"stxvd2x		61, %9, %4			    \n\t"
-	"stxvd2x		62, %10, %4			    \n\t"
-	"stxvd2x		63, %11, %4			    \n\t"
-
-	"addi		%4, %4, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (y2),     // 3
-          "r" (x2),     // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "lxvd2x		40, 0, %4	\n\t"
+       "lxvd2x		41, %5, %4	\n\t"
+       "lxvd2x		42, %6, %4	\n\t"
+       "lxvd2x		43, %7, %4	\n\t"
+       "lxvd2x		44, %8, %4	\n\t"
+       "lxvd2x		45, %9, %4	\n\t"
+       "lxvd2x		46, %10, %4	\n\t"
+       "lxvd2x		47, %11, %4	\n\t"
+
+       "addi		%4, %4, -128	\n\t"
+
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		49, %5, %3	\n\t"
+       "lxvd2x		50, %6, %3	\n\t"
+       "lxvd2x		51, %7, %3	\n\t"
+       "lxvd2x		0, %8, %3	\n\t"
+       "lxvd2x		1, %9, %3	\n\t"
+       "lxvd2x		2, %10, %3	\n\t"
+       "lxvd2x		3, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "lxvd2x		4, 0, %3	\n\t"
+       "lxvd2x		5, %5, %3	\n\t"
+       "lxvd2x		6, %6, %3	\n\t"
+       "lxvd2x		7, %7, %3	\n\t"
+       "lxvd2x		8, %8, %3	\n\t"
+       "lxvd2x		9, %9, %3	\n\t"
+       "lxvd2x		10, %10, %3	\n\t"
+       "lxvd2x		11, %11, %3	\n\t"
+
+       "addi		%3, %3, -128	\n\t"
+
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvd2x		48, 0, %4	\n\t"
+       "stxvd2x		49, %5, %4	\n\t"
+       "stxvd2x		50, %6, %4	\n\t"
+       "stxvd2x		51, %7, %4	\n\t"
+       "stxvd2x		0, %8, %4	\n\t"
+       "stxvd2x		1, %9, %4	\n\t"
+       "stxvd2x		2, %10, %4	\n\t"
+       "stxvd2x		3, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "stxvd2x		4, 0, %4	\n\t"
+       "stxvd2x		5, %5, %4	\n\t"
+       "stxvd2x		6, %6, %4	\n\t"
+       "stxvd2x		7, %7, %4	\n\t"
+       "stxvd2x		8, %8, %4	\n\t"
+       "stxvd2x		9, %9, %4	\n\t"
+       "stxvd2x		10, %10, %4	\n\t"
+       "stxvd2x		11, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "addic.		%2, %2, -32	\n\t"
+       "bgt		1b		\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y)		// 4
+     :
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+       "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+     );
+}
diff --git a/kernel/power/sasum.c b/kernel/power/sasum.c
index 43311f2..fb10b1d 100644
--- a/kernel/power/sasum.c
+++ b/kernel/power/sasum.c
@@ -38,7 +38,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if defined(DOUBLE)

-#define ABS fabs
+#error supports float only

 #else

@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_32

-static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT sasum_kernel_32(BLASLONG n, FLOAT *x1)
 {

 	BLASLONG i=0;
@@ -92,11 +92,7 @@ static void sasum_kernel_32(BLASLONG n, FLOAT *x1, FLOAT *svec)

 	}

-	svec[0] = sum0+sum1+sum2+sum3;
-	svec[1] = 0.0;
-	svec[2] = 0.0;
-	svec[3] = 0.0;
-
+	return sum0+sum1+sum2+sum3;
 }

 #endif
@@ -105,7 +101,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 {
 	BLASLONG i=0;
 	FLOAT sumf = 0.0;
-	FLOAT svec[4] __attribute__ ((aligned (16)));;
 	BLASLONG n1;

 	if (n <= 0 || inc_x <= 0) return(sumf);
@@ -117,8 +112,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		if ( n1 > 0 )
 		{

-			sasum_kernel_32(n1, x, svec);
-			sumf = svec[0] + svec[1]+svec[2]+svec[3];
+			sumf = sasum_kernel_32(n1, x);
 			i=n1;
 		}

diff --git a/kernel/power/sasum_microk_power8.c b/kernel/power/sasum_microk_power8.c
index 847fffe..25a969d 100644
--- a/kernel/power/sasum_microk_power8.c
+++ b/kernel/power/sasum_microk_power8.c
@@ -34,144 +34,145 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

 #define HAVE_KERNEL_32 1
-static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));

-static void sasum_kernel_32( BLASLONG n, FLOAT *x, FLOAT *svec)
+static float sasum_kernel_32 (long n, float *x)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xxlxor		32,32,32			    \n\t"
-	"xxlxor		33,33,33			    \n\t"
-	"xxlxor		34,34,34			    \n\t"
-	"xxlxor		35,35,35			    \n\t"
-	"xxlxor		36,36,36			    \n\t"
-	"xxlxor		37,37,37			    \n\t"
-	"xxlxor		38,38,38			    \n\t"
-	"xxlxor		39,39,39			    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xvabssp	48, 40				    \n\t"
-	"xvabssp	49, 41				    \n\t"
-	"xvabssp	50, 42				    \n\t"
-	"xvabssp	51, 43				    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-
-	"xvabssp	52, 44				    \n\t"
-	"xvabssp	53, 45				    \n\t"
-
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-
-	"xvabssp	54, 46				    \n\t"
-	"xvabssp	55, 47				    \n\t"
-
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-
-	"xvaddsp	32, 32, 48		    \n\t"
-	"xvaddsp	33, 33, 49		    \n\t"
-
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"xvaddsp	34, 34, 50		    \n\t"
-	"xvaddsp	35, 35, 51		    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-	"xvaddsp	36, 36, 52		    \n\t"
-	"xvaddsp	37, 37, 53		    \n\t"
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"xvaddsp	38, 38, 54		    \n\t"
-	"xvaddsp	39, 39, 55		    \n\t"
-
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-
-	"xvabssp	48, 40				    \n\t"
-	"xvabssp	49, 41				    \n\t"
-	"xvabssp	50, 42				    \n\t"
-	"xvabssp	51, 43				    \n\t"
-	"xvabssp	52, 44				    \n\t"
-	"xvabssp	53, 45				    \n\t"
-	"xvabssp	54, 46				    \n\t"
-	"xvabssp	55, 47				    \n\t"
-
-	"xvaddsp	32, 32, 48		    \n\t"
-	"xvaddsp	33, 33, 49		    \n\t"
-	"xvaddsp	34, 34, 50		    \n\t"
-	"xvaddsp	35, 35, 51		    \n\t"
-	"xvaddsp	36, 36, 52		    \n\t"
-	"xvaddsp	37, 37, 53		    \n\t"
-	"xvaddsp	38, 38, 54		    \n\t"
-	"xvaddsp	39, 39, 55		    \n\t"
-
-	"xvaddsp	32, 32, 33		     \n\t"
-	"xvaddsp	34, 34, 35		     \n\t"
-	"xvaddsp	36, 36, 37		     \n\t"
-	"xvaddsp	38, 38, 39		     \n\t"
-
-	"xvaddsp	32, 32, 34		     \n\t"
-	"xvaddsp	36, 36, 38		     \n\t"
-
-	"xvaddsp	32, 32, 36		     \n\t"
-
-
-	"stxvw4x	32, 0, %3		     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (svec),   // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)   // 11
-	: "cr0", "%0", "%2",  "memory"
-	);
-
-}
-
-
+  float sum;
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %8, %2	\n\t"
+       "lxvw4x		42, %9, %2	\n\t"
+       "lxvw4x		43, %10, %2	\n\t"
+       "lxvw4x		44, %11, %2	\n\t"
+       "lxvw4x		45, %12, %2	\n\t"
+       "lxvw4x		46, %13, %2	\n\t"
+       "lxvw4x		47, %14, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvabssp		48, 40		\n\t"
+       "xvabssp		49, 41		\n\t"
+       "xvabssp		50, 42		\n\t"
+       "xvabssp		51, 43		\n\t"
+
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %8, %2	\n\t"
+
+       "xvabssp		%x3, 44		\n\t"
+       "xvabssp		%x4, 45		\n\t"
+
+       "lxvw4x		42, %9, %2	\n\t"
+       "lxvw4x		43, %10, %2	\n\t"
+
+       "xvabssp		%x5, 46		\n\t"
+       "xvabssp		%x6, 47		\n\t"
+
+       "lxvw4x		44, %11, %2	\n\t"
+       "lxvw4x		45, %12, %2	\n\t"
+
+       "xvaddsp		32, 32, 48	\n\t"
+       "xvaddsp		33, 33, 49	\n\t"
+
+       "lxvw4x		46, %13, %2	\n\t"
+       "lxvw4x		47, %14, %2	\n\t"
+
+       "xvaddsp		34, 34, 50	\n\t"
+       "xvaddsp		35, 35, 51	\n\t"
+       "addi		%2, %2, 128	\n\t"
+       "xvaddsp		36, 36, %x3	\n\t"
+       "xvaddsp		37, 37, %x4	\n\t"
+       "addic.		%1, %1, -32	\n\t"
+       "xvaddsp		38, 38, %x5	\n\t"
+       "xvaddsp		39, 39, %x6	\n\t"
+
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvabssp		48, 40		\n\t"
+       "xvabssp		49, 41		\n\t"
+       "xvabssp		50, 42		\n\t"
+       "xvabssp		51, 43		\n\t"
+       "xvabssp		%x3, 44		\n\t"
+       "xvabssp		%x4, 45		\n\t"
+       "xvabssp		%x5, 46		\n\t"
+       "xvabssp		%x6, 47		\n\t"
+
+       "xvaddsp		32, 32, 48	\n\t"
+       "xvaddsp		33, 33, 49	\n\t"
+       "xvaddsp		34, 34, 50	\n\t"
+       "xvaddsp		35, 35, 51	\n\t"
+       "xvaddsp		36, 36, %x3	\n\t"
+       "xvaddsp		37, 37, %x4	\n\t"
+       "xvaddsp		38, 38, %x5	\n\t"
+       "xvaddsp		39, 39, %x6	\n\t"
+
+       "xvaddsp		32, 32, 33	\n\t"
+       "xvaddsp		34, 34, 35	\n\t"
+       "xvaddsp		36, 36, 37	\n\t"
+       "xvaddsp		38, 38, 39	\n\t"
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		32, 32, 36	\n\t"
+
+       "xxsldwi		33, 32, 32, 2	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xxsldwi		33, 32, 32, 1	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xscvspdp	%0, 32		\n"
+
+     "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+     :
+       "=f" (sum),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3)	// 6
+     :
+       "m" (*x),
+       "b" (16),	// 8
+       "b" (32),	// 9
+       "b" (48),	// 10
+       "b" (64),	// 11
+       "b" (80),	// 12
+       "b" (96),	// 13
+       "b" (112)	// 14
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+
+  return sum;
+}
diff --git a/kernel/power/scopy_microk_power8.c b/kernel/power/scopy_microk_power8.c
index 2e08e35..444a6d4 100644
--- a/kernel/power/scopy_microk_power8.c
+++ b/kernel/power/scopy_microk_power8.c
@@ -35,97 +35,78 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_32 1

-static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void scopy_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void scopy_kernel_32 (long n, float *x, float *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"stxvw4x		40, 0, %1			    \n\t"
-	"stxvw4x		41, %5, %1			    \n\t"
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"stxvw4x		42, %6, %1			    \n\t"
-	"stxvw4x		43, %7, %1			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"stxvw4x		44, %8, %1			    \n\t"
-	"stxvw4x		45, %9, %1			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"stxvw4x		46, %10, %1			    \n\t"
-	"stxvw4x		47, %11, %1			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"stxvw4x		40, 0, %1			    \n\t"
-	"stxvw4x		41, %5, %1			    \n\t"
-	"stxvw4x		42, %6, %1			    \n\t"
-	"stxvw4x		43, %7, %1			    \n\t"
-	"stxvw4x		44, %8, %1			    \n\t"
-	"stxvw4x		45, %9, %1			    \n\t"
-	"stxvw4x		46, %10, %1			    \n\t"
-	"stxvw4x		47, %11, %1			    \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %5, %2	\n\t"
+       "lxvw4x		42, %6, %2	\n\t"
+       "lxvw4x		43, %7, %2	\n\t"
+       "lxvw4x		44, %8, %2	\n\t"
+       "lxvw4x		45, %9, %2	\n\t"
+       "lxvw4x		46, %10, %2	\n\t"
+       "lxvw4x		47, %11, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "stxvw4x		40, 0, %3	\n\t"
+       "stxvw4x		41, %5, %3	\n\t"
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		41, %5, %2	\n\t"
+       "stxvw4x		42, %6, %3	\n\t"
+       "stxvw4x		43, %7, %3	\n\t"
+       "lxvw4x		42, %6, %2	\n\t"
+       "lxvw4x		43, %7, %2	\n\t"
+       "stxvw4x		44, %8, %3	\n\t"
+       "stxvw4x		45, %9, %3	\n\t"
+       "lxvw4x		44, %8, %2	\n\t"
+       "lxvw4x		45, %9, %2	\n\t"
+       "stxvw4x		46, %10, %3	\n\t"
+       "stxvw4x		47, %11, %3	\n\t"
+       "lxvw4x		46, %10, %2	\n\t"
+       "lxvw4x		47, %11, %2	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "stxvw4x		40, 0, %3	\n\t"
+       "stxvw4x		41, %5, %3	\n\t"
+       "stxvw4x		42, %6, %3	\n\t"
+       "stxvw4x		43, %7, %3	\n\t"
+       "stxvw4x		44, %8, %3	\n\t"
+       "stxvw4x		45, %9, %3	\n\t"
+       "stxvw4x		46, %10, %3	\n\t"
+       "stxvw4x		47, %11, %3	\n"
+
+     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/sdot.c b/kernel/power/sdot.c
index 52fb1fe..31f4734 100644
--- a/kernel/power/sdot.c
+++ b/kernel/power/sdot.c
@@ -42,7 +42,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_16

-static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
+static FLOAT sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y)
 {
 	BLASLONG register i = 0;
 	FLOAT dot = 0.0;
@@ -61,8 +61,7 @@ static void sdot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
               i+=8 ;

        }
-       *d += dot;
-
+       return dot;
 }

 #endif
@@ -82,8 +81,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y)
 		BLASLONG n1 = n & -32;

 		if ( n1 )
-			sdot_kernel_16(n1, x, y , &dot );
-
+			dot = sdot_kernel_16(n1, x, y);

 		i = n1;
 		while(i < n)
diff --git a/kernel/power/sdot_microk_power8.c b/kernel/power/sdot_microk_power8.c
index 6dd588a..7f7ccfa 100644
--- a/kernel/power/sdot_microk_power8.c
+++ b/kernel/power/sdot_microk_power8.c
@@ -34,146 +34,142 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

 #define HAVE_KERNEL_16 1
-static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));

-static void sdot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+static float sdot_kernel_16 (long n, float *x, float *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	BLASLONG pre = 384;
-	FLOAT tempdot[4];
-
-
-	__asm__  __volatile__
-	(
-	"xxlxor		32,32,32			    \n\t"
-	"xxlxor		33,33,33			    \n\t"
-	"xxlxor		34,34,34			    \n\t"
-	"xxlxor		35,35,35			    \n\t"
-	"xxlxor		36,36,36			    \n\t"
-	"xxlxor		37,37,37			    \n\t"
-	"xxlxor		38,38,38			    \n\t"
-	"xxlxor		39,39,39			    \n\t"
-
-	"dcbt		%2, %12				    \n\t"
-	"dcbt		%3, %12				    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		48, 0, %3			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		49, %5, %3			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		50, %6, %3			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		51, %7, %3			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		52, %8, %3			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		53, %9, %3			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		54, %10, %3			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-	"lxvw4x		55, %11, %3			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-	"addi		%3, %3, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %12				    \n\t"
-	"dcbt		%3, %12				    \n\t"
-
-	"xvmaddasp	32, 40, 48		    \n\t"
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		48, 0, %3			    \n\t"
-	"xvmaddasp	33, 41, 49		    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		49, %5, %3			    \n\t"
-	"xvmaddasp	34, 42, 50		    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		50, %6, %3			    \n\t"
-	"xvmaddasp	35, 43, 51		    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		51, %7, %3			    \n\t"
-	"xvmaddasp	36, 44, 52		    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		52, %8, %3			    \n\t"
-	"xvmaddasp	37, 45, 53		    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		53, %9, %3			    \n\t"
-	"xvmaddasp	38, 46, 54		    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		54, %10, %3			    \n\t"
-	"xvmaddasp	39, 47, 55		    \n\t"
-
-	"lxvw4x		47, %11, %2			    \n\t"
-	"lxvw4x		55, %11, %3			    \n\t"
-
-
-	"addi		%2, %2, 128			    \n\t"
-	"addi		%3, %3, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmaddasp	32, 40, 48		    \n\t"
-	"xvmaddasp	33, 41, 49		    \n\t"
-	"xvmaddasp	34, 42, 50		    \n\t"
-	"xvmaddasp	35, 43, 51		    \n\t"
-	"xvmaddasp	36, 44, 52		    \n\t"
-	"xvmaddasp	37, 45, 53		    \n\t"
-	"xvmaddasp	38, 46, 54		    \n\t"
-	"xvmaddasp	39, 47, 55		    \n\t"
-
-	"xvaddsp	32, 32 , 33		    \n\t"
-	"xvaddsp	34, 34 , 35		    \n\t"
-	"xvaddsp	36, 36 , 37		    \n\t"
-	"xvaddsp	38, 38 , 39		    \n\t"
-
-	"xvaddsp	32, 32 , 34		    \n\t"
-	"xvaddsp	36, 36 , 38		    \n\t"
-
-	"xvaddsp	32, 32 , 36		    \n\t"
-
-	"stxvw4x	32, 0 , %4		    \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (y1),     // 3
-          "r" (tempdot),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112),   // 11
-	  "r" (pre)	// 12
-	: "cr0", "%0", "%2" , "%3", "memory"
-	);
-
-	*dot = tempdot[0] + tempdot[1] + tempdot[2] + tempdot[3];
-
-
-}
-
-
+  float dot;
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		48, 0, %3	\n\t"
+       "lxvw4x		41, %10, %2	\n\t"
+       "lxvw4x		49, %10, %3	\n\t"
+       "lxvw4x		42, %11, %2	\n\t"
+       "lxvw4x		50, %11, %3	\n\t"
+       "lxvw4x		43, %12, %2	\n\t"
+       "lxvw4x		51, %12, %3	\n\t"
+       "lxvw4x		44, %13, %2	\n\t"
+       "lxvw4x		%x4, %13, %3	\n\t"
+       "lxvw4x		45, %14, %2	\n\t"
+       "lxvw4x		%x5, %14, %3	\n\t"
+       "lxvw4x		46, %15, %2	\n\t"
+       "lxvw4x		%x6, %15, %3	\n\t"
+       "lxvw4x		47, %16, %2	\n\t"
+       "lxvw4x		%x7, %16, %3	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "lxvw4x		40, 0, %2	\n\t"
+       "lxvw4x		48, 0, %3	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "lxvw4x		41, %10, %2	\n\t"
+       "lxvw4x		49, %10, %3	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "lxvw4x		42, %11, %2	\n\t"
+       "lxvw4x		50, %11, %3	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "lxvw4x		43, %12, %2	\n\t"
+       "lxvw4x		51, %12, %3	\n\t"
+       "xvmaddasp	36, 44, %x4	\n\t"
+       "lxvw4x		44, %13, %2	\n\t"
+       "lxvw4x		%x4, %13, %3	\n\t"
+       "xvmaddasp	37, 45, %x5	\n\t"
+       "lxvw4x		45, %14, %2	\n\t"
+       "lxvw4x		%x5, %14, %3	\n\t"
+       "xvmaddasp	38, 46, %x6	\n\t"
+       "lxvw4x		46, %15, %2	\n\t"
+       "lxvw4x		%x6, %15, %3	\n\t"
+       "xvmaddasp	39, 47, %x7	\n\t"
+       "lxvw4x		47, %16, %2	\n\t"
+       "lxvw4x		%x7, %16, %3	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+       "addi		%3, %3, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmaddasp	32, 40, 48	\n\t"
+       "xvmaddasp	33, 41, 49	\n\t"
+       "xvmaddasp	34, 42, 50	\n\t"
+       "xvmaddasp	35, 43, 51	\n\t"
+       "xvmaddasp	36, 44, %x4	\n\t"
+       "xvmaddasp	37, 45, %x5	\n\t"
+       "xvmaddasp	38, 46, %x6	\n\t"
+       "xvmaddasp	39, 47, %x7	\n\t"
+
+       "xvaddsp		32, 32, 33	\n\t"
+       "xvaddsp		34, 34, 35	\n\t"
+       "xvaddsp		36, 36, 37	\n\t"
+       "xvaddsp		38, 38, 39	\n\t"
+
+       "xvaddsp		32, 32, 34	\n\t"
+       "xvaddsp		36, 36, 38	\n\t"
+
+       "xvaddsp		32, 32, 36	\n\t"
+
+       "xxsldwi		33, 32, 32, 2	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xxsldwi		33, 32, 32, 1	\n\t"
+       "xvaddsp		32, 32, 33	\n\t"
+
+       "xscvspdp	%x0, 32		\n"
+
+     "#dot=%0 n=%1 x=%8=%2 y=%9=%3 o16=%10 o32=%11 o48=%12 o64=%13 o80=%14 o96=%15 o122=%16\n"
+     "#t0=%x4 t1=%x5 t2=%x6 t3=%x7"
+     :
+       "=f" (dot),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y),	// 3
+       "=wa" (t0),	// 4
+       "=wa" (t1),	// 5
+       "=wa" (t2),	// 6
+       "=wa" (t3)	// 7
+     :
+       "m" (*x),
+       "m" (*y),
+       "b" (16),	// 10
+       "b" (32),	// 11
+       "b" (48),	// 12
+       "b" (64),	// 13
+       "b" (80),	// 14
+       "b" (96),	// 15
+       "b" (112)	// 16
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+
+  return dot;
+}
diff --git a/kernel/power/srot.c b/kernel/power/srot.c
index d464846..d2910ff 100644
--- a/kernel/power/srot.c
+++ b/kernel/power/srot.c
@@ -46,7 +46,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_16

-static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT c, FLOAT s)
 {

 	BLASLONG i=0;
@@ -56,8 +56,6 @@ static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
 	FLOAT y00, y01, y02, y03;
 	FLOAT *x1=x;
 	FLOAT *y1=y;
-	FLOAT c1=*c;
-	FLOAT s1=*s;

 	while ( i<n )
 	{
@@ -71,14 +69,14 @@ static void srot_kernel_16(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
 		x03 = x1[3];
 		y03 = y1[3];

-		f0 = c1*x00 + s1*y00;
-		g0 = c1*y00 - s1*x00;
-		f1 = c1*x01 + s1*y01;
-		g1 = c1*y01 - s1*x01;
-		f2 = c1*x02 + s1*y02;
-		g2 = c1*y02 - s1*x02;
-		f3 = c1*x03 + s1*y03;
-		g3 = c1*y03 - s1*x03;
+		f0 = c*x00 + s*y00;
+		g0 = c*y00 - s*x00;
+		f1 = c*x01 + s*y01;
+		g1 = c*y01 - s*x01;
+		f2 = c*x02 + s*y02;
+		g2 = c*y02 - s*x02;
+		f3 = c*x03 + s*y03;
+		g3 = c*y03 - s*x03;

 		x1[0] = f0;
 		y1[0] = g0;
@@ -106,8 +104,6 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
-	FLOAT c1[4] __attribute__ ((aligned (16)));;
-	FLOAT s1[4] __attribute__ ((aligned (16)));;
 	FLOAT *x1=x;
 	FLOAT *y1=y;
 	FLOAT temp;
@@ -120,15 +116,7 @@ int CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT
 		BLASLONG n1 = n & -16;
 		if ( n1 > 0 )
 		{
-			c1[0]=c;
-			c1[1]=c;
-			c1[2]=c;
-			c1[3]=c;
-			s1[0]=s;
-			s1[1]=s;
-			s1[2]=s;
-			s1[3]=s;
-			srot_kernel_16(n1, x1, y1, c1, s1);
+			srot_kernel_16(n1, x1, y1, c, s);
 			i=n1;
 		}

diff --git a/kernel/power/srot_microk_power8.c b/kernel/power/srot_microk_power8.c
index ade6550..0a18c16 100644
--- a/kernel/power/srot_microk_power8.c
+++ b/kernel/power/srot_microk_power8.c
@@ -38,171 +38,179 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_16 1

-static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s) __attribute__ ((noinline));
-
-static void srot_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *c, FLOAT *s)
+static void srot_kernel_16 (long n, float *x, float *y, float c, float s)
 {
+  __vector float t0;
+  __vector float t1;
+  __vector float t2;
+  __vector float t3;
+  __vector float t4;
+  __vector float t5;
+  __vector float t6;
+  __vector float t7;

+  __asm__
+    (
+       "xscvdpspn	36, %x13	\n\t"	// load c to all words
+       "xxspltw		36, 36, 0	\n\t"

-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *x2=x+1;
-	FLOAT *y2=y+1;
-
-	__asm__  __volatile__
-	(
-
-        "lxvw4x         36 , 0, %3                          \n\t"	// load c
-        "lxvw4x         37 , 0, %4                          \n\t"	// load s
-	"addi		%8 , %8, -4			     \n\t"
-	"addi		%9 , %9, -4			     \n\t"
-
-	"lxvw4x		32, 0, %1			    \n\t"	// load x
-	"lxvw4x		33, %5, %1			    \n\t"
-	"lxvw4x		34, %6, %1			    \n\t"
-	"lxvw4x		35, %7, %1			    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"	// load y
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-
-	"addi		%1, %1, 64			    \n\t"
-	"addi		%2, %2, 64			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"xvmulsp	48, 32, 36		    	    \n\t"	// c * x
-	"xvmulsp	49, 33, 36		    	    \n\t"
-	"xvmulsp	50, 34, 36		    	    \n\t"
-	"xvmulsp	51, 35, 36		    	    \n\t"
-
-	"xvmulsp	56, 40, 36		    	    \n\t"	// c * y
-	"xvmulsp	57, 41, 36		    	    \n\t"
-	"xvmulsp	58, 42, 36		    	    \n\t"
-	"xvmulsp	59, 43, 36		    	    \n\t"
-
-	"xvmulsp	52, 32, 37		    	    \n\t"	// s * x
-	"xvmulsp	53, 33, 37		    	    \n\t"
-
-	"lxvw4x		32, 0, %1			    \n\t"	// load x
-	"lxvw4x		33, %5, %1			    \n\t"
-
-	"xvmulsp	54, 34, 37		    	    \n\t"
-	"xvmulsp	55, 35, 37		    	    \n\t"
-
-	"lxvw4x		34, %6, %1			    \n\t"
-	"lxvw4x		35, %7, %1			    \n\t"
-
-	"xvmulsp	60, 40, 37		    	    \n\t"	// s * y
-	"xvmulsp	61, 41, 37		    	    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"	// load y
-	"lxvw4x		41, %5, %2			    \n\t"
-
-	"xvmulsp	62, 42, 37		    	    \n\t"
-	"xvmulsp	63, 43, 37		    	    \n\t"
-
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-
-	"xvaddsp	48, 48 , 60			    \n\t"	// c * x + s * y
-	"xvaddsp	49, 49 , 61			    \n\t"	// c * x + s * y
-
-	"addi		%1, %1, 64			    \n\t"
-	"addi		%2, %2, 64			    \n\t"
-
-	"xvaddsp	50, 50 , 62			    \n\t"	// c * x + s * y
-	"xvaddsp	51, 51 , 63			    \n\t"	// c * x + s * y
-
-	"xvsubsp	56, 56 , 52			    \n\t"	// c * y - s * x
-	"xvsubsp	57, 57 , 53			    \n\t"	// c * y - s * x
-	"xvsubsp	58, 58 , 54			    \n\t"	// c * y - s * x
-	"xvsubsp	59, 59 , 55			    \n\t"	// c * y - s * x
-
-	"stxvw4x	48, 0, %8			    \n\t"	// store x
-	"stxvw4x	49, %5, %8			    \n\t"
-	"stxvw4x	50, %6, %8			    \n\t"
-	"stxvw4x	51, %7, %8			    \n\t"
-
-	"stxvw4x	56, 0, %9			    \n\t"	// store y
-	"stxvw4x	57, %5, %9			    \n\t"
-	"stxvw4x	58, %6, %9			    \n\t"
-	"stxvw4x	59, %7, %9			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-	"addi		%9, %9, 64			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmulsp	48, 32, 36		    	    \n\t"	// c * x
-	"xvmulsp	49, 33, 36		    	    \n\t"
-	"xvmulsp	50, 34, 36		    	    \n\t"
-	"xvmulsp	51, 35, 36		    	    \n\t"
-
-	"xvmulsp	56, 40, 36		    	    \n\t"	// c * y
-	"xvmulsp	57, 41, 36		    	    \n\t"
-	"xvmulsp	58, 42, 36		    	    \n\t"
-	"xvmulsp	59, 43, 36		    	    \n\t"
-
-	"xvmulsp	52, 32, 37		    	    \n\t"	// s * x
-	"xvmulsp	53, 33, 37		    	    \n\t"
-	"xvmulsp	54, 34, 37		    	    \n\t"
-	"xvmulsp	55, 35, 37		    	    \n\t"
-
-	"xvmulsp	60, 40, 37		    	    \n\t"	// s * y
-	"xvmulsp	61, 41, 37		    	    \n\t"
-	"xvmulsp	62, 42, 37		    	    \n\t"
-	"xvmulsp	63, 43, 37		    	    \n\t"
-
-	"xvaddsp	48, 48 , 60			    \n\t"	// c * x + s * y
-	"xvaddsp	49, 49 , 61			    \n\t"	// c * x + s * y
-	"xvaddsp	50, 50 , 62			    \n\t"	// c * x + s * y
-	"xvaddsp	51, 51 , 63			    \n\t"	// c * x + s * y
-
-	"xvsubsp	56, 56 , 52			    \n\t"	// c * y - s * x
-	"xvsubsp	57, 57 , 53			    \n\t"	// c * y - s * x
-	"xvsubsp	58, 58 , 54			    \n\t"	// c * y - s * x
-	"xvsubsp	59, 59 , 55			    \n\t"	// c * y - s * x
+       "xscvdpspn	37, %x14	\n\t"	// load s to all words
+       "xxspltw		37, 37, 0	\n\t"

-	"stxvw4x	48, 0, %8			    \n\t"	// store x
-	"stxvw4x	49, %5, %8			    \n\t"
-	"stxvw4x	50, %6, %8			    \n\t"
-	"stxvw4x	51, %7, %8			    \n\t"
+       "lxvw4x		32, 0, %3	\n\t"	// load x
+       "lxvw4x		33, %15, %3	\n\t"
+       "lxvw4x		34, %16, %3	\n\t"
+       "lxvw4x		35, %17, %3	\n\t"

-	"stxvw4x	56, 0, %9			    \n\t"	// store y
-	"stxvw4x	57, %5, %9			    \n\t"
-	"stxvw4x	58, %6, %9			    \n\t"
-	"stxvw4x	59, %7, %9			    \n\t"
+       "lxvw4x		48, 0, %4	\n\t"	// load y
+       "lxvw4x		49, %15, %4	\n\t"
+       "lxvw4x		50, %16, %4	\n\t"
+       "lxvw4x		51, %17, %4	\n\t"

+       "addi		%3, %3, 64	\n\t"
+       "addi		%4, %4, 64	\n\t"

+       "addic.		%2, %2, -16	\n\t"
+       "ble		2f		\n\t"

-	:
-        :
-          "r" (i),	// 0
-	  "r" (x1),  	// 1
-          "r" (y1),     // 2
-          "r" (c),      // 3
-          "r" (s),      // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-	  "r" (x2),     // 8
-	  "r" (y2)      // 9
-	: "cr0", "%0", "%1" , "%2", "%8", "%9", "memory"
-	);
+       ".p2align	5		\n"
+     "1:				\n\t"

-}
+       "xvmulsp		40, 32, 36	\n\t"	// c * x
+       "xvmulsp		41, 33, 36	\n\t"
+       "xvmulsp		42, 34, 36	\n\t"
+       "xvmulsp		43, 35, 36	\n\t"

+       "xvmulsp		%x5, 48, 36	\n\t"	// c * y
+       "xvmulsp		%x6, 49, 36	\n\t"
+       "xvmulsp		%x7, 50, 36	\n\t"
+       "xvmulsp		%x8, 51, 36	\n\t"

+       "xvmulsp		44, 32, 37	\n\t"	// s * x
+       "xvmulsp		45, 33, 37	\n\t"
+
+       "lxvw4x		32, 0, %3	\n\t"	// load x
+       "lxvw4x		33, %15, %3	\n\t"
+
+       "xvmulsp		46, 34, 37	\n\t"
+       "xvmulsp		47, 35, 37	\n\t"
+
+       "lxvw4x		34, %16, %3	\n\t"
+       "lxvw4x		35, %17, %3	\n\t"
+
+       "xvmulsp		%x9, 48, 37	\n\t"	// s * y
+       "xvmulsp		%x10, 49, 37	\n\t"
+
+       "lxvw4x		48, 0, %4	\n\t"	// load y
+       "lxvw4x		49, %15, %4	\n\t"
+
+       "xvmulsp		%x11, 50, 37	\n\t"
+       "xvmulsp		%x12, 51, 37	\n\t"
+
+       "lxvw4x		50, %16, %4	\n\t"
+       "lxvw4x		51, %17, %4	\n\t"
+
+       "xvaddsp		40, 40, %x9	\n\t"	// c * x + s * y
+       "xvaddsp		41, 41, %x10	\n\t"	// c * x + s * y
+
+       "addi		%3, %3, -64	\n\t"
+       "addi		%4, %4, -64	\n\t"
+
+       "xvaddsp		42, 42, %x11	\n\t"	// c * x + s * y
+       "xvaddsp		43, 43, %x12	\n\t"	// c * x + s * y
+
+       "xvsubsp		%x5, %x5, 44	\n\t"	// c * y - s * x
+       "xvsubsp		%x6, %x6, 45	\n\t"	// c * y - s * x
+       "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
+       "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x
+
+       "stxvw4x		40, 0, %3	\n\t"	// store x
+       "stxvw4x		41, %15, %3	\n\t"
+       "stxvw4x		42, %16, %3	\n\t"
+       "stxvw4x		43, %17, %3	\n\t"
+
+       "stxvw4x		%x5, 0, %4	\n\t"	// store y
+       "stxvw4x		%x6, %15, %4	\n\t"
+       "stxvw4x		%x7, %16, %4	\n\t"
+       "stxvw4x		%x8, %17, %4	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%4, %4, 128	\n\t"
+
+       "addic.		%2, %2, -16	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmulsp		40, 32, 36	\n\t"	// c * x
+       "xvmulsp		41, 33, 36	\n\t"
+       "xvmulsp		42, 34, 36	\n\t"
+       "xvmulsp		43, 35, 36	\n\t"
+
+       "xvmulsp		%x5, 48, 36	\n\t"	// c * y
+       "xvmulsp		%x6, 49, 36	\n\t"
+       "xvmulsp		%x7, 50, 36	\n\t"
+       "xvmulsp		%x8, 51, 36	\n\t"
+
+       "xvmulsp		44, 32, 37	\n\t"	// s * x
+       "xvmulsp		45, 33, 37	\n\t"
+       "xvmulsp		46, 34, 37	\n\t"
+       "xvmulsp		47, 35, 37	\n\t"
+
+       "xvmulsp		%x9, 48, 37	\n\t"	// s * y
+       "xvmulsp		%x10, 49, 37	\n\t"
+       "xvmulsp		%x11, 50, 37	\n\t"
+       "xvmulsp		%x12, 51, 37	\n\t"
+
+       "addi		%3, %3, -64	\n\t"
+       "addi		%4, %4, -64	\n\t"
+
+       "xvaddsp		40, 40, %x9	\n\t"	// c * x + s * y
+       "xvaddsp		41, 41, %x10	\n\t"	// c * x + s * y
+       "xvaddsp		42, 42, %x11	\n\t"	// c * x + s * y
+       "xvaddsp		43, 43, %x12	\n\t"	// c * x + s * y
+
+       "xvsubsp		%x5, %x5, 44	\n\t"	// c * y - s * x
+       "xvsubsp		%x6, %x6, 45	\n\t"	// c * y - s * x
+       "xvsubsp		%x7, %x7, 46	\n\t"	// c * y - s * x
+       "xvsubsp		%x8, %x8, 47	\n\t"	// c * y - s * x
+
+       "stxvw4x		40, 0, %3	\n\t"	// store x
+       "stxvw4x		41, %15, %3	\n\t"
+       "stxvw4x		42, %16, %3	\n\t"
+       "stxvw4x		43, %17, %3	\n\t"
+
+       "stxvw4x		%x5, 0, %4	\n\t"	// store y
+       "stxvw4x		%x6, %15, %4	\n\t"
+       "stxvw4x		%x7, %16, %4	\n\t"
+       "stxvw4x		%x8, %17, %4	\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4 c=%13 s=%14 o16=%15 o32=%16 o48=%17\n"
+     "#t0=%x5 t1=%x6 t2=%x7 t3=%x8 t4=%x9 t5=%x10 t6=%x11 t7=%x12"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y),	// 4
+       "=wa" (t0),	// 5
+       "=wa" (t1),	// 6
+       "=wa" (t2),	// 7
+       "=wa" (t3),	// 8
+       "=wa" (t4),	// 9
+       "=wa" (t5),	// 10
+       "=wa" (t6),	// 11
+       "=wa" (t7)	// 12
+     :
+       "f" (c),		// 13
+       "f" (s),		// 14
+       "b" (16),	// 15
+       "b" (32),	// 16
+       "b" (48)		// 17
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
diff --git a/kernel/power/sscal.c b/kernel/power/sscal.c
index c6ef5e9..bd5cdc4 100644
--- a/kernel/power/sscal.c
+++ b/kernel/power/sscal.c
@@ -42,11 +42,10 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #if !defined(HAVE_KERNEL_16)

-static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )
+static void sscal_kernel_16 (BLASLONG n, FLOAT *x, FLOAT alpha)
 {

         BLASLONG i;
-        FLOAT alpha = *da;

         for( i=0; i<n; i+=8 )
         {
@@ -63,7 +62,7 @@ static void sscal_kernel_16( BLASLONG n, FLOAT *da , FLOAT *x )

 }

-static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
+static void sscal_kernel_16_zero( BLASLONG n, FLOAT *x )
 {

         BLASLONG i;
@@ -90,7 +89,6 @@ static void sscal_kernel_16_zero( BLASLONG n, FLOAT *da , FLOAT *x )
 int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *dummy, BLASLONG dummy2)
 {
 	BLASLONG i=0,j=0;
-	FLOAT alpha[4] __attribute__ ((aligned (16)));;

 	if ( n <= 0 || inc_x <=0 )
 		return(0);
@@ -105,11 +103,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			BLASLONG n1 = n & -32;
 			if ( n1 > 0 )
 			{
-				alpha[0]=da;
-				alpha[1]=da;
-				alpha[2]=da;
-				alpha[3]=da;
-				sscal_kernel_16_zero(n1 , alpha , x);
+				sscal_kernel_16_zero(n1, x);
 				j=n1;
 			}

@@ -127,11 +121,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da, FLOAT *x, BLAS
 			BLASLONG n1 = n & -32;
 			if ( n1 > 0 )
 			{
-				alpha[0]=da;
-				alpha[1]=da;
-				alpha[2]=da;
-				alpha[3]=da;
-				sscal_kernel_16(n1 , alpha , x);
+				sscal_kernel_16(n1, x, da);
 				j=n1;
 			}
 			while(j < n)
diff --git a/kernel/power/sscal_microk_power8.c b/kernel/power/sscal_microk_power8.c
index 963cec7..49862a3 100644
--- a/kernel/power/sscal_microk_power8.c
+++ b/kernel/power/sscal_microk_power8.c
@@ -35,184 +35,150 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_16 1

-static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void sscal_kernel_16( BLASLONG n, FLOAT *alpha, FLOAT *x)
+static void sscal_kernel_16 (long n, float *x, float alpha)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *x2=x+1;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-        "lxvw4x         32, 0, %3                           \n\t"
-        "addi           %1, %1, -4                          \n\t"
-
-	"dcbt		%2, %4				    \n\t"
-
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %4				    \n\t"
-
-	"xvmulsp	48, 40, 32		    	    \n\t"
-	"xvmulsp	49, 41, 32		    	    \n\t"
-	"lxvw4x		40, 0, %2			    \n\t"
-	"lxvw4x		41, %5, %2			    \n\t"
-	"xvmulsp	50, 42, 32		    	    \n\t"
-	"xvmulsp	51, 43, 32		    	    \n\t"
-	"lxvw4x		42, %6, %2			    \n\t"
-	"lxvw4x		43, %7, %2			    \n\t"
-	"xvmulsp	52, 44, 32		    	    \n\t"
-	"xvmulsp	53, 45, 32		    	    \n\t"
-	"lxvw4x		44, %8, %2			    \n\t"
-	"lxvw4x		45, %9, %2			    \n\t"
-	"xvmulsp	54, 46, 32		    	    \n\t"
-	"xvmulsp	55, 47, 32		    	    \n\t"
-	"lxvw4x		46, %10, %2			    \n\t"
-	"lxvw4x		47, %11, %2			    \n\t"
-
-	"stxvw4x	48, 0, %1			    \n\t"
-	"stxvw4x	49, %5, %1			    \n\t"
-	"stxvw4x	50, %6, %1			    \n\t"
-	"stxvw4x	51, %7, %1			    \n\t"
-	"stxvw4x	52, %8, %1			    \n\t"
-	"stxvw4x	53, %9, %1			    \n\t"
-	"stxvw4x	54, %10, %1			    \n\t"
-	"stxvw4x	55, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmulsp	48, 40, 32		    	    \n\t"
-	"xvmulsp	49, 41, 32		    	    \n\t"
-	"xvmulsp	50, 42, 32		    	    \n\t"
-	"xvmulsp	51, 43, 32		    	    \n\t"
-	"xvmulsp	52, 44, 32		    	    \n\t"
-	"xvmulsp	53, 45, 32		    	    \n\t"
-	"xvmulsp	54, 46, 32		    	    \n\t"
-	"xvmulsp	55, 47, 32		    	    \n\t"
-
-	"stxvw4x	48, 0, %1			    \n\t"
-	"stxvw4x	49, %5, %1			    \n\t"
-	"stxvw4x	50, %6, %1			    \n\t"
-	"stxvw4x	51, %7, %1			    \n\t"
-	"stxvw4x	52, %8, %1			    \n\t"
-	"stxvw4x	53, %9, %1			    \n\t"
-	"stxvw4x	54, %10, %1			    \n\t"
-	"stxvw4x	55, %11, %1			    \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (x2),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
-static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x) __attribute__ ((noinline));
-
-static void sscal_kernel_16_zero( BLASLONG n, FLOAT *alpha, FLOAT *x)
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xscvdpspn	%x3, %x3	\n\t"
+       "xxspltw		%x3, %x3, 0	\n\t"
+
+       "lxvw4x		32, 0, %2	\n\t"
+       "lxvw4x		33, %4, %2	\n\t"
+       "lxvw4x		34, %5, %2	\n\t"
+       "lxvw4x		35, %6, %2	\n\t"
+       "lxvw4x		36, %7, %2	\n\t"
+       "lxvw4x		37, %8, %2	\n\t"
+       "lxvw4x		38, %9, %2	\n\t"
+       "lxvw4x		39, %10, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvmulsp		40, 32, %x3	\n\t"
+       "xvmulsp		41, 33, %x3	\n\t"
+       "lxvw4x		32, 0, %2	\n\t"
+       "lxvw4x		33, %4, %2	\n\t"
+       "xvmulsp		42, 34, %x3	\n\t"
+       "xvmulsp		43, 35, %x3	\n\t"
+       "lxvw4x		34, %5, %2	\n\t"
+       "lxvw4x		35, %6, %2	\n\t"
+       "xvmulsp		44, 36, %x3	\n\t"
+       "xvmulsp		45, 37, %x3	\n\t"
+       "lxvw4x		36, %7, %2	\n\t"
+       "lxvw4x		37, %8, %2	\n\t"
+       "xvmulsp		46, 38, %x3	\n\t"
+       "xvmulsp		47, 39, %x3	\n\t"
+       "lxvw4x		38, %9, %2	\n\t"
+       "lxvw4x		39, %10, %2	\n\t"
+
+       "addi		%2, %2, -128	\n\t"
+
+       "stxvw4x		40, 0, %2	\n\t"
+       "stxvw4x		41, %4, %2	\n\t"
+       "stxvw4x		42, %5, %2	\n\t"
+       "stxvw4x		43, %6, %2	\n\t"
+       "stxvw4x		44, %7, %2	\n\t"
+       "stxvw4x		45, %8, %2	\n\t"
+       "stxvw4x		46, %9, %2	\n\t"
+       "stxvw4x		47, %10, %2	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmulsp		40, 32, %x3	\n\t"
+       "xvmulsp		41, 33, %x3	\n\t"
+       "xvmulsp		42, 34, %x3	\n\t"
+       "xvmulsp		43, 35, %x3	\n\t"
+
+       "addi		%2, %2, -128	\n\t"
+
+       "xvmulsp		44, 36, %x3	\n\t"
+       "xvmulsp		45, 37, %x3	\n\t"
+       "xvmulsp		46, 38, %x3	\n\t"
+       "xvmulsp		47, 39, %x3	\n\t"
+
+       "stxvw4x		40, 0, %2	\n\t"
+       "stxvw4x		41, %4, %2	\n\t"
+       "stxvw4x		42, %5, %2	\n\t"
+       "stxvw4x		43, %6, %2	\n\t"
+       "stxvw4x		44, %7, %2	\n\t"
+       "stxvw4x		45, %8, %2	\n\t"
+       "stxvw4x		46, %9, %2	\n\t"
+       "stxvw4x		47, %10, %2	\n"
+
+     "#n=%1 alpha=%3 x=%0=%2 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+     :
+       "+m" (*x),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+f" (alpha)	// 3
+     :
+       "b" (16),	// 4
+       "b" (32),	// 5
+       "b" (48),	// 6
+       "b" (64),	// 7
+       "b" (80),	// 8
+       "b" (96),	// 9
+       "b" (112)	// 10
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
+
+
+static void sscal_kernel_16_zero (long n, float *x)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *x2=x+1;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-	"xxlxor		32 , 32 , 32			    \n\t"
-        "addi           %1, %1, -4                          \n\t"
-
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"stxvw4x	32, 0, %1			    \n\t"
-	"stxvw4x	32, %5, %1			    \n\t"
-	"stxvw4x	32, %6, %1			    \n\t"
-	"stxvw4x	32, %7, %1			    \n\t"
-	"stxvw4x	32, %8, %1			    \n\t"
-	"stxvw4x	32, %9, %1			    \n\t"
-	"stxvw4x	32, %10, %1			    \n\t"
-	"stxvw4x	32, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (x2),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
+  __vector float t0;
+
+  __asm__
+    (
+       "xxlxor		%x3, %x3, %x3	\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "stxvw4x		%x3, 0, %2	\n\t"
+       "stxvw4x		%x3, %4, %2	\n\t"
+       "stxvw4x		%x3, %5, %2	\n\t"
+       "stxvw4x		%x3, %6, %2	\n\t"
+       "stxvw4x		%x3, %7, %2	\n\t"
+       "stxvw4x		%x3, %8, %2	\n\t"
+       "stxvw4x		%x3, %9, %2	\n\t"
+       "stxvw4x		%x3, %10, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -32	\n\t"
+       "bgt		1b		\n"
+
+     "#n=%1 x=%0=%2 t0=%x3 o16=%4 o32=%5 o48=%6 o64=%7 o80=%8 o96=%9 o112=%10"
+     :
+       "=m" (*x),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0)	// 3
+     :
+       "b" (16),	// 4
+       "b" (32),	// 5
+       "b" (48),	// 6
+       "b" (64),	// 7
+       "b" (80),	// 8
+       "b" (96),	// 9
+       "b" (112)	// 10
+     :
+       "cr0"
+     );
+}
diff --git a/kernel/power/sswap_microk_power8.c b/kernel/power/sswap_microk_power8.c
index c48e743..d44f167 100644
--- a/kernel/power/sswap_microk_power8.c
+++ b/kernel/power/sswap_microk_power8.c
@@ -35,102 +35,74 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_32 1

-static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void sswap_kernel_32( BLASLONG n, FLOAT *x, FLOAT *y)
+static void sswap_kernel_32 (long n, float *x, float *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *x2=x+1;
-	FLOAT *y2=y+1;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"addi		%3, %3, -4			    \n\t"
-	"addi		%4, %4, -4			    \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"lxvw4x		32, 0, %2			    \n\t"
-	"lxvw4x		33, %5, %2			    \n\t"
-	"lxvw4x		34, %6, %2			    \n\t"
-	"lxvw4x		35, %7, %2			    \n\t"
-	"lxvw4x		36, %8, %2			    \n\t"
-	"lxvw4x		37, %9, %2			    \n\t"
-	"lxvw4x		38, %10, %2			    \n\t"
-	"lxvw4x		39, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvw4x		48, 0, %1			    \n\t"
-	"lxvw4x		49, %5, %1			    \n\t"
-	"lxvw4x		50, %6, %1			    \n\t"
-	"lxvw4x		51, %7, %1			    \n\t"
-	"lxvw4x		52, %8, %1			    \n\t"
-	"lxvw4x		53, %9, %1			    \n\t"
-	"lxvw4x		54, %10, %1			    \n\t"
-	"lxvw4x		55, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"stxvw4x		32, 0, %3			    \n\t"
-	"stxvw4x		33, %5, %3			    \n\t"
-	"stxvw4x		34, %6, %3			    \n\t"
-	"stxvw4x		35, %7, %3			    \n\t"
-	"stxvw4x		36, %8, %3			    \n\t"
-	"stxvw4x		37, %9, %3			    \n\t"
-	"stxvw4x		38, %10, %3			    \n\t"
-	"stxvw4x		39, %11, %3			    \n\t"
-
-	"addi		%3, %3, 128			    \n\t"
-
-	"stxvw4x		48, 0, %4			    \n\t"
-	"stxvw4x		49, %5, %4			    \n\t"
-	"stxvw4x		50, %6, %4			    \n\t"
-	"stxvw4x		51, %7, %4			    \n\t"
-	"stxvw4x		52, %8, %4			    \n\t"
-	"stxvw4x		53, %9, %4			    \n\t"
-	"stxvw4x		54, %10, %4			    \n\t"
-	"stxvw4x		55, %11, %4			    \n\t"
-
-	"addi		%4, %4, 128			    \n\t"
-
-	"addic.		%0 , %0	, -32  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (y2),     // 3
-          "r" (x2),     // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "lxvw4x		32, 0, %4	\n\t"
+       "lxvw4x		33, %5, %4	\n\t"
+       "lxvw4x		34, %6, %4	\n\t"
+       "lxvw4x		35, %7, %4	\n\t"
+       "lxvw4x		36, %8, %4	\n\t"
+       "lxvw4x		37, %9, %4	\n\t"
+       "lxvw4x		38, %10, %4	\n\t"
+       "lxvw4x		39, %11, %4	\n\t"
+
+       "lxvw4x		40, 0, %3	\n\t"
+       "lxvw4x		41, %5, %3	\n\t"
+       "lxvw4x		42, %6, %3	\n\t"
+       "lxvw4x		43, %7, %3	\n\t"
+       "lxvw4x		44, %8, %3	\n\t"
+       "lxvw4x		45, %9, %3	\n\t"
+       "lxvw4x		46, %10, %3	\n\t"
+       "lxvw4x		47, %11, %3	\n\t"
+
+       "stxvw4x		32, 0, %3	\n\t"
+       "stxvw4x		33, %5, %3	\n\t"
+       "stxvw4x		34, %6, %3	\n\t"
+       "stxvw4x		35, %7, %3	\n\t"
+       "stxvw4x		36, %8, %3	\n\t"
+       "stxvw4x		37, %9, %3	\n\t"
+       "stxvw4x		38, %10, %3	\n\t"
+       "stxvw4x		39, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvw4x		40, 0, %4	\n\t"
+       "stxvw4x		41, %5, %4	\n\t"
+       "stxvw4x		42, %6, %4	\n\t"
+       "stxvw4x		43, %7, %4	\n\t"
+       "stxvw4x		44, %8, %4	\n\t"
+       "stxvw4x		45, %9, %4	\n\t"
+       "stxvw4x		46, %10, %4	\n\t"
+       "stxvw4x		47, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "addic.		%2, %2, -32	\n\t"
+       "bgt		1b		\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y)		// 4
+     :
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/zasum.c b/kernel/power/zasum.c
index abd6ec0..0b6b87d 100644
--- a/kernel/power/zasum.c
+++ b/kernel/power/zasum.c
@@ -53,7 +53,7 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_8

-static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)
+static FLOAT zasum_kernel_8(BLASLONG n, FLOAT *x1)
 {

 	BLASLONG i=0;
@@ -92,9 +92,7 @@ static void zasum_kernel_8(BLASLONG n, FLOAT *x1, FLOAT *svec)

 	}

-	svec[0] = sum0+sum1+sum2+sum3;
-	svec[1] = 0.0;
-
+	return sum0+sum1+sum2+sum3;
 }

 #endif
@@ -104,7 +102,6 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 	BLASLONG i=0;
 	BLASLONG ip=0;
 	FLOAT sumf = 0.0;
-	FLOAT svec[2] __attribute__ ((aligned (16)));;
 	BLASLONG n1;
 	BLASLONG inc_x2;

@@ -117,8 +114,7 @@ FLOAT CNAME(BLASLONG n, FLOAT *x, BLASLONG inc_x)
 		if ( n1 > 0 )
 		{

-			zasum_kernel_8(n1, x, svec);
-			sumf = svec[0] + svec[1];
+			sumf = zasum_kernel_8(n1, x);
 			i=n1;
 			ip=2*n1;
 		}
diff --git a/kernel/power/zasum_microk_power8.c b/kernel/power/zasum_microk_power8.c
index b9f6c0a..8236690 100644
--- a/kernel/power/zasum_microk_power8.c
+++ b/kernel/power/zasum_microk_power8.c
@@ -34,144 +34,140 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

 #define HAVE_KERNEL_8 1
-static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec) __attribute__ ((noinline));

-static void zasum_kernel_8( BLASLONG n, FLOAT *x, FLOAT *svec)
+static double zasum_kernel_8 (long n, double *x)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xxlxor		32,32,32			    \n\t"
-	"xxlxor		33,33,33			    \n\t"
-	"xxlxor		34,34,34			    \n\t"
-	"xxlxor		35,35,35			    \n\t"
-	"xxlxor		36,36,36			    \n\t"
-	"xxlxor		37,37,37			    \n\t"
-	"xxlxor		38,38,38			    \n\t"
-	"xxlxor		39,39,39			    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2 , %4				    \n\t"
-
-	"xvabsdp	48, 40				    \n\t"
-	"xvabsdp	49, 41				    \n\t"
-	"xvabsdp	50, 42				    \n\t"
-	"xvabsdp	51, 43				    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-
-	"xvabsdp	52, 44				    \n\t"
-	"xvabsdp	53, 45				    \n\t"
-
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-
-	"xvabsdp	54, 46				    \n\t"
-	"xvabsdp	55, 47				    \n\t"
-
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-
-	"xvadddp	32, 32, 48		    \n\t"
-	"xvadddp	33, 33, 49		    \n\t"
-
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"xvadddp	34, 34, 50		    \n\t"
-	"xvadddp	35, 35, 51		    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-	"xvadddp	36, 36, 52		    \n\t"
-	"xvadddp	37, 37, 53		    \n\t"
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"xvadddp	38, 38, 54		    \n\t"
-	"xvadddp	39, 39, 55		    \n\t"
-
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-
-	"xvabsdp	48, 40				    \n\t"
-	"xvabsdp	49, 41				    \n\t"
-	"xvabsdp	50, 42				    \n\t"
-	"xvabsdp	51, 43				    \n\t"
-	"xvabsdp	52, 44				    \n\t"
-	"xvabsdp	53, 45				    \n\t"
-	"xvabsdp	54, 46				    \n\t"
-	"xvabsdp	55, 47				    \n\t"
-
-	"xvadddp	32, 32, 48		    \n\t"
-	"xvadddp	33, 33, 49		    \n\t"
-	"xvadddp	34, 34, 50		    \n\t"
-	"xvadddp	35, 35, 51		    \n\t"
-	"xvadddp	36, 36, 52		    \n\t"
-	"xvadddp	37, 37, 53		    \n\t"
-	"xvadddp	38, 38, 54		    \n\t"
-	"xvadddp	39, 39, 55		    \n\t"
-
-	"xvadddp	32, 32, 33		     \n\t"
-	"xvadddp	34, 34, 35		     \n\t"
-	"xvadddp	36, 36, 37		     \n\t"
-	"xvadddp	38, 38, 39		     \n\t"
-
-	"xvadddp	32, 32, 34		     \n\t"
-	"xvadddp	36, 36, 38		     \n\t"
-
-	"xvadddp	32, 32, 36		     \n\t"
-
-
-	"stxvd2x	32, 0, %3		     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (svec),   // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)   // 11
-	: "cr0", "%0", "%2",  "memory"
-	);
-
-}
-
-
+  double sum;
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvabsdp		48, 40		\n\t"
+       "xvabsdp		49, 41		\n\t"
+       "xvabsdp		50, 42		\n\t"
+       "xvabsdp		51, 43		\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %8, %2	\n\t"
+
+       "xvabsdp		%x3, 44		\n\t"
+       "xvabsdp		%x4, 45		\n\t"
+
+       "lxvd2x		42, %9, %2	\n\t"
+       "lxvd2x		43, %10, %2	\n\t"
+
+       "xvabsdp		%x5, 46		\n\t"
+       "xvabsdp		%x6, 47		\n\t"
+
+       "lxvd2x		44, %11, %2	\n\t"
+       "lxvd2x		45, %12, %2	\n\t"
+
+       "xvadddp		32, 32, 48	\n\t"
+       "xvadddp		33, 33, 49	\n\t"
+
+       "lxvd2x		46, %13, %2	\n\t"
+       "lxvd2x		47, %14, %2	\n\t"
+
+       "xvadddp		34, 34, 50	\n\t"
+       "xvadddp		35, 35, 51	\n\t"
+       "addi		%2, %2, 128	\n\t"
+       "xvadddp		36, 36, %x3	\n\t"
+       "xvadddp		37, 37, %x4	\n\t"
+       "addic.		%1, %1, -8	\n\t"
+       "xvadddp		38, 38, %x5	\n\t"
+       "xvadddp		39, 39, %x6	\n\t"
+
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvabsdp		48, 40		\n\t"
+       "xvabsdp		49, 41		\n\t"
+       "xvabsdp		50, 42		\n\t"
+       "xvabsdp		51, 43		\n\t"
+       "xvabsdp		%x3, 44		\n\t"
+       "xvabsdp		%x4, 45		\n\t"
+       "xvabsdp		%x5, 46		\n\t"
+       "xvabsdp		%x6, 47		\n\t"
+
+       "xvadddp		32, 32, 48	\n\t"
+       "xvadddp		33, 33, 49	\n\t"
+       "xvadddp		34, 34, 50	\n\t"
+       "xvadddp		35, 35, 51	\n\t"
+       "xvadddp		36, 36, %x3	\n\t"
+       "xvadddp		37, 37, %x4	\n\t"
+       "xvadddp		38, 38, %x5	\n\t"
+       "xvadddp		39, 39, %x6	\n\t"
+
+       "xvadddp		32, 32, 33	\n\t"
+       "xvadddp		34, 34, 35	\n\t"
+       "xvadddp		36, 36, 37	\n\t"
+       "xvadddp		38, 38, 39	\n\t"
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+
+       "xxswapd		33, 32		\n\t"
+       "xsadddp		%x0, 32, 33	\n"
+
+     "#n=%1 x=%3=%2 sum=%0 o16=%8 o32=%9 o48=%10 o64=%11 o80=%12 o96=%13 o112=%14\n"
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6"
+     :
+       "=d" (sum),	// 0
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3)	// 6
+     :
+       "m" (*x),
+       "b" (16),	// 8
+       "b" (32),	// 9
+       "b" (48),	// 10
+       "b" (64),	// 11
+       "b" (80),	// 12
+       "b" (96),	// 13
+       "b" (112)	// 14
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+
+  return sum;
+}
diff --git a/kernel/power/zaxpy.c b/kernel/power/zaxpy.c
index 0ee0c1b..dd7ab6c 100644
--- a/kernel/power/zaxpy.c
+++ b/kernel/power/zaxpy.c
@@ -78,7 +78,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,
 {
 	BLASLONG i=0;
 	BLASLONG ix=0,iy=0;
-	FLOAT da[4];

 	if ( n <= 0 )  return(0);

@@ -89,11 +88,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r, FLOAT da_i,

 		if ( n1 )
 		{
-			da[0] = da_r;
-			da[1] = da_r;
-			da[2] = da_i;
-			da[3] = da_i;
-			zaxpy_kernel_4(n1, x, y , da );
+			zaxpy_kernel_4 (n1, x, y, da_r, da_i);
 			ix = 2 * n1;
 		}
 		i = n1;
diff --git a/kernel/power/zaxpy_microk_power8.c b/kernel/power/zaxpy_microk_power8.c
index c8a529f..124614f 100644
--- a/kernel/power/zaxpy_microk_power8.c
+++ b/kernel/power/zaxpy_microk_power8.c
@@ -35,216 +35,225 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.


 #define HAVE_KERNEL_4 1
-static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *alpha) __attribute__ ((noinline));
-
-static void zaxpy_kernel_4( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *alpha)
+static void zaxpy_kernel_4 (long n, double *x, double *y,
+			    double alpha_r, double alpha_i)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *y2=y+1;
-	BLASLONG pre = 384;
-
 #if !defined(CONJ)
-        FLOAT mvec[2] = { -1.0, 1.0 };
+  static const double mvec[2] = { -1.0, 1.0 };
 #else
-        FLOAT mvec[2] = { 1.0, -1.0 };
+  static const double mvec[2] = { 1.0, -1.0 };
 #endif
-
-
-	__asm__  __volatile__
-	(
-
-	"lxsdx		34, 0 , %4			    \n\t"	// alpha_r
-	"lxsdx		35, %5, %4			    \n\t"	// alpha_i
-	"xxspltd	32, 34, 0			    \n\t"
-	"xxspltd	33, 35, 0			    \n\t"
-
-	"lxvd2x		36, 0,	%9			    \n\t"	// mvec
+  const double *mvecp = mvec;
+
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+  __vector double t4;
+  __vector double t5;
+  __vector double t6;
+  __vector double t7;
+  __vector double t8;
+  __vector double t9;
+  __vector double t10;
+  __vector double t11;
+  long ytmp;
+
+  __asm__
+    (
+       "xxspltd		32, %x19, 0	\n\t"	// alpha_r
+       "xxspltd		33, %x20, 0	\n\t"	// alpha_i
+
+       "lxvd2x		36, 0, %21	\n\t"	// mvec

 #if !defined(CONJ)
-        "xvmuldp         33, 33  , 36               	    \n\t"	// alpha_i * mvec
+       "xvmuldp		33, 33, 36	\n\t"	// alpha_i * mvec
 #else
-        "xvmuldp         32, 32  , 36               	    \n\t"	// alpha_r * mvec
+       "xvmuldp		32, 32, 36	\n\t"	// alpha_r * mvec
 #endif

-	"addi		%8, %8, -8			    \n\t"
-
-	"dcbt		%2, %10				    \n\t"
-	"dcbt		%3, %10				    \n\t"
-
-
-	"lxvd2x		40, 0, %2			    \n\t"       // x0
-	"lxvd2x		41, %5, %2			    \n\t"	// x1
-	"lxvd2x		42, %6, %2			    \n\t"	// x2
-	"lxvd2x		43, %7, %2			    \n\t"	// x3
-
-	"lxvd2x		48, 0, %3			    \n\t"	// y0
-	"lxvd2x		49, %5, %3			    \n\t"	// y1
-	"lxvd2x		50, %6, %3			    \n\t"	// y2
-	"lxvd2x		51, %7, %3			    \n\t"	// y3
-
-	"xxswapd	56, 40				    \n\t"	// exchange real and imag part
-	"xxswapd	57, 41				    \n\t"	// exchange real and imag part
-	"xxswapd	58, 42				    \n\t"	// exchange real and imag part
-	"xxswapd	59, 43				    \n\t"	// exchange real and imag part
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-	"lxvd2x		44, 0, %2			    \n\t"	// x4
-	"lxvd2x		45, %5, %2			    \n\t"	// x5
-	"lxvd2x		46, %6, %2			    \n\t"	// x6
-	"lxvd2x		47, %7, %2			    \n\t"	// x7
-
-	"lxvd2x		52, 0, %3			    \n\t"	// y4
-	"lxvd2x		53, %5, %3			    \n\t"	// y5
-	"lxvd2x		54, %6, %3			    \n\t"	// y6
-	"lxvd2x		55, %7, %3			    \n\t"	// y7
-
-	"xxswapd	60, 44				    \n\t"	// exchange real and imag part
-	"xxswapd	61, 45				    \n\t"	// exchange real and imag part
-	"xxswapd	62, 46				    \n\t"	// exchange real and imag part
-	"xxswapd	63, 47				    \n\t"	// exchange real and imag part
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %10				    \n\t"
-	"dcbt		%3, %10				    \n\t"
-
-	"xvmaddadp	48, 40, 32		    	    \n\t"	// alpha_r * x0_r , alpha_r * x0_i
-	"xvmaddadp	49, 41, 32		    	    \n\t"
-	"lxvd2x		40, 0, %2			    \n\t"       // x0
-	"lxvd2x		41, %5, %2			    \n\t"	// x1
-	"xvmaddadp	50, 42, 32		    	    \n\t"
-	"xvmaddadp	51, 43, 32		    	    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"	// x2
-	"lxvd2x		43, %7, %2			    \n\t"	// x3
-
-	"xvmaddadp	52, 44, 32		    	    \n\t"
-	"addi		%2, %2, 64			    \n\t"
-	"xvmaddadp	53, 45, 32		    	    \n\t"
-	"lxvd2x		44, 0, %2			    \n\t"	// x4
-	"lxvd2x		45, %5, %2			    \n\t"	// x5
-	"xvmaddadp	54, 46, 32		    	    \n\t"
-	"xvmaddadp	55, 47, 32		    	    \n\t"
-	"lxvd2x		46, %6, %2			    \n\t"	// x6
-	"lxvd2x		47, %7, %2			    \n\t"	// x7
-
-	"xvmaddadp	48, 56, 33		    	    \n\t"	// alpha_i * x0_i , alpha_i * x0_r
-	"addi		%2, %2, 64			    \n\t"
-	"xvmaddadp	49, 57, 33		    	    \n\t"
-	"xvmaddadp	50, 58, 33		    	    \n\t"
-	"xvmaddadp	51, 59, 33		    	    \n\t"
-
-	"xvmaddadp	52, 60, 33		    	    \n\t"
-	"xvmaddadp	53, 61, 33		    	    \n\t"
-	"xvmaddadp	54, 62, 33		    	    \n\t"
-	"xvmaddadp	55, 63, 33		    	    \n\t"
-
-	"stxvd2x	48,  0, %8			    \n\t"
-	"stxvd2x	49, %5, %8			    \n\t"
-	"stxvd2x	50, %6, %8			    \n\t"
-	"stxvd2x	51, %7, %8			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-
-	"stxvd2x	52,  0, %8			    \n\t"
-	"stxvd2x	53, %5, %8			    \n\t"
-	"stxvd2x	54, %6, %8			    \n\t"
-	"stxvd2x	55, %7, %8			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-
-	"xxswapd	56, 40				    \n\t"	// exchange real and imag part
-	"xxswapd	57, 41				    \n\t"	// exchange real and imag part
-	"lxvd2x		48, 0, %3			    \n\t"	// y0
-	"lxvd2x		49, %5, %3			    \n\t"	// y1
-	"xxswapd	58, 42				    \n\t"	// exchange real and imag part
-	"xxswapd	59, 43				    \n\t"	// exchange real and imag part
-	"lxvd2x		50, %6, %3			    \n\t"	// y2
-	"lxvd2x		51, %7, %3			    \n\t"	// y3
-
-	"xxswapd	60, 44				    \n\t"	// exchange real and imag part
-	"addi		%3, %3, 64			    \n\t"
-	"xxswapd	61, 45				    \n\t"	// exchange real and imag part
-	"lxvd2x		52, 0, %3			    \n\t"	// y4
-	"lxvd2x		53, %5, %3			    \n\t"	// y5
-	"xxswapd	62, 46				    \n\t"	// exchange real and imag part
-	"xxswapd	63, 47				    \n\t"	// exchange real and imag part
-	"lxvd2x		54, %6, %3			    \n\t"	// y6
-	"lxvd2x		55, %7, %3			    \n\t"	// y7
-
-	"addi		%3, %3, 64			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmaddadp	48, 40, 32		    	    \n\t"	// alpha_r * x0_r , alpha_r * x0_i
-	"xvmaddadp	49, 41, 32		    	    \n\t"
-	"xvmaddadp	50, 42, 32		    	    \n\t"
-	"xvmaddadp	51, 43, 32		    	    \n\t"
-
-	"xvmaddadp	52, 44, 32		    	    \n\t"
-	"xvmaddadp	53, 45, 32		    	    \n\t"
-	"xvmaddadp	54, 46, 32		    	    \n\t"
-	"xvmaddadp	55, 47, 32		    	    \n\t"
-
-	"xvmaddadp	48, 56, 33		    	    \n\t"	// alpha_i * x0_i , alpha_i * x0_r
-	"xvmaddadp	49, 57, 33		    	    \n\t"
-	"xvmaddadp	50, 58, 33		    	    \n\t"
-	"xvmaddadp	51, 59, 33		    	    \n\t"
-
-	"xvmaddadp	52, 60, 33		    	    \n\t"
-	"xvmaddadp	53, 61, 33		    	    \n\t"
-	"xvmaddadp	54, 62, 33		    	    \n\t"
-	"xvmaddadp	55, 63, 33		    	    \n\t"
-
-
-	"stxvd2x	48,  0, %8			    \n\t"
-	"stxvd2x	49, %5, %8			    \n\t"
-	"stxvd2x	50, %6, %8			    \n\t"
-	"stxvd2x	51, %7, %8			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-
-	"stxvd2x	52,  0, %8			    \n\t"
-	"stxvd2x	53, %5, %8			    \n\t"
-	"stxvd2x	54, %6, %8			    \n\t"
-	"stxvd2x	55, %7, %8			    \n\t"
-
-	"addi		%8, %8, 64			    \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (y1),     // 3
-          "r" (alpha),  // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-	  "r" (y2),     // 8
-	  "r" (mvec),   // 9
-	  "r" (pre)	// 10
-	: "cr0", "%0", "%2" , "%3", "%8", "memory"
-	);
-
-}
-
-
+       "mr		%16, %3		\n\t"
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+
+       "lxvd2x		40, 0, %2	\n\t"	// x0
+       "lxvd2x		41, %22, %2	\n\t"	// x1
+       "lxvd2x		42, %23, %2	\n\t"	// x2
+       "lxvd2x		43, %24, %2	\n\t"	// x3
+
+       "lxvd2x		48, 0, %3	\n\t"	// y0
+       "lxvd2x		49, %22, %3	\n\t"	// y1
+       "lxvd2x		50, %23, %3	\n\t"	// y2
+       "lxvd2x		51, %24, %3	\n\t"	// y3
+
+       "xxswapd		%x8, 40		\n\t"	// exchange real and imag part
+       "xxswapd		%x9, 41		\n\t"	// exchange real and imag part
+       "xxswapd		%x10, 42	\n\t"	// exchange real and imag part
+       "xxswapd		%x11, 43	\n\t"	// exchange real and imag part
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 64	\n\t"
+
+       "lxvd2x		44, 0, %2	\n\t"	// x4
+       "lxvd2x		45, %22, %2	\n\t"	// x5
+       "lxvd2x		46, %23, %2	\n\t"	// x6
+       "lxvd2x		47, %24, %2	\n\t"	// x7
+
+       "lxvd2x		%x4, 0, %3	\n\t"	// y4
+       "lxvd2x		%x5, %22, %3	\n\t"	// y5
+       "lxvd2x		%x6, %23, %3	\n\t"	// y6
+       "lxvd2x		%x7, %24, %3	\n\t"	// y7
+
+       "xxswapd		%x12, 44	\n\t"	// exchange real and imag part
+       "xxswapd		%x13, 45	\n\t"	// exchange real and imag part
+       "xxswapd		%x14, 46	\n\t"	// exchange real and imag part
+       "xxswapd		%x15, 47	\n\t"	// exchange real and imag part
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 64	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+       "1:				\n\t"
+
+       "xvmaddadp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddadp	49, 41, 32	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"	// x0
+       "lxvd2x		41, %22, %2	\n\t"	// x1
+       "xvmaddadp	50, 42, 32	\n\t"
+       "xvmaddadp	51, 43, 32	\n\t"
+       "lxvd2x		42, %23, %2	\n\t"	// x2
+       "lxvd2x		43, %24, %2	\n\t"	// x3
+
+       "xvmaddadp	%x4, 44, 32	\n\t"
+       "addi		%2, %2, 64	\n\t"
+       "xvmaddadp	%x5, 45, 32	\n\t"
+       "lxvd2x		44, 0, %2	\n\t"	// x4
+       "lxvd2x		45, %22, %2	\n\t"	// x5
+       "xvmaddadp	%x6, 46, 32	\n\t"
+       "xvmaddadp	%x7, 47, 32	\n\t"
+       "lxvd2x		46, %23, %2	\n\t"	// x6
+       "lxvd2x		47, %24, %2	\n\t"	// x7
+
+       "xvmaddadp	48, %x8, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "addi		%2, %2, 64	\n\t"
+       "xvmaddadp	49, %x9, 33	\n\t"
+       "xvmaddadp	50, %x10, 33	\n\t"
+       "xvmaddadp	51, %x11, 33	\n\t"
+
+       "xvmaddadp	%x4, %x12, 33	\n\t"
+       "xvmaddadp	%x5, %x13, 33	\n\t"
+       "xvmaddadp	%x6, %x14, 33	\n\t"
+       "xvmaddadp	%x7, %x15, 33	\n\t"
+
+       "stxvd2x		48, 0, %16	\n\t"
+       "stxvd2x		49, %22, %16	\n\t"
+       "stxvd2x		50, %23, %16	\n\t"
+       "stxvd2x		51, %24, %16	\n\t"
+
+       "addi		%16, %16, 64	\n\t"
+
+       "stxvd2x		%x4, 0, %16	\n\t"
+       "stxvd2x		%x5, %22, %16	\n\t"
+       "stxvd2x		%x6, %23, %16	\n\t"
+       "stxvd2x		%x7, %24, %16	\n\t"
+
+       "addi		%16, %16, 64	\n\t"
+
+       "xxswapd		%x8, 40		\n\t"	// exchange real and imag part
+       "xxswapd		%x9, 41		\n\t"	// exchange real and imag part
+       "lxvd2x		48, 0, %3	\n\t"	// y0
+       "lxvd2x		49, %22, %3	\n\t"	// y1
+       "xxswapd		%x10, 42	\n\t"	// exchange real and imag part
+       "xxswapd		%x11, 43	\n\t"	// exchange real and imag part
+       "lxvd2x		50, %23, %3	\n\t"	// y2
+       "lxvd2x		51, %24, %3	\n\t"	// y3
+
+       "xxswapd		%x12, 44	\n\t"	// exchange real and imag part
+       "addi		%3, %3, 64	\n\t"
+       "xxswapd		%x13, 45	\n\t"	// exchange real and imag part
+       "lxvd2x		%x4, 0, %3	\n\t"	// y4
+       "lxvd2x		%x5, %22, %3	\n\t"	// y5
+       "xxswapd		%x14, 46	\n\t"	// exchange real and imag part
+       "xxswapd		%x15, 47	\n\t"	// exchange real and imag part
+       "lxvd2x		%x6, %23, %3	\n\t"	// y6
+       "lxvd2x		%x7, %24, %3	\n\t"	// y7
+
+       "addi		%3, %3, 64	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "bgt		1b		\n"
+
+       "2:				\n\t"
+
+       "xvmaddadp	48, 40, 32	\n\t"	// alpha_r * x0_r , alpha_r * x0_i
+       "xvmaddadp	49, 41, 32	\n\t"
+       "xvmaddadp	50, 42, 32	\n\t"
+       "xvmaddadp	51, 43, 32	\n\t"
+
+       "xvmaddadp	%x4, 44, 32	\n\t"
+       "xvmaddadp	%x5, 45, 32	\n\t"
+       "xvmaddadp	%x6, 46, 32	\n\t"
+       "xvmaddadp	%x7, 47, 32	\n\t"
+
+       "xvmaddadp	48, %x8, 33	\n\t"	// alpha_i * x0_i , alpha_i * x0_r
+       "xvmaddadp	49, %x9, 33	\n\t"
+       "xvmaddadp	50, %x10, 33	\n\t"
+       "xvmaddadp	51, %x11, 33	\n\t"
+
+       "xvmaddadp	%x4, %x12, 33	\n\t"
+       "xvmaddadp	%x5, %x13, 33	\n\t"
+       "xvmaddadp	%x6, %x14, 33	\n\t"
+       "xvmaddadp	%x7, %x15, 33	\n\t"
+
+       "stxvd2x		48, 0, %16	\n\t"
+       "stxvd2x		49, %22, %16	\n\t"
+       "stxvd2x		50, %23, %16	\n\t"
+       "stxvd2x		51, %24, %16	\n\t"
+
+       "addi		%16, %16, 64	\n\t"
+
+       "stxvd2x		%x4, 0, %16	\n\t"
+       "stxvd2x		%x5, %22, %16	\n\t"
+       "stxvd2x		%x6, %23, %16	\n\t"
+       "stxvd2x		%x7, %24, %16	\n"
+
+     "#n=%1 x=%17=%2 y=%0=%3 alpha=(%19,%20) mvecp=%18=%16 o16=%22 o32=%23 o48=%24 ytmp=%16\n"
+     "#t0=%x4 t1=%x5 t2=%x6 t3=%x7 t4=%x8 t5=%x9 t6=%x10 t7=%x11 t8=%x12 t9=%x13 t10=%x14 t11=%x15"
+     :
+       "+m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y),	// 3
+       "=wa" (t0),	// 4
+       "=wa" (t1),	// 5
+       "=wa" (t2),	// 6
+       "=wa" (t3),	// 7
+       "=wa" (t4),	// 8
+       "=wa" (t5),	// 9
+       "=wa" (t6),	// 10
+       "=wa" (t7),	// 11
+       "=wa" (t8),	// 12
+       "=wa" (t9),	// 13
+       "=wa" (t10),	// 14
+       "=wa" (t11),	// 15
+       "=b" (ytmp)	// 16
+     :
+       "m" (*x),
+       "m" (*mvecp),
+       "d" (alpha_r),	// 19
+       "d" (alpha_i),	// 20
+       "16" (mvecp),	// 21
+       "b" (16),	// 22
+       "b" (32),	// 23
+       "b" (48)		// 24
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
diff --git a/kernel/power/zcopy_microk_power8.c b/kernel/power/zcopy_microk_power8.c
index 73abe08..5ca34b6 100644
--- a/kernel/power/zcopy_microk_power8.c
+++ b/kernel/power/zcopy_microk_power8.c
@@ -35,140 +35,121 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_16 1

-static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zcopy_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+static void zcopy_kernel_16 (long n, FLOAT *x, FLOAT *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvd2x		50, 0, %2			    \n\t"
-	"lxvd2x		51, %5, %2			    \n\t"
-	"lxvd2x		52, %6, %2			    \n\t"
-	"lxvd2x		53, %7, %2			    \n\t"
-	"lxvd2x		54, %8, %2			    \n\t"
-	"lxvd2x		55, %9, %2			    \n\t"
-	"lxvd2x		56, %10, %2			    \n\t"
-	"lxvd2x		57, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"stxvd2x		40, 0, %1			    \n\t"
-	"stxvd2x		41, %5, %1			    \n\t"
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"stxvd2x		42, %6, %1			    \n\t"
-	"stxvd2x		43, %7, %1			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"stxvd2x		44, %8, %1			    \n\t"
-	"stxvd2x		45, %9, %1			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"stxvd2x		46, %10, %1			    \n\t"
-	"stxvd2x		47, %11, %1			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"stxvd2x		50, 0, %1			    \n\t"
-	"stxvd2x		51, %5, %1			    \n\t"
-	"lxvd2x		50, 0, %2			    \n\t"
-	"lxvd2x		51, %5, %2			    \n\t"
-	"stxvd2x		52, %6, %1			    \n\t"
-	"stxvd2x		53, %7, %1			    \n\t"
-	"lxvd2x		52, %6, %2			    \n\t"
-	"lxvd2x		53, %7, %2			    \n\t"
-	"stxvd2x		54, %8, %1			    \n\t"
-	"stxvd2x		55, %9, %1			    \n\t"
-	"lxvd2x		54, %8, %2			    \n\t"
-	"lxvd2x		55, %9, %2			    \n\t"
-	"stxvd2x		56, %10, %1			    \n\t"
-	"stxvd2x		57, %11, %1			    \n\t"
-	"lxvd2x		56, %10, %2			    \n\t"
-	"lxvd2x		57, %11, %2			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"stxvd2x		40, 0, %1			    \n\t"
-	"stxvd2x		41, %5, %1			    \n\t"
-	"stxvd2x		42, %6, %1			    \n\t"
-	"stxvd2x		43, %7, %1			    \n\t"
-	"stxvd2x		44, %8, %1			    \n\t"
-	"stxvd2x		45, %9, %1			    \n\t"
-	"stxvd2x		46, %10, %1			    \n\t"
-	"stxvd2x		47, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"stxvd2x		50, 0, %1			    \n\t"
-	"stxvd2x		51, %5, %1			    \n\t"
-	"stxvd2x		52, %6, %1			    \n\t"
-	"stxvd2x		53, %7, %1			    \n\t"
-	"stxvd2x		54, %8, %1			    \n\t"
-	"stxvd2x		55, %9, %1			    \n\t"
-	"stxvd2x		56, %10, %1			    \n\t"
-	"stxvd2x		57, %11, %1			    \n\t"
-
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "lxvd2x		32, 0, %2	\n\t"
+       "lxvd2x		33, %5, %2	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "lxvd2x		34, %6, %2	\n\t"
+       "lxvd2x		35, %7, %2	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "lxvd2x		36, %8, %2	\n\t"
+       "lxvd2x		37, %9, %2	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+       "lxvd2x		38, %10, %2	\n\t"
+       "lxvd2x		39, %11, %2	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%2, %2, 128	\n\t"
+
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "lxvd2x		40, 0, %2	\n\t"
+       "lxvd2x		41, %5, %2	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "lxvd2x		42, %6, %2	\n\t"
+       "lxvd2x		43, %7, %2	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "lxvd2x		44, %8, %2	\n\t"
+       "lxvd2x		45, %9, %2	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+       "lxvd2x		46, %10, %2	\n\t"
+       "lxvd2x		47, %11, %2	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -16	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n"
+
+     "#n=%1 x=%4=%2 y=%0=%3 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "=m" (*y),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47"
+     );
+}
diff --git a/kernel/power/zdot.c b/kernel/power/zdot.c
index bc1a95e..b83f832 100644
--- a/kernel/power/zdot.c
+++ b/kernel/power/zdot.c
@@ -43,8 +43,6 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_8

-static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d) __attribute__ ((noinline));
-
 static void zdot_kernel_8(BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *d)
 {
 	BLASLONG register i = 0;
diff --git a/kernel/power/zdot_microk_power8.c b/kernel/power/zdot_microk_power8.c
index 296d3d4..71078b6 100644
--- a/kernel/power/zdot_microk_power8.c
+++ b/kernel/power/zdot_microk_power8.c
@@ -34,186 +34,174 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 **************************************************************************************/

 #define HAVE_KERNEL_8 1
-static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y , FLOAT *dot) __attribute__ ((noinline));

-static void zdot_kernel_8( BLASLONG n, FLOAT *x, FLOAT *y, FLOAT *dot)
+static void zdot_kernel_8 (long n, double *x, double *y, double *dot)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-	"xxlxor		32,32,32			    \n\t"
-	"xxlxor		33,33,33			    \n\t"
-	"xxlxor		34,34,34			    \n\t"
-	"xxlxor		35,35,35			    \n\t"
-	"xxlxor		36,36,36			    \n\t"
-	"xxlxor		37,37,37			    \n\t"
-	"xxlxor		38,38,38			    \n\t"
-	"xxlxor		39,39,39			    \n\t"
-
-	"dcbt		%2, %8				    \n\t"
-	"dcbt		%3, %8				    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
-	"lxvd2x		48, 0, %3			    \n\t"	// y0_r, y0_i
-	"lxvd2x		41, %5, %2			    \n\t"	// x1_r, x1_i
-	"lxvd2x		49, %5, %3			    \n\t"	// y1_r, y1_i
-	"lxvd2x		42, %6, %2			    \n\t"	// x2_r, x2_i
-	"lxvd2x		50, %6, %3			    \n\t"	// y2_r, y2_i
-	"lxvd2x		43, %7, %2			    \n\t"	// x3_r, x3_i
-	"lxvd2x		51, %7, %3			    \n\t"	// y3_r, y3_i
-
-	"xxswapd	52,48				    \n\t"	// y0_i, y0_r
-	"xxswapd	53,49				    \n\t"	// y1_i, y1_r
-	"xxswapd	54,50				    \n\t"	// y2_i, y2_r
-	"xxswapd	55,51				    \n\t"	// y3_i, y3_r
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-
-	"lxvd2x		44, 0, %2			    \n\t"	// x0_r, x0_i
-	"lxvd2x		56, 0, %3			    \n\t"	// y0_r, y0_i
-	"lxvd2x		45, %5, %2			    \n\t"	// x1_r, x1_i
-	"lxvd2x		57, %5, %3			    \n\t"	// y1_r, y1_i
-	"lxvd2x		46, %6, %2			    \n\t"	// x2_r, x2_i
-	"lxvd2x		58, %6, %3			    \n\t"	// y2_r, y2_i
-	"lxvd2x		47, %7, %2			    \n\t"	// x3_r, x3_i
-	"lxvd2x		59, %7, %3			    \n\t"	// y3_r, y3_i
-
-	"xxswapd	60,56				    \n\t"	// y0_i, y0_r
-	"xxswapd	61,57				    \n\t"	// y1_i, y1_r
-	"xxswapd	62,58				    \n\t"	// y2_i, y2_r
-	"xxswapd	63,59				    \n\t"	// y3_i, y3_r
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %8				    \n\t"
-	"dcbt		%3, %8				    \n\t"
-
-	"xvmaddadp	32, 40, 48		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
-	"lxvd2x		48, 0, %3			    \n\t"	// y0_r, y0_i
-	"xvmaddadp	34, 41, 49		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
-	"lxvd2x		49, %5, %3			    \n\t"	// y1_r, y1_i
-
-	"xvmaddadp	36, 42, 50		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
-	"lxvd2x		50, %6, %3			    \n\t"	// y2_r, y2_i
-	"xvmaddadp	38, 43, 51		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
-	"lxvd2x		51, %7, %3			    \n\t"	// y3_r, y3_i
-
-	"xvmaddadp	33, 40, 52		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
-	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
-	"xvmaddadp	35, 41, 53		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
-	"lxvd2x		41, %5, %2			    \n\t"	// x1_r, x1_i
-
-	"xvmaddadp	37, 42, 54		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
-	"lxvd2x		42, %6, %2			    \n\t"	// x2_r, x2_i
-	"xvmaddadp	39, 43, 55		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
-	"lxvd2x		43, %7, %2			    \n\t"	// x3_r, x3_i
-
-	"xxswapd	52,48				    \n\t"	// y0_i, y0_r
-	"xxswapd	53,49				    \n\t"	// y1_i, y1_r
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-	"xxswapd	54,50				    \n\t"	// y2_i, y2_r
-	"xxswapd	55,51				    \n\t"	// y3_i, y3_r
-
-	"xvmaddadp	32, 44, 56		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
-	"lxvd2x		56, 0, %3			    \n\t"	// y0_r, y0_i
-	"xvmaddadp	34, 45, 57		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
-	"lxvd2x		57, %5, %3			    \n\t"	// y1_r, y1_i
-	"xvmaddadp	36, 46, 58		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
-	"lxvd2x		58, %6, %3			    \n\t"	// y2_r, y2_i
-	"xvmaddadp	38, 47, 59		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
-	"lxvd2x		59, %7, %3			    \n\t"	// y3_r, y3_i
-
-	"xvmaddadp	33, 44, 60		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
-	"lxvd2x		44, 0, %2			    \n\t"	// x0_r, x0_i
-	"xvmaddadp	35, 45, 61		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
-	"lxvd2x		45, %5, %2			    \n\t"	// x1_r, x1_i
-	"xvmaddadp	37, 46, 62		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
-	"lxvd2x		46, %6, %2			    \n\t"	// x2_r, x2_i
-	"xvmaddadp	39, 47, 63		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
-	"lxvd2x		47, %7, %2			    \n\t"	// x3_r, x3_i
-
-	"xxswapd	60,56				    \n\t"	// y0_i, y0_r
-	"xxswapd	61,57				    \n\t"	// y1_i, y1_r
-
-	"addi		%2, %2, 64			    \n\t"
-	"addi		%3, %3, 64			    \n\t"
-
-	"xxswapd	62,58				    \n\t"	// y2_i, y2_r
-	"xxswapd	63,59				    \n\t"	// y3_i, y3_r
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmaddadp	32, 40, 48		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
-	"xvmaddadp	34, 41, 49		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
-	"xvmaddadp	36, 42, 50		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
-	"xvmaddadp	38, 43, 51		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
-
-	"xvmaddadp	33, 40, 52		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
-	"xvmaddadp	35, 41, 53		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
-	"xvmaddadp	37, 42, 54		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
-	"xvmaddadp	39, 43, 55		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
-
-	"xvmaddadp	32, 44, 56		    	    \n\t"	// x0_r * y0_r , x0_i * y0_i
-	"xvmaddadp	34, 45, 57		    	    \n\t"	// x1_r * y1_r , x1_i * y1_i
-	"xvmaddadp	36, 46, 58		    	    \n\t"	// x2_r * y2_r , x2_i * y2_i
-	"xvmaddadp	38, 47, 59		    	    \n\t"	// x3_r * y3_r , x3_i * y3_i
-
-	"xvmaddadp	33, 44, 60		    	    \n\t"	// x0_r * y0_i , x0_i * y0_r
-	"xvmaddadp	35, 45, 61		    	    \n\t"	// x1_r * y1_i , x1_i * y1_r
-	"xvmaddadp	37, 46, 62		    	    \n\t"	// x2_r * y2_i , x2_i * y2_r
-	"xvmaddadp	39, 47, 63		    	    \n\t"	// x3_r * y3_i , x3_i * y3_r
-
-
-	"xvadddp	32, 32, 34		     \n\t"
-	"xvadddp	36, 36, 38		     \n\t"
-
-	"xvadddp	33, 33, 35		     \n\t"
-	"xvadddp	37, 37, 39		     \n\t"
-
-	"xvadddp	32, 32, 36		     \n\t"
-	"xvadddp	33, 33, 37		     \n\t"
-
-	"stxvd2x	32, 0, %4		     \n\t"
-	"stxvd2x	33, %5, %4		     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (n),  	// 1
-          "r" (x1),     // 2
-          "r" (y1),     // 3
-          "r" (dot),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-	  "r" (pre)	// 8
-	: "cr0", "%0", "%2" , "%3", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+       "dcbt		0, %3		\n\t"
+
+       "xxlxor		32, 32,	32	\n\t"
+       "xxlxor		33, 33,	33	\n\t"
+       "xxlxor		34, 34,	34	\n\t"
+       "xxlxor		35, 35,	35	\n\t"
+       "xxlxor		36, 36,	36	\n\t"
+       "xxlxor		37, 37,	37	\n\t"
+       "xxlxor		38, 38,	38	\n\t"
+       "xxlxor		39, 39,	39	\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"	// x0_r, x0_i
+       "lxvd2x		48, 0, %3	\n\t"	// y0_r, y0_i
+       "lxvd2x		41, %7, %2	\n\t"	// x1_r, x1_i
+       "lxvd2x		49, %7, %3	\n\t"	// y1_r, y1_i
+       "lxvd2x		42, %8, %2	\n\t"	// x2_r, x2_i
+       "lxvd2x		50, %8, %3	\n\t"	// y2_r, y2_i
+       "lxvd2x		43, %9, %2	\n\t"	// x3_r, x3_i
+       "lxvd2x		51, %9, %3	\n\t"	// y3_r, y3_i
+
+       "xxswapd		0, 48		\n\t"	// y0_i, y0_r
+       "xxswapd		1, 49		\n\t"	// y1_i, y1_r
+       "xxswapd		2, 50		\n\t"	// y2_i, y2_r
+       "xxswapd		3, 51		\n\t"	// y3_i, y3_r
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 64	\n\t"
+
+       "lxvd2x		44, 0, %2	\n\t"	// x0_r, x0_i
+       "lxvd2x		4, 0, %3	\n\t"	// y0_r, y0_i
+       "lxvd2x		45, %7, %2	\n\t"	// x1_r, x1_i
+       "lxvd2x		5, %7, %3	\n\t"	// y1_r, y1_i
+       "lxvd2x		46, %8, %2	\n\t"	// x2_r, x2_i
+       "lxvd2x		6, %8, %3	\n\t"	// y2_r, y2_i
+       "lxvd2x		47, %9, %2	\n\t"	// x3_r, x3_i
+       "lxvd2x		7, %9, %3	\n\t"	// y3_r, y3_i
+
+       "xxswapd		8, 4		\n\t"	// y0_i, y0_r
+       "xxswapd		9, 5		\n\t"	// y1_i, y1_r
+       "xxswapd		10, 6		\n\t"	// y2_i, y2_r
+       "xxswapd		11, 7		\n\t"	// y3_i, y3_r
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 64	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "lxvd2x		48, 0, %3	\n\t"	// y0_r, y0_i
+       "xvmaddadp	34, 41, 49	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "lxvd2x		49, %7, %3	\n\t"	// y1_r, y1_i
+
+       "xvmaddadp	36, 42, 50	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "lxvd2x		50, %8, %3	\n\t"	// y2_r, y2_i
+       "xvmaddadp	38, 43, 51	\n\t"	// x3_r * y3_r , x3_i * y3_i
+       "lxvd2x		51, %9, %3	\n\t"	// y3_r, y3_i
+
+       "xvmaddadp	33, 40, 0	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "lxvd2x		40, 0, %2	\n\t"	// x0_r, x0_i
+       "xvmaddadp	35, 41, 1	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "lxvd2x		41, %7, %2	\n\t"	// x1_r, x1_i
+
+       "xvmaddadp	37, 42, 2	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "lxvd2x		42, %8, %2	\n\t"	// x2_r, x2_i
+       "xvmaddadp	39, 43, 3	\n\t"	// x3_r * y3_i , x3_i * y3_r
+       "lxvd2x		43, %9, %2	\n\t"	// x3_r, x3_i
+
+       "xxswapd		0,48		\n\t"	// y0_i, y0_r
+       "xxswapd		1,49		\n\t"	// y1_i, y1_r
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 64	\n\t"
+
+       "xxswapd		2,50		\n\t"	// y2_i, y2_r
+       "xxswapd		3,51		\n\t"	// y3_i, y3_r
+
+       "xvmaddadp	32, 44, 4	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "lxvd2x		4, 0, %3	\n\t"	// y0_r, y0_i
+       "xvmaddadp	34, 45, 5	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "lxvd2x		5, %7, %3	\n\t"	// y1_r, y1_i
+       "xvmaddadp	36, 46, 6	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "lxvd2x		6, %8, %3	\n\t"	// y2_r, y2_i
+       "xvmaddadp	38, 47, 7	\n\t"	// x3_r * y3_r , x3_i * y3_i
+       "lxvd2x		7, %9, %3	\n\t"	// y3_r, y3_i
+
+       "xvmaddadp	33, 44, 8	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "lxvd2x		44, 0, %2	\n\t"	// x0_r, x0_i
+       "xvmaddadp	35, 45, 9	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "lxvd2x		45, %7, %2	\n\t"	// x1_r, x1_i
+       "xvmaddadp	37, 46, 10	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "lxvd2x		46, %8, %2	\n\t"	// x2_r, x2_i
+       "xvmaddadp	39, 47, 11	\n\t"	// x3_r * y3_i , x3_i * y3_r
+       "lxvd2x		47, %9, %2	\n\t"	// x3_r, x3_i
+
+       "xxswapd		8,4		\n\t"	// y0_i, y0_r
+       "xxswapd		9,5		\n\t"	// y1_i, y1_r
+
+       "addi		%2, %2, 64	\n\t"
+       "addi		%3, %3, 64	\n\t"
+
+       "xxswapd		10,6		\n\t"	// y2_i, y2_r
+       "xxswapd		11,7		\n\t"	// y3_i, y3_r
+
+       "addic.		%1, %1, -8	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmaddadp	32, 40, 48	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "xvmaddadp	34, 41, 49	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "xvmaddadp	36, 42, 50	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "xvmaddadp	38, 43, 51	\n\t"	// x3_r * y3_r , x3_i * y3_i
+
+       "xvmaddadp	33, 40, 0	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "xvmaddadp	35, 41, 1	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "xvmaddadp	37, 42, 2	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "xvmaddadp	39, 43, 3	\n\t"	// x3_r * y3_i , x3_i * y3_r
+
+       "xvmaddadp	32, 44, 4	\n\t"	// x0_r * y0_r , x0_i * y0_i
+       "xvmaddadp	34, 45, 5	\n\t"	// x1_r * y1_r , x1_i * y1_i
+       "xvmaddadp	36, 46, 6	\n\t"	// x2_r * y2_r , x2_i * y2_i
+       "xvmaddadp	38, 47, 7	\n\t"	// x3_r * y3_r , x3_i * y3_i
+
+       "xvmaddadp	33, 44, 8	\n\t"	// x0_r * y0_i , x0_i * y0_r
+       "xvmaddadp	35, 45, 9	\n\t"	// x1_r * y1_i , x1_i * y1_r
+       "xvmaddadp	37, 46, 10	\n\t"	// x2_r * y2_i , x2_i * y2_r
+       "xvmaddadp	39, 47, 11	\n\t"	// x3_r * y3_i , x3_i * y3_r
+
+       "xvadddp		32, 32, 34	\n\t"
+       "xvadddp		36, 36, 38	\n\t"
+
+       "xvadddp		33, 33, 35	\n\t"
+       "xvadddp		37, 37, 39	\n\t"
+
+       "xvadddp		32, 32, 36	\n\t"
+       "xvadddp		33, 33, 37	\n\t"
+
+       "stxvd2x		32, 0, %6	\n\t"
+       "stxvd2x		33, %7, %6	\n"
+
+     "#n=%1 x=%4=%2 y=%5=%3 dot=%0=%6 o16=%7 o32=%8 o48=%9"
+     :
+       "=m" (*dot),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "+b" (y)		// 3
+     :
+       "m" (*x),
+       "m" (*y),
+       "b" (dot),	// 6
+       "b" (16),	// 7
+       "b" (32),	// 8
+       "b" (48)		// 9
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+       "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+     );
+}
diff --git a/kernel/power/zscal.c b/kernel/power/zscal.c
index 410fc98..14d677f 100644
--- a/kernel/power/zscal.c
+++ b/kernel/power/zscal.c
@@ -47,15 +47,15 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #ifndef HAVE_KERNEL_8

-static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT *alpha)
+static void zscal_kernel_8(BLASLONG n, FLOAT *x, FLOAT da_r, FLOAT da_i)
 {

 	BLASLONG i=0;
 	FLOAT *x1=x;
-	FLOAT  alpha_r1=alpha[0];
-	FLOAT  alpha_r2=alpha[1];
-	FLOAT  alpha_i1=alpha[2];
-	FLOAT  alpha_i2=alpha[3];
+	FLOAT  alpha_r1=da_r;
+	FLOAT  alpha_r2=da_r;
+	FLOAT  alpha_i1=-da_i;
+	FLOAT  alpha_i2=da_i;
 	FLOAT  temp00, temp01, temp10, temp11, temp20, temp21, temp30, temp31;
 	FLOAT  x0_r, x0_i, x1_r, x1_i, x2_r, x2_i, x3_r, x3_i;

@@ -116,7 +116,6 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 	BLASLONG inc_x2;
 	BLASLONG ip = 0;
 	FLOAT temp;
-	FLOAT alpha[4] __attribute__ ((aligned (16)));;
 	BLASLONG n1;

 	if ( n <= 0 )
@@ -147,11 +146,7 @@ int CNAME(BLASLONG n, BLASLONG dummy0, BLASLONG dummy1, FLOAT da_r,FLOAT da_i, F
 		n1 = n & -8;
 		if ( n1 > 0 )
 		{
-			alpha[0] = da_r;
-			alpha[1] = da_r;
-			alpha[2] = -da_i;
-			alpha[3] = da_i;
-			zscal_kernel_8(n1, x, alpha);
+			zscal_kernel_8(n1, x, da_r, da_i);
 			i=n1;
 			ip = n1 * 2;

diff --git a/kernel/power/zscal_microk_power8.c b/kernel/power/zscal_microk_power8.c
index 5e09d8d..aba9029 100644
--- a/kernel/power/zscal_microk_power8.c
+++ b/kernel/power/zscal_microk_power8.c
@@ -38,187 +38,202 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_8 1

-static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha) __attribute__ ((noinline));
-
-static void zscal_kernel_8( BLASLONG n, FLOAT *x, FLOAT *alpha)
+static void zscal_kernel_8 (long n, double *x, double alpha_r, double alpha_i)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *x2=x+1;
-	BLASLONG pre = 384;
-
-	__asm__  __volatile__
-	(
-
-        "lxvd2x         32, 0, %3                           \n\t"	// alpha_r , alpha_r
-        "lxvd2x         33, %5, %3                          \n\t"	// -alpha_i , alpha_i
-        "addi           %1, %1, -8                          \n\t"
-
-	"dcbt		%2, %4				    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"ble		2f		             	     \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"dcbt		%2, %4				    \n\t"
-
-	"xvmuldp	48, 40, 32		    	    \n\t"	// x0_r * alpha_r, x0_i * alpha_r
-	"xvmuldp	49, 41, 32		    	    \n\t"
-	"xvmuldp	50, 42, 32		    	    \n\t"
-	"xvmuldp	51, 43, 32		    	    \n\t"
-	"xvmuldp	52, 44, 32		    	    \n\t"
-	"xvmuldp	53, 45, 32		    	    \n\t"
-	"xvmuldp	54, 46, 32		    	    \n\t"
-	"xvmuldp	55, 47, 32		    	    \n\t"
-
-	"xxswapd	56, 40				    \n\t"
-	"xxswapd	57, 41				    \n\t"
-	"xxswapd	58, 42				    \n\t"
-	"xxswapd	59, 43				    \n\t"
-	"xxswapd	60, 44				    \n\t"
-	"xxswapd	61, 45				    \n\t"
-	"xxswapd	62, 46				    \n\t"
-	"xxswapd	63, 47				    \n\t"
-
-	"xvmuldp	56, 56, 33		    	    \n\t"	// x0_i * -alpha_i, x0_r * alpha_i
-	"xvmuldp	57, 57, 33		    	    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"	// x0_r, x0_i
-	"lxvd2x		41, %5, %2			    \n\t"
-
-	"xvmuldp	58, 58, 33		    	    \n\t"
-	"xvmuldp	59, 59, 33		    	    \n\t"
-
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-
-	"xvmuldp	60, 60, 33		    	    \n\t"
-	"xvmuldp	61, 61, 33		    	    \n\t"
-
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-
-	"xvmuldp	62, 62, 33		    	    \n\t"
-	"xvmuldp	63, 63, 33		    	    \n\t"
-
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"xvadddp	48, 48 , 56			    \n\t"
-	"xvadddp	49, 49 , 57			    \n\t"
-	"xvadddp	50, 50 , 58			    \n\t"
-	"xvadddp	51, 51 , 59			    \n\t"
-
-	"stxvd2x	48, 0, %1			    \n\t"
-	"stxvd2x	49, %5, %1			    \n\t"
-
-	"xvadddp	52, 52 , 60			    \n\t"
-	"xvadddp	53, 53 , 61			    \n\t"
-
-	"stxvd2x	50, %6, %1			    \n\t"
-	"stxvd2x	51, %7, %1			    \n\t"
-
-	"xvadddp	54, 54 , 62			    \n\t"
-	"xvadddp	55, 55 , 63			    \n\t"
-
-	"stxvd2x	52, %8, %1			    \n\t"
-	"stxvd2x	53, %9, %1			    \n\t"
-	"stxvd2x	54, %10, %1			    \n\t"
-	"stxvd2x	55, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-	"addi		%2, %2, 128			    \n\t"
-
-	"addic.		%0 , %0	, -8  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	"xvmuldp	48, 40, 32		    	    \n\t"	// x0_r * alpha_r, x0_i * alpha_r
-	"xvmuldp	49, 41, 32		    	    \n\t"
-	"xvmuldp	50, 42, 32		    	    \n\t"
-	"xvmuldp	51, 43, 32		    	    \n\t"
-	"xvmuldp	52, 44, 32		    	    \n\t"
-	"xvmuldp	53, 45, 32		    	    \n\t"
-	"xvmuldp	54, 46, 32		    	    \n\t"
-	"xvmuldp	55, 47, 32		    	    \n\t"
-
-	"xxswapd	56, 40				    \n\t"
-	"xxswapd	57, 41				    \n\t"
-	"xxswapd	58, 42				    \n\t"
-	"xxswapd	59, 43				    \n\t"
-	"xxswapd	60, 44				    \n\t"
-	"xxswapd	61, 45				    \n\t"
-	"xxswapd	62, 46				    \n\t"
-	"xxswapd	63, 47				    \n\t"
-
-	"xvmuldp	56, 56, 33		    	    \n\t"	// x0_i * -alpha_i, x0_r * alpha_i
-	"xvmuldp	57, 57, 33		    	    \n\t"
-	"xvmuldp	58, 58, 33		    	    \n\t"
-	"xvmuldp	59, 59, 33		    	    \n\t"
-	"xvmuldp	60, 60, 33		    	    \n\t"
-	"xvmuldp	61, 61, 33		    	    \n\t"
-	"xvmuldp	62, 62, 33		    	    \n\t"
-	"xvmuldp	63, 63, 33		    	    \n\t"
-
-	"xvadddp	48, 48 , 56			    \n\t"
-	"xvadddp	49, 49 , 57			    \n\t"
-	"xvadddp	50, 50 , 58			    \n\t"
-	"xvadddp	51, 51 , 59			    \n\t"
-	"xvadddp	52, 52 , 60			    \n\t"
-	"xvadddp	53, 53 , 61			    \n\t"
-	"xvadddp	54, 54 , 62			    \n\t"
-	"xvadddp	55, 55 , 63			    \n\t"
-
-	"stxvd2x	48, 0, %1			    \n\t"
-	"stxvd2x	49, %5, %1			    \n\t"
-	"stxvd2x	50, %6, %1			    \n\t"
-	"stxvd2x	51, %7, %1			    \n\t"
-	"stxvd2x	52, %8, %1			    \n\t"
-	"stxvd2x	53, %9, %1			    \n\t"
-	"stxvd2x	54, %10, %1			    \n\t"
-	"stxvd2x	55, %11, %1			    \n\t"
-
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (x2),  	// 1
-          "r" (x1),     // 2
-          "r" (alpha),  // 3
-          "r" (pre),    // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "memory"
-	);
-
-}
-
-
+  __vector double t0;
+  __vector double t1;
+  __vector double t2;
+  __vector double t3;
+  __vector double t4;
+  __vector double t5;
+  __vector double t6;
+  __vector double t7;
+  __vector double t8;
+  __vector double t9;
+  __vector double t10;
+  __vector double t11;
+
+  __asm__
+    (
+       "dcbt		0, %2		\n\t"
+
+       "xsnegdp		33, %x16	\n\t"	// -alpha_i
+       "xxspltd		32, %x15, 0	\n\t"	// alpha_r , alpha_r
+       "xxmrghd		33, 33, %x16	\n\t"	// -alpha_i , alpha_i
+
+       "lxvd2x		40, 0, %2	\n\t"	// x0_r, x0_i
+       "lxvd2x		41, %17, %2	\n\t"
+       "lxvd2x		42, %18, %2	\n\t"
+       "lxvd2x		43, %19, %2	\n\t"
+       "lxvd2x		44, %20, %2	\n\t"
+       "lxvd2x		45, %21, %2	\n\t"
+       "lxvd2x		46, %22, %2	\n\t"
+       "lxvd2x		47, %23, %2	\n\t"
+
+       "addi		%2, %2, 128	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "ble		2f		\n\t"
+
+       ".p2align	5		\n"
+     "1:				\n\t"
+
+       "xvmuldp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
+       "xvmuldp		49, 41, 32	\n\t"
+       "xvmuldp		50, 42, 32	\n\t"
+       "xvmuldp		51, 43, 32	\n\t"
+       "xvmuldp		%x3, 44, 32	\n\t"
+       "xvmuldp		%x4, 45, 32	\n\t"
+       "xvmuldp		%x5, 46, 32	\n\t"
+       "xvmuldp		%x6, 47, 32	\n\t"
+
+       "xxswapd		%x7, 40		\n\t"
+       "xxswapd		%x8, 41		\n\t"
+       "xxswapd		%x9, 42		\n\t"
+       "xxswapd		%x10, 43		\n\t"
+       "xxswapd		%x11, 44		\n\t"
+       "xxswapd		%x12, 45	\n\t"
+       "xxswapd		%x13, 46	\n\t"
+       "xxswapd		%x14, 47	\n\t"
+
+       "xvmuldp		%x7, %x7, 33	\n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+       "xvmuldp		%x8, %x8, 33	\n\t"
+
+       "lxvd2x		40, 0, %2	\n\t"	// x0_r, x0_i
+       "lxvd2x		41, %17, %2	\n\t"
+
+       "xvmuldp		%x9, %x9, 33	\n\t"
+       "xvmuldp		%x10, %x10, 33	\n\t"
+
+       "lxvd2x		42, %18, %2	\n\t"
+       "lxvd2x		43, %19, %2	\n\t"
+
+       "xvmuldp		%x11, %x11, 33	\n\t"
+       "xvmuldp		%x12, %x12, 33	\n\t"
+
+       "lxvd2x		44, %20, %2	\n\t"
+       "lxvd2x		45, %21, %2	\n\t"
+
+       "xvmuldp		%x13, %x13, 33	\n\t"
+       "xvmuldp		%x14, %x14, 33	\n\t"
+
+       "lxvd2x		46, %22, %2	\n\t"
+       "lxvd2x		47, %23, %2	\n\t"
+
+       "addi		%2, %2, -128	\n\t"
+
+       "xvadddp		48, 48, %x7	\n\t"
+       "xvadddp		49, 49, %x8	\n\t"
+       "xvadddp		50, 50, %x9	\n\t"
+       "xvadddp		51, 51, %x10	\n\t"
+
+       "stxvd2x		48, 0, %2	\n\t"
+       "stxvd2x		49, %17, %2	\n\t"
+
+       "xvadddp		%x3, %x3, %x11	\n\t"
+       "xvadddp		%x4, %x4, %x12	\n\t"
+
+       "stxvd2x		50, %18, %2	\n\t"
+       "stxvd2x		51, %19, %2	\n\t"
+
+       "xvadddp		%x5, %x5, %x13	\n\t"
+       "xvadddp		%x6, %x6, %x14	\n\t"
+
+       "stxvd2x		%x3, %20, %2	\n\t"
+       "stxvd2x		%x4, %21, %2	\n\t"
+       "stxvd2x		%x5, %22, %2	\n\t"
+       "stxvd2x		%x6, %23, %2	\n\t"
+
+       "addi		%2, %2, 256	\n\t"
+
+       "addic.		%1, %1, -8	\n\t"
+       "bgt		1b		\n"
+
+     "2:				\n\t"
+
+       "xvmuldp		48, 40, 32	\n\t"	// x0_r * alpha_r, x0_i * alpha_r
+       "xvmuldp		49, 41, 32	\n\t"
+       "xvmuldp		50, 42, 32	\n\t"
+       "xvmuldp		51, 43, 32	\n\t"
+       "xvmuldp		%x3, 44, 32	\n\t"
+       "xvmuldp		%x4, 45, 32	\n\t"
+       "xvmuldp		%x5, 46, 32	\n\t"
+       "xvmuldp		%x6, 47, 32	\n\t"
+
+       "xxswapd		%x7, 40		\n\t"
+       "xxswapd		%x8, 41		\n\t"
+       "xxswapd		%x9, 42		\n\t"
+       "xxswapd		%x10, 43		\n\t"
+       "xxswapd		%x11, 44		\n\t"
+       "xxswapd		%x12, 45	\n\t"
+       "xxswapd		%x13, 46	\n\t"
+       "xxswapd		%x14, 47	\n\t"
+
+       "addi		%2, %2, -128	\n\t"
+
+       "xvmuldp		%x7, %x7, 33	\n\t"	// x0_i * -alpha_i, x0_r * alpha_i
+       "xvmuldp		%x8, %x8, 33	\n\t"
+       "xvmuldp		%x9, %x9, 33	\n\t"
+       "xvmuldp		%x10, %x10, 33	\n\t"
+       "xvmuldp		%x11, %x11, 33	\n\t"
+       "xvmuldp		%x12, %x12, 33	\n\t"
+       "xvmuldp		%x13, %x13, 33	\n\t"
+       "xvmuldp		%x14, %x14, 33	\n\t"
+
+       "xvadddp		48, 48, %x7	\n\t"
+       "xvadddp		49, 49, %x8	\n\t"
+       "xvadddp		50, 50, %x9	\n\t"
+       "xvadddp		51, 51, %x10	\n\t"
+
+       "stxvd2x		48, 0, %2	\n\t"
+       "stxvd2x		49, %17, %2	\n\t"
+
+       "xvadddp		%x3, %x3, %x11	\n\t"
+       "xvadddp		%x4, %x4, %x12	\n\t"
+
+       "stxvd2x		50, %18, %2	\n\t"
+       "stxvd2x		51, %19, %2	\n\t"
+
+       "xvadddp		%x5, %x5, %x13	\n\t"
+       "xvadddp		%x6, %x6, %x14	\n\t"
+
+       "stxvd2x		%x3, %20, %2	\n\t"
+       "stxvd2x		%x4, %21, %2	\n\t"
+       "stxvd2x		%x5, %22, %2	\n\t"
+       "stxvd2x		%x6, %23, %2	\n"
+
+     "#n=%1 x=%0=%2 alpha=(%15,%16) o16=%17 o32=%18 o48=%19 o64=%20 o80=%21 o96=%22 o112=%23\n"
+     "#t0=%x3 t1=%x4 t2=%x5 t3=%x6 t4=%x7 t5=%x8 t6=%x9 t7=%x10 t8=%x11 t9=%x12 t10=%x13 t11=%x14"
+     :
+       "+m" (*x),
+       "+r" (n),	// 1
+       "+b" (x),	// 2
+       "=wa" (t0),	// 3
+       "=wa" (t1),	// 4
+       "=wa" (t2),	// 5
+       "=wa" (t3),	// 6
+       "=wa" (t4),	// 7
+       "=wa" (t5),	// 8
+       "=wa" (t6),	// 9
+       "=wa" (t7),	// 10
+       "=wa" (t8),	// 11
+       "=wa" (t9),	// 12
+       "=wa" (t10),	// 13
+       "=wa" (t11)	// 14
+     :
+       "d" (alpha_r),	// 15
+       "d" (alpha_i),	// 16
+       "b" (16),	// 17
+       "b" (32),	// 18
+       "b" (48),	// 19
+       "b" (64),	// 20
+       "b" (80),	// 21
+       "b" (96),	// 22
+       "b" (112)	// 23
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51"
+     );
+}
diff --git a/kernel/power/zswap_microk_power8.c b/kernel/power/zswap_microk_power8.c
index 9e56237..54391ba 100644
--- a/kernel/power/zswap_microk_power8.c
+++ b/kernel/power/zswap_microk_power8.c
@@ -35,146 +35,123 @@ USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.

 #define HAVE_KERNEL_16 1

-static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y) __attribute__ ((noinline));
-
-static void zswap_kernel_16( BLASLONG n, FLOAT *x, FLOAT *y)
+static void
+zswap_kernel_16 (long n, double *x, double *y)
 {
-
-
-	BLASLONG i = n;
-	BLASLONG o16 = 16;
-	BLASLONG o32 = 32;
-	BLASLONG o48 = 48;
-	BLASLONG o64 = 64;
-	BLASLONG o80 = 80;
-	BLASLONG o96 = 96;
-	BLASLONG o112 = 112;
-	FLOAT *x1=x;
-	FLOAT *y1=y;
-	FLOAT *x2=x+1;
-	FLOAT *y2=y+1;
-	BLASLONG pre = 384;
-	BLASLONG alpha=0;
-
-	__asm__  __volatile__
-	(
-
-	"addi		%3, %3, -8			    \n\t"
-	"addi		%4, %4, -8			    \n\t"
-
-	".align 5				            \n\t"
-	"1:				                    \n\t"
-
-	"lxvd2x		32, 0, %2			    \n\t"
-	"lxvd2x		33, %5, %2			    \n\t"
-	"lxvd2x		34, %6, %2			    \n\t"
-	"lxvd2x		35, %7, %2			    \n\t"
-	"lxvd2x		36, %8, %2			    \n\t"
-	"lxvd2x		37, %9, %2			    \n\t"
-	"lxvd2x		38, %10, %2			    \n\t"
-	"lxvd2x		39, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvd2x		40, 0, %2			    \n\t"
-	"lxvd2x		41, %5, %2			    \n\t"
-	"lxvd2x		42, %6, %2			    \n\t"
-	"lxvd2x		43, %7, %2			    \n\t"
-	"lxvd2x		44, %8, %2			    \n\t"
-	"lxvd2x		45, %9, %2			    \n\t"
-	"lxvd2x		46, %10, %2			    \n\t"
-	"lxvd2x		47, %11, %2			    \n\t"
-
-	"addi		%2, %2, 128			    \n\t"
-
-	"lxvd2x		48, 0, %1			    \n\t"
-	"lxvd2x		49, %5, %1			    \n\t"
-	"lxvd2x		50, %6, %1			    \n\t"
-	"lxvd2x		51, %7, %1			    \n\t"
-	"lxvd2x		52, %8, %1			    \n\t"
-	"lxvd2x		53, %9, %1			    \n\t"
-	"lxvd2x		54, %10, %1			    \n\t"
-	"lxvd2x		55, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"lxvd2x		56, 0, %1			    \n\t"
-	"lxvd2x		57, %5, %1			    \n\t"
-	"lxvd2x		58, %6, %1			    \n\t"
-	"lxvd2x		59, %7, %1			    \n\t"
-	"lxvd2x		60, %8, %1			    \n\t"
-	"lxvd2x		61, %9, %1			    \n\t"
-	"lxvd2x		62, %10, %1			    \n\t"
-	"lxvd2x		63, %11, %1			    \n\t"
-
-	"addi		%1, %1, 128			    \n\t"
-
-	"stxvd2x		32, 0, %3			    \n\t"
-	"stxvd2x		33, %5, %3			    \n\t"
-	"stxvd2x		34, %6, %3			    \n\t"
-	"stxvd2x		35, %7, %3			    \n\t"
-	"stxvd2x		36, %8, %3			    \n\t"
-	"stxvd2x		37, %9, %3			    \n\t"
-	"stxvd2x		38, %10, %3			    \n\t"
-	"stxvd2x		39, %11, %3			    \n\t"
-
-	"addi		%3, %3, 128			    \n\t"
-
-	"stxvd2x		40, 0, %3			    \n\t"
-	"stxvd2x		41, %5, %3			    \n\t"
-	"stxvd2x		42, %6, %3			    \n\t"
-	"stxvd2x		43, %7, %3			    \n\t"
-	"stxvd2x		44, %8, %3			    \n\t"
-	"stxvd2x		45, %9, %3			    \n\t"
-	"stxvd2x		46, %10, %3			    \n\t"
-	"stxvd2x		47, %11, %3			    \n\t"
-
-	"addi		%3, %3, 128			    \n\t"
-
-	"stxvd2x		48, 0, %4			    \n\t"
-	"stxvd2x		49, %5, %4			    \n\t"
-	"stxvd2x		50, %6, %4			    \n\t"
-	"stxvd2x		51, %7, %4			    \n\t"
-	"stxvd2x		52, %8, %4			    \n\t"
-	"stxvd2x		53, %9, %4			    \n\t"
-	"stxvd2x		54, %10, %4			    \n\t"
-	"stxvd2x		55, %11, %4			    \n\t"
-
-	"addi		%4, %4, 128			    \n\t"
-
-	"stxvd2x		56, 0, %4			    \n\t"
-	"stxvd2x		57, %5, %4			    \n\t"
-	"stxvd2x		58, %6, %4			    \n\t"
-	"stxvd2x		59, %7, %4			    \n\t"
-	"stxvd2x		60, %8, %4			    \n\t"
-	"stxvd2x		61, %9, %4			    \n\t"
-	"stxvd2x		62, %10, %4			    \n\t"
-	"stxvd2x		63, %11, %4			    \n\t"
-
-	"addi		%4, %4, 128			    \n\t"
-
-	"addic.		%0 , %0	, -16  	 	             \n\t"
-	"bgt		1b		             	     \n\t"
-
-	"2:						     \n\t"
-
-	:
-        :
-          "r" (i),	// 0
-	  "r" (y1),  	// 1
-          "r" (x1),     // 2
-          "r" (y2),     // 3
-          "r" (x2),     // 4
-	  "r" (o16),	// 5
-	  "r" (o32),	// 6
-	  "r" (o48),    // 7
-          "r" (o64),    // 8
-          "r" (o80),    // 9
-          "r" (o96),    // 10
-          "r" (o112)    // 11
-	: "cr0", "%0", "%2" , "%1", "%3", "%4", "memory"
-	);
-
-}
-
-
+  __asm__
+    (
+       ".p2align	5		\n"
+     "1:				\n\t"
+       "lxvd2x		32, 0, %4	\n\t"
+       "lxvd2x		33, %5, %4	\n\t"
+       "lxvd2x		34, %6, %4	\n\t"
+       "lxvd2x		35, %7, %4	\n\t"
+       "lxvd2x		36, %8, %4	\n\t"
+       "lxvd2x		37, %9, %4	\n\t"
+       "lxvd2x		38, %10, %4	\n\t"
+       "lxvd2x		39, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "lxvd2x		40, 0, %4	\n\t"
+       "lxvd2x		41, %5, %4	\n\t"
+       "lxvd2x		42, %6, %4	\n\t"
+       "lxvd2x		43, %7, %4	\n\t"
+       "lxvd2x		44, %8, %4	\n\t"
+       "lxvd2x		45, %9, %4	\n\t"
+       "lxvd2x		46, %10, %4	\n\t"
+       "lxvd2x		47, %11, %4	\n\t"
+
+       "addi		%4, %4, -128	\n\t"
+
+       "lxvd2x		48, 0, %3	\n\t"
+       "lxvd2x		49, %5, %3	\n\t"
+       "lxvd2x		50, %6, %3	\n\t"
+       "lxvd2x		51, %7, %3	\n\t"
+       "lxvd2x		0, %8, %3	\n\t"
+       "lxvd2x		1, %9, %3	\n\t"
+       "lxvd2x		2, %10, %3	\n\t"
+       "lxvd2x		3, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "lxvd2x		4, 0, %3	\n\t"
+       "lxvd2x		5, %5, %3	\n\t"
+       "lxvd2x		6, %6, %3	\n\t"
+       "lxvd2x		7, %7, %3	\n\t"
+       "lxvd2x		8, %8, %3	\n\t"
+       "lxvd2x		9, %9, %3	\n\t"
+       "lxvd2x		10, %10, %3	\n\t"
+       "lxvd2x		11, %11, %3	\n\t"
+
+       "addi		%3, %3, -128	\n\t"
+
+       "stxvd2x		32, 0, %3	\n\t"
+       "stxvd2x		33, %5, %3	\n\t"
+       "stxvd2x		34, %6, %3	\n\t"
+       "stxvd2x		35, %7, %3	\n\t"
+       "stxvd2x		36, %8, %3	\n\t"
+       "stxvd2x		37, %9, %3	\n\t"
+       "stxvd2x		38, %10, %3	\n\t"
+       "stxvd2x		39, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvd2x		40, 0, %3	\n\t"
+       "stxvd2x		41, %5, %3	\n\t"
+       "stxvd2x		42, %6, %3	\n\t"
+       "stxvd2x		43, %7, %3	\n\t"
+       "stxvd2x		44, %8, %3	\n\t"
+       "stxvd2x		45, %9, %3	\n\t"
+       "stxvd2x		46, %10, %3	\n\t"
+       "stxvd2x		47, %11, %3	\n\t"
+
+       "addi		%3, %3, 128	\n\t"
+
+       "stxvd2x		48, 0, %4	\n\t"
+       "stxvd2x		49, %5, %4	\n\t"
+       "stxvd2x		50, %6, %4	\n\t"
+       "stxvd2x		51, %7, %4	\n\t"
+       "stxvd2x		0, %8, %4	\n\t"
+       "stxvd2x		1, %9, %4	\n\t"
+       "stxvd2x		2, %10, %4	\n\t"
+       "stxvd2x		3, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+
+       "stxvd2x		4, 0, %4	\n\t"
+       "stxvd2x		5, %5, %4	\n\t"
+       "stxvd2x		6, %6, %4	\n\t"
+       "stxvd2x		7, %7, %4	\n\t"
+       "stxvd2x		8, %8, %4	\n\t"
+       "stxvd2x		9, %9, %4	\n\t"
+       "stxvd2x		10, %10, %4	\n\t"
+       "stxvd2x		11, %11, %4	\n\t"
+
+       "addi		%4, %4, 128	\n\t"
+       "addic.		%2, %2, -16	\n\t"
+       "bgt		1b		\n"
+
+     "#n=%2 x=%0=%3 y=%1=%4 o16=%5 o32=%6 o48=%7 o64=%8 o80=%9 o96=%10 o112=%11"
+     :
+       "+m" (*x),
+       "+m" (*y),
+       "+r" (n),	// 2
+       "+b" (x),	// 3
+       "+b" (y)		// 4
+     :
+       "b" (16),	// 5
+       "b" (32),	// 6
+       "b" (48),	// 7
+       "b" (64),	// 8
+       "b" (80),	// 9
+       "b" (96),	// 10
+       "b" (112)	// 11
+     :
+       "cr0",
+       "vs32","vs33","vs34","vs35","vs36","vs37","vs38","vs39",
+       "vs40","vs41","vs42","vs43","vs44","vs45","vs46","vs47",
+       "vs48","vs49","vs50","vs51","vs0","vs1","vs2","vs3",
+       "vs4","vs5","vs6","vs7","vs8","vs9","vs10","vs11"
+     );
+}