Merge pull request #445 from wernsaar/develop

A lot of optimizations for gemv kernels
11 years ago · d13e92f07e
--- a/benchmark/gemv.c
+++ b/benchmark/gemv.c
@@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){
  blasint inc_x=1,inc_y=1;
  blasint n=0;
  int has_param_n = 0;
  int has_param_m = 0;
  int loops = 1;
  int l;
  char *p;
@@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){
  if (argc > 0) { to       = MAX(atol(*argv), from);	argc--; argv++;}
  if (argc > 0) { step     = atol(*argv);		argc--; argv++;}


  int tomax = to;

  if ((p = getenv("OPENBLAS_LOOPS")))  loops = atoi(p);
  if ((p = getenv("OPENBLAS_INCX")))   inc_x = atoi(p);
  if ((p = getenv("OPENBLAS_INCY")))   inc_y = atoi(p);
  if ((p = getenv("OPENBLAS_TRANS")))  trans=*p;
  if ((p = getenv("OPENBLAS_PARAM_N"))) {
 	  n = atoi(p);
 	  if ((n>0) && (n<=to)) has_param_n = 1;
 	  if ((n>0)) has_param_n = 1;
  	  if ( n > tomax ) tomax = n;
  }
  if ( has_param_n == 0 )
  	if ((p = getenv("OPENBLAS_PARAM_M"))) {
 		  m = atoi(p);
 		  if ((m>0)) has_param_m = 1;
  	  	  if ( m > tomax ) tomax = m;
  	}

  if ( has_param_n == 1 )
    fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops);
  else
    fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);

  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){

  fprintf(stderr, "From : %3d  To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops);

  if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){
  if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){
  if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){
    fprintf(stderr,"Out of Memory!!\n");exit(1);
  }

@@ -177,50 +187,80 @@ int MAIN__(int argc, char *argv[]){

  fprintf(stderr, "   SIZE       Flops\n");

  for(m = from; m <= to; m += step)
  if (has_param_m == 0)
  {

   timeg=0;
  	for(m = from; m <= to; m += step)
  	{
   		timeg=0;
   		if ( has_param_n == 0 ) n = m;
   		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
   		for(j = 0; j < m; j++){
      			for(i = 0; i < n * COMPSIZE; i++){
 				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
      			}
   		}

   if ( has_param_n == 0 ) n = m;
    		for (l=0; l<loops; l++)
    		{

   fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
   			for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
 				x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   			}

   for(j = 0; j < m; j++){
      		for(i = 0; i < n * COMPSIZE; i++){
 			a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
      		}
   }
   			for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   			}
    			gettimeofday( &start, (struct timezone *)0);
    			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
    			gettimeofday( &stop, (struct timezone *)0);
    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
 			timeg += time1;

    		}

    for (l=0; l<loops; l++)
    {
    		timeg /= loops;

   	for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
 			x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   	}
    		fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);

   	for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
 			y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   	}
    	gettimeofday( &start, (struct timezone *)0);
  	}
  }
  else
  {

    	GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
  	for(n = from; n <= to; n += step)
  	{
   		timeg=0;
   		fprintf(stderr, " %6dx%d : ", (int)m,(int)n);
   		for(j = 0; j < m; j++){
      			for(i = 0; i < n * COMPSIZE; i++){
 				a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
      			}
   		}

    	gettimeofday( &stop, (struct timezone *)0);
    		for (l=0; l<loops; l++)
    		{

    	time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
   			for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){
 				x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   			}

 	timeg += time1;
   			for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){
 				y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5;
   			}
    			gettimeofday( &start, (struct timezone *)0);
    			GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y );
    			gettimeofday( &stop, (struct timezone *)0);
    			time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6;
 			timeg += time1;

    }
    		}

    timeg /= loops;
    		timeg /= loops;

    fprintf(stderr,
 	    " %10.2f MFlops\n",
 	    COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);
    		fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6);

  	}
  }

  return 0;
--- a/benchmark/tplot-header
+++ b/benchmark/tplot-header
@@ -0,0 +1,42 @@
 # **********************************************************************************
 # Copyright (c) 2014, The OpenBLAS Project
 # All rights reserved.
 # Redistribution and use in source and binary forms, with or without
 # modification, are permitted provided that the following conditions are
 # met:
 # 1. Redistributions of source code must retain the above copyright
 # notice, this list of conditions and the following disclaimer.
 # 2. Redistributions in binary form must reproduce the above copyright
 # notice, this list of conditions and the following disclaimer in
 # the documentation and/or other materials provided with the
 # distribution.
 # 3. Neither the name of the OpenBLAS project nor the names of
 # its contributors may be used to endorse or promote products
 # derived from this software without specific prior written permission.
 # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 # **********************************************************************************

 set term x11 font sans;
 set ylabel "MFlops";
 set xlabel "Size";
 set grid xtics;
 set grid ytics;
 set key left;
 set timestamp "generated on %Y-%m-%d by `whoami`"
 set title "Sgemv\nTRANS=T\nBulldozer"
 plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier;
 set output "print.png";
 show title;
 show plot;
 show output;


--- a/common_x86_64.h
+++ b/common_x86_64.h
@@ -46,6 +46,7 @@
 #define	__volatile__
 #endif

 /*
 #ifdef HAVE_SSE2
 #define MB   __asm__ __volatile__ ("mfence");
 #define WMB  __asm__ __volatile__ ("sfence");
@@ -53,6 +54,10 @@
 #define MB
 #define WMB
 #endif
 */

 #define MB
 #define WMB

 static void __inline blas_lock(volatile BLASULONG *address){

@@ -99,7 +104,9 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){
 			     : "0" (op));
 }

 /*
 #define WHEREAMI
 */

 static inline int WhereAmI(void){
  int eax, ebx, ecx, edx;
@@ -111,6 +118,7 @@ static inline int WhereAmI(void){
  return apicid;
 }


 #ifdef CORE_BARCELONA
 #define IFLUSH		gotoblas_iflush()
 #define IFLUSH_HALF	gotoblas_iflush_half()
--- a/driver/others/parameter.c
+++ b/driver/others/parameter.c
@@ -251,7 +251,11 @@ void blas_set_parameter(void){

  env_var_t p;
  int factor;
 #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL)
  int size = 16;
 #else
  int size = get_L2_size();
 #endif

 #if defined(CORE_KATMAI)  || defined(CORE_COPPERMINE) || defined(CORE_BANIAS)
  size >>= 7;
--- a/interface/gemv.c
+++ b/interface/gemv.c
@@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order,
  int  nthreads_avail = nthreads_max;

  double MNK = (double) m * (double) n;
  if ( MNK <= (500.0 * 100.0  * (double) GEMM_MULTITHREAD_THRESHOLD)  )
  if ( MNK <= (24.0 * 24.0  * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) )  )
        nthreads_max = 1;

  if ( nthreads_max > nthreads_avail )
--- a/kernel/x86_64/KERNEL.BULLDOZER
+++ b/kernel/x86_64/KERNEL.BULLDOZER
@@ -10,8 +10,8 @@ DSYMV_L_KERNEL = dsymv_L.c
 SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c

 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.c
--- a/kernel/x86_64/KERNEL.HASWELL
+++ b/kernel/x86_64/KERNEL.HASWELL
@@ -1,8 +1,8 @@
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c

 DGEMVNKERNEL = dgemv_n.c
 DGEMVTKERNEL = dgemv_t.c
 DGEMVNKERNEL = dgemv_n_4.c
 DGEMVTKERNEL = dgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n.c
 ZGEMVTKERNEL = zgemv_t.c
--- a/kernel/x86_64/KERNEL.NEHALEM
+++ b/kernel/x86_64/KERNEL.NEHALEM
@@ -9,9 +9,9 @@ DSYMV_L_KERNEL = dsymv_L.c
 SSYMV_U_KERNEL = ssymv_U.c
 SSYMV_L_KERNEL = ssymv_L.c

 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 DGEMVNKERNEL = dgemv_n.c
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c
 DGEMVNKERNEL = dgemv_n_4.c

 SGEMMKERNEL    =  gemm_kernel_4x8_nehalem.S
 SGEMMINCOPY    =  gemm_ncopy_4.S
--- a/kernel/x86_64/KERNEL.PILEDRIVER
+++ b/kernel/x86_64/KERNEL.PILEDRIVER
@@ -1,5 +1,5 @@
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n_dup.S
 ZGEMVTKERNEL = zgemv_t.S
--- a/kernel/x86_64/KERNEL.SANDYBRIDGE
+++ b/kernel/x86_64/KERNEL.SANDYBRIDGE
@@ -1,5 +1,5 @@
 SGEMVNKERNEL = sgemv_n.c
 SGEMVTKERNEL = sgemv_t.c
 SGEMVNKERNEL = sgemv_n_4.c
 SGEMVTKERNEL = sgemv_t_4.c

 ZGEMVNKERNEL = zgemv_n.c

--- a/kernel/x86_64/dgemv_n_4.c
+++ b/kernel/x86_64/dgemv_n_4.c
@@ -0,0 +1,548 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"


 #if defined(NEHALEM)
 #include "dgemv_n_microk_nehalem-4.c"
 #elif defined(HASWELL)
 #include "dgemv_n_microk_haswell-4.c"
 #endif


 #define NBMAX 2048

 #ifndef HAVE_KERNEL_4x8

 static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
 	FLOAT *b0,*b1,*b2,*b3;
 	FLOAT *x4;
 	FLOAT x[8];
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
 	a3 = ap[3];
 	b0 = a0 + lda4 ;
 	b1 = a1 + lda4 ;
 	b2 = a2 + lda4 ;
 	b3 = a3 + lda4 ;
 	x4 = x + 4;

 	for ( i=0; i<8; i++)
 		x[i] = xo[i] * *alpha;

 	for ( i=0; i< n; i+=4 )
 	{

 		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
 		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
 		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
 		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		

 		y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];		
 		y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];		
 		y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];		
 		y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];		

 	}
 }
 	
 #endif


 #ifndef HAVE_KERNEL_4x4

 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
 	FLOAT x[4];
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
 	a3 = ap[3];

 	for ( i=0; i<4; i++)
 		x[i] = xo[i] * *alpha;

 	for ( i=0; i< n; i+=4 )
 	{
 		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
 		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
 		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
 		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
 	}
 }
 	
 #endif

 #ifndef HAVE_KERNEL_4x2

 static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"movsd    (%2)  , %%xmm12	 \n\t"	// x0 
 	"movsd    (%6)  , %%xmm4 	 \n\t"	// alpha 
 	"movsd   8(%2)  , %%xmm13	 \n\t"	// x1 
        "mulsd  %%xmm4  , %%xmm12        \n\t"  // alpha 
        "mulsd  %%xmm4  , %%xmm13        \n\t"  // alpha 
 	"shufpd $0,  %%xmm12, %%xmm12    \n\t"	
 	"shufpd $0,  %%xmm13, %%xmm13    \n\t"	

 	".align 16				       \n\t"
 	".L01LOOP%=:				       \n\t"
 	"movups	       (%3,%0,8), %%xmm4	       \n\t"	// 2 * y
 	"movups	     16(%3,%0,8), %%xmm5	       \n\t"	// 2 * y

 	"movups             (%4,%0,8), %%xmm8          \n\t" 
 	"movups             (%5,%0,8), %%xmm9          \n\t" 
 	"mulpd		%%xmm12, %%xmm8		       \n\t"
 	"mulpd		%%xmm13, %%xmm9		       \n\t"
 	"addpd		%%xmm8 , %%xmm4		       \n\t"
 	"addpd		%%xmm9 , %%xmm4		       \n\t"

 	"movups           16(%4,%0,8), %%xmm8          \n\t" 
 	"movups           16(%5,%0,8), %%xmm9          \n\t" 
 	"mulpd		%%xmm12, %%xmm8		       \n\t"
 	"mulpd		%%xmm13, %%xmm9		       \n\t"
 	"addpd		%%xmm8 , %%xmm5		       \n\t"
 	"addpd		%%xmm9 , %%xmm5		       \n\t"

 	"movups  %%xmm4 ,   (%3,%0,8)		       \n\t"	// 2 * y
 	"movups  %%xmm5 , 16(%3,%0,8)		       \n\t"	// 2 * y

        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		
 	"jnz		.L01LOOP%=		       \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (alpha)   // 6
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 

 #endif

 #ifndef HAVE_KERNEL_4x2

 static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

        BLASLONG register i = 0;

        __asm__  __volatile__
        (
        "movsd          (%2), %%xmm12            \n\t"  // x0 
        "mulsd          (%5), %%xmm12            \n\t"  // alpha 
        "shufpd $0,  %%xmm12, %%xmm12            \n\t"

        ".align 16                               \n\t"
        ".L01LOOP%=:                             \n\t"
        "movups       (%4,%0,8), %%xmm8          \n\t"  // 2 * a
        "movups     16(%4,%0,8), %%xmm9          \n\t"  // 2 * a
        "movups       (%3,%0,8), %%xmm4          \n\t"  // 2 * y
        "movups     16(%3,%0,8), %%xmm5          \n\t"  // 2 * y
 	"mulpd          %%xmm12, %%xmm8          \n\t"
 	"mulpd          %%xmm12, %%xmm9          \n\t"
        "addpd          %%xmm8 , %%xmm4          \n\t"
        "addpd          %%xmm9 , %%xmm5          \n\t"

 	"movups  %%xmm4 ,    (%3,%0,8)           \n\t"    // 2 * y
 	"movups  %%xmm5 ,  16(%3,%0,8)           \n\t"    // 2 * y

        "addq           $4 , %0                  \n\t"
        "subq           $4 , %1                  \n\t"

        "jnz            .L01LOOP%=               \n\t"

        :
        :
          "r" (i),      // 0    
          "r" (n),      // 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap),     // 4
          "r" (alpha)   // 5
        : "cc",
          "%xmm4", "%xmm5",
          "%xmm6", "%xmm7",
          "%xmm8", "%xmm9", "%xmm10", "%xmm11",
          "%xmm12", "%xmm13", "%xmm14", "%xmm15",
          "memory"
        );

 }

 #endif

 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));

 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 {
 	BLASLONG i;
 	if ( inc_dest != 1 )
 	{
 		for ( i=0; i<n; i++ )
 		{
 			*dest += *src;
 			src++;
 			dest += inc_dest;
 		}
 		return;
 	}

 }

 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
 	FLOAT *ap[4];
 	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
 	BLASLONG m3;
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
 	BLASLONG lda8 =  lda << 3;
 	FLOAT xbuffer[8],*ybuffer;

        if ( m < 1 ) return(0);
        if ( n < 1 ) return(0);

 	ybuffer = buffer;
 	
        if ( inc_x == 1 )
 	{
 		n1 = n >> 3 ;
 		n2 = n &  7 ;
 	}
 	else
 	{
 		n1 = n >> 2 ;
 		n2 = n &  3 ;

 	}
 	
        m3 = m & 3  ;
        m1 = m & -4 ;
        m2 = (m & (NBMAX-1)) - m3 ;


 	y_ptr = y;

 	BLASLONG NB = NBMAX;

 	while ( NB == NBMAX )
 	{
 		
 		m1 -= NB;
 		if ( m1 < 0)
 		{
 			if ( m2 == 0 ) break;	
 			NB = m2;
 		}
 		
 		a_ptr = a;
 		x_ptr = x;
 		
 		ap[0] = a_ptr;
 		ap[1] = a_ptr + lda;
 		ap[2] = ap[1] + lda;
 		ap[3] = ap[2] + lda;

 		if ( inc_y != 1 )
 			memset(ybuffer,0,NB*8);
 		else
 			ybuffer = y_ptr;

 		if ( inc_x == 1 )
 		{


 			for( i = 0; i < n1 ; i++)
 			{
 				dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
 				ap[0] += lda8; 
 				ap[1] += lda8; 
 				ap[2] += lda8; 
 				ap[3] += lda8; 
 				a_ptr += lda8;
 				x_ptr += 8;	
 			}


 			if ( n2 & 4 )
 			{
 				dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
 				ap[0] += lda4; 
 				ap[1] += lda4; 
 				a_ptr += lda4;
 				x_ptr += 4;	
 			}

 			if ( n2 & 2 )
 			{
 				dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
 				a_ptr += lda*2;
 				x_ptr += 2;	
 			}


 			if ( n2 & 1 )
 			{
 				dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
 				a_ptr += lda;
 				x_ptr += 1;	

 			}


 		}
 		else
 		{

 			for( i = 0; i < n1 ; i++)
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
 				xbuffer[1] =  x_ptr[0];
 				x_ptr += inc_x;	
 				xbuffer[2] =  x_ptr[0];
 				x_ptr += inc_x;	
 				xbuffer[3] = x_ptr[0];
 				x_ptr += inc_x;	
 				dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
 				ap[0] += lda4; 
 				ap[1] += lda4; 
 				ap[2] += lda4; 
 				ap[3] += lda4; 
 				a_ptr += lda4;
 			}

 			for( i = 0; i < n2 ; i++)
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
 				dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
 				a_ptr += lda;

 			}

 		}

 		a     += NB;
 		if ( inc_y != 1 )
 		{
 			add_y(NB,ybuffer,y_ptr,inc_y);
 			y_ptr += NB * inc_y;
 		}
 		else
 			y_ptr += NB ;

 	}

 	if ( m3 == 0 ) return(0);

 	if ( m3 == 3 )
 	{
 		a_ptr = a;
 		x_ptr = x;
 		FLOAT temp0 = 0.0;
 		FLOAT temp1 = 0.0;
 		FLOAT temp2 = 0.0;
 		if ( lda == 3 && inc_x ==1 )
 		{

 			for( i = 0; i < ( n & -4 ); i+=4 )
 			{

 				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
 				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
 				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];

 				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
 				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
 				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];

 				a_ptr += 12;
 				x_ptr += 4;
 			}

 			for( ; i < n; i++ )
 			{
 				temp0 += a_ptr[0] * x_ptr[0];
 				temp1 += a_ptr[1] * x_ptr[0];
 				temp2 += a_ptr[2] * x_ptr[0];
 				a_ptr += 3;
 				x_ptr ++;
 			}

 		}
 		else
 		{

 			for( i = 0; i < n; i++ )
 			{
 				temp0 += a_ptr[0] * x_ptr[0];
 				temp1 += a_ptr[1] * x_ptr[0];
 				temp2 += a_ptr[2] * x_ptr[0];
 				a_ptr += lda;
 				x_ptr += inc_x;


 			}

 		}
 		y_ptr[0] += alpha * temp0;
 		y_ptr += inc_y;
 		y_ptr[0] += alpha * temp1;
 		y_ptr += inc_y;
 		y_ptr[0] += alpha * temp2;
 		return(0);
 	}


 	if ( m3 == 2 )
 	{
 		a_ptr = a;
 		x_ptr = x;
 		FLOAT temp0 = 0.0;
 		FLOAT temp1 = 0.0;
 		if ( lda == 2 && inc_x ==1 )
 		{

 			for( i = 0; i < (n & -4) ; i+=4 )
 			{
 				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
 				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
 				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
 				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
 				a_ptr += 8;
 				x_ptr += 4;

 			}


 			for( ; i < n; i++ )
 			{
 				temp0 += a_ptr[0]   * x_ptr[0];
 				temp1 += a_ptr[1]   * x_ptr[0];
 				a_ptr += 2;
 				x_ptr ++;
 			}

 		}
 		else
 		{

 			for( i = 0; i < n; i++ )
 			{
 				temp0 += a_ptr[0] * x_ptr[0];
 				temp1 += a_ptr[1] * x_ptr[0];
 				a_ptr += lda;
 				x_ptr += inc_x;


 			}

 		}
 		y_ptr[0] += alpha * temp0;
 		y_ptr += inc_y;
 		y_ptr[0] += alpha * temp1;
 		return(0);
 	}

 	if ( m3 == 1 )
 	{
 		a_ptr = a;
 		x_ptr = x;
 		FLOAT temp = 0.0;
 		if ( lda == 1 && inc_x ==1 )
 		{

 			for( i = 0; i < (n & -4); i+=4 )
 			{
 				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
 	
 			}

 			for( ; i < n; i++ )
 			{
 				temp += a_ptr[i] * x_ptr[i];
 			}

 		}
 		else
 		{

 			for( i = 0; i < n; i++ )
 			{
 				temp += a_ptr[0] * x_ptr[0];
 				a_ptr += lda;
 				x_ptr += inc_x;
 			}

 		}
 		y_ptr[0] += alpha * temp;
 		return(0);
 	}


 	return(0);
 }


--- a/kernel/x86_64/dgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_n_microk_haswell-4.c
@@ -0,0 +1,247 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/



 #define HAVE_KERNEL_4x8 1
 static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

 static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
 	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
 	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
 	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
 	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 
 	"vbroadcastsd  32(%2), %%ymm0 	 \n\t"	// x4 
 	"vbroadcastsd  40(%2), %%ymm1 	 \n\t"	// x5 
 	"vbroadcastsd  48(%2), %%ymm2 	 \n\t"	// x6 
 	"vbroadcastsd  56(%2), %%ymm3 	 \n\t"	// x7 

 	"vbroadcastsd    (%9), %%ymm6 	 \n\t"	// alpha 

        "testq          $0x04, %1                      \n\t"
        "jz             .L8LABEL%=                     \n\t"

 	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y
 	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"

 	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm5      \n\t" 

 	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
 	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
 	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
 	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm5      \n\t" 

 	"vaddpd		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
 	"vmulpd		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
 	"vaddpd		%%ymm7 , %%ymm5 , %%ymm5       \n\t"


 	"vmovupd  %%ymm5,   (%3,%0,8)		       \n\t"	// 4 * y

        "addq		$4 , %8	  	 	       \n\t"
        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		

        ".L8LABEL%=:                                   \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L16END%=                      \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"

 	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
 	"vmovupd      32(%3,%0,8), %%ymm9	       \n\t"	// 4 * y

 	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
 	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
 	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 

 	"vfmadd231pd   (%4,%8,8), %%ymm0 , %%ymm4      \n\t" 
        "addq		$8 , %0	  	 	       \n\t"
 	"vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5      \n\t" 
 	"vfmadd231pd   (%5,%8,8), %%ymm1 , %%ymm4      \n\t" 
 	"vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5      \n\t" 
 	"vfmadd231pd   (%6,%8,8), %%ymm2 , %%ymm4      \n\t" 
 	"vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5      \n\t" 
 	"vfmadd231pd   (%7,%8,8), %%ymm3 , %%ymm4      \n\t" 
 	"vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5      \n\t" 

 	"vfmadd231pd     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
 	"vfmadd231pd     %%ymm6 , %%ymm5 , %%ymm9      \n\t"

        "addq		$8 , %8	  	 	      \n\t"
 	"vmovupd  %%ymm8,-64(%3,%0,8)		      \n\t"	// 4 * y
 	"subq	        $8 , %1			      \n\t"		
 	"vmovupd  %%ymm9,-32(%3,%0,8)		      \n\t"	// 4 * y

 	"jnz		.L01LOOP%=		      \n\t"

        ".L16END%=:                             \n\t"
 	"vzeroupper			        \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (lda4),   // 8
          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 #define HAVE_KERNEL_4x4 1
 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
 	"vbroadcastsd    (%2), %%ymm12	 \n\t"	// x0 
 	"vbroadcastsd   8(%2), %%ymm13	 \n\t"	// x1 
 	"vbroadcastsd  16(%2), %%ymm14	 \n\t"	// x2 
 	"vbroadcastsd  24(%2), %%ymm15	 \n\t"	// x3 

 	"vbroadcastsd    (%8), %%ymm6 	 \n\t"	// alpha 

        "testq          $0x04, %1                      \n\t"
        "jz             .L8LABEL%=                     \n\t"

 	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovupd	(%3,%0,8), %%ymm7	       \n\t"	// 4 * y

 	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm5      \n\t" 

 	"vaddpd		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
 	"vmulpd		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
 	"vaddpd		%%ymm7 , %%ymm5 , %%ymm5       \n\t"

 	"vmovupd  %%ymm5,   (%3,%0,8)		       \n\t"	// 4 * y

        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		

        ".L8LABEL%=:                                   \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L8END%=                       \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vxorpd		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorpd		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovupd	(%3,%0,8), %%ymm8	       \n\t"	// 4 * y
 	"vmovupd      32(%3,%0,8), %%ymm9	       \n\t"	// 4 * y

 	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5      \n\t" 
 	"vfmadd231pd   (%5,%0,8), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231pd   (%6,%0,8), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5      \n\t" 
 	"vfmadd231pd   (%7,%0,8), %%ymm15, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5      \n\t" 

 	"vfmadd231pd     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
 	"vfmadd231pd     %%ymm6 , %%ymm5 , %%ymm9      \n\t"

 	"vmovupd  %%ymm8,   (%3,%0,8)		      \n\t"	// 4 * y
 	"vmovupd  %%ymm9, 32(%3,%0,8)		      \n\t"	// 4 * y

        "addq		$8 , %0	  	 	      \n\t"
 	"subq	        $8 , %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"

        ".L8END%=:                                    \n\t"
 	"vzeroupper			              \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/dgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/dgemv_n_microk_nehalem-4.c
@@ -0,0 +1,265 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/



 #define HAVE_KERNEL_4x8 1
 static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

 static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"movsd    (%2), %%xmm12	 \n\t"	// x0 
 	"movsd   8(%2), %%xmm13	 \n\t"	// x1 
 	"movsd  16(%2), %%xmm14	 \n\t"	// x2 
 	"movsd  24(%2), %%xmm15	 \n\t"	// x3 
 	"shufpd $0,  %%xmm12, %%xmm12\n\t"	
 	"shufpd $0,  %%xmm13, %%xmm13\n\t"	
 	"shufpd $0,  %%xmm14, %%xmm14\n\t"	
 	"shufpd $0,  %%xmm15, %%xmm15\n\t"	

 	"movsd  32(%2), %%xmm0	 \n\t"	// x4 
 	"movsd  40(%2), %%xmm1	 \n\t"	// x5 
 	"movsd  48(%2), %%xmm2	 \n\t"	// x6 
 	"movsd  56(%2), %%xmm3	 \n\t"	// x7 
 	"shufpd $0,  %%xmm0 , %%xmm0 \n\t"	
 	"shufpd $0,  %%xmm1 , %%xmm1 \n\t"	
 	"shufpd $0,  %%xmm2 , %%xmm2 \n\t"	
 	"shufpd $0,  %%xmm3 , %%xmm3 \n\t"	

 	"movsd    (%9), %%xmm6	     \n\t"	// alpha 
 	"shufpd $0,  %%xmm6 , %%xmm6 \n\t"	


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"xorpd           %%xmm4 , %%xmm4	 \n\t"
 	"xorpd           %%xmm5 , %%xmm5	 \n\t"
 	"movups             (%3,%0,8), %%xmm7          \n\t" // 2 * y

 	".align 2				       \n\t"
 	"movups             (%4,%0,8), %%xmm8          \n\t" 
 	"movups             (%5,%0,8), %%xmm9          \n\t" 
 	"movups             (%6,%0,8), %%xmm10         \n\t" 
 	"movups             (%7,%0,8), %%xmm11         \n\t" 
 	".align 2				       \n\t"
 	"mulpd		%%xmm12, %%xmm8		       \n\t"
 	"mulpd		%%xmm13, %%xmm9		       \n\t"
 	"mulpd		%%xmm14, %%xmm10	       \n\t"
 	"mulpd		%%xmm15, %%xmm11	       \n\t"
 	"addpd		%%xmm8 , %%xmm4		       \n\t"
 	"addpd		%%xmm9 , %%xmm5		       \n\t"
 	"addpd		%%xmm10, %%xmm4	               \n\t"
 	"addpd		%%xmm11, %%xmm5 	       \n\t"

 	"movups             (%4,%8,8), %%xmm8          \n\t" 
 	"movups             (%5,%8,8), %%xmm9          \n\t" 
 	"movups             (%6,%8,8), %%xmm10         \n\t" 
 	"movups             (%7,%8,8), %%xmm11         \n\t" 
 	".align 2				       \n\t"
 	"mulpd		%%xmm0 , %%xmm8		       \n\t"
 	"mulpd		%%xmm1 , %%xmm9		       \n\t"
 	"mulpd		%%xmm2 , %%xmm10	       \n\t"
 	"mulpd		%%xmm3 , %%xmm11	       \n\t"
 	"addpd		%%xmm8 , %%xmm4		       \n\t"
 	"addpd		%%xmm9 , %%xmm5		       \n\t"
 	"addpd		%%xmm10, %%xmm4	       	       \n\t"
 	"addpd		%%xmm11, %%xmm5 	       \n\t"

 	"addpd		%%xmm5 , %%xmm4 	       \n\t"
 	"mulpd		%%xmm6 , %%xmm4		       \n\t" 
 	"addpd		%%xmm4 , %%xmm7 	       \n\t"

 	"movups  %%xmm7 ,    (%3,%0,8)		       \n\t"	// 2 * y

 	"xorpd           %%xmm4 , %%xmm4	 \n\t"
 	"xorpd           %%xmm5 , %%xmm5	 \n\t"
 	"movups           16(%3,%0,8), %%xmm7          \n\t" // 2 * y

 	".align 2				       \n\t"
 	"movups           16(%4,%0,8), %%xmm8          \n\t" 
 	"movups           16(%5,%0,8), %%xmm9          \n\t" 
 	"movups           16(%6,%0,8), %%xmm10         \n\t" 
 	"movups           16(%7,%0,8), %%xmm11         \n\t" 
 	".align 2				       \n\t"
 	"mulpd		%%xmm12, %%xmm8		       \n\t"
 	"mulpd		%%xmm13, %%xmm9		       \n\t"
 	"mulpd		%%xmm14, %%xmm10	       \n\t"
 	"mulpd		%%xmm15, %%xmm11	       \n\t"
 	"addpd		%%xmm8 , %%xmm4		       \n\t"
 	"addpd		%%xmm9 , %%xmm5		       \n\t"
 	"addpd		%%xmm10, %%xmm4	               \n\t"
 	"addpd		%%xmm11, %%xmm5 	       \n\t"

 	"movups           16(%4,%8,8), %%xmm8          \n\t" 
 	"movups           16(%5,%8,8), %%xmm9          \n\t" 
 	"movups           16(%6,%8,8), %%xmm10         \n\t" 
 	"movups           16(%7,%8,8), %%xmm11         \n\t" 
 	".align 2				       \n\t"
 	"mulpd		%%xmm0 , %%xmm8		       \n\t"
 	"mulpd		%%xmm1 , %%xmm9		       \n\t"
 	"mulpd		%%xmm2 , %%xmm10	       \n\t"
 	"mulpd		%%xmm3 , %%xmm11	       \n\t"
 	"addpd		%%xmm8 , %%xmm4		       \n\t"
 	"addpd		%%xmm9 , %%xmm5		       \n\t"
 	"addpd		%%xmm10, %%xmm4	       	       \n\t"
 	"addpd		%%xmm11, %%xmm5 	       \n\t"

        "addq		$4 , %8	  	 	       \n\t"
 	"addpd		%%xmm5 , %%xmm4 	       \n\t"
 	"mulpd		%%xmm6 , %%xmm4		       \n\t" 
 	"addpd		%%xmm4 , %%xmm7 	       \n\t"

 	"movups  %%xmm7 ,  16(%3,%0,8)		       \n\t"	// 2 * y

        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		
 	"jnz		.L01LOOP%=		       \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (lda4),   // 8
          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 




 #define HAVE_KERNEL_4x4 1
 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"movsd    (%2), %%xmm12	 \n\t"	// x0 
 	"movsd   8(%2), %%xmm13	 \n\t"	// x1 
 	"movsd  16(%2), %%xmm14	 \n\t"	// x2 
 	"movsd  24(%2), %%xmm15	 \n\t"	// x3 
 	"shufpd $0,  %%xmm12, %%xmm12\n\t"	
 	"shufpd $0,  %%xmm13, %%xmm13\n\t"	
 	"shufpd $0,  %%xmm14, %%xmm14\n\t"	
 	"shufpd $0,  %%xmm15, %%xmm15\n\t"	

 	"movsd    (%8), %%xmm6	     \n\t"	// alpha 
 	"shufpd $0,  %%xmm6 , %%xmm6 \n\t"	

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"xorpd           %%xmm4 , %%xmm4	 \n\t"
 	"xorpd           %%xmm5 , %%xmm5	 \n\t"
 	"movups	       (%3,%0,8), %%xmm7	 \n\t"	// 2 * y

 	"movups             (%4,%0,8), %%xmm8          \n\t" 
 	"movups             (%5,%0,8), %%xmm9          \n\t" 
 	"movups             (%6,%0,8), %%xmm10         \n\t" 
 	"movups             (%7,%0,8), %%xmm11         \n\t" 
 	"mulpd		%%xmm12, %%xmm8		       \n\t"
 	"mulpd		%%xmm13, %%xmm9		       \n\t"
 	"mulpd		%%xmm14, %%xmm10	       \n\t"
 	"mulpd		%%xmm15, %%xmm11	       \n\t"
 	"addpd		%%xmm8 , %%xmm4		       \n\t"
 	"addpd		%%xmm9 , %%xmm4		       \n\t"
 	"addpd		%%xmm10 , %%xmm4	       \n\t"
 	"addpd		%%xmm4 , %%xmm11	       \n\t"

 	"mulpd		%%xmm6 , %%xmm11	       \n\t" 
 	"addpd		%%xmm7 , %%xmm11 	       \n\t"
 	"movups  %%xmm11,    (%3,%0,8)		       \n\t"	// 2 * y

 	"xorpd           %%xmm4 , %%xmm4	 \n\t"
 	"xorpd           %%xmm5 , %%xmm5	 \n\t"
 	"movups	     16(%3,%0,8), %%xmm7	 \n\t"	// 2 * y

 	"movups           16(%4,%0,8), %%xmm8          \n\t" 
 	"movups           16(%5,%0,8), %%xmm9          \n\t" 
 	"movups           16(%6,%0,8), %%xmm10         \n\t" 
 	"movups           16(%7,%0,8), %%xmm11         \n\t" 
 	"mulpd		%%xmm12, %%xmm8		       \n\t"
 	"mulpd		%%xmm13, %%xmm9		       \n\t"
 	"mulpd		%%xmm14, %%xmm10	       \n\t"
 	"mulpd		%%xmm15, %%xmm11	       \n\t"
 	"addpd		%%xmm8 , %%xmm4		       \n\t"
 	"addpd		%%xmm9 , %%xmm4		       \n\t"
 	"addpd		%%xmm10 , %%xmm4	       \n\t"
 	"addpd		%%xmm4 , %%xmm11	       \n\t"

 	"mulpd		%%xmm6 , %%xmm11	       \n\t" 
 	"addpd		%%xmm7 , %%xmm11 	       \n\t"
 	"movups  %%xmm11,  16(%3,%0,8)		       \n\t"	// 2 * y

        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		
 	"jnz		.L01LOOP%=		       \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/dgemv_t_4.c
+++ b/kernel/x86_64/dgemv_t_4.c
@@ -0,0 +1,615 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #if defined(HASWELL)
 #include "dgemv_t_microk_haswell-4.c"
 #endif

 #define NBMAX 2048

 #ifndef HAVE_KERNEL_4x4

 static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
 	a3 = ap[3];
 	FLOAT temp0 = 0.0;
 	FLOAT temp1 = 0.0;
 	FLOAT temp2 = 0.0;
 	FLOAT temp3 = 0.0;

 	for ( i=0; i< n; i+=4 )
 	{
 		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
 		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
 		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
 		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
 	}
 	y[0] = temp0;
 	y[1] = temp1;
 	y[2] = temp2;
 	y[3] = temp3;
 }
 	
 #endif

 static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));

 static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;

 	i=0;

        __asm__  __volatile__
 	(
 	"xorpd %%xmm10 , %%xmm10		\n\t"
 	"xorpd %%xmm11 , %%xmm11		\n\t"
 		
 	"testq	$2 , %1				\n\t"
 	"jz	.L01LABEL%=			\n\t"

 	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
 	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
 	"movups  (%4,%0,8) , %%xmm13		\n\t" // ap1
 	"mulpd   %%xmm14   , %%xmm12 		\n\t"
 	"mulpd   %%xmm14   , %%xmm13 		\n\t"
        "addq           $2 , %0                 \n\t"
 	"addpd   %%xmm12   , %%xmm10		\n\t"
        "subq           $2 , %1                 \n\t"
 	"addpd   %%xmm13   , %%xmm11		\n\t"

        ".L01LABEL%=:                           \n\t"

 	"cmpq	$0, %1				\n\t"
 	"je	.L01END%=			\n\t"

        ".align 16                              \n\t"
        ".L01LOOP%=:                            \n\t"

 	"movups  (%5,%0,8) , %%xmm14		\n\t" // x
 	"movups  (%3,%0,8) , %%xmm12		\n\t" // ap0
 	"movups  (%4,%0,8) , %%xmm13		\n\t" // ap1
 	"mulpd   %%xmm14   , %%xmm12 		\n\t"
 	"mulpd   %%xmm14   , %%xmm13 		\n\t"
 	"addpd   %%xmm12   , %%xmm10		\n\t"
 	"addpd   %%xmm13   , %%xmm11		\n\t"

 	"movups  16(%5,%0,8) , %%xmm14		\n\t" // x
 	"movups  16(%3,%0,8) , %%xmm12		\n\t" // ap0
 	"movups  16(%4,%0,8) , %%xmm13		\n\t" // ap1
 	"mulpd   %%xmm14   , %%xmm12 		\n\t"
 	"mulpd   %%xmm14   , %%xmm13 		\n\t"
 	"addpd   %%xmm12   , %%xmm10		\n\t"
 	"addpd   %%xmm13   , %%xmm11		\n\t"

        "addq           $4 , %0                 \n\t"
        "subq           $4 , %1                 \n\t"
        "jnz            .L01LOOP%=              \n\t"

        ".L01END%=:                             \n\t"

 	"haddpd        %%xmm10, %%xmm10         \n\t"
 	"haddpd        %%xmm11, %%xmm11         \n\t"

 	"movsd	       %%xmm10, (%2)	        \n\t"
 	"movsd	       %%xmm11,8(%2)	        \n\t"

        :
   	:
 	"r" (i),	 // 0
 	"r" (n),	 // 1
        "r" (y),         // 2    
        "r" (ap0),       // 3
        "r" (ap1),       // 4
        "r" (x)          // 5
        : "cc",
       	"%xmm4", "%xmm5", "%xmm10", "%xmm11",
       	"%xmm12", "%xmm13", "%xmm14", "%xmm15",
       	"memory"
       	);


 }
 	
 static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));

 static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;

 	i=0;

        __asm__  __volatile__
 	(
 	"xorpd %%xmm9  , %%xmm9 		\n\t"
 	"xorpd %%xmm10 , %%xmm10		\n\t"
 	
 	"testq	$2 , %1				\n\t"
 	"jz	.L01LABEL%=			\n\t"

 	"movups  (%3,%0,8) , %%xmm12		\n\t"
 	"movups  (%4,%0,8) , %%xmm11		\n\t"
 	"mulpd   %%xmm11   , %%xmm12 		\n\t"
        "addq           $2 , %0                 \n\t"
 	"addpd   %%xmm12   , %%xmm10		\n\t"
        "subq           $2 , %1                 \n\t"

        ".L01LABEL%=:                           \n\t"

 	"cmpq	$0, %1				\n\t"
 	"je	.L01END%=			\n\t"

        ".align 16                              \n\t"
        ".L01LOOP%=:                            \n\t"

 	"movups    (%3,%0,8) , %%xmm12		\n\t"
 	"movups  16(%3,%0,8) , %%xmm14		\n\t"
 	"movups    (%4,%0,8) , %%xmm11		\n\t"
 	"movups  16(%4,%0,8) , %%xmm13		\n\t"
 	"mulpd   %%xmm11   , %%xmm12 		\n\t"
 	"mulpd   %%xmm13   , %%xmm14 		\n\t"
        "addq           $4 , %0                 \n\t"
 	"addpd   %%xmm12   , %%xmm10		\n\t"
        "subq           $4 , %1                 \n\t"
 	"addpd   %%xmm14   , %%xmm9 		\n\t"

        "jnz            .L01LOOP%=              \n\t"

        ".L01END%=:                             \n\t"

 	"addpd	       %%xmm9 , %%xmm10         \n\t"
 	"haddpd        %%xmm10, %%xmm10         \n\t"

 	"movsd	       %%xmm10, (%2)	        \n\t"

        :
   	:
 	"r" (i),	 // 0
 	"r" (n),	 // 1
        "r" (y),         // 2    
        "r" (ap),        // 3
        "r" (x)          // 4
        : "cc",
       	"%xmm9", "%xmm10" ,
       	"%xmm11", "%xmm12", "%xmm13", "%xmm14",
       	"memory"
       	);


 }
 	
 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 {
        BLASLONG i;
        for ( i=0; i<n; i++ )
        {
                *dest = *src;
                dest++;
                src += inc_src;
        }
 }

 static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));

 static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 {

        BLASLONG i;

 	if ( inc_dest != 1 )
 	{
        	for ( i=0; i<n; i++ )
        	{
                	*dest += src[i]  * da;
                	dest  += inc_dest;
 		}
 		return;
        }

 	i=0;

        __asm__  __volatile__
 	(
 	"movsd	 (%2) , %%xmm10                 \n\t"
 	"shufpd  $0 , %%xmm10 , %%xmm10		\n\t"

        ".align 16                              \n\t"
        ".L01LOOP%=:                            \n\t"

 	"movups  (%3,%0,8) , %%xmm12		\n\t"
 	"movups  (%4,%0,8) , %%xmm11		\n\t"
 	"mulpd   %%xmm10   , %%xmm12 		\n\t"
        "addq           $2 , %0                 \n\t"
 	"addpd   %%xmm12   , %%xmm11		\n\t"
        "subq           $2 , %1                 \n\t"
 	"movups  %%xmm11, -16(%4,%0,8)		\n\t"

        "jnz            .L01LOOP%=              \n\t"

        :
   	:
 	"r" (i),	  // 0
 	"r" (n),	  // 1
        "r" (&da),        // 2    
        "r" (src),        // 3
        "r" (dest)        // 4
        : "cc",
       	"%xmm10", "%xmm11", "%xmm12",
       	"memory"
       	);


 }

 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG register i;
 	BLASLONG register j;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
 	BLASLONG n0;
 	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
 	BLASLONG m3;
 	BLASLONG n2;
 	FLOAT ybuffer[4],*xbuffer;
 	FLOAT *ytemp;

        if ( m < 1 ) return(0);
        if ( n < 1 ) return(0);

 	xbuffer = buffer;
 	ytemp   = buffer + NBMAX;
 	
 	n0 = n / NBMAX;
        n1 = (n % NBMAX)  >> 2 ;
        n2 = n & 3  ;

 	m3 = m & 3  ;
        m1 = m & -4 ;
        m2 = (m & (NBMAX-1)) - m3 ;


 	BLASLONG NB = NBMAX;

 	while ( NB == NBMAX )
 	{
 		
 		m1 -= NB;
 		if ( m1 < 0)
 		{
 			if ( m2 == 0 ) break;	
 			NB = m2;
 		}
 		
 		y_ptr = y;
 		a_ptr = a;
 		x_ptr = x;

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(NB,x_ptr,xbuffer,inc_x);


 		FLOAT *ap[4];
 		FLOAT *yp;
 		BLASLONG register lda4 = 4 * lda;
 		ap[0] = a_ptr;
 		ap[1] = a_ptr + lda;
 		ap[2] = ap[1] + lda;
 		ap[3] = ap[2] + lda;

 		if ( n0 > 0 )
 		{
 			BLASLONG nb1 = NBMAX / 4;
 			for( j=0; j<n0; j++)
 			{

 				yp = ytemp;
 				for( i = 0; i < nb1  ; i++)
 				{
 					dgemv_kernel_4x4(NB,ap,xbuffer,yp);
 					ap[0] += lda4 ;
 					ap[1] += lda4 ;
 					ap[2] += lda4 ;
 					ap[3] += lda4 ;
 					yp += 4;
 				}
 				add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
 				y_ptr += nb1 * inc_y * 4;
 				a_ptr += nb1 * lda4 ;

 			}

 		}


 		yp = ytemp;

 		for( i = 0; i < n1 ; i++)
 		{
 			dgemv_kernel_4x4(NB,ap,xbuffer,yp);
 			ap[0] += lda4 ;
 			ap[1] += lda4 ;
 			ap[2] += lda4 ;
 			ap[3] += lda4 ;
 			yp += 4;
 		}
 		if ( n1 > 0 )
 		{
 			add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
 			y_ptr += n1 * inc_y * 4;
 			a_ptr += n1 * lda4 ;
 		}

 		if ( n2 & 2 )
 		{

 			dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
 			a_ptr  += lda * 2;
 			*y_ptr += ybuffer[0] * alpha;
 			y_ptr  += inc_y;
 			*y_ptr += ybuffer[1] * alpha;
 			y_ptr  += inc_y;

 		}

 		if ( n2 & 1 )
 		{

 			dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
 			a_ptr  += lda;
 			*y_ptr += ybuffer[0] * alpha;
 			y_ptr  += inc_y;

 		}
 		a += NB;
 		x += NB * inc_x;	
 	}

 	if ( m3 == 0 ) return(0);

 	x_ptr = x;
 	a_ptr = a;
 	if ( m3 == 3 )
 	{
 		FLOAT xtemp0 = *x_ptr * alpha;
 		x_ptr += inc_x;
 		FLOAT xtemp1 = *x_ptr * alpha;
 		x_ptr += inc_x;
 		FLOAT xtemp2 = *x_ptr * alpha;

 		FLOAT *aj = a_ptr;
 		y_ptr = y;

 		if ( lda == 3 && inc_y == 1 )
 		{

 			for ( j=0; j< ( n & -4) ; j+=4 )
 			{

 				y_ptr[j]   += aj[0] * xtemp0 + aj[1]  * xtemp1 + aj[2]  * xtemp2;
 				y_ptr[j+1] += aj[3] * xtemp0 + aj[4]  * xtemp1 + aj[5]  * xtemp2;
 				y_ptr[j+2] += aj[6] * xtemp0 + aj[7]  * xtemp1 + aj[8]  * xtemp2;
 				y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
 			 	aj        += 12;
 			}

 			for ( ; j<n; j++ )
 			{
 				y_ptr[j]  += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
 			 	aj        += 3;
 			}

 		}
 		else
 		{

 			if ( inc_y == 1 )
 			{

 				BLASLONG register lda2 = lda << 1;
 				BLASLONG register lda4 = lda << 2;
 				BLASLONG register lda3 = lda2 + lda;

 				for ( j=0; j< ( n & -4 ); j+=4 )
 				{

 					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 + *(aj+2)      * xtemp2;
 					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 + *(aj+lda+2)  * xtemp2;
 					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
 					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
 			 		aj          += lda4;
 				}

 				for ( ; j< n ; j++ )
 				{

 					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
 			 		aj          += lda;
 				}

 			}
 			else
 			{

 				for ( j=0; j<n; j++ )
 				{
 					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
 				 	y_ptr += inc_y;
 			 		aj    += lda;
 				}


 			}

 		}
 		return(0);
 	}

 	if ( m3 == 2 )
 	{
 		FLOAT xtemp0 = *x_ptr * alpha;
 		x_ptr += inc_x;
 		FLOAT xtemp1 = *x_ptr * alpha;

 		FLOAT *aj = a_ptr;
 		y_ptr = y;

 		if ( lda == 2 && inc_y == 1 )
 		{

 			for ( j=0; j< ( n & -4) ; j+=4 )
 			{
 				y_ptr[j]   += aj[0] * xtemp0 + aj[1] * xtemp1 ;
 				y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
 				y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
 				y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
 			 	aj         += 8;

 			}

 			for ( ; j<n; j++ )
 			{
 				y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
 			 	aj       += 2;
 			}

 		}
 		else
 		{
 			if ( inc_y == 1 )
 			{

 				BLASLONG register lda2 = lda << 1;
 				BLASLONG register lda4 = lda << 2;
 				BLASLONG register lda3 = lda2 + lda;

 				for ( j=0; j< ( n & -4 ); j+=4 )
 				{

 					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 ;
 					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 ;
 					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
 					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
 			 		aj          += lda4;
 				}

 				for ( ; j< n ; j++ )
 				{

 					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 ;
 			 		aj          += lda;
 				}

 			}
 			else
 			{
 				for ( j=0; j<n; j++ )
 				{
 					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
 			 		y_ptr += inc_y;
 			 		aj    += lda;
 				}
 			}

 		}
 		return(0);

 	}

 	FLOAT xtemp = *x_ptr * alpha;
 	FLOAT *aj = a_ptr;
 	y_ptr = y;
 	if ( lda == 1 && inc_y == 1 )
 	{
 		for ( j=0; j< ( n & -4) ; j+=4 )
 		{
 			y_ptr[j]   += aj[j]   * xtemp;
 			y_ptr[j+1] += aj[j+1] * xtemp;
 			y_ptr[j+2] += aj[j+2] * xtemp;
 			y_ptr[j+3] += aj[j+3] * xtemp;
 		}
 		for ( ; j<n   ; j++ )
 		{
 			y_ptr[j] += aj[j] * xtemp;
 		}



 	}
 	else
 	{
 		if ( inc_y == 1 )
 		{

 			BLASLONG register lda2 = lda << 1;
 			BLASLONG register lda4 = lda << 2;
 			BLASLONG register lda3 = lda2 + lda;
 			for ( j=0; j< ( n & -4 ); j+=4 )
 			{
 				y_ptr[j]    += *aj        * xtemp;
 				y_ptr[j+1]  += *(aj+lda)  * xtemp;
 				y_ptr[j+2]  += *(aj+lda2) * xtemp;
 				y_ptr[j+3]  += *(aj+lda3) * xtemp;
 		 		aj          += lda4  ;
 			}

 			for ( ; j<n; j++ )
 			{
 				y_ptr[j]  += *aj * xtemp;
 		 		aj        += lda;
 			}

 		}
 		else
 		{
 			for ( j=0; j<n; j++ )
 			{
 				*y_ptr += *aj * xtemp;
 		 		y_ptr += inc_y;
 		 		aj    += lda;
 			}

 		}
 	}

 	return(0);
 }


--- a/kernel/x86_64/dgemv_t_microk_haswell-4.c
+++ b/kernel/x86_64/dgemv_t_microk_haswell-4.c
@@ -0,0 +1,127 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #define HAVE_KERNEL_4x4 1
 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));

 static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			         \n\t"
 	"vxorpd		%%ymm4 , %%ymm4, %%ymm4  \n\t"
 	"vxorpd		%%ymm5 , %%ymm5, %%ymm5  \n\t"
 	"vxorpd		%%ymm6 , %%ymm6, %%ymm6  \n\t"
 	"vxorpd		%%ymm7 , %%ymm7, %%ymm7  \n\t"

        "testq          $0x04, %1                      \n\t"
        "jz             .L08LABEL%=                    \n\t"

 	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x

 	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
 	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
 	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 

        "addq		$4 , %0	  	 	      \n\t"
 	"subq	        $4 , %1			      \n\t"		

        ".L08LABEL%=:                                  \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L16END%=                      \n\t"

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	// "prefetcht0	 384(%2,%0,8)		 \n\t"
 	"vmovups	(%2,%0,8), %%ymm12       \n\t"	// 4 * x
 	"vmovups      32(%2,%0,8), %%ymm13       \n\t"	// 4 * x

 	// "prefetcht0	 384(%4,%0,8)		       \n\t"
 	"vfmadd231pd   (%4,%0,8), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231pd   (%5,%0,8), %%ymm12, %%ymm5      \n\t" 
 	// "prefetcht0	 384(%5,%0,8)		       \n\t"
 	"vfmadd231pd   (%6,%0,8), %%ymm12, %%ymm6      \n\t" 
 	"vfmadd231pd   (%7,%0,8), %%ymm12, %%ymm7      \n\t" 
 	// "prefetcht0	 384(%6,%0,8)		       \n\t"
 	"vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5      \n\t" 
        "addq		$8 , %0	  	 	       \n\t"
 	// "prefetcht0	 384(%7,%0,8)		       \n\t"
 	"vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6     \n\t" 
 	"subq	        $8 , %1			       \n\t"		
 	"vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7     \n\t" 

 	"jnz		.L01LOOP%=		      \n\t"

        ".L16END%=:                                   \n\t"

 	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
 	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
 	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
 	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"

 	"vaddpd		%%xmm4, %%xmm12, %%xmm4       \n\t"
 	"vaddpd		%%xmm5, %%xmm13, %%xmm5       \n\t"
 	"vaddpd		%%xmm6, %%xmm14, %%xmm6       \n\t"
 	"vaddpd		%%xmm7, %%xmm15, %%xmm7       \n\t"

        "vhaddpd        %%xmm4, %%xmm4, %%xmm4  \n\t"
        "vhaddpd        %%xmm5, %%xmm5, %%xmm5  \n\t"
        "vhaddpd        %%xmm6, %%xmm6, %%xmm6  \n\t"
        "vhaddpd        %%xmm7, %%xmm7, %%xmm7  \n\t"

        "vmovsd         %%xmm4,    (%3)         \n\t"
        "vmovsd         %%xmm5,   8(%3)         \n\t"
        "vmovsd         %%xmm6,  16(%3)         \n\t"
        "vmovsd         %%xmm7,  24(%3)         \n\t"

 	"vzeroupper			 \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3])   // 7
 	: "cc", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_n_4.c
+++ b/kernel/x86_64/sgemv_n_4.c
@@ -0,0 +1,591 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"


 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_n_microk_bulldozer-4.c"
 #elif defined(NEHALEM)
 #include "sgemv_n_microk_nehalem-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_n_microk_sandy-4.c"
 #elif defined(HASWELL)
 #include "sgemv_n_microk_haswell-4.c"
 #endif


 #define NBMAX 4096

 #ifndef HAVE_KERNEL_4x8

 static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
 	FLOAT *b0,*b1,*b2,*b3;
 	FLOAT *x4;
 	FLOAT x[8];
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
 	a3 = ap[3];
 	b0 = a0 + lda4 ;
 	b1 = a1 + lda4 ;
 	b2 = a2 + lda4 ;
 	b3 = a3 + lda4 ;
 	x4 = x + 4;

 	for ( i=0; i<8; i++)
 		x[i] = xo[i] * *alpha;

 	for ( i=0; i< n; i+=4 )
 	{

 		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
 		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
 		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
 		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		

 		y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3];		
 		y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3];		
 		y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3];		
 		y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3];		

 	}
 }
 	
 #endif


 #ifndef HAVE_KERNEL_4x4

 static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
 	FLOAT x[4];
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
 	a3 = ap[3];

 	for ( i=0; i<4; i++)
 		x[i] = xo[i] * *alpha;

 	for ( i=0; i< n; i+=4 )
 	{
 		y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3];		
 		y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3];		
 		y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3];		
 		y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3];		
 	}
 }
 	
 #endif

 #ifndef HAVE_KERNEL_4x2

 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"movss    (%2)  , %%xmm12	 \n\t"	// x0 
 	"movss    (%6)  , %%xmm4 	 \n\t"	// alpha 
 	"movss   4(%2)  , %%xmm13	 \n\t"	// x1 
        "mulss  %%xmm4  , %%xmm12        \n\t"  // alpha 
        "mulss  %%xmm4  , %%xmm13        \n\t"  // alpha 
 	"shufps $0,  %%xmm12, %%xmm12    \n\t"	
 	"shufps $0,  %%xmm13, %%xmm13    \n\t"	

 	".align 16				       \n\t"
 	".L01LOOP%=:				       \n\t"
 	"movups	       (%3,%0,4), %%xmm4	       \n\t"	// 4 * y

 	"movups             (%4,%0,4), %%xmm8          \n\t" 
 	"movups             (%5,%0,4), %%xmm9          \n\t" 
 	"mulps		%%xmm12, %%xmm8		       \n\t"
 	"mulps		%%xmm13, %%xmm9		       \n\t"
 	"addps		%%xmm8 , %%xmm4		       \n\t"
        "addq		$4 , %0	  	 	       \n\t"
 	"addps		%%xmm9 , %%xmm4		       \n\t"

 	"movups  %%xmm4 , -16(%3,%0,4)		       \n\t"	// 4 * y

 	"subq	        $4 , %1			       \n\t"		
 	"jnz		.L01LOOP%=		       \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (alpha)   // 6
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 

 #endif

 #ifndef HAVE_KERNEL_4x2

 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

        BLASLONG register i = 0;
 	BLASLONG register n1 = n & -8 ;
 	BLASLONG register n2 = n & 4  ;

        __asm__  __volatile__
        (
        "movss          (%2), %%xmm12            \n\t"  // x0 
        "mulss          (%6), %%xmm12            \n\t"  // alpha 
        "shufps $0,  %%xmm12, %%xmm12            \n\t"

        "cmpq           $0, %1                   \n\t"
        "je             .L16END%=                \n\t"

        ".align 16                               \n\t"
        ".L01LOOP%=:                             \n\t"
        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
        "movups     16(%3,%0,4), %%xmm5          \n\t"  // 4 * y
        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
        "movups     16(%4,%0,4), %%xmm9          \n\t"  // 4 * a
 	"mulps          %%xmm12, %%xmm8          \n\t"
 	"mulps          %%xmm12, %%xmm9          \n\t"
        "addps          %%xmm4 , %%xmm8          \n\t"
        "addps          %%xmm5 , %%xmm9          \n\t"

        "addq           $8 , %0                  \n\t"
 	"movups  %%xmm8 , -32(%3,%0,4)           \n\t"    // 4 * y
 	"movups  %%xmm9 , -16(%3,%0,4)           \n\t"    // 4 * y

        "subq           $8 , %1                  \n\t"

        "jnz            .L01LOOP%=               \n\t"

        ".L16END%=:                              \n\t"

        "testq          $0x04, %5                \n\t"
        "jz             .L08LABEL%=              \n\t"

        "movups       (%3,%0,4), %%xmm4          \n\t"  // 4 * y
        "movups       (%4,%0,4), %%xmm8          \n\t"  // 4 * a
 	"mulps          %%xmm12, %%xmm8          \n\t"
        "addps          %%xmm8 , %%xmm4          \n\t"
 	"movups  %%xmm4 ,    (%3,%0,4)           \n\t"    // 4 * y
        "addq           $4 , %0                  \n\t"
        "subq           $4 , %1                  \n\t"

        ".L08LABEL%=:      			 \n\t" 
        :
        :
          "r" (i),      // 0    
          "r" (n1),     // 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap),     // 4
          "r" (n2),     // 5
          "r" (alpha)   // 6
        : "cc",
          "%xmm4", "%xmm5",
          "%xmm6", "%xmm7",
          "%xmm8", "%xmm9", "%xmm10", "%xmm11",
          "%xmm12", "%xmm13", "%xmm14", "%xmm15",
          "memory"
        );

 }

 #endif

 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));

 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 {
 	BLASLONG i;
 	if ( inc_dest != 1 )
 	{
 		for ( i=0; i<n; i++ )
 		{
 			*dest += *src;
 			src++;
 			dest += inc_dest;
 		}
 		return;
 	}

        i=0;

        __asm__  __volatile__
        (

        ".align 16                              \n\t"
        ".L01LOOP%=:                            \n\t"

        "movups  (%2,%0,4) , %%xmm12            \n\t"
        "movups  (%3,%0,4) , %%xmm11            \n\t"
        "addps   %%xmm12   , %%xmm11            \n\t"
        "addq           $4 , %0                 \n\t"
        "movups  %%xmm11, -16(%3,%0,4)          \n\t"

        "subq           $4 , %1                 \n\t"
        "jnz            .L01LOOP%=              \n\t"

        :
        :
        "r" (i),          // 0
        "r" (n),          // 1
        "r" (src),        // 2
        "r" (dest)        // 3
        : "cc",
        "%xmm10", "%xmm11", "%xmm12",
        "memory"
        );

 }

 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
 	FLOAT *ap[4];
 	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
 	BLASLONG m3;
 	BLASLONG n2;
 	BLASLONG lda4 =  lda << 2;
 	BLASLONG lda8 =  lda << 3;
 	FLOAT xbuffer[8],*ybuffer;

        if ( m < 1 ) return(0);
        if ( n < 1 ) return(0);

 	ybuffer = buffer;
 	
        if ( inc_x == 1 )
 	{
 		n1 = n >> 3 ;
 		n2 = n &  7 ;
 	}
 	else
 	{
 		n1 = n >> 2 ;
 		n2 = n &  3 ;

 	}
 	
        m3 = m & 3  ;
        m1 = m & -4 ;
        m2 = (m & (NBMAX-1)) - m3 ;


 	y_ptr = y;

 	BLASLONG NB = NBMAX;

 	while ( NB == NBMAX )
 	{
 		
 		m1 -= NB;
 		if ( m1 < 0)
 		{
 			if ( m2 == 0 ) break;	
 			NB = m2;
 		}
 		
 		a_ptr = a;
 		x_ptr = x;
 		
 		ap[0] = a_ptr;
 		ap[1] = a_ptr + lda;
 		ap[2] = ap[1] + lda;
 		ap[3] = ap[2] + lda;

 		if ( inc_y != 1 )
 			memset(ybuffer,0,NB*4);
 		else
 			ybuffer = y_ptr;

 		if ( inc_x == 1 )
 		{


 			for( i = 0; i < n1 ; i++)
 			{
 				sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha);
 				ap[0] += lda8; 
 				ap[1] += lda8; 
 				ap[2] += lda8; 
 				ap[3] += lda8; 
 				a_ptr += lda8;
 				x_ptr += 8;	
 			}


 			if ( n2 & 4 )
 			{
 				sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha);
 				ap[0] += lda4; 
 				ap[1] += lda4; 
 				a_ptr += lda4;
 				x_ptr += 4;	
 			}

 			if ( n2 & 2 )
 			{
 				sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha);
 				a_ptr += lda*2;
 				x_ptr += 2;	
 			}


 			if ( n2 & 1 )
 			{
 				sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha);
 				a_ptr += lda;
 				x_ptr += 1;	

 			}


 		}
 		else
 		{

 			for( i = 0; i < n1 ; i++)
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
 				xbuffer[1] =  x_ptr[0];
 				x_ptr += inc_x;	
 				xbuffer[2] =  x_ptr[0];
 				x_ptr += inc_x;	
 				xbuffer[3] = x_ptr[0];
 				x_ptr += inc_x;	
 				sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha);
 				ap[0] += lda4; 
 				ap[1] += lda4; 
 				ap[2] += lda4; 
 				ap[3] += lda4; 
 				a_ptr += lda4;
 			}

 			for( i = 0; i < n2 ; i++)
 			{
 				xbuffer[0] = x_ptr[0];
 				x_ptr += inc_x;	
 				sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha);
 				a_ptr += lda;

 			}

 		}

 		a     += NB;
 		if ( inc_y != 1 )
 		{
 			add_y(NB,ybuffer,y_ptr,inc_y);
 			y_ptr += NB * inc_y;
 		}
 		else
 			y_ptr += NB ;

 	}

 	if ( m3 == 0 ) return(0);

 	if ( m3 == 3 )
 	{
 		a_ptr = a;
 		x_ptr = x;
 		FLOAT temp0 = 0.0;
 		FLOAT temp1 = 0.0;
 		FLOAT temp2 = 0.0;
 		if ( lda == 3 && inc_x ==1 )
 		{

 			for( i = 0; i < ( n & -4 ); i+=4 )
 			{

 				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1];
 				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1];
 				temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1];

 				temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9]  * x_ptr[3];
 				temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3];
 				temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3];

 				a_ptr += 12;
 				x_ptr += 4;
 			}

 			for( ; i < n; i++ )
 			{
 				temp0 += a_ptr[0] * x_ptr[0];
 				temp1 += a_ptr[1] * x_ptr[0];
 				temp2 += a_ptr[2] * x_ptr[0];
 				a_ptr += 3;
 				x_ptr ++;
 			}

 		}
 		else
 		{

 			for( i = 0; i < n; i++ )
 			{
 				temp0 += a_ptr[0] * x_ptr[0];
 				temp1 += a_ptr[1] * x_ptr[0];
 				temp2 += a_ptr[2] * x_ptr[0];
 				a_ptr += lda;
 				x_ptr += inc_x;


 			}

 		}
 		y_ptr[0] += alpha * temp0;
 		y_ptr += inc_y;
 		y_ptr[0] += alpha * temp1;
 		y_ptr += inc_y;
 		y_ptr[0] += alpha * temp2;
 		return(0);
 	}


 	if ( m3 == 2 )
 	{
 		a_ptr = a;
 		x_ptr = x;
 		FLOAT temp0 = 0.0;
 		FLOAT temp1 = 0.0;
 		if ( lda == 2 && inc_x ==1 )
 		{

 			for( i = 0; i < (n & -4) ; i+=4 )
 			{
 				temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1];
 				temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1];
 				temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3];
 				temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3];
 				a_ptr += 8;
 				x_ptr += 4;

 			}


 			for( ; i < n; i++ )
 			{
 				temp0 += a_ptr[0]   * x_ptr[0];
 				temp1 += a_ptr[1]   * x_ptr[0];
 				a_ptr += 2;
 				x_ptr ++;
 			}

 		}
 		else
 		{

 			for( i = 0; i < n; i++ )
 			{
 				temp0 += a_ptr[0] * x_ptr[0];
 				temp1 += a_ptr[1] * x_ptr[0];
 				a_ptr += lda;
 				x_ptr += inc_x;


 			}

 		}
 		y_ptr[0] += alpha * temp0;
 		y_ptr += inc_y;
 		y_ptr[0] += alpha * temp1;
 		return(0);
 	}

 	if ( m3 == 1 )
 	{
 		a_ptr = a;
 		x_ptr = x;
 		FLOAT temp = 0.0;
 		if ( lda == 1 && inc_x ==1 )
 		{

 			for( i = 0; i < (n & -4); i+=4 )
 			{
 				temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3];
 	
 			}

 			for( ; i < n; i++ )
 			{
 				temp += a_ptr[i] * x_ptr[i];
 			}

 		}
 		else
 		{

 			for( i = 0; i < n; i++ )
 			{
 				temp += a_ptr[0] * x_ptr[0];
 				a_ptr += lda;
 				x_ptr += inc_x;
 			}

 		}
 		y_ptr[0] += alpha * temp;
 		return(0);
 	}


 	return(0);
 }


--- a/kernel/x86_64/sgemv_n_avx.c
+++ b/kernel/x86_64/sgemv_n_avx.c
@@ -1,218 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_n_microk_bulldozer.c"
 #elif defined(HASWELL)
 #include "sgemv_n_microk_haswell.c"
 #else
 #include "sgemv_n_microk_sandy.c"
 #endif

 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 {
 	BLASLONG i;
 	for ( i=0; i<n; i++ )
 	{
 		*dest = *src;
 		dest++;
 		src += inc_src;
 	}
 }

 static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 {
 	BLASLONG i;
 	for ( i=0; i<n; i++ )
 	{
 		*dest += *src;
 		src++;
 		dest += inc_dest;
 	}
 }

 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
 	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG register m2;
 	BLASLONG register n2;
 	FLOAT *xbuffer,*ybuffer;
 	xbuffer = buffer;
 	ybuffer = xbuffer + 2048 + 256;
 	
 	n1 = n / 512 ;
 	n2 = n % 512 ;

 	m1 = m / 64;
 	m2 = m % 64;

 	y_ptr = y;
 	x_ptr = x;

 	for (j=0; j<n1; j++)
 	{

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(512,x_ptr,xbuffer,inc_x);

 		a_ptr = a + j * 512 * lda;
 		y_ptr = y;

 		for(i = 0; i<m1; i++ )
 		{
 			sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(64,ybuffer,y_ptr,inc_y);
 			y_ptr += 64 * inc_y;
 			a_ptr += 64;			

 		}

 		if ( m2 & 32 )
 		{
 			sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(32,ybuffer,y_ptr,inc_y);
 			y_ptr += 32 * inc_y;
 			a_ptr += 32;			

 		}

 		if ( m2 & 16 )
 		{
 			sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(16,ybuffer,y_ptr,inc_y);
 			y_ptr += 16 * inc_y;
 			a_ptr += 16;			
 		}
 		if ( m2 & 8 )
 		{
 			sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(8,ybuffer,y_ptr,inc_y);
 			y_ptr += 8 * inc_y;
 			a_ptr += 8;			
 		}
 		if ( m2 & 4 )
 		{
 			sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(4,ybuffer,y_ptr,inc_y);
 			y_ptr += 4 * inc_y;
 			a_ptr += 4;			
 		}
 		if ( m2 & 2 )
 		{
 			sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(2,ybuffer,y_ptr,inc_y);
 			y_ptr += 2 * inc_y;
 			a_ptr += 2;			
 		}
 		if ( m2 & 1 )
 		{
 			sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(1,ybuffer,y_ptr,inc_y);
 		}
 		x_ptr += 512 * inc_x;

 	}

 	if ( n2 > 0 )
 	{

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(n2,x_ptr,xbuffer,inc_x);

 		a_ptr = a + n1 * 512 * lda;
 		y_ptr = y;

 		for(i = 0; i<m1; i++ )
 		{
 			sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(64,ybuffer,y_ptr,inc_y);
 			y_ptr += 64 * inc_y;
 			a_ptr += 64;			

 		}

 		if ( m2 & 32 )
 		{
 			sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(32,ybuffer,y_ptr,inc_y);
 			y_ptr += 32 * inc_y;
 			a_ptr += 32;			

 		}
 		if ( m2 & 16 )
 		{
 			sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(16,ybuffer,y_ptr,inc_y);
 			y_ptr += 16 * inc_y;
 			a_ptr += 16;			
 		}
 		if ( m2 & 8 )
 		{
 			sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(8,ybuffer,y_ptr,inc_y);
 			y_ptr += 8 * inc_y;
 			a_ptr += 8;			
 		}
 		if ( m2 & 4 )
 		{
 			sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(4,ybuffer,y_ptr,inc_y);
 			y_ptr += 4 * inc_y;
 			a_ptr += 4;			
 		}
 		if ( m2 & 2 )
 		{
 			sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(2,ybuffer,y_ptr,inc_y);
 			y_ptr += 2 * inc_y;
 			a_ptr += 2;			
 		}
 		if ( m2 & 1 )
 		{
 			sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer);
 			add_y(1,ybuffer,y_ptr,inc_y);
 		}


 	}
 	return(0);
 }


--- a/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer-4.c
@@ -0,0 +1,269 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/



 #define HAVE_KERNEL_4x8 1
 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
 	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
 	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 
 	"vbroadcastss  16(%2), %%xmm0 	 \n\t"	// x4 
 	"vbroadcastss  20(%2), %%xmm1 	 \n\t"	// x5 
 	"vbroadcastss  24(%2), %%xmm2 	 \n\t"	// x6 
 	"vbroadcastss  28(%2), %%xmm3 	 \n\t"	// x7 

 	"vbroadcastss    (%9), %%xmm8 	 \n\t"	// alpha 

        "testq          $0x04, %1                      \n\t"
        "jz             .L08LABEL%=                    \n\t"

 	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"

 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
        "addq		$4 , %0	  	 	       \n\t"

 	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
        "addq		$4 , %8	  	 	       \n\t"
 	
 	"vaddps		%%xmm5 , %%xmm4, %%xmm4        \n\t"
 	"vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t"
 	"subq	        $4 , %1			       \n\t"		
 	"vmovups  %%xmm6, -16(%3,%0,4)		       \n\t"	// 4 * y

 	".L08LABEL%=:                                  \n\t"

        "testq          $0x08, %1                      \n\t"
        "jz             .L16LABEL%=                    \n\t"

 	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"

 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 

 	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
 	
 	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
 	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
 	"vmovups  %%xmm4,   (%3,%0,4)		      \n\t"	// 4 * y
 	"vmovups  %%xmm5, 16(%3,%0,4)		      \n\t"	// 4 * y

        "addq		$8 , %0	  	 	      \n\t"
        "addq		$8 , %8	  	 	      \n\t"
 	"subq	        $8 , %1			      \n\t"		


        ".L16LABEL%=:                                  \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L16END%=                      \n\t"

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"

 	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"
 	"vxorps		%%xmm6, %%xmm6 , %%xmm6  \n\t"
 	"vxorps		%%xmm7, %%xmm7 , %%xmm7  \n\t"

        "prefetcht0      192(%4,%0,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" 
        "prefetcht0      192(%5,%0,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%5,%0,4), %%xmm13, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
        "prefetcht0      192(%6,%0,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" 
        "prefetcht0      192(%7,%0,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%7,%0,4), %%xmm15, %%xmm4 \n\t" 
 	".align 2				 \n\t"
 	"vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" 

 	"vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" 

        "prefetcht0      192(%4,%8,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%4,%8,4), %%xmm0 , %%xmm4 \n\t" 
        "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" 
        "prefetcht0      192(%5,%8,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%5,%8,4), %%xmm1 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" 
        "prefetcht0      192(%6,%8,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%6,%8,4), %%xmm2 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" 
        "prefetcht0      192(%7,%8,4)                  \n\t"
 	"vfmaddps %%xmm4,   (%7,%8,4), %%xmm3 , %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" 
 	
 	"vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" 
        "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" 
 	"vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" 
 	
 	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t"
 	"vfmaddps  16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t"
 	"vfmaddps  32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t"
 	"vfmaddps  48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t"

        "addq		$16, %0	  	 	      \n\t"
 	"vmovups  %%xmm4,-64(%3,%0,4)		      \n\t"	// 4 * y
 	"vmovups  %%xmm5,-48(%3,%0,4)		      \n\t"	// 4 * y
        "addq		$16, %8	  	 	      \n\t"
 	"vmovups  %%xmm6,-32(%3,%0,4)		      \n\t"	// 4 * y
 	"vmovups  %%xmm7,-16(%3,%0,4)		      \n\t"	// 4 * y

 	"subq	        $16, %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"

 	".L16END%=:                             \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (lda4),   // 8
          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 




 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vbroadcastss    (%2), %%xmm12	 \n\t"	// x0 
 	"vbroadcastss   4(%2), %%xmm13	 \n\t"	// x1 
 	"vbroadcastss   8(%2), %%xmm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%xmm15	 \n\t"	// x3 

 	"vbroadcastss    (%8), %%xmm8 	 \n\t"	// alpha 

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vxorps		%%xmm4, %%xmm4 , %%xmm4  \n\t"
 	"vxorps		%%xmm5, %%xmm5 , %%xmm5  \n\t"

 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm13, %%xmm5 \n\t" 
 	"vfmaddps %%xmm4,   (%6,%0,4), %%xmm14, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%7,%0,4), %%xmm15, %%xmm5 \n\t" 
 	
 	"vaddps	  %%xmm4, %%xmm5, %%xmm4	       \n\t"

 	"vfmaddps    (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t"
 	"vmovups  %%xmm6,   (%3,%0,4)		      \n\t"	// 4 * y

        "addq		$4 , %0	  	 	      \n\t"
 	"subq	        $4 , %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_n_microk_bulldozer.c
+++ b/kernel/x86_64/sgemv_n_microk_bulldozer.c
@@ -1,451 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	float *pre = a + lda*3;

 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
 	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
 	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
 	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
 	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
 	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
 	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
 	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"vfmaddps %%ymm8 ,   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch
 	"vfmaddps %%ymm9 ,   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
 	"prefetcht0   128(%%r8)\n\t"			// Prefetch
 	"vfmaddps %%ymm10,  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
 	"vfmaddps %%ymm11,  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
 	"prefetcht0   192(%%r8)\n\t"			// Prefetch
 	"vfmaddps %%ymm12,  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
 	"vfmaddps %%ymm13,  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
 	"vfmaddps %%ymm14,  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
 	"vfmaddps %%ymm15,  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
 	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
 	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
 	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
 	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
 	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
 	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha

 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	float *pre = a + lda*3;

 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%xmm8 , %%xmm8 , %%xmm8 \n\t"	// set to zero
 	"vxorps		%%xmm9 , %%xmm9 , %%xmm9 \n\t"	// set to zero
 	"vxorps		%%xmm10, %%xmm10, %%xmm10\n\t"	// set to zero
 	"vxorps		%%xmm11, %%xmm11, %%xmm11\n\t"	// set to zero
 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
 	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
 	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
 	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
 	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"vfmaddps %%xmm8 ,   0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch
 	"vfmaddps %%xmm9 ,   4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm10,   8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm11,  12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm12,  16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm13,  20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm14,  24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm15,  28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%xmm8 , %%xmm1,  %%xmm8 \n\t"  // scale by alpha
 	"vmulps		%%xmm9 , %%xmm1,  %%xmm9 \n\t"  // scale by alpha
 	"vmulps		%%xmm10, %%xmm1,  %%xmm10\n\t"  // scale by alpha
 	"vmulps		%%xmm11, %%xmm1,  %%xmm11\n\t"  // scale by alpha
 	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
 	"vmulps		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha
 	"vmulps		%%xmm14, %%xmm1,  %%xmm14\n\t"  // scale by alpha
 	"vmulps		%%xmm15, %%xmm1,  %%xmm15\n\t"  // scale by alpha

 	"vmovups	%%xmm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%xmm9 ,  4*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%xmm10,  8*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%xmm11, 12*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%xmm12, 16*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%xmm13, 20*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%xmm14, 24*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%xmm15, 28*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	);

 } 

 static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {

 	float *pre = a + lda*3;

 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
 	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 

 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch

 	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
 	"vfmaddps %%ymm13,   8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp

 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
 	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha

 	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm13,  8*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


 static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 

 	"vfmaddps %%ymm12,   0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp

 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha

 	"vmovups	%%ymm12,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


 static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c
        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 

 	"vfmaddps %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp

 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha

 	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 

 static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
 	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 

 	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp
 	"vfmaddss %%xmm13,   1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp

 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
 	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha

 	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 

 	"vfmaddss %%xmm12,   0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp

 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha

 	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_n_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell-4.c
@@ -0,0 +1,299 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/



 #define HAVE_KERNEL_4x8 1
 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
 	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
 	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
 	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
 	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
 	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
 	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
 	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 

 	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 

        "testq          $0x04, %1                      \n\t"
        "jz             .L08LABEL%=                    \n\t"

 	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y
 	"vxorps		%%xmm4 , %%xmm4, %%xmm4        \n\t"
 	"vxorps		%%xmm5 , %%xmm5, %%xmm5        \n\t"

 	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm5      \n\t" 

 	"vfmadd231ps   (%4,%8,4), %%xmm0 , %%xmm4      \n\t" 
 	"vfmadd231ps   (%5,%8,4), %%xmm1 , %%xmm5      \n\t" 
 	"vfmadd231ps   (%6,%8,4), %%xmm2 , %%xmm4      \n\t" 
 	"vfmadd231ps   (%7,%8,4), %%xmm3 , %%xmm5      \n\t" 

 	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
 	"vmulps		%%xmm6 , %%xmm5 , %%xmm5       \n\t"
 	"vaddps		%%xmm7 , %%xmm5 , %%xmm5       \n\t"

 	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y

        "addq		$4 , %8	  	 	       \n\t"
        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		

        ".L08LABEL%=:                                  \n\t"

        "testq          $0x08, %1                      \n\t"
        "jz             .L16LABEL%=                    \n\t"

 	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y
 	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"

 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm5      \n\t" 

 	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
 	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
 	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
 	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm5      \n\t" 

 	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
 	"vmulps		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
 	"vaddps		%%ymm7 , %%ymm5 , %%ymm5       \n\t"


 	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y

        "addq		$8 , %8	  	 	       \n\t"
        "addq		$8 , %0	  	 	       \n\t"
 	"subq	        $8 , %1			       \n\t"		

        ".L16LABEL%=:                                  \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L16END%=                      \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"

 	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%ymm8	       \n\t"	// 8 * y
 	"vmovups      32(%3,%0,4), %%ymm9	       \n\t"	// 8 * y

 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 

 	"vfmadd231ps   (%4,%8,4), %%ymm0 , %%ymm4      \n\t" 
        "addq		$16, %0	  	 	       \n\t"
 	"vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5      \n\t" 
 	"vfmadd231ps   (%5,%8,4), %%ymm1 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5      \n\t" 
 	"vfmadd231ps   (%6,%8,4), %%ymm2 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5      \n\t" 
 	"vfmadd231ps   (%7,%8,4), %%ymm3 , %%ymm4      \n\t" 
 	"vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5      \n\t" 

 	"vfmadd231ps     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
 	"vfmadd231ps     %%ymm6 , %%ymm5 , %%ymm9      \n\t"

        "addq		$16, %8	  	 	      \n\t"
 	"vmovups  %%ymm8,-64(%3,%0,4)		      \n\t"	// 8 * y
 	"subq	        $16, %1			      \n\t"		
 	"vmovups  %%ymm9,-32(%3,%0,4)		      \n\t"	// 8 * y

 	"jnz		.L01LOOP%=		      \n\t"

        ".L16END%=:                             \n\t"
 	"vzeroupper			        \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (lda4),   // 8
          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
 	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
 	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
 	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 

 	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 

        "testq          $0x04, %1                      \n\t"
        "jz             .L08LABEL%=                    \n\t"

 	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%xmm7	       \n\t"	// 4 * y

 	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%xmm13, %%xmm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%xmm14, %%xmm4      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%xmm15, %%xmm5      \n\t" 

 	"vaddps		%%xmm4 , %%xmm5 , %%xmm5       \n\t"
 	"vmulps		%%xmm6 , %%xmm5 , %%xmm5       \n\t"
 	"vaddps		%%xmm7 , %%xmm5 , %%xmm5       \n\t"

 	"vmovups  %%xmm5,   (%3,%0,4)		       \n\t"	// 4 * y

        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		

        ".L08LABEL%=:                                  \n\t"

        "testq          $0x08, %1                      \n\t"
        "jz             .L16LABEL%=                    \n\t"

 	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%ymm7	       \n\t"	// 8 * y

 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm5      \n\t" 

 	"vaddps		%%ymm4 , %%ymm5 , %%ymm5       \n\t"
 	"vmulps		%%ymm6 , %%ymm5 , %%ymm5       \n\t"
 	"vaddps		%%ymm7 , %%ymm5 , %%ymm5       \n\t"

 	"vmovups  %%ymm5,   (%3,%0,4)		       \n\t"	// 8 * y

        "addq		$8 , %0	  	 	       \n\t"
 	"subq	        $8 , %1			       \n\t"		

        ".L16LABEL%=:                                  \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L16END%=                      \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vxorps		%%ymm4 , %%ymm4, %%ymm4        \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%ymm8	 \n\t"	// 8 * y
 	"vmovups      32(%3,%0,4), %%ymm9	 \n\t"	// 8 * y

 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%ymm14, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%ymm15, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5      \n\t" 

 	"vfmadd231ps     %%ymm6 , %%ymm4 , %%ymm8      \n\t"
 	"vfmadd231ps     %%ymm6 , %%ymm5 , %%ymm9      \n\t"

 	"vmovups  %%ymm8,   (%3,%0,4)		      \n\t"	// 8 * y
 	"vmovups  %%ymm9, 32(%3,%0,4)		      \n\t"	// 8 * y

        "addq		$16, %0	  	 	      \n\t"
 	"subq	        $16, %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"

        ".L16END%=:                             \n\t"
 	"vzeroupper			 \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_n_microk_haswell.c
+++ b/kernel/x86_64/sgemv_n_microk_haswell.c
@@ -1,461 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	float *pre = a + lda*2;

 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
 	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
 	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
 	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
 	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
 	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
 	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"vfmadd231ps   0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vfmadd231ps   8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch
 	"vfmadd231ps  16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp
 	"vfmadd231ps  24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp
 	"prefetcht0  128(%%r8)\n\t"			// Prefetch
 	"vfmadd231ps  32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp
 	"vfmadd231ps  40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp
 	"prefetcht0  192(%%r8)\n\t"			// Prefetch
 	"vfmadd231ps  48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp
 	"vfmadd231ps  56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
 	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
 	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
 	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
 	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
 	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
 	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha

 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	float *pre = a + lda*3;

 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
 	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
 	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
 	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
 	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
 	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp

 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp



        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
 	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
 	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha

 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "memory"
 	);



 } 

 static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {

 	float *pre = a + lda*3;
 	
 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
 	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch

 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp

 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha

 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "memory"
 	);


 } 


 static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 	
 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c

 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "memory"
 	);


 } 


 static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c

 	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
 	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha

 	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 

 static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
 	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c

 	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
 	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp

 	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
 	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
 	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha

 	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 

 	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
 	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp

 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha

 	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_n_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_n_microk_nehalem-4.c
@@ -0,0 +1,204 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/



 #define HAVE_KERNEL_4x8 1
 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"movss    (%2), %%xmm12	 \n\t"	// x0 
 	"movss   4(%2), %%xmm13	 \n\t"	// x1 
 	"movss   8(%2), %%xmm14	 \n\t"	// x2 
 	"movss  12(%2), %%xmm15	 \n\t"	// x3 
 	"shufps $0,  %%xmm12, %%xmm12\n\t"	
 	"shufps $0,  %%xmm13, %%xmm13\n\t"	
 	"shufps $0,  %%xmm14, %%xmm14\n\t"	
 	"shufps $0,  %%xmm15, %%xmm15\n\t"	

 	"movss  16(%2), %%xmm0	 \n\t"	// x4 
 	"movss  20(%2), %%xmm1	 \n\t"	// x5 
 	"movss  24(%2), %%xmm2	 \n\t"	// x6 
 	"movss  28(%2), %%xmm3	 \n\t"	// x7 
 	"shufps $0,  %%xmm0 , %%xmm0 \n\t"	
 	"shufps $0,  %%xmm1 , %%xmm1 \n\t"	
 	"shufps $0,  %%xmm2 , %%xmm2 \n\t"	
 	"shufps $0,  %%xmm3 , %%xmm3 \n\t"	

 	"movss    (%9), %%xmm6	     \n\t"	// alpha 
 	"shufps $0,  %%xmm6 , %%xmm6 \n\t"	


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"xorps           %%xmm4 , %%xmm4	 \n\t"
 	"xorps           %%xmm5 , %%xmm5	 \n\t"
 	"movups             (%3,%0,4), %%xmm7          \n\t" // 4 * y

 	".align 2				       \n\t"
 	"movups             (%4,%0,4), %%xmm8          \n\t" 
 	"movups             (%5,%0,4), %%xmm9          \n\t" 
 	"movups             (%6,%0,4), %%xmm10         \n\t" 
 	"movups             (%7,%0,4), %%xmm11         \n\t" 
 	".align 2				       \n\t"
 	"mulps		%%xmm12, %%xmm8		       \n\t"
 	"mulps		%%xmm13, %%xmm9		       \n\t"
 	"mulps		%%xmm14, %%xmm10	       \n\t"
 	"mulps		%%xmm15, %%xmm11	       \n\t"
 	"addps		%%xmm8 , %%xmm4		       \n\t"
 	"addps		%%xmm9 , %%xmm5		       \n\t"
 	"addps		%%xmm10, %%xmm4	               \n\t"
 	"addps		%%xmm11, %%xmm5 	       \n\t"

 	"movups             (%4,%8,4), %%xmm8          \n\t" 
 	"movups             (%5,%8,4), %%xmm9          \n\t" 
 	"movups             (%6,%8,4), %%xmm10         \n\t" 
 	"movups             (%7,%8,4), %%xmm11         \n\t" 
 	".align 2				       \n\t"
 	"mulps		%%xmm0 , %%xmm8		       \n\t"
 	"mulps		%%xmm1 , %%xmm9		       \n\t"
 	"mulps		%%xmm2 , %%xmm10	       \n\t"
 	"mulps		%%xmm3 , %%xmm11	       \n\t"
 	"addps		%%xmm8 , %%xmm4		       \n\t"
 	"addps		%%xmm9 , %%xmm5		       \n\t"
 	"addps		%%xmm10, %%xmm4	       	       \n\t"
 	"addps		%%xmm11, %%xmm5 	       \n\t"

        "addq		$4 , %8	  	 	       \n\t"
 	"addps		%%xmm5 , %%xmm4 	       \n\t"
        "addq		$4 , %0	  	 	       \n\t"
 	"mulps		%%xmm6 , %%xmm4		       \n\t" 
 	"subq	        $4 , %1			       \n\t"		
 	"addps		%%xmm4 , %%xmm7 	       \n\t"

 	"movups  %%xmm7 , -16(%3,%0,4)		       \n\t"	// 4 * y

 	"jnz		.L01LOOP%=		       \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (lda4),   // 8
          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 




 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"movss    (%2), %%xmm12	 \n\t"	// x0 
 	"movss   4(%2), %%xmm13	 \n\t"	// x1 
 	"movss   8(%2), %%xmm14	 \n\t"	// x2 
 	"movss  12(%2), %%xmm15	 \n\t"	// x3 
 	"shufps $0,  %%xmm12, %%xmm12\n\t"	
 	"shufps $0,  %%xmm13, %%xmm13\n\t"	
 	"shufps $0,  %%xmm14, %%xmm14\n\t"	
 	"shufps $0,  %%xmm15, %%xmm15\n\t"	

 	"movss    (%8), %%xmm6	     \n\t"	// alpha 
 	"shufps $0,  %%xmm6 , %%xmm6 \n\t"	

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"xorps           %%xmm4 , %%xmm4	 \n\t"
 	"movups	       (%3,%0,4), %%xmm7	 \n\t"	// 4 * y

 	"movups             (%4,%0,4), %%xmm8          \n\t" 
 	"movups             (%5,%0,4), %%xmm9          \n\t" 
 	"movups             (%6,%0,4), %%xmm10         \n\t" 
 	"movups             (%7,%0,4), %%xmm11         \n\t" 
 	"mulps		%%xmm12, %%xmm8		       \n\t"
 	"mulps		%%xmm13, %%xmm9		       \n\t"
 	"mulps		%%xmm14, %%xmm10	       \n\t"
 	"mulps		%%xmm15, %%xmm11	       \n\t"
 	"addps		%%xmm8 , %%xmm4		       \n\t"
        "addq		$4 , %0	  	 	       \n\t"
 	"addps		%%xmm9 , %%xmm4		       \n\t"
 	"subq	        $4 , %1			       \n\t"		
 	"addps		%%xmm10 , %%xmm4	       \n\t"
 	"addps		%%xmm4 , %%xmm11	       \n\t"

 	"mulps		%%xmm6 , %%xmm11	       \n\t" 
 	"addps		%%xmm7 , %%xmm11 	       \n\t"
 	"movups  %%xmm11, -16(%3,%0,4)		       \n\t"	// 4 * y

 	"jnz		.L01LOOP%=		       \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_n_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy-4.c
@@ -0,0 +1,370 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/




 #define HAVE_KERNEL_4x8 1
 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
 	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
 	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
 	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 
 	"vbroadcastss  16(%2), %%ymm0 	 \n\t"	// x4 
 	"vbroadcastss  20(%2), %%ymm1 	 \n\t"	// x5 
 	"vbroadcastss  24(%2), %%ymm2 	 \n\t"	// x6 
 	"vbroadcastss  28(%2), %%ymm3 	 \n\t"	// x7 

 	"vbroadcastss    (%9), %%ymm6 	 \n\t"	// alpha 

        "testq          $0x04, %1               \n\t"
        "jz             .L08LABEL%=             \n\t"

 	"vxorps	  %%xmm4 , %%xmm4 , %%xmm4        \n\t"
 	"vxorps	  %%xmm5 , %%xmm5 , %%xmm5        \n\t"
 	"vmovups	(%3,%0,4), %%xmm7	  \n\t"	// 4 * y

 	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
 	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
 	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
 	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"

 	"vmulps   (%4,%8,4), %%xmm0 , %%xmm8      \n\t" 
 	"vmulps   (%5,%8,4), %%xmm1 , %%xmm10     \n\t" 
 	"vmulps   (%6,%8,4), %%xmm2 , %%xmm9      \n\t" 
 	"vmulps   (%7,%8,4), %%xmm3 , %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"

 	"vaddps	  %%xmm5, %%xmm4 , %%xmm4	  \n\t"
 	"vmulps	  %%xmm6, %%xmm4 , %%xmm5	  \n\t"
 	"vaddps	  %%xmm5, %%xmm7 , %%xmm5	  \n\t"

 	"vmovups  %%xmm5,   (%3,%0,4)		  \n\t"	// 4 * y

        "addq		$4, %8	  	 	  \n\t"
        "addq		$4, %0	  	 	  \n\t"
 	"subq	        $4, %1			  \n\t"		

        ".L08LABEL%=:                             \n\t"

        "testq          $0x08, %1                 \n\t"
        "jz             .L16LABEL%=               \n\t"

 	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%ymm7	  \n\t"	// 8 * y

 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
 	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
 	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
 	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
 	"vmulps   (%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
 	"vmulps   (%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"vaddps	  %%ymm5, %%ymm4 , %%ymm4	  \n\t"
 	"vmulps	  %%ymm6, %%ymm4 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm5, %%ymm7 , %%ymm5	  \n\t"

 	"vmovups  %%ymm5,   (%3,%0,4)		  \n\t"	// 8 * y

        "addq		$8, %8	  	 	  \n\t"
        "addq		$8, %0	  	 	  \n\t"
 	"subq	        $8, %1			  \n\t"		


        ".L16LABEL%=:                             \n\t"

        "cmpq           $0, %1                    \n\t"
        "je             .L16END%=                 \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"

 	"prefetcht0	 192(%4,%0,4)		  \n\t"
 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
 	"prefetcht0	 192(%5,%0,4)		  \n\t"
 	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
 	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"prefetcht0	 192(%6,%0,4)		  \n\t"
 	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
 	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
 	"prefetcht0	 192(%7,%0,4)		  \n\t"
 	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
 	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"prefetcht0	 192(%4,%8,4)		  \n\t"
 	"vmulps   (%4,%8,4), %%ymm0 , %%ymm8      \n\t" 
 	"vmulps 32(%4,%8,4), %%ymm0 , %%ymm9      \n\t" 
 	"prefetcht0	 192(%5,%8,4)		  \n\t"
 	"vmulps   (%5,%8,4), %%ymm1 , %%ymm10     \n\t" 
 	"vmulps 32(%5,%8,4), %%ymm1 , %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"prefetcht0	 192(%6,%8,4)		  \n\t"
 	"vmulps   (%6,%8,4), %%ymm2 , %%ymm8      \n\t" 
 	"vmulps 32(%6,%8,4), %%ymm2 , %%ymm9      \n\t" 
 	"prefetcht0	 192(%7,%8,4)		  \n\t"
 	"vmulps   (%7,%8,4), %%ymm3 , %%ymm10     \n\t" 
 	"vmulps 32(%7,%8,4), %%ymm3 , %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"vmulps	  %%ymm6, %%ymm4 , %%ymm4	  \n\t"
 	"vmulps	  %%ymm6, %%ymm5 , %%ymm5	  \n\t"

 	"vaddps    (%3,%0,4), %%ymm4 , %%ymm4	 \n\t"	// 8 * y
 	"vaddps  32(%3,%0,4), %%ymm5 , %%ymm5	 \n\t"	// 8 * y

 	"vmovups  %%ymm4,   (%3,%0,4)		  \n\t"	// 8 * y
 	"vmovups  %%ymm5, 32(%3,%0,4)		  \n\t"	// 8 * y

        "addq		$16, %8	  	 	  \n\t"
        "addq		$16, %0	  	 	  \n\t"
 	"subq	        $16, %1			  \n\t"		
 	"jnz		.L01LOOP%=		  \n\t"

 	".L16END%=:                               \n\t"
 	"vzeroupper			          \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (lda4),   // 8
          "r" (alpha)   // 9
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 




 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
 	"vbroadcastss    (%2), %%ymm12	 \n\t"	// x0 
 	"vbroadcastss   4(%2), %%ymm13	 \n\t"	// x1 
 	"vbroadcastss   8(%2), %%ymm14	 \n\t"	// x2 
 	"vbroadcastss  12(%2), %%ymm15	 \n\t"	// x3 

 	"vbroadcastss    (%8), %%ymm6 	 \n\t"	// alpha 

        "testq          $0x04, %1               \n\t"
        "jz             .L08LABEL%=             \n\t"

 	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%xmm7	  \n\t"	// 4 * y

 	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
 	"vmulps   (%5,%0,4), %%xmm13, %%xmm10     \n\t" 
 	"vmulps   (%6,%0,4), %%xmm14, %%xmm9      \n\t" 
 	"vmulps   (%7,%0,4), %%xmm15, %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm4, %%xmm9 , %%xmm4	  \n\t"
 	"vaddps	  %%xmm5, %%xmm11, %%xmm5	  \n\t"

 	"vaddps	  %%xmm5, %%xmm4 , %%xmm4	  \n\t"
 	"vmulps	  %%xmm6, %%xmm4 , %%xmm5	  \n\t"
 	"vaddps	  %%xmm5, %%xmm7 , %%xmm5	  \n\t"

 	"vmovups  %%xmm5,   (%3,%0,4)		  \n\t"	// 4 * y

        "addq		$4, %0	  	 	  \n\t"
 	"subq	        $4, %1			  \n\t"		

        ".L08LABEL%=:                           \n\t"

        "testq          $0x08, %1                 \n\t"
        "jz             .L16LABEL%=               \n\t"

 	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%ymm7	  \n\t"	// 8 * y

 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
 	"vmulps   (%6,%0,4), %%ymm14, %%ymm9      \n\t" 
 	"vmulps   (%7,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm9 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"vaddps	  %%ymm5, %%ymm4 , %%ymm4	  \n\t"
 	"vmulps	  %%ymm6, %%ymm4 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm5, %%ymm7 , %%ymm5	  \n\t"

 	"vmovups  %%ymm5,   (%3,%0,4)		  \n\t"	// 8 * y

        "addq		$8, %0	  	 	  \n\t"
 	"subq	        $8, %1			  \n\t"		


        ".L16LABEL%=:                             \n\t"

        "cmpq           $0, %1                    \n\t"
        "je             .L16END%=                 \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vxorps	  %%ymm4 , %%ymm4 , %%ymm4        \n\t"
 	"vxorps	  %%ymm5 , %%ymm5 , %%ymm5        \n\t"
 	"vmovups	(%3,%0,4), %%ymm0	 \n\t"	// 8 * y
 	"vmovups      32(%3,%0,4), %%ymm1	 \n\t"	// 8 * y

 	"prefetcht0	 192(%4,%0,4)		  \n\t"
 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps 32(%4,%0,4), %%ymm12, %%ymm9      \n\t" 
 	"prefetcht0	 192(%5,%0,4)		  \n\t"
 	"vmulps   (%5,%0,4), %%ymm13, %%ymm10     \n\t" 
 	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"prefetcht0	 192(%6,%0,4)		  \n\t"
 	"vmulps   (%6,%0,4), %%ymm14, %%ymm8      \n\t" 
 	"vmulps 32(%6,%0,4), %%ymm14, %%ymm9      \n\t" 
 	"prefetcht0	 192(%7,%0,4)		  \n\t"
 	"vmulps   (%7,%0,4), %%ymm15, %%ymm10     \n\t" 
 	"vmulps 32(%7,%0,4), %%ymm15, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm9 , %%ymm5	  \n\t"
 	"vaddps	  %%ymm4, %%ymm10, %%ymm4	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"

 	"vmulps	  %%ymm6, %%ymm4 , %%ymm4	  \n\t"
 	"vmulps	  %%ymm6, %%ymm5 , %%ymm5	  \n\t"

 	"vaddps	  %%ymm4, %%ymm0 , %%ymm0	  \n\t"
 	"vaddps	  %%ymm5, %%ymm1 , %%ymm1	  \n\t"

 	"vmovups  %%ymm0,   (%3,%0,4)		  \n\t"	// 8 * y
 	"vmovups  %%ymm1, 32(%3,%0,4)		  \n\t"	// 8 * y

        "addq		$16, %0	  	 	  \n\t"
 	"subq	        $16, %1			  \n\t"		
 	"jnz		.L01LOOP%=		  \n\t"

 	".L16END%=:                               \n\t"
 	"vzeroupper			          \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3]),  // 7
          "r" (alpha)   // 8
 	: "cc", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_n_microk_sandy.c
+++ b/kernel/x86_64/sgemv_n_microk_sandy.c
@@ -1,473 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 static void  sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	float *pre = a + lda*2;

 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
 	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
 	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
 	"vxorps		%%ymm12, %%ymm12, %%ymm12\n\t"	// set to zero
 	"vxorps		%%ymm13, %%ymm13, %%ymm13\n\t"	// set to zero
 	"vxorps		%%ymm14, %%ymm14, %%ymm14\n\t"	// set to zero
 	"vxorps		%%ymm15, %%ymm15, %%ymm15\n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
 	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch
 	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
 	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp

 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
 	"prefetcht0  128(%%r8)\n\t"			// Prefetch
 	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp

 	"prefetcht0  192(%%r8)\n\t"			// Prefetch
 	"vmulps  32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps  40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
 	"vmulps  48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
 	"vmulps  56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp

 	"vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
 	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
 	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha
 	"vmulps		%%ymm12, %%ymm1,  %%ymm12\n\t"  // scale by alpha
 	"vmulps		%%ymm13, %%ymm1,  %%ymm13\n\t"  // scale by alpha
 	"vmulps		%%ymm14, %%ymm1,  %%ymm14\n\t"  // scale by alpha
 	"vmulps		%%ymm15, %%ymm1,  %%ymm15\n\t"  // scale by alpha

 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm12, 32*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm13, 40*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm14, 48*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm15, 56*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 static void  sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	float *pre = a + lda*3;

 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
 	"vxorps		%%ymm10, %%ymm10, %%ymm10\n\t"	// set to zero
 	"vxorps		%%ymm11, %%ymm11, %%ymm11\n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
 	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp
 	"vmulps  16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp
 	"vmulps  24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp

 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp
 	"vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp



        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha
 	"vmulps		%%ymm10, %%ymm1,  %%ymm10\n\t"  // scale by alpha
 	"vmulps		%%ymm11, %%ymm1,  %%ymm11\n\t"  // scale by alpha

 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm10, 16*4(%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm11, 24*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "memory"
 	);



 } 

 static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {

 	float *pre = a + lda*3;
 	
 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx
 	"movq		%6,	 %%r8\n\t"		// address for prefetch
 	"prefetcht0	(%%r8)\n\t"			// Prefetch
 	"prefetcht0   64(%%r8)\n\t"			// Prefetch

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	"vxorps		%%ymm9 , %%ymm9 , %%ymm9 \n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c
 	"nop					 \n\t"
 	"leaq     (%%r8 , %%rcx, 4), %%r8	 \n\t"	// add lda to pointer for prefetch

 	"prefetcht0	(%%r8)\n\t"			// Prefetch

 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vmulps   8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp

 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmulps		%%ymm9 , %%ymm1,  %%ymm9 \n\t"  // scale by alpha

 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovups	%%ymm9 ,  8*4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y),      // 5
 	  "m" (pre)	// 6
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "memory"
 	);


 } 


 static void  sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y)
 {
 	
 	__asm__  __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%ymm1\n\t"		// alpha -> ymm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%ymm8 , %%ymm8 , %%ymm8 \n\t"	// set to zero
 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%ymm0	 \n\t"	// load values of c

 	"vmulps   0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp
 	"vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%ymm8 , %%ymm1,  %%ymm8 \n\t"  // scale by alpha
 	"vmovups	%%ymm8 ,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "memory"
 	);


 } 


 static void  sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vbroadcastss   %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vbroadcastss	(%%rdi),   %%xmm0	 \n\t"	// load values of c

 	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
 	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulps		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha

 	"vmovups	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 

 static void  sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
 	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c

 	"vmulps   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
 	"vmulps   1*4(%%rsi), %%xmm0, %%xmm5 \n\t" 		// multiply a and c and add to temp

 	"vaddps %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp
 	"vaddps %%xmm13, %%xmm5, %%xmm13     \n\t" 		// multiply a and c and add to temp

        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 
 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha
 	"vmulss		%%xmm13, %%xmm1,  %%xmm13\n\t"  // scale by alpha

 	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y
 	"vmovss 	%%xmm13,    4(%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



 static void  sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y)
 {


 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero

 	".L01LOOP%=:				 \n\t"
 	"vmovss      	(%%rdi),   %%xmm0	 \n\t"	// load values of c
        "addq		$4     ,   %%rdi	 \n\t"  // increment pointer of c 

 	"vmulss   0*4(%%rsi), %%xmm0, %%xmm4 \n\t" 		// multiply a and c and add to temp
 	"vaddss %%xmm12, %%xmm4, %%xmm12     \n\t" 		// multiply a and c and add to temp

 	"leaq     (%%rsi, %%rcx, 4), %%rsi	 \n\t"	// add lda to pointer of a

 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vmulss		%%xmm12, %%xmm1,  %%xmm12\n\t"  // scale by alpha

 	"vmovss 	%%xmm12,     (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", 
 	  "%xmm0", "%xmm1", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_t_4.c
+++ b/kernel/x86_64/sgemv_t_4.c
@@ -0,0 +1,624 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #if defined(NEHALEM)
 #include "sgemv_t_microk_nehalem-4.c"
 #elif defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_t_microk_bulldozer-4.c"
 #elif defined(SANDYBRIDGE)
 #include "sgemv_t_microk_sandy-4.c"
 #elif defined(HASWELL)
 #include "sgemv_t_microk_haswell-4.c"
 #endif

 #define NBMAX 4096

 #ifndef HAVE_KERNEL_4x4

 static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;
 	FLOAT *a0,*a1,*a2,*a3;
 	a0 = ap[0];
 	a1 = ap[1];
 	a2 = ap[2];
 	a3 = ap[3];
 	FLOAT temp0 = 0.0;
 	FLOAT temp1 = 0.0;
 	FLOAT temp2 = 0.0;
 	FLOAT temp3 = 0.0;

 	for ( i=0; i< n; i+=4 )
 	{
 		temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3];		
 		temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3];		
 		temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3];		
 		temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3];		
 	}
 	y[0] = temp0;
 	y[1] = temp1;
 	y[2] = temp2;
 	y[3] = temp3;
 }
 	
 #endif

 static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));

 static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;

 	i=0;

        __asm__  __volatile__
 	(
 	"xorps %%xmm10 , %%xmm10		\n\t"
 	"xorps %%xmm11 , %%xmm11		\n\t"
 		
 	"testq	$4 , %1				\n\t"
 	"jz	.L01LABEL%=			\n\t"

 	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
 	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
 	"movups  (%4,%0,4) , %%xmm13		\n\t" // ap1
 	"mulps   %%xmm14   , %%xmm12 		\n\t"
 	"mulps   %%xmm14   , %%xmm13 		\n\t"
        "addq           $4 , %0                 \n\t"
 	"addps   %%xmm12   , %%xmm10		\n\t"
        "subq           $4 , %1                 \n\t"
 	"addps   %%xmm13   , %%xmm11		\n\t"

        ".L01LABEL%=:                           \n\t"

 	"cmpq	$0, %1				\n\t"
 	"je	.L01END%=			\n\t"

        ".align 16                              \n\t"
        ".L01LOOP%=:                            \n\t"

 	"movups  (%5,%0,4) , %%xmm14		\n\t" // x
 	"movups  (%3,%0,4) , %%xmm12		\n\t" // ap0
 	"movups  (%4,%0,4) , %%xmm13		\n\t" // ap1
 	"mulps   %%xmm14   , %%xmm12 		\n\t"
 	"mulps   %%xmm14   , %%xmm13 		\n\t"
 	"addps   %%xmm12   , %%xmm10		\n\t"
 	"addps   %%xmm13   , %%xmm11		\n\t"

 	"movups  16(%5,%0,4) , %%xmm14		\n\t" // x
 	"movups  16(%3,%0,4) , %%xmm12		\n\t" // ap0
 	"movups  16(%4,%0,4) , %%xmm13		\n\t" // ap1
 	"mulps   %%xmm14   , %%xmm12 		\n\t"
 	"mulps   %%xmm14   , %%xmm13 		\n\t"
 	"addps   %%xmm12   , %%xmm10		\n\t"
 	"addps   %%xmm13   , %%xmm11		\n\t"

        "addq           $8 , %0                 \n\t"
        "subq           $8 , %1                 \n\t"
        "jnz            .L01LOOP%=              \n\t"

        ".L01END%=:                             \n\t"

 	"haddps        %%xmm10, %%xmm10         \n\t"
 	"haddps        %%xmm11, %%xmm11         \n\t"
 	"haddps        %%xmm10, %%xmm10         \n\t"
 	"haddps        %%xmm11, %%xmm11         \n\t"

 	"movss	       %%xmm10, (%2)	        \n\t"
 	"movss	       %%xmm11,4(%2)	        \n\t"

        :
   	:
 	"r" (i),	 // 0
 	"r" (n),	 // 1
        "r" (y),         // 2    
        "r" (ap0),       // 3
        "r" (ap1),       // 4
        "r" (x)          // 5
        : "cc",
       	"%xmm4", "%xmm5", "%xmm10", "%xmm11",
       	"%xmm12", "%xmm13", "%xmm14", "%xmm15",
       	"memory"
       	);


 }
 	
 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)  __attribute__ ((noinline));

 static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y)
 {
 	BLASLONG i;

 	i=0;

        __asm__  __volatile__
 	(
 	"xorps %%xmm9  , %%xmm9 		\n\t"
 	"xorps %%xmm10 , %%xmm10		\n\t"
 	
 	"testq	$4 , %1				\n\t"
 	"jz	.L01LABEL%=			\n\t"

 	"movups  (%3,%0,4) , %%xmm12		\n\t"
 	"movups  (%4,%0,4) , %%xmm11		\n\t"
 	"mulps   %%xmm11   , %%xmm12 		\n\t"
        "addq           $4 , %0                 \n\t"
 	"addps   %%xmm12   , %%xmm10		\n\t"
        "subq           $4 , %1                 \n\t"

        ".L01LABEL%=:                           \n\t"

 	"cmpq	$0, %1				\n\t"
 	"je	.L01END%=			\n\t"

        ".align 16                              \n\t"
        ".L01LOOP%=:                            \n\t"

 	"movups    (%3,%0,4) , %%xmm12		\n\t"
 	"movups  16(%3,%0,4) , %%xmm14		\n\t"
 	"movups    (%4,%0,4) , %%xmm11		\n\t"
 	"movups  16(%4,%0,4) , %%xmm13		\n\t"
 	"mulps   %%xmm11   , %%xmm12 		\n\t"
 	"mulps   %%xmm13   , %%xmm14 		\n\t"
        "addq           $8 , %0                 \n\t"
 	"addps   %%xmm12   , %%xmm10		\n\t"
        "subq           $8 , %1                 \n\t"
 	"addps   %%xmm14   , %%xmm9 		\n\t"

        "jnz            .L01LOOP%=              \n\t"

        ".L01END%=:                             \n\t"

 	"addps	       %%xmm9 , %%xmm10         \n\t"
 	"haddps        %%xmm10, %%xmm10         \n\t"
 	"haddps        %%xmm10, %%xmm10         \n\t"

 	"movss	       %%xmm10, (%2)	        \n\t"

        :
   	:
 	"r" (i),	 // 0
 	"r" (n),	 // 1
        "r" (y),         // 2    
        "r" (ap),        // 3
        "r" (x)          // 4
        : "cc",
       	"%xmm9", "%xmm10" ,
       	"%xmm11", "%xmm12", "%xmm13", "%xmm14",
       	"memory"
       	);


 }
 	
 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 {
        BLASLONG i;
        for ( i=0; i<n; i++ )
        {
                *dest = *src;
                dest++;
                src += inc_src;
        }
 }

 static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline));

 static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest)
 {

        BLASLONG i;

 	if ( inc_dest != 1 )
 	{
        	for ( i=0; i<n; i++ )
        	{
                	*dest += src[i]  * da;
                	dest  += inc_dest;
 		}
 		return;
        }

 	i=0;

        __asm__  __volatile__
 	(
 	"movss	 (%2) , %%xmm10                 \n\t"
 	"shufps  $0 , %%xmm10 , %%xmm10		\n\t"

        ".align 16                              \n\t"
        ".L01LOOP%=:                            \n\t"

 	"movups  (%3,%0,4) , %%xmm12		\n\t"
 	"movups  (%4,%0,4) , %%xmm11		\n\t"
 	"mulps   %%xmm10   , %%xmm12 		\n\t"
        "addq           $4 , %0                 \n\t"
 	"addps   %%xmm12   , %%xmm11		\n\t"
        "subq           $4 , %1                 \n\t"
 	"movups  %%xmm11, -16(%4,%0,4)		\n\t"

        "jnz            .L01LOOP%=              \n\t"

        :
   	:
 	"r" (i),	  // 0
 	"r" (n),	  // 1
        "r" (&da),        // 2    
        "r" (src),        // 3
        "r" (dest)        // 4
        : "cc",
       	"%xmm10", "%xmm11", "%xmm12",
       	"memory"
       	);


 }

 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG register i;
 	BLASLONG register j;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
 	BLASLONG n0;
 	BLASLONG n1;
 	BLASLONG m1;
 	BLASLONG m2;
 	BLASLONG m3;
 	BLASLONG n2;
 	FLOAT ybuffer[4],*xbuffer;
 	FLOAT *ytemp;

        if ( m < 1 ) return(0);
        if ( n < 1 ) return(0);

 	xbuffer = buffer;
 	ytemp   = buffer + NBMAX;
 	
 	n0 = n / NBMAX;
        n1 = (n % NBMAX)  >> 2 ;
        n2 = n & 3  ;

 	m3 = m & 3  ;
        m1 = m & -4 ;
        m2 = (m & (NBMAX-1)) - m3 ;


 	BLASLONG NB = NBMAX;

 	while ( NB == NBMAX )
 	{
 		
 		m1 -= NB;
 		if ( m1 < 0)
 		{
 			if ( m2 == 0 ) break;	
 			NB = m2;
 		}
 		
 		y_ptr = y;
 		a_ptr = a;
 		x_ptr = x;

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(NB,x_ptr,xbuffer,inc_x);


 		FLOAT *ap[4];
 		FLOAT *yp;
 		BLASLONG register lda4 = 4 * lda;
 		ap[0] = a_ptr;
 		ap[1] = a_ptr + lda;
 		ap[2] = ap[1] + lda;
 		ap[3] = ap[2] + lda;

 		if ( n0 > 0 )
 		{
 			BLASLONG nb1 = NBMAX / 4;
 			for( j=0; j<n0; j++)
 			{

 				yp = ytemp;
 				for( i = 0; i < nb1  ; i++)
 				{
 					sgemv_kernel_4x4(NB,ap,xbuffer,yp);
 					ap[0] += lda4 ;
 					ap[1] += lda4 ;
 					ap[2] += lda4 ;
 					ap[3] += lda4 ;
 					yp += 4;
 				}
 				add_y(nb1*4, alpha, ytemp, y_ptr, inc_y );
 				y_ptr += nb1 * inc_y * 4;
 				a_ptr += nb1 * lda4 ;

 			}

 		}


 		yp = ytemp;

 		for( i = 0; i < n1 ; i++)
 		{
 			sgemv_kernel_4x4(NB,ap,xbuffer,yp);
 			ap[0] += lda4 ;
 			ap[1] += lda4 ;
 			ap[2] += lda4 ;
 			ap[3] += lda4 ;
 			yp += 4;
 		}
 		if ( n1 > 0 )
 		{
 			add_y(n1*4, alpha, ytemp, y_ptr, inc_y );
 			y_ptr += n1 * inc_y * 4;
 			a_ptr += n1 * lda4 ;
 		}

 		if ( n2 & 2 )
 		{

 			sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer);
 			a_ptr  += lda * 2;
 			*y_ptr += ybuffer[0] * alpha;
 			y_ptr  += inc_y;
 			*y_ptr += ybuffer[1] * alpha;
 			y_ptr  += inc_y;

 		}

 		if ( n2 & 1 )
 		{

 			sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer);
 			a_ptr  += lda;
 			*y_ptr += ybuffer[0] * alpha;
 			y_ptr  += inc_y;

 		}
 		a += NB;
 		x += NB * inc_x;	
 	}

 	if ( m3 == 0 ) return(0);

 	x_ptr = x;
 	a_ptr = a;
 	if ( m3 == 3 )
 	{
 		FLOAT xtemp0 = *x_ptr * alpha;
 		x_ptr += inc_x;
 		FLOAT xtemp1 = *x_ptr * alpha;
 		x_ptr += inc_x;
 		FLOAT xtemp2 = *x_ptr * alpha;

 		FLOAT *aj = a_ptr;
 		y_ptr = y;

 		if ( lda == 3 && inc_y == 1 )
 		{

 			for ( j=0; j< ( n & -4) ; j+=4 )
 			{

 				y_ptr[j]   += aj[0] * xtemp0 + aj[1]  * xtemp1 + aj[2]  * xtemp2;
 				y_ptr[j+1] += aj[3] * xtemp0 + aj[4]  * xtemp1 + aj[5]  * xtemp2;
 				y_ptr[j+2] += aj[6] * xtemp0 + aj[7]  * xtemp1 + aj[8]  * xtemp2;
 				y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2;
 			 	aj        += 12;
 			}

 			for ( ; j<n; j++ )
 			{
 				y_ptr[j]  += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2;
 			 	aj        += 3;
 			}

 		}
 		else
 		{

 			if ( inc_y == 1 )
 			{

 				BLASLONG register lda2 = lda << 1;
 				BLASLONG register lda4 = lda << 2;
 				BLASLONG register lda3 = lda2 + lda;

 				for ( j=0; j< ( n & -4 ); j+=4 )
 				{

 					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 + *(aj+2)      * xtemp2;
 					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 + *(aj+lda+2)  * xtemp2;
 					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2;
 					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2;
 			 		aj          += lda4;
 				}

 				for ( ; j< n ; j++ )
 				{

 					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ;
 			 		aj          += lda;
 				}

 			}
 			else
 			{

 				for ( j=0; j<n; j++ )
 				{
 					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2;
 				 	y_ptr += inc_y;
 			 		aj    += lda;
 				}


 			}

 		}
 		return(0);
 	}

 	if ( m3 == 2 )
 	{
 		FLOAT xtemp0 = *x_ptr * alpha;
 		x_ptr += inc_x;
 		FLOAT xtemp1 = *x_ptr * alpha;

 		FLOAT *aj = a_ptr;
 		y_ptr = y;

 		if ( lda == 2 && inc_y == 1 )
 		{

 			for ( j=0; j< ( n & -4) ; j+=4 )
 			{
 				y_ptr[j]   += aj[0] * xtemp0 + aj[1] * xtemp1 ;
 				y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ;
 				y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ;
 				y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ;
 			 	aj         += 8;

 			}

 			for ( ; j<n; j++ )
 			{
 				y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ;
 			 	aj       += 2;
 			}

 		}
 		else
 		{
 			if ( inc_y == 1 )
 			{

 				BLASLONG register lda2 = lda << 1;
 				BLASLONG register lda4 = lda << 2;
 				BLASLONG register lda3 = lda2 + lda;

 				for ( j=0; j< ( n & -4 ); j+=4 )
 				{

 					y_ptr[j]    += *aj        * xtemp0 + *(aj+1)      * xtemp1 ;
 					y_ptr[j+1]  += *(aj+lda)  * xtemp0 + *(aj+lda+1)  * xtemp1 ;
 					y_ptr[j+2]  += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ;
 					y_ptr[j+3]  += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ;
 			 		aj          += lda4;
 				}

 				for ( ; j< n ; j++ )
 				{

 					y_ptr[j]    += *aj * xtemp0 + *(aj+1) * xtemp1 ;
 			 		aj          += lda;
 				}

 			}
 			else
 			{
 				for ( j=0; j<n; j++ )
 				{
 					*y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ;
 			 		y_ptr += inc_y;
 			 		aj    += lda;
 				}
 			}

 		}
 		return(0);

 	}

 	FLOAT xtemp = *x_ptr * alpha;
 	FLOAT *aj = a_ptr;
 	y_ptr = y;
 	if ( lda == 1 && inc_y == 1 )
 	{
 		for ( j=0; j< ( n & -4) ; j+=4 )
 		{
 			y_ptr[j]   += aj[j]   * xtemp;
 			y_ptr[j+1] += aj[j+1] * xtemp;
 			y_ptr[j+2] += aj[j+2] * xtemp;
 			y_ptr[j+3] += aj[j+3] * xtemp;
 		}
 		for ( ; j<n   ; j++ )
 		{
 			y_ptr[j] += aj[j] * xtemp;
 		}



 	}
 	else
 	{
 		if ( inc_y == 1 )
 		{

 			BLASLONG register lda2 = lda << 1;
 			BLASLONG register lda4 = lda << 2;
 			BLASLONG register lda3 = lda2 + lda;
 			for ( j=0; j< ( n & -4 ); j+=4 )
 			{
 				y_ptr[j]    += *aj        * xtemp;
 				y_ptr[j+1]  += *(aj+lda)  * xtemp;
 				y_ptr[j+2]  += *(aj+lda2) * xtemp;
 				y_ptr[j+3]  += *(aj+lda3) * xtemp;
 		 		aj          += lda4  ;
 			}

 			for ( ; j<n; j++ )
 			{
 				y_ptr[j]  += *aj * xtemp;
 		 		aj        += lda;
 			}

 		}
 		else
 		{
 			for ( j=0; j<n; j++ )
 			{
 				*y_ptr += *aj * xtemp;
 		 		y_ptr += inc_y;
 		 		aj    += lda;
 			}

 		}
 	}

 	return(0);
 }


--- a/kernel/x86_64/sgemv_t_avx.c
+++ b/kernel/x86_64/sgemv_t_avx.c
@@ -1,232 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/


 #include "common.h"

 #if defined(BULLDOZER) || defined(PILEDRIVER)
 #include "sgemv_t_microk_bulldozer.c"
 #elif defined(HASWELL)
 #include "sgemv_t_microk_haswell.c"
 #else
 #include "sgemv_t_microk_sandy.c"
 #endif

 static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src)
 {
 	BLASLONG i;
 	for ( i=0; i<n; i++ )
 	{
 		*dest = *src;
 		dest++;
 		src += inc_src;
 	}
 }

 static void  sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y)
 {

 	FLOAT register temp0 = 0.0;
 	BLASLONG i;
 	for ( i=0; i<n ; i++)
 	{
 		temp0 += a[i] * x[i];
 	}
 	temp0 *= alpha ;
 	*y += temp0;
 }




 int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer)
 {
 	BLASLONG i;
 	BLASLONG j;
 	FLOAT *a_ptr;
 	FLOAT *x_ptr;
 	FLOAT *y_ptr;
 	FLOAT *a_ptrl;
 	BLASLONG m1;
 	BLASLONG register m2;
 	FLOAT *xbuffer;
 	xbuffer = buffer;
 	BLASLONG register Mblock;

 	m1 = m / 1024 ;
 	m2 = m % 1024 ;

 	x_ptr = x;
 	a_ptr = a;

 	for (j=0; j<m1; j++)
 	{

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(1024,x_ptr,xbuffer,inc_x);

 		y_ptr = y;
 		a_ptrl = a_ptr;

 		for(i = 0; i<n; i++ )
 		{
 			sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr);
 			y_ptr += inc_y;
 			a_ptrl += lda;
 		}
 		a_ptr += 1024;	
 		x_ptr += 1024 * inc_x;
 	}

 	if ( m2 == 0 ) return(0);

 	Mblock = 512;
 	while ( Mblock >= 16 )
 	{
 	  if ( m2 & Mblock)
 	  {

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(Mblock,x_ptr,xbuffer,inc_x);

 		y_ptr = y;
 		a_ptrl = a_ptr;

 		for(i = 0; i<n; i++ )
 		{
 			sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
 			y_ptr += inc_y;
 			a_ptrl += lda;
 		}
 		a_ptr += Mblock;	
 		x_ptr += Mblock * inc_x;


 	  }
 	  Mblock /= 2;

 	}

        if ( m2 & Mblock)
 	{

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(Mblock,x_ptr,xbuffer,inc_x);

 		y_ptr = y;
 		a_ptrl = a_ptr;

 		for(i = 0; i<n; i++ )
 		{
 			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
 			y_ptr += inc_y;
 			a_ptrl += lda;
 		}
 		a_ptr += Mblock;	
 		x_ptr += Mblock * inc_x;


 	}
 	Mblock /= 2;


        if ( m2 & Mblock)
 	{

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(Mblock,x_ptr,xbuffer,inc_x);

 		y_ptr = y;
 		a_ptrl = a_ptr;

 		for(i = 0; i<n; i++ )
 		{
 			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
 			y_ptr += inc_y;
 			a_ptrl += lda;
 		}
 		a_ptr += Mblock;	
 		x_ptr += Mblock * inc_x;


 	}
 	Mblock /= 2;

        if ( m2 & Mblock)
 	{

 		if ( inc_x == 1 )
 			xbuffer = x_ptr;
 		else
 			copy_x(Mblock,x_ptr,xbuffer,inc_x);

 		y_ptr = y;
 		a_ptrl = a_ptr;

 		for(i = 0; i<n; i++ )
 		{
 			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
 			y_ptr += inc_y;
 			a_ptrl += lda;
 		}
 		a_ptr += Mblock;	
 		x_ptr += Mblock * inc_x;


 	}
 	Mblock /= 2;

        if ( m2 & Mblock)
 	{

 		xbuffer = x_ptr;

 		y_ptr = y;
 		a_ptrl = a_ptr;

 		for(i = 0; i<n; i++ )
 		{
 			sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr);
 			y_ptr += inc_y;
 			a_ptrl += lda;
 		}


 	}

 	return(0);
 }


--- a/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer-4.c
@@ -0,0 +1,147 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vxorps		%%xmm4, %%xmm4, %%xmm4	 \n\t"
 	"vxorps		%%xmm5, %%xmm5, %%xmm5	 \n\t"
 	"vxorps		%%xmm6, %%xmm6, %%xmm6	 \n\t"
 	"vxorps		%%xmm7, %%xmm7, %%xmm7	 \n\t"

 	"testq		$0x04, %1		       \n\t"
 	"jz		.L08LABEL%=		       \n\t"

        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
 	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
        "addq		$4 , %0	  	 	       \n\t"
 	"subq	        $4 , %1			       \n\t"		

 	".L08LABEL%=:				       \n\t"

 	"testq		$0x08, %1		       \n\t"
 	"jz		.L16LABEL%=		       \n\t"

        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x
        "vmovups      16(%2,%0,4), %%xmm13             \n\t"  // 4 * x
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
 	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
 	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
 	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 

        "addq		$8 , %0	  	 	       \n\t"
 	"subq	        $8 , %1			       \n\t"		

 	".L16LABEL%=:				       \n\t"

 	"cmpq		$0, %1		               \n\t"
 	"je		.L16END%=		       \n\t"

 	".align 16				       \n\t"
 	".L01LOOP%=:				       \n\t"
        "vmovups        (%2,%0,4), %%xmm12             \n\t"  // 4 * x

 	"prefetcht0	 384(%4,%0,4)		       \n\t"
 	"vfmaddps %%xmm4,   (%4,%0,4), %%xmm12, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5,   (%5,%0,4), %%xmm12, %%xmm5 \n\t" 
        "vmovups      16(%2,%0,4), %%xmm13             \n\t"  // 4 * x
 	"vfmaddps %%xmm6,   (%6,%0,4), %%xmm12, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7,   (%7,%0,4), %%xmm12, %%xmm7 \n\t" 
 	"prefetcht0	 384(%5,%0,4)		       \n\t"
 	".align 2				       \n\t"
 	"vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" 
        "vmovups      32(%2,%0,4), %%xmm14             \n\t"  // 4 * x
 	"vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" 
 	"prefetcht0	 384(%6,%0,4)		       \n\t"
 	".align 2				       \n\t"
 	"vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" 
 	"vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" 
        "vmovups      48(%2,%0,4), %%xmm15             \n\t"  // 4 * x
 	"vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" 
 	"vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" 
 	"prefetcht0	 384(%7,%0,4)		       \n\t"
 	"vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" 
        "addq		$16, %0	  	 	       \n\t"
 	"vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t" 
 	"vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t" 
 	"subq	        $16, %1			       \n\t"		
 	"vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" 

 	"jnz		.L01LOOP%=		       \n\t"

 	".L16END%=:				\n\t"
 	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
 	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
 	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
 	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"

 	"vhaddps        %%xmm4, %%xmm4, %%xmm4	\n\t"
 	"vhaddps        %%xmm5, %%xmm5, %%xmm5	\n\t"
 	"vhaddps        %%xmm6, %%xmm6, %%xmm6	\n\t"
 	"vhaddps        %%xmm7, %%xmm7, %%xmm7	\n\t"

 	"vmovss		%%xmm4,    (%3)		\n\t"
 	"vmovss		%%xmm5,   4(%3)		\n\t"
 	"vmovss		%%xmm6,   8(%3)		\n\t"
 	"vmovss		%%xmm7,  12(%3)		\n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3])   // 7
 	: "cc", 
 	  "%xmm4", "%xmm5", 
 	  "%xmm6", "%xmm7", 
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_t_microk_bulldozer.c
+++ b/kernel/x86_64/sgemv_t_microk_bulldozer.c
@@ -1,99 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {

 	//n = n / 16;

 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
 	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
 	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
 	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
 	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero

 	"sarq		$4, %%rax		 \n\t"	// n = n / 16

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	// "prefetcht0	512(%%rsi)		 \n\t"
 	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
 	"vmovups	(%%rsi), %%xmm4		 \n\t"
 	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
 	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
 	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"

 	"vfmaddps %%xmm12,   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm13,   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm14,   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
 	"vfmaddps %%xmm15,  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp

        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
 	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
 	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	

 	"vfmaddss	(%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t"
 	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



--- a/kernel/x86_64/sgemv_t_microk_haswell-4.c
+++ b/kernel/x86_64/sgemv_t_microk_haswell-4.c
@@ -0,0 +1,148 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			         \n\t"
 	"vxorps		%%ymm4 , %%ymm4, %%ymm4  \n\t"
 	"vxorps		%%ymm5 , %%ymm5, %%ymm5  \n\t"
 	"vxorps		%%ymm6 , %%ymm6, %%ymm6  \n\t"
 	"vxorps		%%ymm7 , %%ymm7, %%ymm7  \n\t"

        "testq          $0x04, %1                      \n\t"
        "jz             .L08LABEL%=                    \n\t"

 	"vmovups	(%2,%0,4), %%xmm12             \n\t"	// 4 * x

 	"vfmadd231ps   (%4,%0,4), %%xmm12, %%xmm4      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%xmm12, %%xmm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%xmm12, %%xmm6      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%xmm12, %%xmm7      \n\t" 

        "addq		$4 , %0	  	 	      \n\t"
 	"subq	        $4 , %1			      \n\t"		

        ".L08LABEL%=:                                  \n\t"

        "testq          $0x08, %1                      \n\t"
        "jz             .L16LABEL%=                    \n\t"

 	"vmovups	(%2,%0,4), %%ymm12             \n\t"	// 8 * x

 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
 	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 

        "addq		$8 , %0	  	 	      \n\t"
 	"subq	        $8 , %1			      \n\t"		

        ".L16LABEL%=:                                  \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L16END%=                      \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"prefetcht0	 384(%2,%0,4)		 \n\t"
 	"vmovups	(%2,%0,4), %%ymm12       \n\t"	// 8 * x
 	"vmovups      32(%2,%0,4), %%ymm13       \n\t"	// 8 * x

 	"prefetcht0	 384(%4,%0,4)		       \n\t"
 	"vfmadd231ps   (%4,%0,4), %%ymm12, %%ymm4      \n\t" 
 	"vfmadd231ps   (%5,%0,4), %%ymm12, %%ymm5      \n\t" 
 	"prefetcht0	 384(%5,%0,4)		       \n\t"
 	"vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4      \n\t" 
 	"vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5      \n\t" 
 	"prefetcht0	 384(%6,%0,4)		       \n\t"
 	"vfmadd231ps   (%6,%0,4), %%ymm12, %%ymm6      \n\t" 
 	"vfmadd231ps   (%7,%0,4), %%ymm12, %%ymm7      \n\t" 
 	"prefetcht0	 384(%7,%0,4)		       \n\t"
 	"vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6      \n\t" 
 	"vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7      \n\t" 

        "addq		$16, %0	  	 	      \n\t"
 	"subq	        $16, %1			      \n\t"		
 	"jnz		.L01LOOP%=		      \n\t"

        ".L16END%=:                                   \n\t"

 	"vextractf128   $1 , %%ymm4, %%xmm12	      \n\t"
 	"vextractf128   $1 , %%ymm5, %%xmm13	      \n\t"
 	"vextractf128   $1 , %%ymm6, %%xmm14	      \n\t"
 	"vextractf128   $1 , %%ymm7, %%xmm15	      \n\t"

 	"vaddps		%%xmm4, %%xmm12, %%xmm4       \n\t"
 	"vaddps		%%xmm5, %%xmm13, %%xmm5       \n\t"
 	"vaddps		%%xmm6, %%xmm14, %%xmm6       \n\t"
 	"vaddps		%%xmm7, %%xmm15, %%xmm7       \n\t"

        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"

        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"

        "vmovss         %%xmm4,    (%3)         \n\t"
        "vmovss         %%xmm5,   4(%3)         \n\t"
        "vmovss         %%xmm6,   8(%3)         \n\t"
        "vmovss         %%xmm7,  12(%3)         \n\t"

 	"vzeroupper			 \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3])   // 7
 	: "cc", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_t_microk_haswell.c
+++ b/kernel/x86_64/sgemv_t_microk_haswell.c
@@ -1,100 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {

 	//n = n / 16;

 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
 	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
 	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
 	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
 	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero

 	"sarq		$4, %%rax		 \n\t"	// n = n / 16

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	// "prefetcht0	512(%%rsi)		 \n\t"
 	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
 	"vmovups	(%%rsi), %%xmm4		 \n\t"
 	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
 	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
 	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"

 	"vfmadd231ps   0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp
 	"vfmadd231ps   4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp
 	"vfmadd231ps   8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp
 	"vfmadd231ps  12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp

        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
 	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
 	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	

 	"vmulss	        %%xmm12, %%xmm1, %%xmm12\n\t"
 	"vaddss		(%%rdx), %%xmm12,%%xmm12\n\t"
 	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



--- a/kernel/x86_64/sgemv_t_microk_nehalem-4.c
+++ b/kernel/x86_64/sgemv_t_microk_nehalem-4.c
@@ -0,0 +1,99 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"xorps		%%xmm4 , %%xmm4	         \n\t"
 	"xorps		%%xmm5 , %%xmm5	         \n\t"
 	"xorps		%%xmm6 , %%xmm6	         \n\t"
 	"xorps		%%xmm7 , %%xmm7	         \n\t"

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"

 	"movups	       (%2,%0,4), %%xmm12              \n\t"   // 4 * x
 	"movups        (%4,%0,4), %%xmm8               \n\t"   // 4 * a0
 	"movups        (%5,%0,4), %%xmm9               \n\t"   // 4 * a1
 	"movups        (%6,%0,4), %%xmm10              \n\t"   // 4 * a2
 	"movups        (%7,%0,4), %%xmm11              \n\t"   // 4 * a3

 	"mulps		%%xmm12, %%xmm8		       \n\t"
 	"mulps		%%xmm12, %%xmm9		       \n\t"
 	"mulps		%%xmm12, %%xmm10	       \n\t"
 	"mulps		%%xmm12, %%xmm11	       \n\t"
 	"addps		%%xmm8 , %%xmm4		       \n\t"
        "addq		$4 , %0	  	 	       \n\t"
 	"addps		%%xmm9 , %%xmm5		       \n\t"
 	"subq	        $4 , %1			       \n\t"		
 	"addps		%%xmm10, %%xmm6		       \n\t"
 	"addps		%%xmm11, %%xmm7		       \n\t"

 	"jnz		.L01LOOP%=		       \n\t"

        "haddps        %%xmm4, %%xmm4  \n\t"
        "haddps        %%xmm5, %%xmm5  \n\t"
        "haddps        %%xmm6, %%xmm6  \n\t"
        "haddps        %%xmm7, %%xmm7  \n\t"

        "haddps        %%xmm4, %%xmm4  \n\t"
        "haddps        %%xmm5, %%xmm5  \n\t"
        "haddps        %%xmm6, %%xmm6  \n\t"
        "haddps        %%xmm7, %%xmm7  \n\t"

        "movss         %%xmm4,    (%3)         \n\t"
        "movss         %%xmm5,   4(%3)         \n\t"
        "movss         %%xmm6,   8(%3)         \n\t"
        "movss         %%xmm7,  12(%3)         \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3])   // 7
 	: "cc", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_t_microk_sandy-4.c
+++ b/kernel/x86_64/sgemv_t_microk_sandy-4.c
@@ -0,0 +1,174 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 #define HAVE_KERNEL_4x4 1
 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline));

 static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y)
 {

 	BLASLONG register i = 0;

 	__asm__  __volatile__
 	(
 	"vzeroupper			 \n\t"
        "vxorps         %%ymm0 , %%ymm0, %%ymm0  \n\t"
        "vxorps         %%ymm1 , %%ymm1, %%ymm1  \n\t"
        "vxorps         %%ymm2 , %%ymm2, %%ymm2  \n\t"
        "vxorps         %%ymm3 , %%ymm3, %%ymm3  \n\t"
        "vxorps         %%ymm4 , %%ymm4, %%ymm4  \n\t"
        "vxorps         %%ymm5 , %%ymm5, %%ymm5  \n\t"
        "vxorps         %%ymm6 , %%ymm6, %%ymm6  \n\t"
        "vxorps         %%ymm7 , %%ymm7, %%ymm7  \n\t"

        "testq          $0x04, %1                      \n\t"
        "jz             .L08LABEL%=                    \n\t"

        "vmovups        (%2,%0,4), %%xmm12       \n\t"  // 4 * x

 	"vmulps   (%4,%0,4), %%xmm12, %%xmm8      \n\t" 
 	"vmulps   (%5,%0,4), %%xmm12, %%xmm10     \n\t" 
 	"vmulps   (%6,%0,4), %%xmm12, %%xmm9      \n\t" 
 	"vmulps   (%7,%0,4), %%xmm12, %%xmm11     \n\t" 
 	"vaddps	  %%xmm4, %%xmm8 , %%xmm4	  \n\t"
        "addq		$4 , %0	  	 	      \n\t"
 	"vaddps	  %%xmm5, %%xmm10, %%xmm5	  \n\t"
 	"vaddps	  %%xmm6, %%xmm9 , %%xmm6	  \n\t"
 	"subq	        $4 , %1			      \n\t"		
 	"vaddps	  %%xmm7, %%xmm11, %%xmm7	  \n\t"

        ".L08LABEL%=:                                  \n\t"

        "testq          $0x08, %1                      \n\t"
        "jz             .L16LABEL%=                    \n\t"

        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x

 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
 	"vmulps   (%6,%0,4), %%ymm12, %%ymm9      \n\t" 
 	"vmulps   (%7,%0,4), %%ymm12, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
        "addq		$8 , %0	  	 	      \n\t"
 	"vaddps	  %%ymm5, %%ymm10, %%ymm5	  \n\t"
 	"vaddps	  %%ymm6, %%ymm9 , %%ymm6	  \n\t"
 	"subq	        $8 , %1			      \n\t"		
 	"vaddps	  %%ymm7, %%ymm11, %%ymm7	  \n\t"

        ".L16LABEL%=:                                  \n\t"

        "cmpq           $0, %1                         \n\t"
        "je             .L16END%=                      \n\t"


 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	"prefetcht0	 384(%2,%0,4)		       \n\t"
        "vmovups        (%2,%0,4), %%ymm12       \n\t"  // 8 * x
        "vmovups      32(%2,%0,4), %%ymm13       \n\t"  // 8 * x

 	"prefetcht0	 384(%4,%0,4)		       \n\t"
 	"vmulps   (%4,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps 32(%4,%0,4), %%ymm13, %%ymm9      \n\t" 
 	"prefetcht0	 384(%5,%0,4)		       \n\t"
 	"vmulps   (%5,%0,4), %%ymm12, %%ymm10     \n\t" 
 	"vmulps 32(%5,%0,4), %%ymm13, %%ymm11     \n\t" 
 	"vaddps	  %%ymm4, %%ymm8 , %%ymm4	  \n\t"
 	"vaddps	  %%ymm0, %%ymm9 , %%ymm0	  \n\t"
 	"vaddps	  %%ymm1, %%ymm10, %%ymm1	  \n\t"
 	"vaddps	  %%ymm5, %%ymm11, %%ymm5	  \n\t"
 	"prefetcht0	 384(%6,%0,4)		  \n\t"
 	"vmulps   (%6,%0,4), %%ymm12, %%ymm8      \n\t" 
 	"vmulps 32(%6,%0,4), %%ymm13, %%ymm9      \n\t" 
 	"prefetcht0	 384(%7,%0,4)		  \n\t"
 	"vmulps   (%7,%0,4), %%ymm12, %%ymm10     \n\t" 
 	"vmulps 32(%7,%0,4), %%ymm13, %%ymm11     \n\t" 
 	"vaddps	  %%ymm6, %%ymm8 , %%ymm6	  \n\t"
        "addq		$16, %0	  	 	      \n\t"
 	"vaddps	  %%ymm2, %%ymm9 , %%ymm2	  \n\t"
 	"vaddps	  %%ymm7, %%ymm10, %%ymm7	  \n\t"
 	"subq	        $16, %1			      \n\t"		
 	"vaddps	  %%ymm3, %%ymm11, %%ymm3	  \n\t"

 	"jnz		.L01LOOP%=		      \n\t"

        ".L16END%=:				      \n\t"

        "vaddps         %%ymm4, %%ymm0, %%ymm4       \n\t"
        "vaddps         %%ymm5, %%ymm1, %%ymm5       \n\t"
        "vaddps         %%ymm6, %%ymm2, %%ymm6       \n\t"
        "vaddps         %%ymm7, %%ymm3, %%ymm7       \n\t"

        "vextractf128   $1 , %%ymm4, %%xmm12          \n\t"
        "vextractf128   $1 , %%ymm5, %%xmm13          \n\t"
        "vextractf128   $1 , %%ymm6, %%xmm14          \n\t"
        "vextractf128   $1 , %%ymm7, %%xmm15          \n\t"

        "vaddps         %%xmm4, %%xmm12, %%xmm4       \n\t"
        "vaddps         %%xmm5, %%xmm13, %%xmm5       \n\t"
        "vaddps         %%xmm6, %%xmm14, %%xmm6       \n\t"
        "vaddps         %%xmm7, %%xmm15, %%xmm7       \n\t"

        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"

        "vhaddps        %%xmm4, %%xmm4, %%xmm4  \n\t"
        "vhaddps        %%xmm5, %%xmm5, %%xmm5  \n\t"
        "vhaddps        %%xmm6, %%xmm6, %%xmm6  \n\t"
        "vhaddps        %%xmm7, %%xmm7, %%xmm7  \n\t"

        "vmovss         %%xmm4,    (%3)         \n\t"
        "vmovss         %%xmm5,   4(%3)         \n\t"
        "vmovss         %%xmm6,   8(%3)         \n\t"
        "vmovss         %%xmm7,  12(%3)         \n\t"


 	"vzeroupper			 \n\t"

 	:
        : 
          "r" (i),	// 0	
 	  "r" (n),  	// 1
          "r" (x),      // 2
          "r" (y),      // 3
          "r" (ap[0]),  // 4
          "r" (ap[1]),  // 5
          "r" (ap[2]),  // 6
          "r" (ap[3])   // 7
 	: "cc", 
 	  "%xmm0", "%xmm1", "%xmm2", "%xmm3", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7", 
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 


--- a/kernel/x86_64/sgemv_t_microk_sandy.c
+++ b/kernel/x86_64/sgemv_t_microk_sandy.c
@@ -1,106 +0,0 @@
 /***************************************************************************
 Copyright (c) 2014, The OpenBLAS Project
 All rights reserved.
 Redistribution and use in source and binary forms, with or without
 modification, are permitted provided that the following conditions are
 met:
 1. Redistributions of source code must retain the above copyright
 notice, this list of conditions and the following disclaimer.
 2. Redistributions in binary form must reproduce the above copyright
 notice, this list of conditions and the following disclaimer in
 the documentation and/or other materials provided with the
 distribution.
 3. Neither the name of the OpenBLAS project nor the names of
 its contributors may be used to endorse or promote products
 derived from this software without specific prior written permission.
 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
 ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE
 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE
 USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 *****************************************************************************/

 static void  sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y)
 {

 	//n = n / 16;

 	__asm__ __volatile__
 	(
 	"movq 	        %0, 	 %%rax\n\t"		// n -> rax
 	"vmovss         %1, 	 %%xmm1\n\t"		// alpha -> xmm1
 	"movq		%2,	 %%rsi\n\t"		// adress of a -> rsi
 	"movq	        %3,	 %%rcx\n\t"		// value of lda > rcx
 	"movq		%4,	 %%rdi\n\t"		// adress of x -> rdi
 	"movq		%5,	 %%rdx\n\t"		// adress of y -> rdx

 	"leaq	(, %%rcx,4), %%rcx       \n\t"		// scale lda by size of float
 	"leaq	(%%rsi,%%rcx,1), %%r8    \n\t"		// pointer to next line

 	"vxorps		%%xmm12, %%xmm12, %%xmm12\n\t"	// set to zero
 	"vxorps		%%xmm13, %%xmm13, %%xmm13\n\t"	// set to zero
 	"vxorps		%%xmm14, %%xmm14, %%xmm14\n\t"	// set to zero
 	"vxorps		%%xmm15, %%xmm15, %%xmm15\n\t"	// set to zero

 	"sarq		$4, %%rax		 \n\t"	// n = n / 16

 	".align 16				 \n\t"
 	".L01LOOP%=:				 \n\t"
 	// "prefetcht0	512(%%rsi)		 \n\t"
 	"prefetcht0	(%%r8)		 	 \n\t" //prefetch next line of a
 	"vmovups	(%%rsi), %%xmm4		 \n\t"
 	"vmovups     4*4(%%rsi), %%xmm5		 \n\t"
 	"vmovups     8*4(%%rsi), %%xmm6		 \n\t"
 	"vmovups    12*4(%%rsi), %%xmm7		 \n\t"

 	"vmulps      0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp
 	"vmulps      4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp
 	"vmulps      8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp
 	"vmulps     12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp

 	"vaddps		%%xmm12, %%xmm8 , %%xmm12\n\t"	
 	"vaddps		%%xmm13, %%xmm9 , %%xmm13\n\t"	
 	"vaddps		%%xmm14, %%xmm10, %%xmm14\n\t"	
 	"vaddps		%%xmm15, %%xmm11, %%xmm15\n\t"	

        "addq		$16*4    ,   %%r8	 \n\t"  // increment prefetch pointer 
        "addq		$16*4    ,   %%rsi	 \n\t"  // increment pointer of a 
        "addq		$16*4    ,   %%rdi	 \n\t"  // increment pointer of c 
 	"dec		%%rax			 \n\t"  // n = n -1
 	"jnz		.L01LOOP%=		 \n\t"

 	"vaddps		%%xmm12, %%xmm14, %%xmm12\n\t"	
 	"vaddps		%%xmm13, %%xmm15, %%xmm13\n\t"	
 	"vaddps		%%xmm12, %%xmm13, %%xmm12\n\t"	
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	
 	"vhaddps	%%xmm12, %%xmm12, %%xmm12\n\t"	

 	"vmulss		%%xmm12, %%xmm1, %%xmm12 \n\t"
 	"vaddss	       (%%rdx), %%xmm12, %%xmm12\n\t"
 	"vmovss		%%xmm12, (%%rdx)	 \n\t"  // store temp -> y

 	:
        :
          "m" (n),	// 0	
 	  "m" (alpha),  // 1
 	  "m" (a),      // 2
          "m" (lda),    // 3
          "m" (x),      // 4
          "m" (y)       // 5
 	: "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc",
 	  "%xmm0", "%xmm1", 
 	  "%xmm4", "%xmm5", "%xmm6", "%xmm7",
 	  "%xmm8", "%xmm9", "%xmm10", "%xmm11",
 	  "%xmm12", "%xmm13", "%xmm14", "%xmm15",
 	  "memory"
 	);

 } 



--- a/lapack-netlib/TESTING/dstest.in
+++ b/lapack-netlib/TESTING/dstest.in
@@ -1,6 +1,6 @@
 Data file for testing DSGESV/DSPOSV LAPACK routines
 12                                      Number of values of M
 0 1 2 13 17 45 78 91 101 119 120 132    values of M (row dimension)
 0 1 2 13 17 45 78 91 101 119 112 132    values of M (row dimension)
 6                                       Number of values of NRHS
 1 2 14 15 16 13                         Values of NRHS (number of right hand sides)
 30.0                                    Threshold value of test ratio