A lot of optimizations for gemv kernelstags/v0.2.12^2
| @@ -128,6 +128,7 @@ int MAIN__(int argc, char *argv[]){ | |||
| blasint inc_x=1,inc_y=1; | |||
| blasint n=0; | |||
| int has_param_n = 0; | |||
| int has_param_m = 0; | |||
| int loops = 1; | |||
| int l; | |||
| char *p; | |||
| @@ -145,29 +146,38 @@ int MAIN__(int argc, char *argv[]){ | |||
| if (argc > 0) { to = MAX(atol(*argv), from); argc--; argv++;} | |||
| if (argc > 0) { step = atol(*argv); argc--; argv++;} | |||
| int tomax = to; | |||
| if ((p = getenv("OPENBLAS_LOOPS"))) loops = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCX"))) inc_x = atoi(p); | |||
| if ((p = getenv("OPENBLAS_INCY"))) inc_y = atoi(p); | |||
| if ((p = getenv("OPENBLAS_TRANS"))) trans=*p; | |||
| if ((p = getenv("OPENBLAS_PARAM_N"))) { | |||
| n = atoi(p); | |||
| if ((n>0) && (n<=to)) has_param_n = 1; | |||
| if ((n>0)) has_param_n = 1; | |||
| if ( n > tomax ) tomax = n; | |||
| } | |||
| if ( has_param_n == 0 ) | |||
| if ((p = getenv("OPENBLAS_PARAM_M"))) { | |||
| m = atoi(p); | |||
| if ((m>0)) has_param_m = 1; | |||
| if ( m > tomax ) tomax = m; | |||
| } | |||
| if ( has_param_n == 1 ) | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' N = %d Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,n,inc_x,inc_y,loops); | |||
| else | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * to * to * COMPSIZE)) == NULL){ | |||
| fprintf(stderr, "From : %3d To : %3d Step = %3d Trans = '%c' Inc_x = %d Inc_y = %d Loops = %d\n", from, to, step,trans,inc_x,inc_y,loops); | |||
| if (( a = (FLOAT *)malloc(sizeof(FLOAT) * tomax * tomax * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| if (( x = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_x) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * to * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| if (( y = (FLOAT *)malloc(sizeof(FLOAT) * tomax * abs(inc_y) * COMPSIZE)) == NULL){ | |||
| fprintf(stderr,"Out of Memory!!\n");exit(1); | |||
| } | |||
| @@ -177,50 +187,80 @@ int MAIN__(int argc, char *argv[]){ | |||
| fprintf(stderr, " SIZE Flops\n"); | |||
| for(m = from; m <= to; m += step) | |||
| if (has_param_m == 0) | |||
| { | |||
| timeg=0; | |||
| for(m = from; m <= to; m += step) | |||
| { | |||
| timeg=0; | |||
| if ( has_param_n == 0 ) n = m; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < n * COMPSIZE; i++){ | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| if ( has_param_n == 0 ) n = m; | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < n * COMPSIZE; i++){ | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| timeg /= loops; | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| } | |||
| } | |||
| else | |||
| { | |||
| GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| for(n = from; n <= to; n += step) | |||
| { | |||
| timeg=0; | |||
| fprintf(stderr, " %6dx%d : ", (int)m,(int)n); | |||
| for(j = 0; j < m; j++){ | |||
| for(i = 0; i < n * COMPSIZE; i++){ | |||
| a[i + j * m * COMPSIZE] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| } | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| for (l=0; l<loops; l++) | |||
| { | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_x); i++){ | |||
| x[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| timeg += time1; | |||
| for(i = 0; i < n * COMPSIZE * abs(inc_y); i++){ | |||
| y[i] = ((FLOAT) rand() / (FLOAT) RAND_MAX) - 0.5; | |||
| } | |||
| gettimeofday( &start, (struct timezone *)0); | |||
| GEMV (&trans, &m, &n, alpha, a, &m, x, &inc_x, beta, y, &inc_y ); | |||
| gettimeofday( &stop, (struct timezone *)0); | |||
| time1 = (double)(stop.tv_sec - start.tv_sec) + (double)((stop.tv_usec - start.tv_usec)) * 1.e-6; | |||
| timeg += time1; | |||
| } | |||
| } | |||
| timeg /= loops; | |||
| timeg /= loops; | |||
| fprintf(stderr, | |||
| " %10.2f MFlops\n", | |||
| COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||
| fprintf(stderr, " %10.2f MFlops\n", COMPSIZE * COMPSIZE * 2. * (double)m * (double)n / timeg * 1.e-6); | |||
| } | |||
| } | |||
| return 0; | |||
| @@ -0,0 +1,42 @@ | |||
| # ********************************************************************************** | |||
| # Copyright (c) 2014, The OpenBLAS Project | |||
| # All rights reserved. | |||
| # Redistribution and use in source and binary forms, with or without | |||
| # modification, are permitted provided that the following conditions are | |||
| # met: | |||
| # 1. Redistributions of source code must retain the above copyright | |||
| # notice, this list of conditions and the following disclaimer. | |||
| # 2. Redistributions in binary form must reproduce the above copyright | |||
| # notice, this list of conditions and the following disclaimer in | |||
| # the documentation and/or other materials provided with the | |||
| # distribution. | |||
| # 3. Neither the name of the OpenBLAS project nor the names of | |||
| # its contributors may be used to endorse or promote products | |||
| # derived from this software without specific prior written permission. | |||
| # THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| # AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| # ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| # LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| # DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| # SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| # CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| # OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| # USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| # ********************************************************************************** | |||
| set term x11 font sans; | |||
| set ylabel "MFlops"; | |||
| set xlabel "Size"; | |||
| set grid xtics; | |||
| set grid ytics; | |||
| set key left; | |||
| set timestamp "generated on %Y-%m-%d by `whoami`" | |||
| set title "Sgemv\nTRANS=T\nBulldozer" | |||
| plot '1-THREAD' smooth bezier, '2-THREADS' smooth bezier, '4-THREADS' smooth bezier; | |||
| set output "print.png"; | |||
| show title; | |||
| show plot; | |||
| show output; | |||
| @@ -46,6 +46,7 @@ | |||
| #define __volatile__ | |||
| #endif | |||
| /* | |||
| #ifdef HAVE_SSE2 | |||
| #define MB __asm__ __volatile__ ("mfence"); | |||
| #define WMB __asm__ __volatile__ ("sfence"); | |||
| @@ -53,6 +54,10 @@ | |||
| #define MB | |||
| #define WMB | |||
| #endif | |||
| */ | |||
| #define MB | |||
| #define WMB | |||
| static void __inline blas_lock(volatile BLASULONG *address){ | |||
| @@ -99,7 +104,9 @@ static __inline void cpuid(int op, int *eax, int *ebx, int *ecx, int *edx){ | |||
| : "0" (op)); | |||
| } | |||
| /* | |||
| #define WHEREAMI | |||
| */ | |||
| static inline int WhereAmI(void){ | |||
| int eax, ebx, ecx, edx; | |||
| @@ -111,6 +118,7 @@ static inline int WhereAmI(void){ | |||
| return apicid; | |||
| } | |||
| #ifdef CORE_BARCELONA | |||
| #define IFLUSH gotoblas_iflush() | |||
| #define IFLUSH_HALF gotoblas_iflush_half() | |||
| @@ -251,7 +251,11 @@ void blas_set_parameter(void){ | |||
| env_var_t p; | |||
| int factor; | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) || defined(SANDYBRIDGE) || defined(NEHALEM) || defined(HASWELL) | |||
| int size = 16; | |||
| #else | |||
| int size = get_L2_size(); | |||
| #endif | |||
| #if defined(CORE_KATMAI) || defined(CORE_COPPERMINE) || defined(CORE_BANIAS) | |||
| size >>= 7; | |||
| @@ -216,7 +216,7 @@ void CNAME(enum CBLAS_ORDER order, | |||
| int nthreads_avail = nthreads_max; | |||
| double MNK = (double) m * (double) n; | |||
| if ( MNK <= (500.0 * 100.0 * (double) GEMM_MULTITHREAD_THRESHOLD) ) | |||
| if ( MNK <= (24.0 * 24.0 * (double) (GEMM_MULTITHREAD_THRESHOLD*GEMM_MULTITHREAD_THRESHOLD) ) ) | |||
| nthreads_max = 1; | |||
| if ( nthreads_max > nthreads_avail ) | |||
| @@ -10,8 +10,8 @@ DSYMV_L_KERNEL = dsymv_L.c | |||
| SSYMV_U_KERNEL = ssymv_U.c | |||
| SSYMV_L_KERNEL = ssymv_L.c | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t.c | |||
| @@ -1,8 +1,8 @@ | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| DGEMVTKERNEL = dgemv_t.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| DGEMVTKERNEL = dgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n.c | |||
| ZGEMVTKERNEL = zgemv_t.c | |||
| @@ -9,9 +9,9 @@ DSYMV_L_KERNEL = dsymv_L.c | |||
| SSYMV_U_KERNEL = ssymv_U.c | |||
| SSYMV_L_KERNEL = ssymv_L.c | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| DGEMVNKERNEL = dgemv_n.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| DGEMVNKERNEL = dgemv_n_4.c | |||
| SGEMMKERNEL = gemm_kernel_4x8_nehalem.S | |||
| SGEMMINCOPY = gemm_ncopy_4.S | |||
| @@ -1,5 +1,5 @@ | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n_dup.S | |||
| ZGEMVTKERNEL = zgemv_t.S | |||
| @@ -1,5 +1,5 @@ | |||
| SGEMVNKERNEL = sgemv_n.c | |||
| SGEMVTKERNEL = sgemv_t.c | |||
| SGEMVNKERNEL = sgemv_n_4.c | |||
| SGEMVTKERNEL = sgemv_t_4.c | |||
| ZGEMVNKERNEL = zgemv_n.c | |||
| @@ -0,0 +1,548 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(NEHALEM) | |||
| #include "dgemv_n_microk_nehalem-4.c" | |||
| #elif defined(HASWELL) | |||
| #include "dgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| #define NBMAX 2048 | |||
| #ifndef HAVE_KERNEL_4x8 | |||
| static void dgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT *b0,*b1,*b2,*b3; | |||
| FLOAT *x4; | |||
| FLOAT x[8]; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x4 = x + 4; | |||
| for ( i=0; i<8; i++) | |||
| x[i] = xo[i] * *alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; | |||
| y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; | |||
| y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; | |||
| y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; | |||
| y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT x[4]; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| for ( i=0; i<4; i++) | |||
| x[i] = xo[i] * *alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movsd (%2) , %%xmm12 \n\t" // x0 | |||
| "movsd (%6) , %%xmm4 \n\t" // alpha | |||
| "movsd 8(%2) , %%xmm13 \n\t" // x1 | |||
| "mulsd %%xmm4 , %%xmm12 \n\t" // alpha | |||
| "mulsd %%xmm4 , %%xmm13 \n\t" // alpha | |||
| "shufpd $0, %%xmm12, %%xmm12 \n\t" | |||
| "shufpd $0, %%xmm13, %%xmm13 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y | |||
| "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y | |||
| "movups (%4,%0,8), %%xmm8 \n\t" | |||
| "movups (%5,%0,8), %%xmm9 \n\t" | |||
| "mulpd %%xmm12, %%xmm8 \n\t" | |||
| "mulpd %%xmm13, %%xmm9 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm4 \n\t" | |||
| "movups 16(%4,%0,8), %%xmm8 \n\t" | |||
| "movups 16(%5,%0,8), %%xmm9 \n\t" | |||
| "mulpd %%xmm12, %%xmm8 \n\t" | |||
| "mulpd %%xmm13, %%xmm9 \n\t" | |||
| "addpd %%xmm8 , %%xmm5 \n\t" | |||
| "addpd %%xmm9 , %%xmm5 \n\t" | |||
| "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y | |||
| "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (alpha) // 6 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movsd (%2), %%xmm12 \n\t" // x0 | |||
| "mulsd (%5), %%xmm12 \n\t" // alpha | |||
| "shufpd $0, %%xmm12, %%xmm12 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%4,%0,8), %%xmm8 \n\t" // 2 * a | |||
| "movups 16(%4,%0,8), %%xmm9 \n\t" // 2 * a | |||
| "movups (%3,%0,8), %%xmm4 \n\t" // 2 * y | |||
| "movups 16(%3,%0,8), %%xmm5 \n\t" // 2 * y | |||
| "mulpd %%xmm12, %%xmm8 \n\t" | |||
| "mulpd %%xmm12, %%xmm9 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm5 \n\t" | |||
| "movups %%xmm4 , (%3,%0,8) \n\t" // 2 * y | |||
| "movups %%xmm5 , 16(%3,%0,8) \n\t" // 2 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap), // 4 | |||
| "r" (alpha) // 5 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #endif | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 1 ) | |||
| { | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8],*ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*8); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| dgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| dgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| dgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| dgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| dgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,247 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastsd 32(%2), %%ymm0 \n\t" // x4 | |||
| "vbroadcastsd 40(%2), %%ymm1 \n\t" // x5 | |||
| "vbroadcastsd 48(%2), %%ymm2 \n\t" // x6 | |||
| "vbroadcastsd 56(%2), %%ymm3 \n\t" // x7 | |||
| "vbroadcastsd (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L8LABEL%= \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y | |||
| "addq $4 , %8 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L8LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y | |||
| "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd (%4,%8,8), %%ymm0 , %%ymm4 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "vfmadd231pd 32(%4,%8,8), %%ymm0 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%8,8), %%ymm1 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%8,8), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%8,8), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%8,8), %%ymm2 , %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%8,8), %%ymm3 , %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%8,8), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "addq $8 , %8 \n\t" | |||
| "vmovupd %%ymm8,-64(%3,%0,8) \n\t" // 4 * y | |||
| "subq $8 , %1 \n\t" | |||
| "vmovupd %%ymm9,-32(%3,%0,8) \n\t" // 4 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastsd (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastsd 8(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastsd 16(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastsd 24(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastsd (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L8LABEL%= \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm7 \n\t" // 4 * y | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vaddpd %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulpd %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddpd %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovupd %%ymm5, (%3,%0,8) \n\t" // 4 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L8LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L8END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovupd (%3,%0,8), %%ymm8 \n\t" // 4 * y | |||
| "vmovupd 32(%3,%0,8), %%ymm9 \n\t" // 4 * y | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%4,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%6,%0,8), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%7,%0,8), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231pd %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "vmovupd %%ymm8, (%3,%0,8) \n\t" // 4 * y | |||
| "vmovupd %%ymm9, 32(%3,%0,8) \n\t" // 4 * y | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L8END%=: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,265 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movsd (%2), %%xmm12 \n\t" // x0 | |||
| "movsd 8(%2), %%xmm13 \n\t" // x1 | |||
| "movsd 16(%2), %%xmm14 \n\t" // x2 | |||
| "movsd 24(%2), %%xmm15 \n\t" // x3 | |||
| "shufpd $0, %%xmm12, %%xmm12\n\t" | |||
| "shufpd $0, %%xmm13, %%xmm13\n\t" | |||
| "shufpd $0, %%xmm14, %%xmm14\n\t" | |||
| "shufpd $0, %%xmm15, %%xmm15\n\t" | |||
| "movsd 32(%2), %%xmm0 \n\t" // x4 | |||
| "movsd 40(%2), %%xmm1 \n\t" // x5 | |||
| "movsd 48(%2), %%xmm2 \n\t" // x6 | |||
| "movsd 56(%2), %%xmm3 \n\t" // x7 | |||
| "shufpd $0, %%xmm0 , %%xmm0 \n\t" | |||
| "shufpd $0, %%xmm1 , %%xmm1 \n\t" | |||
| "shufpd $0, %%xmm2 , %%xmm2 \n\t" | |||
| "shufpd $0, %%xmm3 , %%xmm3 \n\t" | |||
| "movsd (%9), %%xmm6 \n\t" // alpha | |||
| "shufpd $0, %%xmm6 , %%xmm6 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| ".align 2 \n\t" | |||
| "movups (%4,%0,8), %%xmm8 \n\t" | |||
| "movups (%5,%0,8), %%xmm9 \n\t" | |||
| "movups (%6,%0,8), %%xmm10 \n\t" | |||
| "movups (%7,%0,8), %%xmm11 \n\t" | |||
| ".align 2 \n\t" | |||
| "mulpd %%xmm12, %%xmm8 \n\t" | |||
| "mulpd %%xmm13, %%xmm9 \n\t" | |||
| "mulpd %%xmm14, %%xmm10 \n\t" | |||
| "mulpd %%xmm15, %%xmm11 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm5 \n\t" | |||
| "addpd %%xmm10, %%xmm4 \n\t" | |||
| "addpd %%xmm11, %%xmm5 \n\t" | |||
| "movups (%4,%8,8), %%xmm8 \n\t" | |||
| "movups (%5,%8,8), %%xmm9 \n\t" | |||
| "movups (%6,%8,8), %%xmm10 \n\t" | |||
| "movups (%7,%8,8), %%xmm11 \n\t" | |||
| ".align 2 \n\t" | |||
| "mulpd %%xmm0 , %%xmm8 \n\t" | |||
| "mulpd %%xmm1 , %%xmm9 \n\t" | |||
| "mulpd %%xmm2 , %%xmm10 \n\t" | |||
| "mulpd %%xmm3 , %%xmm11 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm5 \n\t" | |||
| "addpd %%xmm10, %%xmm4 \n\t" | |||
| "addpd %%xmm11, %%xmm5 \n\t" | |||
| "addpd %%xmm5 , %%xmm4 \n\t" | |||
| "mulpd %%xmm6 , %%xmm4 \n\t" | |||
| "addpd %%xmm4 , %%xmm7 \n\t" | |||
| "movups %%xmm7 , (%3,%0,8) \n\t" // 2 * y | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| ".align 2 \n\t" | |||
| "movups 16(%4,%0,8), %%xmm8 \n\t" | |||
| "movups 16(%5,%0,8), %%xmm9 \n\t" | |||
| "movups 16(%6,%0,8), %%xmm10 \n\t" | |||
| "movups 16(%7,%0,8), %%xmm11 \n\t" | |||
| ".align 2 \n\t" | |||
| "mulpd %%xmm12, %%xmm8 \n\t" | |||
| "mulpd %%xmm13, %%xmm9 \n\t" | |||
| "mulpd %%xmm14, %%xmm10 \n\t" | |||
| "mulpd %%xmm15, %%xmm11 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm5 \n\t" | |||
| "addpd %%xmm10, %%xmm4 \n\t" | |||
| "addpd %%xmm11, %%xmm5 \n\t" | |||
| "movups 16(%4,%8,8), %%xmm8 \n\t" | |||
| "movups 16(%5,%8,8), %%xmm9 \n\t" | |||
| "movups 16(%6,%8,8), %%xmm10 \n\t" | |||
| "movups 16(%7,%8,8), %%xmm11 \n\t" | |||
| ".align 2 \n\t" | |||
| "mulpd %%xmm0 , %%xmm8 \n\t" | |||
| "mulpd %%xmm1 , %%xmm9 \n\t" | |||
| "mulpd %%xmm2 , %%xmm10 \n\t" | |||
| "mulpd %%xmm3 , %%xmm11 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm5 \n\t" | |||
| "addpd %%xmm10, %%xmm4 \n\t" | |||
| "addpd %%xmm11, %%xmm5 \n\t" | |||
| "addq $4 , %8 \n\t" | |||
| "addpd %%xmm5 , %%xmm4 \n\t" | |||
| "mulpd %%xmm6 , %%xmm4 \n\t" | |||
| "addpd %%xmm4 , %%xmm7 \n\t" | |||
| "movups %%xmm7 , 16(%3,%0,8) \n\t" // 2 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movsd (%2), %%xmm12 \n\t" // x0 | |||
| "movsd 8(%2), %%xmm13 \n\t" // x1 | |||
| "movsd 16(%2), %%xmm14 \n\t" // x2 | |||
| "movsd 24(%2), %%xmm15 \n\t" // x3 | |||
| "shufpd $0, %%xmm12, %%xmm12\n\t" | |||
| "shufpd $0, %%xmm13, %%xmm13\n\t" | |||
| "shufpd $0, %%xmm14, %%xmm14\n\t" | |||
| "shufpd $0, %%xmm15, %%xmm15\n\t" | |||
| "movsd (%8), %%xmm6 \n\t" // alpha | |||
| "shufpd $0, %%xmm6 , %%xmm6 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| "movups (%4,%0,8), %%xmm8 \n\t" | |||
| "movups (%5,%0,8), %%xmm9 \n\t" | |||
| "movups (%6,%0,8), %%xmm10 \n\t" | |||
| "movups (%7,%0,8), %%xmm11 \n\t" | |||
| "mulpd %%xmm12, %%xmm8 \n\t" | |||
| "mulpd %%xmm13, %%xmm9 \n\t" | |||
| "mulpd %%xmm14, %%xmm10 \n\t" | |||
| "mulpd %%xmm15, %%xmm11 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm4 \n\t" | |||
| "addpd %%xmm10 , %%xmm4 \n\t" | |||
| "addpd %%xmm4 , %%xmm11 \n\t" | |||
| "mulpd %%xmm6 , %%xmm11 \n\t" | |||
| "addpd %%xmm7 , %%xmm11 \n\t" | |||
| "movups %%xmm11, (%3,%0,8) \n\t" // 2 * y | |||
| "xorpd %%xmm4 , %%xmm4 \n\t" | |||
| "xorpd %%xmm5 , %%xmm5 \n\t" | |||
| "movups 16(%3,%0,8), %%xmm7 \n\t" // 2 * y | |||
| "movups 16(%4,%0,8), %%xmm8 \n\t" | |||
| "movups 16(%5,%0,8), %%xmm9 \n\t" | |||
| "movups 16(%6,%0,8), %%xmm10 \n\t" | |||
| "movups 16(%7,%0,8), %%xmm11 \n\t" | |||
| "mulpd %%xmm12, %%xmm8 \n\t" | |||
| "mulpd %%xmm13, %%xmm9 \n\t" | |||
| "mulpd %%xmm14, %%xmm10 \n\t" | |||
| "mulpd %%xmm15, %%xmm11 \n\t" | |||
| "addpd %%xmm8 , %%xmm4 \n\t" | |||
| "addpd %%xmm9 , %%xmm4 \n\t" | |||
| "addpd %%xmm10 , %%xmm4 \n\t" | |||
| "addpd %%xmm4 , %%xmm11 \n\t" | |||
| "mulpd %%xmm6 , %%xmm11 \n\t" | |||
| "addpd %%xmm7 , %%xmm11 \n\t" | |||
| "movups %%xmm11, 16(%3,%0,8) \n\t" // 2 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,615 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(HASWELL) | |||
| #include "dgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| #define NBMAX 2048 | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void dgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| FLOAT temp3 = 0.0; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; | |||
| temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; | |||
| temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; | |||
| temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; | |||
| } | |||
| y[0] = temp0; | |||
| y[1] = temp1; | |||
| y[2] = temp2; | |||
| y[3] = temp3; | |||
| } | |||
| #endif | |||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i; | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorpd %%xmm10 , %%xmm10 \n\t" | |||
| "xorpd %%xmm11 , %%xmm11 \n\t" | |||
| "testq $2 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "movups (%5,%0,8) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 | |||
| "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 | |||
| "mulpd %%xmm14 , %%xmm12 \n\t" | |||
| "mulpd %%xmm14 , %%xmm13 \n\t" | |||
| "addq $2 , %0 \n\t" | |||
| "addpd %%xmm12 , %%xmm10 \n\t" | |||
| "subq $2 , %1 \n\t" | |||
| "addpd %%xmm13 , %%xmm11 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%5,%0,8) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" // ap0 | |||
| "movups (%4,%0,8) , %%xmm13 \n\t" // ap1 | |||
| "mulpd %%xmm14 , %%xmm12 \n\t" | |||
| "mulpd %%xmm14 , %%xmm13 \n\t" | |||
| "addpd %%xmm12 , %%xmm10 \n\t" | |||
| "addpd %%xmm13 , %%xmm11 \n\t" | |||
| "movups 16(%5,%0,8) , %%xmm14 \n\t" // x | |||
| "movups 16(%3,%0,8) , %%xmm12 \n\t" // ap0 | |||
| "movups 16(%4,%0,8) , %%xmm13 \n\t" // ap1 | |||
| "mulpd %%xmm14 , %%xmm12 \n\t" | |||
| "mulpd %%xmm14 , %%xmm13 \n\t" | |||
| "addpd %%xmm12 , %%xmm10 \n\t" | |||
| "addpd %%xmm13 , %%xmm11 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L01END%=: \n\t" | |||
| "haddpd %%xmm10, %%xmm10 \n\t" | |||
| "haddpd %%xmm11, %%xmm11 \n\t" | |||
| "movsd %%xmm10, (%2) \n\t" | |||
| "movsd %%xmm11,8(%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (y), // 2 | |||
| "r" (ap0), // 3 | |||
| "r" (ap1), // 4 | |||
| "r" (x) // 5 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i; | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorpd %%xmm9 , %%xmm9 \n\t" | |||
| "xorpd %%xmm10 , %%xmm10 \n\t" | |||
| "testq $2 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups (%4,%0,8) , %%xmm11 \n\t" | |||
| "mulpd %%xmm11 , %%xmm12 \n\t" | |||
| "addq $2 , %0 \n\t" | |||
| "addpd %%xmm12 , %%xmm10 \n\t" | |||
| "subq $2 , %1 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups 16(%3,%0,8) , %%xmm14 \n\t" | |||
| "movups (%4,%0,8) , %%xmm11 \n\t" | |||
| "movups 16(%4,%0,8) , %%xmm13 \n\t" | |||
| "mulpd %%xmm11 , %%xmm12 \n\t" | |||
| "mulpd %%xmm13 , %%xmm14 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addpd %%xmm12 , %%xmm10 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addpd %%xmm14 , %%xmm9 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L01END%=: \n\t" | |||
| "addpd %%xmm9 , %%xmm10 \n\t" | |||
| "haddpd %%xmm10, %%xmm10 \n\t" | |||
| "movsd %%xmm10, (%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (y), // 2 | |||
| "r" (ap), // 3 | |||
| "r" (x) // 4 | |||
| : "cc", | |||
| "%xmm9", "%xmm10" , | |||
| "%xmm11", "%xmm12", "%xmm13", "%xmm14", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest = *src; | |||
| dest++; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); | |||
| static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 1 ) | |||
| { | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest += src[i] * da; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movsd (%2) , %%xmm10 \n\t" | |||
| "shufpd $0 , %%xmm10 , %%xmm10 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,8) , %%xmm12 \n\t" | |||
| "movups (%4,%0,8) , %%xmm11 \n\t" | |||
| "mulpd %%xmm10 , %%xmm12 \n\t" | |||
| "addq $2 , %0 \n\t" | |||
| "addpd %%xmm12 , %%xmm11 \n\t" | |||
| "subq $2 , %1 \n\t" | |||
| "movups %%xmm11, -16(%4,%0,8) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (&da), // 2 | |||
| "r" (src), // 3 | |||
| "r" (dest) // 4 | |||
| : "cc", | |||
| "%xmm10", "%xmm11", "%xmm12", | |||
| "memory" | |||
| ); | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG register i; | |||
| BLASLONG register j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n0; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[4],*xbuffer; | |||
| FLOAT *ytemp; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| xbuffer = buffer; | |||
| ytemp = buffer + NBMAX; | |||
| n0 = n / NBMAX; | |||
| n1 = (n % NBMAX) >> 2 ; | |||
| n2 = n & 3 ; | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(NB,x_ptr,xbuffer,inc_x); | |||
| FLOAT *ap[4]; | |||
| FLOAT *yp; | |||
| BLASLONG register lda4 = 4 * lda; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( n0 > 0 ) | |||
| { | |||
| BLASLONG nb1 = NBMAX / 4; | |||
| for( j=0; j<n0; j++) | |||
| { | |||
| yp = ytemp; | |||
| for( i = 0; i < nb1 ; i++) | |||
| { | |||
| dgemv_kernel_4x4(NB,ap,xbuffer,yp); | |||
| ap[0] += lda4 ; | |||
| ap[1] += lda4 ; | |||
| ap[2] += lda4 ; | |||
| ap[3] += lda4 ; | |||
| yp += 4; | |||
| } | |||
| add_y(nb1*4, alpha, ytemp, y_ptr, inc_y ); | |||
| y_ptr += nb1 * inc_y * 4; | |||
| a_ptr += nb1 * lda4 ; | |||
| } | |||
| } | |||
| yp = ytemp; | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| dgemv_kernel_4x4(NB,ap,xbuffer,yp); | |||
| ap[0] += lda4 ; | |||
| ap[1] += lda4 ; | |||
| ap[2] += lda4 ; | |||
| ap[3] += lda4 ; | |||
| yp += 4; | |||
| } | |||
| if ( n1 > 0 ) | |||
| { | |||
| add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); | |||
| y_ptr += n1 * inc_y * 4; | |||
| a_ptr += n1 * lda4 ; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| dgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); | |||
| a_ptr += lda * 2; | |||
| *y_ptr += ybuffer[0] * alpha; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1] * alpha; | |||
| y_ptr += inc_y; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| dgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); | |||
| a_ptr += lda; | |||
| *y_ptr += ybuffer[0] * alpha; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if ( m3 == 3 ) | |||
| { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if ( lda == 3 && inc_y == 1 ) | |||
| { | |||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for ( ; j<n; j++ ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_y == 1 ) | |||
| { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; | |||
| y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2; | |||
| y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2; | |||
| y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for ( ; j< n ; j++ ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ; | |||
| aj += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for ( j=0; j<n; j++ ) | |||
| { | |||
| *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if ( lda == 2 && inc_y == 1 ) | |||
| { | |||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ; | |||
| y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ; | |||
| y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ; | |||
| y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ; | |||
| aj += 8; | |||
| } | |||
| for ( ; j<n; j++ ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ; | |||
| aj += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_y == 1 ) | |||
| { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||
| y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ; | |||
| y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ; | |||
| y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ; | |||
| aj += lda4; | |||
| } | |||
| for ( ; j< n ; j++ ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||
| aj += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for ( j=0; j<n; j++ ) | |||
| { | |||
| *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if ( lda == 1 && inc_y == 1 ) | |||
| { | |||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||
| { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j+1] += aj[j+1] * xtemp; | |||
| y_ptr[j+2] += aj[j+2] * xtemp; | |||
| y_ptr[j+3] += aj[j+3] * xtemp; | |||
| } | |||
| for ( ; j<n ; j++ ) | |||
| { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_y == 1 ) | |||
| { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j+1] += *(aj+lda) * xtemp; | |||
| y_ptr[j+2] += *(aj+lda2) * xtemp; | |||
| y_ptr[j+3] += *(aj+lda3) * xtemp; | |||
| aj += lda4 ; | |||
| } | |||
| for ( ; j<n; j++ ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for ( j=0; j<n; j++ ) | |||
| { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,127 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void dgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorpd %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorpd %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vxorpd %%ymm6 , %%ymm6, %%ymm6 \n\t" | |||
| "vxorpd %%ymm7 , %%ymm7, %%ymm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| // "prefetcht0 384(%2,%0,8) \n\t" | |||
| "vmovups (%2,%0,8), %%ymm12 \n\t" // 4 * x | |||
| "vmovups 32(%2,%0,8), %%ymm13 \n\t" // 4 * x | |||
| // "prefetcht0 384(%4,%0,8) \n\t" | |||
| "vfmadd231pd (%4,%0,8), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231pd (%5,%0,8), %%ymm12, %%ymm5 \n\t" | |||
| // "prefetcht0 384(%5,%0,8) \n\t" | |||
| "vfmadd231pd (%6,%0,8), %%ymm12, %%ymm6 \n\t" | |||
| "vfmadd231pd (%7,%0,8), %%ymm12, %%ymm7 \n\t" | |||
| // "prefetcht0 384(%6,%0,8) \n\t" | |||
| "vfmadd231pd 32(%4,%0,8), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231pd 32(%5,%0,8), %%ymm13, %%ymm5 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| // "prefetcht0 384(%7,%0,8) \n\t" | |||
| "vfmadd231pd -32(%6,%0,8), %%ymm13, %%ymm6 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "vfmadd231pd -32(%7,%0,8), %%ymm13, %%ymm7 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" | |||
| "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" | |||
| "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" | |||
| "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" | |||
| "vaddpd %%xmm4, %%xmm12, %%xmm4 \n\t" | |||
| "vaddpd %%xmm5, %%xmm13, %%xmm5 \n\t" | |||
| "vaddpd %%xmm6, %%xmm14, %%xmm6 \n\t" | |||
| "vaddpd %%xmm7, %%xmm15, %%xmm7 \n\t" | |||
| "vhaddpd %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddpd %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddpd %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vhaddpd %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "vmovsd %%xmm4, (%3) \n\t" | |||
| "vmovsd %%xmm5, 8(%3) \n\t" | |||
| "vmovsd %%xmm6, 16(%3) \n\t" | |||
| "vmovsd %%xmm7, 24(%3) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,591 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "sgemv_n_microk_bulldozer-4.c" | |||
| #elif defined(NEHALEM) | |||
| #include "sgemv_n_microk_nehalem-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "sgemv_n_microk_sandy-4.c" | |||
| #elif defined(HASWELL) | |||
| #include "sgemv_n_microk_haswell-4.c" | |||
| #endif | |||
| #define NBMAX 4096 | |||
| #ifndef HAVE_KERNEL_4x8 | |||
| static void sgemv_kernel_4x8(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT *b0,*b1,*b2,*b3; | |||
| FLOAT *x4; | |||
| FLOAT x[8]; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| b0 = a0 + lda4 ; | |||
| b1 = a1 + lda4 ; | |||
| b2 = a2 + lda4 ; | |||
| b3 = a3 + lda4 ; | |||
| x4 = x + 4; | |||
| for ( i=0; i<8; i++) | |||
| x[i] = xo[i] * *alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; | |||
| y[i] += b0[i]*x4[0] + b1[i]*x4[1] + b2[i]*x4[2] + b3[i]*x4[3]; | |||
| y[i+1] += b0[i+1]*x4[0] + b1[i+1]*x4[1] + b2[i+1]*x4[2] + b3[i+1]*x4[3]; | |||
| y[i+2] += b0[i+2]*x4[0] + b1[i+2]*x4[1] + b2[i+2]*x4[2] + b3[i+2]*x4[3]; | |||
| y[i+3] += b0[i+3]*x4[0] + b1[i+3]*x4[1] + b2[i+3]*x4[2] + b3[i+3]*x4[3]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *xo, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| FLOAT x[4]; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| for ( i=0; i<4; i++) | |||
| x[i] = xo[i] * *alpha; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| y[i] += a0[i]*x[0] + a1[i]*x[1] + a2[i]*x[2] + a3[i]*x[3]; | |||
| y[i+1] += a0[i+1]*x[0] + a1[i+1]*x[1] + a2[i+1]*x[2] + a3[i+1]*x[3]; | |||
| y[i+2] += a0[i+2]*x[0] + a1[i+2]*x[1] + a2[i+2]*x[2] + a3[i+2]*x[3]; | |||
| y[i+3] += a0[i+3]*x[0] + a1[i+3]*x[1] + a2[i+3]*x[2] + a3[i+3]*x[3]; | |||
| } | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x2( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movss (%2) , %%xmm12 \n\t" // x0 | |||
| "movss (%6) , %%xmm4 \n\t" // alpha | |||
| "movss 4(%2) , %%xmm13 \n\t" // x1 | |||
| "mulss %%xmm4 , %%xmm12 \n\t" // alpha | |||
| "mulss %%xmm4 , %%xmm13 \n\t" // alpha | |||
| "shufps $0, %%xmm12, %%xmm12 \n\t" | |||
| "shufps $0, %%xmm13, %%xmm13 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" | |||
| "movups (%5,%0,4), %%xmm9 \n\t" | |||
| "mulps %%xmm12, %%xmm8 \n\t" | |||
| "mulps %%xmm13, %%xmm9 \n\t" | |||
| "addps %%xmm8 , %%xmm4 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm9 , %%xmm4 \n\t" | |||
| "movups %%xmm4 , -16(%3,%0,4) \n\t" // 4 * y | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (alpha) // 6 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #endif | |||
| #ifndef HAVE_KERNEL_4x2 | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| BLASLONG register n1 = n & -8 ; | |||
| BLASLONG register n2 = n & 4 ; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movss (%2), %%xmm12 \n\t" // x0 | |||
| "mulss (%6), %%xmm12 \n\t" // alpha | |||
| "shufps $0, %%xmm12, %%xmm12 \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups 16(%3,%0,4), %%xmm5 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a | |||
| "movups 16(%4,%0,4), %%xmm9 \n\t" // 4 * a | |||
| "mulps %%xmm12, %%xmm8 \n\t" | |||
| "mulps %%xmm12, %%xmm9 \n\t" | |||
| "addps %%xmm4 , %%xmm8 \n\t" | |||
| "addps %%xmm5 , %%xmm9 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "movups %%xmm8 , -32(%3,%0,4) \n\t" // 4 * y | |||
| "movups %%xmm9 , -16(%3,%0,4) \n\t" // 4 * y | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "testq $0x04, %5 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "movups (%3,%0,4), %%xmm4 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a | |||
| "mulps %%xmm12, %%xmm8 \n\t" | |||
| "addps %%xmm8 , %%xmm4 \n\t" | |||
| "movups %%xmm4 , (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n1), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap), // 4 | |||
| "r" (n2), // 5 | |||
| "r" (alpha) // 6 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #endif | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 1 ) | |||
| { | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%2,%0,4) , %%xmm12 \n\t" | |||
| "movups (%3,%0,4) , %%xmm11 \n\t" | |||
| "addps %%xmm12 , %%xmm11 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "movups %%xmm11, -16(%3,%0,4) \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (src), // 2 | |||
| "r" (dest) // 3 | |||
| : "cc", | |||
| "%xmm10", "%xmm11", "%xmm12", | |||
| "memory" | |||
| ); | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *ap[4]; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| BLASLONG lda4 = lda << 2; | |||
| BLASLONG lda8 = lda << 3; | |||
| FLOAT xbuffer[8],*ybuffer; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| ybuffer = buffer; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| n1 = n >> 3 ; | |||
| n2 = n & 7 ; | |||
| } | |||
| else | |||
| { | |||
| n1 = n >> 2 ; | |||
| n2 = n & 3 ; | |||
| } | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| y_ptr = y; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( inc_y != 1 ) | |||
| memset(ybuffer,0,NB*4); | |||
| else | |||
| ybuffer = y_ptr; | |||
| if ( inc_x == 1 ) | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x8(NB,ap,x_ptr,ybuffer,lda4,&alpha); | |||
| ap[0] += lda8; | |||
| ap[1] += lda8; | |||
| ap[2] += lda8; | |||
| ap[3] += lda8; | |||
| a_ptr += lda8; | |||
| x_ptr += 8; | |||
| } | |||
| if ( n2 & 4 ) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,x_ptr,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| a_ptr += lda4; | |||
| x_ptr += 4; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda*2; | |||
| x_ptr += 2; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,x_ptr,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| x_ptr += 1; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[1] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[2] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| xbuffer[3] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,ybuffer,&alpha); | |||
| ap[0] += lda4; | |||
| ap[1] += lda4; | |||
| ap[2] += lda4; | |||
| ap[3] += lda4; | |||
| a_ptr += lda4; | |||
| } | |||
| for( i = 0; i < n2 ; i++) | |||
| { | |||
| xbuffer[0] = x_ptr[0]; | |||
| x_ptr += inc_x; | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer,&alpha); | |||
| a_ptr += lda; | |||
| } | |||
| } | |||
| a += NB; | |||
| if ( inc_y != 1 ) | |||
| { | |||
| add_y(NB,ybuffer,y_ptr,inc_y); | |||
| y_ptr += NB * inc_y; | |||
| } | |||
| else | |||
| y_ptr += NB ; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| if ( m3 == 3 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| if ( lda == 3 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < ( n & -4 ); i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[4] * x_ptr[1]; | |||
| temp2 += a_ptr[2] * x_ptr[0] + a_ptr[5] * x_ptr[1]; | |||
| temp0 += a_ptr[6] * x_ptr[2] + a_ptr[9] * x_ptr[3]; | |||
| temp1 += a_ptr[7] * x_ptr[2] + a_ptr[10] * x_ptr[3]; | |||
| temp2 += a_ptr[8] * x_ptr[2] + a_ptr[11] * x_ptr[3]; | |||
| a_ptr += 12; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += 3; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| temp2 += a_ptr[2] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp2; | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| if ( lda == 2 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4) ; i+=4 ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0] + a_ptr[2] * x_ptr[1]; | |||
| temp1 += a_ptr[1] * x_ptr[0] + a_ptr[3] * x_ptr[1]; | |||
| temp0 += a_ptr[4] * x_ptr[2] + a_ptr[6] * x_ptr[3]; | |||
| temp1 += a_ptr[5] * x_ptr[2] + a_ptr[7] * x_ptr[3]; | |||
| a_ptr += 8; | |||
| x_ptr += 4; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += 2; | |||
| x_ptr ++; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp0 += a_ptr[0] * x_ptr[0]; | |||
| temp1 += a_ptr[1] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp0; | |||
| y_ptr += inc_y; | |||
| y_ptr[0] += alpha * temp1; | |||
| return(0); | |||
| } | |||
| if ( m3 == 1 ) | |||
| { | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| FLOAT temp = 0.0; | |||
| if ( lda == 1 && inc_x ==1 ) | |||
| { | |||
| for( i = 0; i < (n & -4); i+=4 ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i] + a_ptr[i+1] * x_ptr[i+1] + a_ptr[i+2] * x_ptr[i+2] + a_ptr[i+3] * x_ptr[i+3]; | |||
| } | |||
| for( ; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[i] * x_ptr[i]; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for( i = 0; i < n; i++ ) | |||
| { | |||
| temp += a_ptr[0] * x_ptr[0]; | |||
| a_ptr += lda; | |||
| x_ptr += inc_x; | |||
| } | |||
| } | |||
| y_ptr[0] += alpha * temp; | |||
| return(0); | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -1,218 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "sgemv_n_microk_bulldozer.c" | |||
| #elif defined(HASWELL) | |||
| #include "sgemv_n_microk_haswell.c" | |||
| #else | |||
| #include "sgemv_n_microk_sandy.c" | |||
| #endif | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest = *src; | |||
| dest++; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest += *src; | |||
| src++; | |||
| dest += inc_dest; | |||
| } | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG register m2; | |||
| BLASLONG register n2; | |||
| FLOAT *xbuffer,*ybuffer; | |||
| xbuffer = buffer; | |||
| ybuffer = xbuffer + 2048 + 256; | |||
| n1 = n / 512 ; | |||
| n2 = n % 512 ; | |||
| m1 = m / 64; | |||
| m2 = m % 64; | |||
| y_ptr = y; | |||
| x_ptr = x; | |||
| for (j=0; j<n1; j++) | |||
| { | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(512,x_ptr,xbuffer,inc_x); | |||
| a_ptr = a + j * 512 * lda; | |||
| y_ptr = y; | |||
| for(i = 0; i<m1; i++ ) | |||
| { | |||
| sgemv_kernel_64(512,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(64,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 64 * inc_y; | |||
| a_ptr += 64; | |||
| } | |||
| if ( m2 & 32 ) | |||
| { | |||
| sgemv_kernel_32(512,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(32,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 32 * inc_y; | |||
| a_ptr += 32; | |||
| } | |||
| if ( m2 & 16 ) | |||
| { | |||
| sgemv_kernel_16(512,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(16,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 16 * inc_y; | |||
| a_ptr += 16; | |||
| } | |||
| if ( m2 & 8 ) | |||
| { | |||
| sgemv_kernel_8(512,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(8,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 8 * inc_y; | |||
| a_ptr += 8; | |||
| } | |||
| if ( m2 & 4 ) | |||
| { | |||
| sgemv_kernel_4(512,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(4,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 4 * inc_y; | |||
| a_ptr += 4; | |||
| } | |||
| if ( m2 & 2 ) | |||
| { | |||
| sgemv_kernel_2(512,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(2,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 2 * inc_y; | |||
| a_ptr += 2; | |||
| } | |||
| if ( m2 & 1 ) | |||
| { | |||
| sgemv_kernel_1(512,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(1,ybuffer,y_ptr,inc_y); | |||
| } | |||
| x_ptr += 512 * inc_x; | |||
| } | |||
| if ( n2 > 0 ) | |||
| { | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(n2,x_ptr,xbuffer,inc_x); | |||
| a_ptr = a + n1 * 512 * lda; | |||
| y_ptr = y; | |||
| for(i = 0; i<m1; i++ ) | |||
| { | |||
| sgemv_kernel_64(n2,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(64,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 64 * inc_y; | |||
| a_ptr += 64; | |||
| } | |||
| if ( m2 & 32 ) | |||
| { | |||
| sgemv_kernel_32(n2,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(32,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 32 * inc_y; | |||
| a_ptr += 32; | |||
| } | |||
| if ( m2 & 16 ) | |||
| { | |||
| sgemv_kernel_16(n2,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(16,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 16 * inc_y; | |||
| a_ptr += 16; | |||
| } | |||
| if ( m2 & 8 ) | |||
| { | |||
| sgemv_kernel_8(n2,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(8,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 8 * inc_y; | |||
| a_ptr += 8; | |||
| } | |||
| if ( m2 & 4 ) | |||
| { | |||
| sgemv_kernel_4(n2,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(4,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 4 * inc_y; | |||
| a_ptr += 4; | |||
| } | |||
| if ( m2 & 2 ) | |||
| { | |||
| sgemv_kernel_2(n2,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(2,ybuffer,y_ptr,inc_y); | |||
| y_ptr += 2 * inc_y; | |||
| a_ptr += 2; | |||
| } | |||
| if ( m2 & 1 ) | |||
| { | |||
| sgemv_kernel_1(n2,alpha,a_ptr,lda,xbuffer,ybuffer); | |||
| add_y(1,ybuffer,y_ptr,inc_y); | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,269 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastss (%2), %%xmm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 | |||
| "vbroadcastss 16(%2), %%xmm0 \n\t" // x4 | |||
| "vbroadcastss 20(%2), %%xmm1 \n\t" // x5 | |||
| "vbroadcastss 24(%2), %%xmm2 \n\t" // x6 | |||
| "vbroadcastss 28(%2), %%xmm3 \n\t" // x7 | |||
| "vbroadcastss (%9), %%xmm8 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "addq $4 , %8 \n\t" | |||
| "vaddps %%xmm5 , %%xmm4, %%xmm4 \n\t" | |||
| "vfmaddps -16(%3,%0,4) , %%xmm4, %%xmm8,%%xmm6 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "vmovups %%xmm6, -16(%3,%0,4) \n\t" // 4 * y | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" | |||
| "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" | |||
| "vmovups %%xmm4, (%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5, 16(%3,%0,4) \n\t" // 4 * y | |||
| "addq $8 , %0 \n\t" | |||
| "addq $8 , %8 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| "vxorps %%xmm6, %%xmm6 , %%xmm6 \n\t" | |||
| "vxorps %%xmm7, %%xmm7 , %%xmm7 \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%5,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "prefetcht0 192(%6,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%0,4), %%xmm14, %%xmm5 \n\t" | |||
| "prefetcht0 192(%7,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%7,%0,4), %%xmm15, %%xmm4 \n\t" | |||
| ".align 2 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, 32(%4,%0,4), %%xmm12, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%4,%0,4), %%xmm12, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%5,%0,4), %%xmm13, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%5,%0,4), %%xmm13, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%6,%0,4), %%xmm14, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%7,%0,4), %%xmm15, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%7,%0,4), %%xmm15, %%xmm7 \n\t" | |||
| "prefetcht0 192(%4,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%4,%8,4), %%xmm0 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%5,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%5,%8,4), %%xmm1 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%6,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%6,%8,4), %%xmm2 , %%xmm5 \n\t" | |||
| "prefetcht0 192(%7,%8,4) \n\t" | |||
| "vfmaddps %%xmm4, (%7,%8,4), %%xmm3 , %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, 32(%4,%8,4), %%xmm0 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%4,%8,4), %%xmm0 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%5,%8,4), %%xmm1 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%5,%8,4), %%xmm1 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%6,%8,4), %%xmm2 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%6,%8,4), %%xmm2 , %%xmm7 \n\t" | |||
| "vfmaddps %%xmm6, 32(%7,%8,4), %%xmm3 , %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 48(%7,%8,4), %%xmm3 , %%xmm7 \n\t" | |||
| "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm4 \n\t" | |||
| "vfmaddps 16(%3,%0,4) , %%xmm5,%%xmm8,%%xmm5 \n\t" | |||
| "vfmaddps 32(%3,%0,4) , %%xmm6,%%xmm8,%%xmm6 \n\t" | |||
| "vfmaddps 48(%3,%0,4) , %%xmm7,%%xmm8,%%xmm7 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "vmovups %%xmm4,-64(%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm5,-48(%3,%0,4) \n\t" // 4 * y | |||
| "addq $16, %8 \n\t" | |||
| "vmovups %%xmm6,-32(%3,%0,4) \n\t" // 4 * y | |||
| "vmovups %%xmm7,-16(%3,%0,4) \n\t" // 4 * y | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vbroadcastss (%2), %%xmm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%xmm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%xmm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%xmm15 \n\t" // x3 | |||
| "vbroadcastss (%8), %%xmm8 \n\t" // alpha | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorps %%xmm4, %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5 , %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm4, (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vaddps %%xmm4, %%xmm5, %%xmm4 \n\t" | |||
| "vfmaddps (%3,%0,4) , %%xmm4,%%xmm8,%%xmm6 \n\t" | |||
| "vmovups %%xmm6, (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -1,451 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*3; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero | |||
| "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero | |||
| "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero | |||
| "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero | |||
| "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero | |||
| "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "nop \n\t" | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vfmaddps %%ymm8 , 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vfmaddps %%ymm9 , 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp | |||
| "prefetcht0 128(%%r8)\n\t" // Prefetch | |||
| "vfmaddps %%ymm10, 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%ymm11, 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp | |||
| "prefetcht0 192(%%r8)\n\t" // Prefetch | |||
| "vfmaddps %%ymm12, 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%ymm13, 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%ymm14, 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%ymm15, 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha | |||
| "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha | |||
| "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha | |||
| "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha | |||
| "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha | |||
| "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha | |||
| "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*3; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%xmm8 , %%xmm8 , %%xmm8 \n\t" // set to zero | |||
| "vxorps %%xmm9 , %%xmm9 , %%xmm9 \n\t" // set to zero | |||
| "vxorps %%xmm10, %%xmm10, %%xmm10\n\t" // set to zero | |||
| "vxorps %%xmm11, %%xmm11, %%xmm11\n\t" // set to zero | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero | |||
| "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero | |||
| "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "nop \n\t" | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vfmaddps %%xmm8 , 0*4(%%rsi), %%xmm0, %%xmm8 \n\t" // multiply a and c and add to temp | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vfmaddps %%xmm9 , 4*4(%%rsi), %%xmm0, %%xmm9 \n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm10, 8*4(%%rsi), %%xmm0, %%xmm10\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm11, 12*4(%%rsi), %%xmm0, %%xmm11\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm12, 16*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm13, 20*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm14, 24*4(%%rsi), %%xmm0, %%xmm14\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm15, 28*4(%%rsi), %%xmm0, %%xmm15\n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%xmm8 , %%xmm1, %%xmm8 \n\t" // scale by alpha | |||
| "vmulps %%xmm9 , %%xmm1, %%xmm9 \n\t" // scale by alpha | |||
| "vmulps %%xmm10, %%xmm1, %%xmm10\n\t" // scale by alpha | |||
| "vmulps %%xmm11, %%xmm1, %%xmm11\n\t" // scale by alpha | |||
| "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmulps %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha | |||
| "vmulps %%xmm14, %%xmm1, %%xmm14\n\t" // scale by alpha | |||
| "vmulps %%xmm15, %%xmm1, %%xmm15\n\t" // scale by alpha | |||
| "vmovups %%xmm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%xmm9 , 4*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%xmm10, 8*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%xmm11, 12*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%xmm12, 16*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%xmm13, 20*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%xmm14, 24*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%xmm15, 28*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| ); | |||
| } | |||
| static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*3; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero | |||
| "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%ymm13, 8*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha | |||
| "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha | |||
| "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm13, 8*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "vfmaddps %%ymm12, 0*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha | |||
| "vmovups %%ymm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "vfmaddps %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp | |||
| "vfmaddss %%xmm13, 1*4(%%rsi), %%xmm0, %%xmm13\n\t" // multiply a and c and add to temp | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "vfmaddss %%xmm12, 0*4(%%rsi), %%xmm0, %%xmm12\n\t" // multiply a and c and add to temp | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,299 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 | |||
| "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 | |||
| "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 | |||
| "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 | |||
| "vbroadcastss (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vxorps %%xmm4 , %%xmm4, %%xmm4 \n\t" | |||
| "vxorps %%xmm5 , %%xmm5, %%xmm5 \n\t" | |||
| "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmadd231ps (%4,%8,4), %%xmm0 , %%xmm4 \n\t" | |||
| "vfmadd231ps (%5,%8,4), %%xmm1 , %%xmm5 \n\t" | |||
| "vfmadd231ps (%6,%8,4), %%xmm2 , %%xmm4 \n\t" | |||
| "vfmadd231ps (%7,%8,4), %%xmm3 , %%xmm5 \n\t" | |||
| "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" | |||
| "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4 , %8 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" | |||
| "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y | |||
| "addq $8 , %8 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y | |||
| "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231ps (%4,%8,4), %%ymm0 , %%ymm4 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "vfmadd231ps 32(%4,%8,4), %%ymm0 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%8,4), %%ymm1 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%5,%8,4), %%ymm1 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%8,4), %%ymm2 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%6,%8,4), %%ymm2 , %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%8,4), %%ymm3 , %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%7,%8,4), %%ymm3 , %%ymm5 \n\t" | |||
| "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "addq $16, %8 \n\t" | |||
| "vmovups %%ymm8,-64(%3,%0,4) \n\t" // 8 * y | |||
| "subq $16, %1 \n\t" | |||
| "vmovups %%ymm9,-32(%3,%0,4) \n\t" // 8 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vaddps %%xmm4 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmulps %%xmm6 , %%xmm5 , %%xmm5 \n\t" | |||
| "vaddps %%xmm7 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vaddps %%ymm4 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmulps %%ymm6 , %%ymm5 , %%ymm5 \n\t" | |||
| "vaddps %%ymm7 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm8 \n\t" // 8 * y | |||
| "vmovups 32(%3,%0,4), %%ymm9 \n\t" // 8 * y | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%4,%0,4), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm14, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%6,%0,4), %%ymm14, %%ymm5 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm15, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%7,%0,4), %%ymm15, %%ymm5 \n\t" | |||
| "vfmadd231ps %%ymm6 , %%ymm4 , %%ymm8 \n\t" | |||
| "vfmadd231ps %%ymm6 , %%ymm5 , %%ymm9 \n\t" | |||
| "vmovups %%ymm8, (%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm9, 32(%3,%0,4) \n\t" // 8 * y | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -1,461 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*2; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero | |||
| "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero | |||
| "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero | |||
| "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero | |||
| "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero | |||
| "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vfmadd231ps 0*4(%%rsi), %%ymm0, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "vfmadd231ps 8*4(%%rsi), %%ymm0, %%ymm9 \n\t" // multiply a and c and add to temp | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vfmadd231ps 16*4(%%rsi), %%ymm0, %%ymm10\n\t" // multiply a and c and add to temp | |||
| "vfmadd231ps 24*4(%%rsi), %%ymm0, %%ymm11\n\t" // multiply a and c and add to temp | |||
| "prefetcht0 128(%%r8)\n\t" // Prefetch | |||
| "vfmadd231ps 32*4(%%rsi), %%ymm0, %%ymm12\n\t" // multiply a and c and add to temp | |||
| "vfmadd231ps 40*4(%%rsi), %%ymm0, %%ymm13\n\t" // multiply a and c and add to temp | |||
| "prefetcht0 192(%%r8)\n\t" // Prefetch | |||
| "vfmadd231ps 48*4(%%rsi), %%ymm0, %%ymm14\n\t" // multiply a and c and add to temp | |||
| "vfmadd231ps 56*4(%%rsi), %%ymm0, %%ymm15\n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha | |||
| "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha | |||
| "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha | |||
| "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha | |||
| "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha | |||
| "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha | |||
| "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*3; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero | |||
| "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "nop \n\t" | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp | |||
| "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp | |||
| "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha | |||
| "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha | |||
| "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*3; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "nop \n\t" | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp | |||
| "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,204 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movss (%2), %%xmm12 \n\t" // x0 | |||
| "movss 4(%2), %%xmm13 \n\t" // x1 | |||
| "movss 8(%2), %%xmm14 \n\t" // x2 | |||
| "movss 12(%2), %%xmm15 \n\t" // x3 | |||
| "shufps $0, %%xmm12, %%xmm12\n\t" | |||
| "shufps $0, %%xmm13, %%xmm13\n\t" | |||
| "shufps $0, %%xmm14, %%xmm14\n\t" | |||
| "shufps $0, %%xmm15, %%xmm15\n\t" | |||
| "movss 16(%2), %%xmm0 \n\t" // x4 | |||
| "movss 20(%2), %%xmm1 \n\t" // x5 | |||
| "movss 24(%2), %%xmm2 \n\t" // x6 | |||
| "movss 28(%2), %%xmm3 \n\t" // x7 | |||
| "shufps $0, %%xmm0 , %%xmm0 \n\t" | |||
| "shufps $0, %%xmm1 , %%xmm1 \n\t" | |||
| "shufps $0, %%xmm2 , %%xmm2 \n\t" | |||
| "shufps $0, %%xmm3 , %%xmm3 \n\t" | |||
| "movss (%9), %%xmm6 \n\t" // alpha | |||
| "shufps $0, %%xmm6 , %%xmm6 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "xorps %%xmm4 , %%xmm4 \n\t" | |||
| "xorps %%xmm5 , %%xmm5 \n\t" | |||
| "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| ".align 2 \n\t" | |||
| "movups (%4,%0,4), %%xmm8 \n\t" | |||
| "movups (%5,%0,4), %%xmm9 \n\t" | |||
| "movups (%6,%0,4), %%xmm10 \n\t" | |||
| "movups (%7,%0,4), %%xmm11 \n\t" | |||
| ".align 2 \n\t" | |||
| "mulps %%xmm12, %%xmm8 \n\t" | |||
| "mulps %%xmm13, %%xmm9 \n\t" | |||
| "mulps %%xmm14, %%xmm10 \n\t" | |||
| "mulps %%xmm15, %%xmm11 \n\t" | |||
| "addps %%xmm8 , %%xmm4 \n\t" | |||
| "addps %%xmm9 , %%xmm5 \n\t" | |||
| "addps %%xmm10, %%xmm4 \n\t" | |||
| "addps %%xmm11, %%xmm5 \n\t" | |||
| "movups (%4,%8,4), %%xmm8 \n\t" | |||
| "movups (%5,%8,4), %%xmm9 \n\t" | |||
| "movups (%6,%8,4), %%xmm10 \n\t" | |||
| "movups (%7,%8,4), %%xmm11 \n\t" | |||
| ".align 2 \n\t" | |||
| "mulps %%xmm0 , %%xmm8 \n\t" | |||
| "mulps %%xmm1 , %%xmm9 \n\t" | |||
| "mulps %%xmm2 , %%xmm10 \n\t" | |||
| "mulps %%xmm3 , %%xmm11 \n\t" | |||
| "addps %%xmm8 , %%xmm4 \n\t" | |||
| "addps %%xmm9 , %%xmm5 \n\t" | |||
| "addps %%xmm10, %%xmm4 \n\t" | |||
| "addps %%xmm11, %%xmm5 \n\t" | |||
| "addq $4 , %8 \n\t" | |||
| "addps %%xmm5 , %%xmm4 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "mulps %%xmm6 , %%xmm4 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm4 , %%xmm7 \n\t" | |||
| "movups %%xmm7 , -16(%3,%0,4) \n\t" // 4 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movss (%2), %%xmm12 \n\t" // x0 | |||
| "movss 4(%2), %%xmm13 \n\t" // x1 | |||
| "movss 8(%2), %%xmm14 \n\t" // x2 | |||
| "movss 12(%2), %%xmm15 \n\t" // x3 | |||
| "shufps $0, %%xmm12, %%xmm12\n\t" | |||
| "shufps $0, %%xmm13, %%xmm13\n\t" | |||
| "shufps $0, %%xmm14, %%xmm14\n\t" | |||
| "shufps $0, %%xmm15, %%xmm15\n\t" | |||
| "movss (%8), %%xmm6 \n\t" // alpha | |||
| "shufps $0, %%xmm6 , %%xmm6 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "xorps %%xmm4 , %%xmm4 \n\t" | |||
| "movups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "movups (%4,%0,4), %%xmm8 \n\t" | |||
| "movups (%5,%0,4), %%xmm9 \n\t" | |||
| "movups (%6,%0,4), %%xmm10 \n\t" | |||
| "movups (%7,%0,4), %%xmm11 \n\t" | |||
| "mulps %%xmm12, %%xmm8 \n\t" | |||
| "mulps %%xmm13, %%xmm9 \n\t" | |||
| "mulps %%xmm14, %%xmm10 \n\t" | |||
| "mulps %%xmm15, %%xmm11 \n\t" | |||
| "addps %%xmm8 , %%xmm4 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm9 , %%xmm4 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm10 , %%xmm4 \n\t" | |||
| "addps %%xmm4 , %%xmm11 \n\t" | |||
| "mulps %%xmm6 , %%xmm11 \n\t" | |||
| "addps %%xmm7 , %%xmm11 \n\t" | |||
| "movups %%xmm11, -16(%3,%0,4) \n\t" // 4 * y | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,370 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x8 1 | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x8( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, BLASLONG lda4, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss 16(%2), %%ymm0 \n\t" // x4 | |||
| "vbroadcastss 20(%2), %%ymm1 \n\t" // x5 | |||
| "vbroadcastss 24(%2), %%ymm2 \n\t" // x6 | |||
| "vbroadcastss 28(%2), %%ymm3 \n\t" // x7 | |||
| "vbroadcastss (%9), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vxorps %%xmm4 , %%xmm4 , %%xmm4 \n\t" | |||
| "vxorps %%xmm5 , %%xmm5 , %%xmm5 \n\t" | |||
| "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" | |||
| "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" | |||
| "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" | |||
| "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" | |||
| "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" | |||
| "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" | |||
| "vmulps (%4,%8,4), %%xmm0 , %%xmm8 \n\t" | |||
| "vmulps (%5,%8,4), %%xmm1 , %%xmm10 \n\t" | |||
| "vmulps (%6,%8,4), %%xmm2 , %%xmm9 \n\t" | |||
| "vmulps (%7,%8,4), %%xmm3 , %%xmm11 \n\t" | |||
| "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" | |||
| "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" | |||
| "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" | |||
| "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" | |||
| "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" | |||
| "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4, %8 \n\t" | |||
| "addq $4, %0 \n\t" | |||
| "subq $4, %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" | |||
| "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" | |||
| "vmulps (%6,%8,4), %%ymm2 , %%ymm9 \n\t" | |||
| "vmulps (%7,%8,4), %%ymm3 , %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" | |||
| "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" | |||
| "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" | |||
| "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y | |||
| "addq $8, %8 \n\t" | |||
| "addq $8, %0 \n\t" | |||
| "subq $8, %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 192(%6,%0,4) \n\t" | |||
| "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" | |||
| "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "prefetcht0 192(%7,%0,4) \n\t" | |||
| "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" | |||
| "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 192(%4,%8,4) \n\t" | |||
| "vmulps (%4,%8,4), %%ymm0 , %%ymm8 \n\t" | |||
| "vmulps 32(%4,%8,4), %%ymm0 , %%ymm9 \n\t" | |||
| "prefetcht0 192(%5,%8,4) \n\t" | |||
| "vmulps (%5,%8,4), %%ymm1 , %%ymm10 \n\t" | |||
| "vmulps 32(%5,%8,4), %%ymm1 , %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 192(%6,%8,4) \n\t" | |||
| "vmulps (%6,%8,4), %%ymm2 , %%ymm8 \n\t" | |||
| "vmulps 32(%6,%8,4), %%ymm2 , %%ymm9 \n\t" | |||
| "prefetcht0 192(%7,%8,4) \n\t" | |||
| "vmulps (%7,%8,4), %%ymm3 , %%ymm10 \n\t" | |||
| "vmulps 32(%7,%8,4), %%ymm3 , %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" | |||
| "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" | |||
| "vaddps (%3,%0,4), %%ymm4 , %%ymm4 \n\t" // 8 * y | |||
| "vaddps 32(%3,%0,4), %%ymm5 , %%ymm5 \n\t" // 8 * y | |||
| "vmovups %%ymm4, (%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm5, 32(%3,%0,4) \n\t" // 8 * y | |||
| "addq $16, %8 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (lda4), // 8 | |||
| "r" (alpha) // 9 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y, FLOAT *alpha) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vbroadcastss (%2), %%ymm12 \n\t" // x0 | |||
| "vbroadcastss 4(%2), %%ymm13 \n\t" // x1 | |||
| "vbroadcastss 8(%2), %%ymm14 \n\t" // x2 | |||
| "vbroadcastss 12(%2), %%ymm15 \n\t" // x3 | |||
| "vbroadcastss (%8), %%ymm6 \n\t" // alpha | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%xmm7 \n\t" // 4 * y | |||
| "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" | |||
| "vmulps (%5,%0,4), %%xmm13, %%xmm10 \n\t" | |||
| "vmulps (%6,%0,4), %%xmm14, %%xmm9 \n\t" | |||
| "vmulps (%7,%0,4), %%xmm15, %%xmm11 \n\t" | |||
| "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" | |||
| "vaddps %%xmm4, %%xmm9 , %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm11, %%xmm5 \n\t" | |||
| "vaddps %%xmm5, %%xmm4 , %%xmm4 \n\t" | |||
| "vmulps %%xmm6, %%xmm4 , %%xmm5 \n\t" | |||
| "vaddps %%xmm5, %%xmm7 , %%xmm5 \n\t" | |||
| "vmovups %%xmm5, (%3,%0,4) \n\t" // 4 * y | |||
| "addq $4, %0 \n\t" | |||
| "subq $4, %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm7 \n\t" // 8 * y | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps (%6,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "vmulps (%7,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm9 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "vaddps %%ymm5, %%ymm4 , %%ymm4 \n\t" | |||
| "vmulps %%ymm6, %%ymm4 , %%ymm5 \n\t" | |||
| "vaddps %%ymm5, %%ymm7 , %%ymm5 \n\t" | |||
| "vmovups %%ymm5, (%3,%0,4) \n\t" // 8 * y | |||
| "addq $8, %0 \n\t" | |||
| "subq $8, %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vxorps %%ymm4 , %%ymm4 , %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5 , %%ymm5 \n\t" | |||
| "vmovups (%3,%0,4), %%ymm0 \n\t" // 8 * y | |||
| "vmovups 32(%3,%0,4), %%ymm1 \n\t" // 8 * y | |||
| "prefetcht0 192(%4,%0,4) \n\t" | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps 32(%4,%0,4), %%ymm12, %%ymm9 \n\t" | |||
| "prefetcht0 192(%5,%0,4) \n\t" | |||
| "vmulps (%5,%0,4), %%ymm13, %%ymm10 \n\t" | |||
| "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 192(%6,%0,4) \n\t" | |||
| "vmulps (%6,%0,4), %%ymm14, %%ymm8 \n\t" | |||
| "vmulps 32(%6,%0,4), %%ymm14, %%ymm9 \n\t" | |||
| "prefetcht0 192(%7,%0,4) \n\t" | |||
| "vmulps (%7,%0,4), %%ymm15, %%ymm10 \n\t" | |||
| "vmulps 32(%7,%0,4), %%ymm15, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm9 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm10, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "vmulps %%ymm6, %%ymm4 , %%ymm4 \n\t" | |||
| "vmulps %%ymm6, %%ymm5 , %%ymm5 \n\t" | |||
| "vaddps %%ymm4, %%ymm0 , %%ymm0 \n\t" | |||
| "vaddps %%ymm5, %%ymm1 , %%ymm1 \n\t" | |||
| "vmovups %%ymm0, (%3,%0,4) \n\t" // 8 * y | |||
| "vmovups %%ymm1, 32(%3,%0,4) \n\t" // 8 * y | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]), // 7 | |||
| "r" (alpha) // 8 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -1,473 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| static void sgemv_kernel_64( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*2; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero | |||
| "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero | |||
| "vxorps %%ymm12, %%ymm12, %%ymm12\n\t" // set to zero | |||
| "vxorps %%ymm13, %%ymm13, %%ymm13\n\t" // set to zero | |||
| "vxorps %%ymm14, %%ymm14, %%ymm14\n\t" // set to zero | |||
| "vxorps %%ymm15, %%ymm15, %%ymm15\n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "nop \n\t" | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp | |||
| "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp | |||
| "prefetcht0 128(%%r8)\n\t" // Prefetch | |||
| "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp | |||
| "prefetcht0 192(%%r8)\n\t" // Prefetch | |||
| "vmulps 32*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 40*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp | |||
| "vmulps 48*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp | |||
| "vmulps 56*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm12, %%ymm4, %%ymm12\n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm13, %%ymm5, %%ymm13\n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm14, %%ymm6, %%ymm14\n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm15, %%ymm7, %%ymm15\n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha | |||
| "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha | |||
| "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha | |||
| "vmulps %%ymm12, %%ymm1, %%ymm12\n\t" // scale by alpha | |||
| "vmulps %%ymm13, %%ymm1, %%ymm13\n\t" // scale by alpha | |||
| "vmulps %%ymm14, %%ymm1, %%ymm14\n\t" // scale by alpha | |||
| "vmulps %%ymm15, %%ymm1, %%ymm15\n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm12, 32*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm13, 40*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm14, 48*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm15, 56*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_32( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*3; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero | |||
| "vxorps %%ymm10, %%ymm10, %%ymm10\n\t" // set to zero | |||
| "vxorps %%ymm11, %%ymm11, %%ymm11\n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "nop \n\t" | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp | |||
| "vmulps 16*4(%%rsi), %%ymm0, %%ymm6 \n\t" // multiply a and c and add to temp | |||
| "vmulps 24*4(%%rsi), %%ymm0, %%ymm7 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm10, %%ymm6, %%ymm10\n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm11, %%ymm7, %%ymm11\n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha | |||
| "vmulps %%ymm10, %%ymm1, %%ymm10\n\t" // scale by alpha | |||
| "vmulps %%ymm11, %%ymm1, %%ymm11\n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm10, 16*4(%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm11, 24*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| float *pre = a + lda*3; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "movq %6, %%r8\n\t" // address for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "prefetcht0 64(%%r8)\n\t" // Prefetch | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| "vxorps %%ymm9 , %%ymm9 , %%ymm9 \n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "nop \n\t" | |||
| "leaq (%%r8 , %%rcx, 4), %%r8 \n\t" // add lda to pointer for prefetch | |||
| "prefetcht0 (%%r8)\n\t" // Prefetch | |||
| "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 8*4(%%rsi), %%ymm0, %%ymm5 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm9 , %%ymm5, %%ymm9 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmulps %%ymm9 , %%ymm1, %%ymm9 \n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| "vmovups %%ymm9 , 8*4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y), // 5 | |||
| "m" (pre) // 6 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_8( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%ymm1\n\t" // alpha -> ymm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%ymm8 , %%ymm8 , %%ymm8 \n\t" // set to zero | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%ymm0 \n\t" // load values of c | |||
| "vmulps 0*4(%%rsi), %%ymm0, %%ymm4 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%ymm8 , %%ymm4, %%ymm8 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%ymm8 , %%ymm1, %%ymm8 \n\t" // scale by alpha | |||
| "vmovups %%ymm8 , (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_4( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vbroadcastss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vbroadcastss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulps %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmovups %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_2( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "vmulps 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp | |||
| "vmulps 1*4(%%rsi), %%xmm0, %%xmm5 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp | |||
| "vaddps %%xmm13, %%xmm5, %%xmm13 \n\t" // multiply a and c and add to temp | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmulss %%xmm13, %%xmm1, %%xmm13\n\t" // scale by alpha | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| "vmovss %%xmm13, 4(%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_1( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovss (%%rdi), %%xmm0 \n\t" // load values of c | |||
| "addq $4 , %%rdi \n\t" // increment pointer of c | |||
| "vmulss 0*4(%%rsi), %%xmm0, %%xmm4 \n\t" // multiply a and c and add to temp | |||
| "vaddss %%xmm12, %%xmm4, %%xmm12 \n\t" // multiply a and c and add to temp | |||
| "leaq (%%rsi, %%rcx, 4), %%rsi \n\t" // add lda to pointer of a | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" // scale by alpha | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,624 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(NEHALEM) | |||
| #include "sgemv_t_microk_nehalem-4.c" | |||
| #elif defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "sgemv_t_microk_bulldozer-4.c" | |||
| #elif defined(SANDYBRIDGE) | |||
| #include "sgemv_t_microk_sandy-4.c" | |||
| #elif defined(HASWELL) | |||
| #include "sgemv_t_microk_haswell-4.c" | |||
| #endif | |||
| #define NBMAX 4096 | |||
| #ifndef HAVE_KERNEL_4x4 | |||
| static void sgemv_kernel_4x4(BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i; | |||
| FLOAT *a0,*a1,*a2,*a3; | |||
| a0 = ap[0]; | |||
| a1 = ap[1]; | |||
| a2 = ap[2]; | |||
| a3 = ap[3]; | |||
| FLOAT temp0 = 0.0; | |||
| FLOAT temp1 = 0.0; | |||
| FLOAT temp2 = 0.0; | |||
| FLOAT temp3 = 0.0; | |||
| for ( i=0; i< n; i+=4 ) | |||
| { | |||
| temp0 += a0[i]*x[i] + a0[i+1]*x[i+1] + a0[i+2]*x[i+2] + a0[i+3]*x[i+3]; | |||
| temp1 += a1[i]*x[i] + a1[i+1]*x[i+1] + a1[i+2]*x[i+2] + a1[i+3]*x[i+3]; | |||
| temp2 += a2[i]*x[i] + a2[i+1]*x[i+1] + a2[i+2]*x[i+2] + a2[i+3]*x[i+3]; | |||
| temp3 += a3[i]*x[i] + a3[i+1]*x[i+1] + a3[i+2]*x[i+2] + a3[i+3]*x[i+3]; | |||
| } | |||
| y[0] = temp0; | |||
| y[1] = temp1; | |||
| y[2] = temp2; | |||
| y[3] = temp3; | |||
| } | |||
| #endif | |||
| static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x2(BLASLONG n, FLOAT *ap0, FLOAT *ap1, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i; | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorps %%xmm10 , %%xmm10 \n\t" | |||
| "xorps %%xmm11 , %%xmm11 \n\t" | |||
| "testq $4 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "movups (%5,%0,4) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 | |||
| "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 | |||
| "mulps %%xmm14 , %%xmm12 \n\t" | |||
| "mulps %%xmm14 , %%xmm13 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm13 , %%xmm11 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%5,%0,4) , %%xmm14 \n\t" // x | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" // ap0 | |||
| "movups (%4,%0,4) , %%xmm13 \n\t" // ap1 | |||
| "mulps %%xmm14 , %%xmm12 \n\t" | |||
| "mulps %%xmm14 , %%xmm13 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "addps %%xmm13 , %%xmm11 \n\t" | |||
| "movups 16(%5,%0,4) , %%xmm14 \n\t" // x | |||
| "movups 16(%3,%0,4) , %%xmm12 \n\t" // ap0 | |||
| "movups 16(%4,%0,4) , %%xmm13 \n\t" // ap1 | |||
| "mulps %%xmm14 , %%xmm12 \n\t" | |||
| "mulps %%xmm14 , %%xmm13 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "addps %%xmm13 , %%xmm11 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L01END%=: \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| "haddps %%xmm11, %%xmm11 \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| "haddps %%xmm11, %%xmm11 \n\t" | |||
| "movss %%xmm10, (%2) \n\t" | |||
| "movss %%xmm11,4(%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (y), // 2 | |||
| "r" (ap0), // 3 | |||
| "r" (ap1), // 4 | |||
| "r" (x) // 5 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x1(BLASLONG n, FLOAT *ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG i; | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorps %%xmm9 , %%xmm9 \n\t" | |||
| "xorps %%xmm10 , %%xmm10 \n\t" | |||
| "testq $4 , %1 \n\t" | |||
| "jz .L01LABEL%= \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups (%4,%0,4) , %%xmm11 \n\t" | |||
| "mulps %%xmm11 , %%xmm12 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L01LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L01END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups 16(%3,%0,4) , %%xmm14 \n\t" | |||
| "movups (%4,%0,4) , %%xmm11 \n\t" | |||
| "movups 16(%4,%0,4) , %%xmm13 \n\t" | |||
| "mulps %%xmm11 , %%xmm12 \n\t" | |||
| "mulps %%xmm13 , %%xmm14 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "addps %%xmm12 , %%xmm10 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "addps %%xmm14 , %%xmm9 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L01END%=: \n\t" | |||
| "addps %%xmm9 , %%xmm10 \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| "haddps %%xmm10, %%xmm10 \n\t" | |||
| "movss %%xmm10, (%2) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (y), // 2 | |||
| "r" (ap), // 3 | |||
| "r" (x) // 4 | |||
| : "cc", | |||
| "%xmm9", "%xmm10" , | |||
| "%xmm11", "%xmm12", "%xmm13", "%xmm14", | |||
| "memory" | |||
| ); | |||
| } | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest = *src; | |||
| dest++; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) __attribute__ ((noinline)); | |||
| static void add_y(BLASLONG n, FLOAT da , FLOAT *src, FLOAT *dest, BLASLONG inc_dest) | |||
| { | |||
| BLASLONG i; | |||
| if ( inc_dest != 1 ) | |||
| { | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest += src[i] * da; | |||
| dest += inc_dest; | |||
| } | |||
| return; | |||
| } | |||
| i=0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movss (%2) , %%xmm10 \n\t" | |||
| "shufps $0 , %%xmm10 , %%xmm10 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%3,%0,4) , %%xmm12 \n\t" | |||
| "movups (%4,%0,4) , %%xmm11 \n\t" | |||
| "mulps %%xmm10 , %%xmm12 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm12 , %%xmm11 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "movups %%xmm11, -16(%4,%0,4) \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (&da), // 2 | |||
| "r" (src), // 3 | |||
| "r" (dest) // 4 | |||
| : "cc", | |||
| "%xmm10", "%xmm11", "%xmm12", | |||
| "memory" | |||
| ); | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG register i; | |||
| BLASLONG register j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| BLASLONG n0; | |||
| BLASLONG n1; | |||
| BLASLONG m1; | |||
| BLASLONG m2; | |||
| BLASLONG m3; | |||
| BLASLONG n2; | |||
| FLOAT ybuffer[4],*xbuffer; | |||
| FLOAT *ytemp; | |||
| if ( m < 1 ) return(0); | |||
| if ( n < 1 ) return(0); | |||
| xbuffer = buffer; | |||
| ytemp = buffer + NBMAX; | |||
| n0 = n / NBMAX; | |||
| n1 = (n % NBMAX) >> 2 ; | |||
| n2 = n & 3 ; | |||
| m3 = m & 3 ; | |||
| m1 = m & -4 ; | |||
| m2 = (m & (NBMAX-1)) - m3 ; | |||
| BLASLONG NB = NBMAX; | |||
| while ( NB == NBMAX ) | |||
| { | |||
| m1 -= NB; | |||
| if ( m1 < 0) | |||
| { | |||
| if ( m2 == 0 ) break; | |||
| NB = m2; | |||
| } | |||
| y_ptr = y; | |||
| a_ptr = a; | |||
| x_ptr = x; | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(NB,x_ptr,xbuffer,inc_x); | |||
| FLOAT *ap[4]; | |||
| FLOAT *yp; | |||
| BLASLONG register lda4 = 4 * lda; | |||
| ap[0] = a_ptr; | |||
| ap[1] = a_ptr + lda; | |||
| ap[2] = ap[1] + lda; | |||
| ap[3] = ap[2] + lda; | |||
| if ( n0 > 0 ) | |||
| { | |||
| BLASLONG nb1 = NBMAX / 4; | |||
| for( j=0; j<n0; j++) | |||
| { | |||
| yp = ytemp; | |||
| for( i = 0; i < nb1 ; i++) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,yp); | |||
| ap[0] += lda4 ; | |||
| ap[1] += lda4 ; | |||
| ap[2] += lda4 ; | |||
| ap[3] += lda4 ; | |||
| yp += 4; | |||
| } | |||
| add_y(nb1*4, alpha, ytemp, y_ptr, inc_y ); | |||
| y_ptr += nb1 * inc_y * 4; | |||
| a_ptr += nb1 * lda4 ; | |||
| } | |||
| } | |||
| yp = ytemp; | |||
| for( i = 0; i < n1 ; i++) | |||
| { | |||
| sgemv_kernel_4x4(NB,ap,xbuffer,yp); | |||
| ap[0] += lda4 ; | |||
| ap[1] += lda4 ; | |||
| ap[2] += lda4 ; | |||
| ap[3] += lda4 ; | |||
| yp += 4; | |||
| } | |||
| if ( n1 > 0 ) | |||
| { | |||
| add_y(n1*4, alpha, ytemp, y_ptr, inc_y ); | |||
| y_ptr += n1 * inc_y * 4; | |||
| a_ptr += n1 * lda4 ; | |||
| } | |||
| if ( n2 & 2 ) | |||
| { | |||
| sgemv_kernel_4x2(NB,ap[0],ap[1],xbuffer,ybuffer); | |||
| a_ptr += lda * 2; | |||
| *y_ptr += ybuffer[0] * alpha; | |||
| y_ptr += inc_y; | |||
| *y_ptr += ybuffer[1] * alpha; | |||
| y_ptr += inc_y; | |||
| } | |||
| if ( n2 & 1 ) | |||
| { | |||
| sgemv_kernel_4x1(NB,a_ptr,xbuffer,ybuffer); | |||
| a_ptr += lda; | |||
| *y_ptr += ybuffer[0] * alpha; | |||
| y_ptr += inc_y; | |||
| } | |||
| a += NB; | |||
| x += NB * inc_x; | |||
| } | |||
| if ( m3 == 0 ) return(0); | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| if ( m3 == 3 ) | |||
| { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp2 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if ( lda == 3 && inc_y == 1 ) | |||
| { | |||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| y_ptr[j+1] += aj[3] * xtemp0 + aj[4] * xtemp1 + aj[5] * xtemp2; | |||
| y_ptr[j+2] += aj[6] * xtemp0 + aj[7] * xtemp1 + aj[8] * xtemp2; | |||
| y_ptr[j+3] += aj[9] * xtemp0 + aj[10] * xtemp1 + aj[11] * xtemp2; | |||
| aj += 12; | |||
| } | |||
| for ( ; j<n; j++ ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 + aj[2] * xtemp2; | |||
| aj += 3; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_y == 1 ) | |||
| { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; | |||
| y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 + *(aj+lda+2) * xtemp2; | |||
| y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 + *(aj+lda2+2) * xtemp2; | |||
| y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 + *(aj+lda3+2) * xtemp2; | |||
| aj += lda4; | |||
| } | |||
| for ( ; j< n ; j++ ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2 ; | |||
| aj += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for ( j=0; j<n; j++ ) | |||
| { | |||
| *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 + *(aj+2) * xtemp2; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| if ( m3 == 2 ) | |||
| { | |||
| FLOAT xtemp0 = *x_ptr * alpha; | |||
| x_ptr += inc_x; | |||
| FLOAT xtemp1 = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if ( lda == 2 && inc_y == 1 ) | |||
| { | |||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ; | |||
| y_ptr[j+1] += aj[2] * xtemp0 + aj[3] * xtemp1 ; | |||
| y_ptr[j+2] += aj[4] * xtemp0 + aj[5] * xtemp1 ; | |||
| y_ptr[j+3] += aj[6] * xtemp0 + aj[7] * xtemp1 ; | |||
| aj += 8; | |||
| } | |||
| for ( ; j<n; j++ ) | |||
| { | |||
| y_ptr[j] += aj[0] * xtemp0 + aj[1] * xtemp1 ; | |||
| aj += 2; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_y == 1 ) | |||
| { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||
| y_ptr[j+1] += *(aj+lda) * xtemp0 + *(aj+lda+1) * xtemp1 ; | |||
| y_ptr[j+2] += *(aj+lda2) * xtemp0 + *(aj+lda2+1) * xtemp1 ; | |||
| y_ptr[j+3] += *(aj+lda3) * xtemp0 + *(aj+lda3+1) * xtemp1 ; | |||
| aj += lda4; | |||
| } | |||
| for ( ; j< n ; j++ ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||
| aj += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for ( j=0; j<n; j++ ) | |||
| { | |||
| *y_ptr += *aj * xtemp0 + *(aj+1) * xtemp1 ; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| FLOAT xtemp = *x_ptr * alpha; | |||
| FLOAT *aj = a_ptr; | |||
| y_ptr = y; | |||
| if ( lda == 1 && inc_y == 1 ) | |||
| { | |||
| for ( j=0; j< ( n & -4) ; j+=4 ) | |||
| { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| y_ptr[j+1] += aj[j+1] * xtemp; | |||
| y_ptr[j+2] += aj[j+2] * xtemp; | |||
| y_ptr[j+3] += aj[j+3] * xtemp; | |||
| } | |||
| for ( ; j<n ; j++ ) | |||
| { | |||
| y_ptr[j] += aj[j] * xtemp; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| if ( inc_y == 1 ) | |||
| { | |||
| BLASLONG register lda2 = lda << 1; | |||
| BLASLONG register lda4 = lda << 2; | |||
| BLASLONG register lda3 = lda2 + lda; | |||
| for ( j=0; j< ( n & -4 ); j+=4 ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp; | |||
| y_ptr[j+1] += *(aj+lda) * xtemp; | |||
| y_ptr[j+2] += *(aj+lda2) * xtemp; | |||
| y_ptr[j+3] += *(aj+lda3) * xtemp; | |||
| aj += lda4 ; | |||
| } | |||
| for ( ; j<n; j++ ) | |||
| { | |||
| y_ptr[j] += *aj * xtemp; | |||
| aj += lda; | |||
| } | |||
| } | |||
| else | |||
| { | |||
| for ( j=0; j<n; j++ ) | |||
| { | |||
| *y_ptr += *aj * xtemp; | |||
| y_ptr += inc_y; | |||
| aj += lda; | |||
| } | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -1,232 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #include "common.h" | |||
| #if defined(BULLDOZER) || defined(PILEDRIVER) | |||
| #include "sgemv_t_microk_bulldozer.c" | |||
| #elif defined(HASWELL) | |||
| #include "sgemv_t_microk_haswell.c" | |||
| #else | |||
| #include "sgemv_t_microk_sandy.c" | |||
| #endif | |||
| static void copy_x(BLASLONG n, FLOAT *src, FLOAT *dest, BLASLONG inc_src) | |||
| { | |||
| BLASLONG i; | |||
| for ( i=0; i<n; i++ ) | |||
| { | |||
| *dest = *src; | |||
| dest++; | |||
| src += inc_src; | |||
| } | |||
| } | |||
| static void sgemv_kernel_1( BLASLONG n, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, FLOAT *y) | |||
| { | |||
| FLOAT register temp0 = 0.0; | |||
| BLASLONG i; | |||
| for ( i=0; i<n ; i++) | |||
| { | |||
| temp0 += a[i] * x[i]; | |||
| } | |||
| temp0 *= alpha ; | |||
| *y += temp0; | |||
| } | |||
| int CNAME(BLASLONG m, BLASLONG n, BLASLONG dummy1, FLOAT alpha, FLOAT *a, BLASLONG lda, FLOAT *x, BLASLONG inc_x, FLOAT *y, BLASLONG inc_y, FLOAT *buffer) | |||
| { | |||
| BLASLONG i; | |||
| BLASLONG j; | |||
| FLOAT *a_ptr; | |||
| FLOAT *x_ptr; | |||
| FLOAT *y_ptr; | |||
| FLOAT *a_ptrl; | |||
| BLASLONG m1; | |||
| BLASLONG register m2; | |||
| FLOAT *xbuffer; | |||
| xbuffer = buffer; | |||
| BLASLONG register Mblock; | |||
| m1 = m / 1024 ; | |||
| m2 = m % 1024 ; | |||
| x_ptr = x; | |||
| a_ptr = a; | |||
| for (j=0; j<m1; j++) | |||
| { | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(1024,x_ptr,xbuffer,inc_x); | |||
| y_ptr = y; | |||
| a_ptrl = a_ptr; | |||
| for(i = 0; i<n; i++ ) | |||
| { | |||
| sgemv_kernel_16(1024,alpha,a_ptrl,lda,xbuffer,y_ptr); | |||
| y_ptr += inc_y; | |||
| a_ptrl += lda; | |||
| } | |||
| a_ptr += 1024; | |||
| x_ptr += 1024 * inc_x; | |||
| } | |||
| if ( m2 == 0 ) return(0); | |||
| Mblock = 512; | |||
| while ( Mblock >= 16 ) | |||
| { | |||
| if ( m2 & Mblock) | |||
| { | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(Mblock,x_ptr,xbuffer,inc_x); | |||
| y_ptr = y; | |||
| a_ptrl = a_ptr; | |||
| for(i = 0; i<n; i++ ) | |||
| { | |||
| sgemv_kernel_16(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr); | |||
| y_ptr += inc_y; | |||
| a_ptrl += lda; | |||
| } | |||
| a_ptr += Mblock; | |||
| x_ptr += Mblock * inc_x; | |||
| } | |||
| Mblock /= 2; | |||
| } | |||
| if ( m2 & Mblock) | |||
| { | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(Mblock,x_ptr,xbuffer,inc_x); | |||
| y_ptr = y; | |||
| a_ptrl = a_ptr; | |||
| for(i = 0; i<n; i++ ) | |||
| { | |||
| sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr); | |||
| y_ptr += inc_y; | |||
| a_ptrl += lda; | |||
| } | |||
| a_ptr += Mblock; | |||
| x_ptr += Mblock * inc_x; | |||
| } | |||
| Mblock /= 2; | |||
| if ( m2 & Mblock) | |||
| { | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(Mblock,x_ptr,xbuffer,inc_x); | |||
| y_ptr = y; | |||
| a_ptrl = a_ptr; | |||
| for(i = 0; i<n; i++ ) | |||
| { | |||
| sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr); | |||
| y_ptr += inc_y; | |||
| a_ptrl += lda; | |||
| } | |||
| a_ptr += Mblock; | |||
| x_ptr += Mblock * inc_x; | |||
| } | |||
| Mblock /= 2; | |||
| if ( m2 & Mblock) | |||
| { | |||
| if ( inc_x == 1 ) | |||
| xbuffer = x_ptr; | |||
| else | |||
| copy_x(Mblock,x_ptr,xbuffer,inc_x); | |||
| y_ptr = y; | |||
| a_ptrl = a_ptr; | |||
| for(i = 0; i<n; i++ ) | |||
| { | |||
| sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr); | |||
| y_ptr += inc_y; | |||
| a_ptrl += lda; | |||
| } | |||
| a_ptr += Mblock; | |||
| x_ptr += Mblock * inc_x; | |||
| } | |||
| Mblock /= 2; | |||
| if ( m2 & Mblock) | |||
| { | |||
| xbuffer = x_ptr; | |||
| y_ptr = y; | |||
| a_ptrl = a_ptr; | |||
| for(i = 0; i<n; i++ ) | |||
| { | |||
| sgemv_kernel_1(Mblock,alpha,a_ptrl,lda,xbuffer,y_ptr); | |||
| y_ptr += inc_y; | |||
| a_ptrl += lda; | |||
| } | |||
| } | |||
| return(0); | |||
| } | |||
| @@ -0,0 +1,147 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vxorps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vxorps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vxorps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vxorps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" | |||
| "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, (%5,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "vmovups 16(%2,%0,4), %%xmm13 \n\t" // 4 * x | |||
| "vfmaddps %%xmm6, (%6,%0,4), %%xmm12, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, (%7,%0,4), %%xmm12, %%xmm7 \n\t" | |||
| "prefetcht0 384(%5,%0,4) \n\t" | |||
| ".align 2 \n\t" | |||
| "vfmaddps %%xmm4, 16(%4,%0,4), %%xmm13, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 16(%5,%0,4), %%xmm13, %%xmm5 \n\t" | |||
| "vmovups 32(%2,%0,4), %%xmm14 \n\t" // 4 * x | |||
| "vfmaddps %%xmm6, 16(%6,%0,4), %%xmm13, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 16(%7,%0,4), %%xmm13, %%xmm7 \n\t" | |||
| "prefetcht0 384(%6,%0,4) \n\t" | |||
| ".align 2 \n\t" | |||
| "vfmaddps %%xmm4, 32(%4,%0,4), %%xmm14, %%xmm4 \n\t" | |||
| "vfmaddps %%xmm5, 32(%5,%0,4), %%xmm14, %%xmm5 \n\t" | |||
| "vmovups 48(%2,%0,4), %%xmm15 \n\t" // 4 * x | |||
| "vfmaddps %%xmm6, 32(%6,%0,4), %%xmm14, %%xmm6 \n\t" | |||
| "vfmaddps %%xmm7, 32(%7,%0,4), %%xmm14, %%xmm7 \n\t" | |||
| "prefetcht0 384(%7,%0,4) \n\t" | |||
| "vfmaddps %%xmm4, 48(%4,%0,4), %%xmm15, %%xmm4 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "vfmaddps %%xmm5,-16(%5,%0,4), %%xmm15, %%xmm5 \n\t" | |||
| "vfmaddps %%xmm6,-16(%6,%0,4), %%xmm15, %%xmm6 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "vfmaddps %%xmm7,-16(%7,%0,4), %%xmm15, %%xmm7 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "vmovss %%xmm4, (%3) \n\t" | |||
| "vmovss %%xmm5, 4(%3) \n\t" | |||
| "vmovss %%xmm6, 8(%3) \n\t" | |||
| "vmovss %%xmm7, 12(%3) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", | |||
| "%xmm6", "%xmm7", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -1,99 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| //n = n / 16; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float | |||
| "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero | |||
| "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero | |||
| "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero | |||
| "sarq $4, %%rax \n\t" // n = n / 16 | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| // "prefetcht0 512(%%rsi) \n\t" | |||
| "prefetcht0 (%%r8) \n\t" //prefetch next line of a | |||
| "vmovups (%%rsi), %%xmm4 \n\t" | |||
| "vmovups 4*4(%%rsi), %%xmm5 \n\t" | |||
| "vmovups 8*4(%%rsi), %%xmm6 \n\t" | |||
| "vmovups 12*4(%%rsi), %%xmm7 \n\t" | |||
| "vfmaddps %%xmm12, 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm13, 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm14, 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp | |||
| "vfmaddps %%xmm15, 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp | |||
| "addq $16*4 , %%r8 \n\t" // increment prefetch pointer | |||
| "addq $16*4 , %%rsi \n\t" // increment pointer of a | |||
| "addq $16*4 , %%rdi \n\t" // increment pointer of c | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" | |||
| "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" | |||
| "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" | |||
| "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" | |||
| "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" | |||
| "vfmaddss (%%rdx), %%xmm12, %%xmm1, %%xmm12\n\t" | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,148 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" | |||
| "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vfmadd231ps (%4,%0,4), %%xmm12, %%xmm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%xmm12, %%xmm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%xmm12, %%xmm6 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%xmm12, %%xmm7 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vfmadd231ps (%4,%0,4), %%ymm12, %%ymm4 \n\t" | |||
| "vfmadd231ps (%5,%0,4), %%ymm12, %%ymm5 \n\t" | |||
| "prefetcht0 384(%5,%0,4) \n\t" | |||
| "vfmadd231ps 32(%4,%0,4), %%ymm13, %%ymm4 \n\t" | |||
| "vfmadd231ps 32(%5,%0,4), %%ymm13, %%ymm5 \n\t" | |||
| "prefetcht0 384(%6,%0,4) \n\t" | |||
| "vfmadd231ps (%6,%0,4), %%ymm12, %%ymm6 \n\t" | |||
| "vfmadd231ps (%7,%0,4), %%ymm12, %%ymm7 \n\t" | |||
| "prefetcht0 384(%7,%0,4) \n\t" | |||
| "vfmadd231ps 32(%6,%0,4), %%ymm13, %%ymm6 \n\t" | |||
| "vfmadd231ps 32(%7,%0,4), %%ymm13, %%ymm7 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" | |||
| "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" | |||
| "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" | |||
| "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" | |||
| "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" | |||
| "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" | |||
| "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "vmovss %%xmm4, (%3) \n\t" | |||
| "vmovss %%xmm5, 4(%3) \n\t" | |||
| "vmovss %%xmm6, 8(%3) \n\t" | |||
| "vmovss %%xmm7, 12(%3) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -1,100 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| //n = n / 16; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float | |||
| "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero | |||
| "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero | |||
| "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero | |||
| "sarq $4, %%rax \n\t" // n = n / 16 | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| // "prefetcht0 512(%%rsi) \n\t" | |||
| "prefetcht0 (%%r8) \n\t" //prefetch next line of a | |||
| "vmovups (%%rsi), %%xmm4 \n\t" | |||
| "vmovups 4*4(%%rsi), %%xmm5 \n\t" | |||
| "vmovups 8*4(%%rsi), %%xmm6 \n\t" | |||
| "vmovups 12*4(%%rsi), %%xmm7 \n\t" | |||
| "vfmadd231ps 0*4(%%rdi), %%xmm4, %%xmm12\n\t" // multiply a and c and add to temp | |||
| "vfmadd231ps 4*4(%%rdi), %%xmm5, %%xmm13\n\t" // multiply a and c and add to temp | |||
| "vfmadd231ps 8*4(%%rdi), %%xmm6, %%xmm14\n\t" // multiply a and c and add to temp | |||
| "vfmadd231ps 12*4(%%rdi), %%xmm7, %%xmm15\n\t" // multiply a and c and add to temp | |||
| "addq $16*4 , %%r8 \n\t" // increment prefetch pointer | |||
| "addq $16*4 , %%rsi \n\t" // increment pointer of a | |||
| "addq $16*4 , %%rdi \n\t" // increment pointer of c | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" | |||
| "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" | |||
| "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" | |||
| "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" | |||
| "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12\n\t" | |||
| "vaddss (%%rdx), %%xmm12,%%xmm12\n\t" | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,99 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "xorps %%xmm4 , %%xmm4 \n\t" | |||
| "xorps %%xmm5 , %%xmm5 \n\t" | |||
| "xorps %%xmm6 , %%xmm6 \n\t" | |||
| "xorps %%xmm7 , %%xmm7 \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "movups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "movups (%4,%0,4), %%xmm8 \n\t" // 4 * a0 | |||
| "movups (%5,%0,4), %%xmm9 \n\t" // 4 * a1 | |||
| "movups (%6,%0,4), %%xmm10 \n\t" // 4 * a2 | |||
| "movups (%7,%0,4), %%xmm11 \n\t" // 4 * a3 | |||
| "mulps %%xmm12, %%xmm8 \n\t" | |||
| "mulps %%xmm12, %%xmm9 \n\t" | |||
| "mulps %%xmm12, %%xmm10 \n\t" | |||
| "mulps %%xmm12, %%xmm11 \n\t" | |||
| "addps %%xmm8 , %%xmm4 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "addps %%xmm9 , %%xmm5 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "addps %%xmm10, %%xmm6 \n\t" | |||
| "addps %%xmm11, %%xmm7 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| "haddps %%xmm4, %%xmm4 \n\t" | |||
| "haddps %%xmm5, %%xmm5 \n\t" | |||
| "haddps %%xmm6, %%xmm6 \n\t" | |||
| "haddps %%xmm7, %%xmm7 \n\t" | |||
| "haddps %%xmm4, %%xmm4 \n\t" | |||
| "haddps %%xmm5, %%xmm5 \n\t" | |||
| "haddps %%xmm6, %%xmm6 \n\t" | |||
| "haddps %%xmm7, %%xmm7 \n\t" | |||
| "movss %%xmm4, (%3) \n\t" | |||
| "movss %%xmm5, 4(%3) \n\t" | |||
| "movss %%xmm6, 8(%3) \n\t" | |||
| "movss %%xmm7, 12(%3) \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -0,0 +1,174 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| #define HAVE_KERNEL_4x4 1 | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) __attribute__ ((noinline)); | |||
| static void sgemv_kernel_4x4( BLASLONG n, FLOAT **ap, FLOAT *x, FLOAT *y) | |||
| { | |||
| BLASLONG register i = 0; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "vzeroupper \n\t" | |||
| "vxorps %%ymm0 , %%ymm0, %%ymm0 \n\t" | |||
| "vxorps %%ymm1 , %%ymm1, %%ymm1 \n\t" | |||
| "vxorps %%ymm2 , %%ymm2, %%ymm2 \n\t" | |||
| "vxorps %%ymm3 , %%ymm3, %%ymm3 \n\t" | |||
| "vxorps %%ymm4 , %%ymm4, %%ymm4 \n\t" | |||
| "vxorps %%ymm5 , %%ymm5, %%ymm5 \n\t" | |||
| "vxorps %%ymm6 , %%ymm6, %%ymm6 \n\t" | |||
| "vxorps %%ymm7 , %%ymm7, %%ymm7 \n\t" | |||
| "testq $0x04, %1 \n\t" | |||
| "jz .L08LABEL%= \n\t" | |||
| "vmovups (%2,%0,4), %%xmm12 \n\t" // 4 * x | |||
| "vmulps (%4,%0,4), %%xmm12, %%xmm8 \n\t" | |||
| "vmulps (%5,%0,4), %%xmm12, %%xmm10 \n\t" | |||
| "vmulps (%6,%0,4), %%xmm12, %%xmm9 \n\t" | |||
| "vmulps (%7,%0,4), %%xmm12, %%xmm11 \n\t" | |||
| "vaddps %%xmm4, %%xmm8 , %%xmm4 \n\t" | |||
| "addq $4 , %0 \n\t" | |||
| "vaddps %%xmm5, %%xmm10, %%xmm5 \n\t" | |||
| "vaddps %%xmm6, %%xmm9 , %%xmm6 \n\t" | |||
| "subq $4 , %1 \n\t" | |||
| "vaddps %%xmm7, %%xmm11, %%xmm7 \n\t" | |||
| ".L08LABEL%=: \n\t" | |||
| "testq $0x08, %1 \n\t" | |||
| "jz .L16LABEL%= \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" | |||
| "vmulps (%6,%0,4), %%ymm12, %%ymm9 \n\t" | |||
| "vmulps (%7,%0,4), %%ymm12, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "addq $8 , %0 \n\t" | |||
| "vaddps %%ymm5, %%ymm10, %%ymm5 \n\t" | |||
| "vaddps %%ymm6, %%ymm9 , %%ymm6 \n\t" | |||
| "subq $8 , %1 \n\t" | |||
| "vaddps %%ymm7, %%ymm11, %%ymm7 \n\t" | |||
| ".L16LABEL%=: \n\t" | |||
| "cmpq $0, %1 \n\t" | |||
| "je .L16END%= \n\t" | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| "prefetcht0 384(%2,%0,4) \n\t" | |||
| "vmovups (%2,%0,4), %%ymm12 \n\t" // 8 * x | |||
| "vmovups 32(%2,%0,4), %%ymm13 \n\t" // 8 * x | |||
| "prefetcht0 384(%4,%0,4) \n\t" | |||
| "vmulps (%4,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps 32(%4,%0,4), %%ymm13, %%ymm9 \n\t" | |||
| "prefetcht0 384(%5,%0,4) \n\t" | |||
| "vmulps (%5,%0,4), %%ymm12, %%ymm10 \n\t" | |||
| "vmulps 32(%5,%0,4), %%ymm13, %%ymm11 \n\t" | |||
| "vaddps %%ymm4, %%ymm8 , %%ymm4 \n\t" | |||
| "vaddps %%ymm0, %%ymm9 , %%ymm0 \n\t" | |||
| "vaddps %%ymm1, %%ymm10, %%ymm1 \n\t" | |||
| "vaddps %%ymm5, %%ymm11, %%ymm5 \n\t" | |||
| "prefetcht0 384(%6,%0,4) \n\t" | |||
| "vmulps (%6,%0,4), %%ymm12, %%ymm8 \n\t" | |||
| "vmulps 32(%6,%0,4), %%ymm13, %%ymm9 \n\t" | |||
| "prefetcht0 384(%7,%0,4) \n\t" | |||
| "vmulps (%7,%0,4), %%ymm12, %%ymm10 \n\t" | |||
| "vmulps 32(%7,%0,4), %%ymm13, %%ymm11 \n\t" | |||
| "vaddps %%ymm6, %%ymm8 , %%ymm6 \n\t" | |||
| "addq $16, %0 \n\t" | |||
| "vaddps %%ymm2, %%ymm9 , %%ymm2 \n\t" | |||
| "vaddps %%ymm7, %%ymm10, %%ymm7 \n\t" | |||
| "subq $16, %1 \n\t" | |||
| "vaddps %%ymm3, %%ymm11, %%ymm3 \n\t" | |||
| "jnz .L01LOOP%= \n\t" | |||
| ".L16END%=: \n\t" | |||
| "vaddps %%ymm4, %%ymm0, %%ymm4 \n\t" | |||
| "vaddps %%ymm5, %%ymm1, %%ymm5 \n\t" | |||
| "vaddps %%ymm6, %%ymm2, %%ymm6 \n\t" | |||
| "vaddps %%ymm7, %%ymm3, %%ymm7 \n\t" | |||
| "vextractf128 $1 , %%ymm4, %%xmm12 \n\t" | |||
| "vextractf128 $1 , %%ymm5, %%xmm13 \n\t" | |||
| "vextractf128 $1 , %%ymm6, %%xmm14 \n\t" | |||
| "vextractf128 $1 , %%ymm7, %%xmm15 \n\t" | |||
| "vaddps %%xmm4, %%xmm12, %%xmm4 \n\t" | |||
| "vaddps %%xmm5, %%xmm13, %%xmm5 \n\t" | |||
| "vaddps %%xmm6, %%xmm14, %%xmm6 \n\t" | |||
| "vaddps %%xmm7, %%xmm15, %%xmm7 \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "vhaddps %%xmm4, %%xmm4, %%xmm4 \n\t" | |||
| "vhaddps %%xmm5, %%xmm5, %%xmm5 \n\t" | |||
| "vhaddps %%xmm6, %%xmm6, %%xmm6 \n\t" | |||
| "vhaddps %%xmm7, %%xmm7, %%xmm7 \n\t" | |||
| "vmovss %%xmm4, (%3) \n\t" | |||
| "vmovss %%xmm5, 4(%3) \n\t" | |||
| "vmovss %%xmm6, 8(%3) \n\t" | |||
| "vmovss %%xmm7, 12(%3) \n\t" | |||
| "vzeroupper \n\t" | |||
| : | |||
| : | |||
| "r" (i), // 0 | |||
| "r" (n), // 1 | |||
| "r" (x), // 2 | |||
| "r" (y), // 3 | |||
| "r" (ap[0]), // 4 | |||
| "r" (ap[1]), // 5 | |||
| "r" (ap[2]), // 6 | |||
| "r" (ap[3]) // 7 | |||
| : "cc", | |||
| "%xmm0", "%xmm1", "%xmm2", "%xmm3", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -1,106 +0,0 @@ | |||
| /*************************************************************************** | |||
| Copyright (c) 2014, The OpenBLAS Project | |||
| All rights reserved. | |||
| Redistribution and use in source and binary forms, with or without | |||
| modification, are permitted provided that the following conditions are | |||
| met: | |||
| 1. Redistributions of source code must retain the above copyright | |||
| notice, this list of conditions and the following disclaimer. | |||
| 2. Redistributions in binary form must reproduce the above copyright | |||
| notice, this list of conditions and the following disclaimer in | |||
| the documentation and/or other materials provided with the | |||
| distribution. | |||
| 3. Neither the name of the OpenBLAS project nor the names of | |||
| its contributors may be used to endorse or promote products | |||
| derived from this software without specific prior written permission. | |||
| THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" | |||
| AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE | |||
| IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE | |||
| ARE DISCLAIMED. IN NO EVENT SHALL THE OPENBLAS PROJECT OR CONTRIBUTORS BE | |||
| LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL | |||
| DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR | |||
| SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER | |||
| CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, | |||
| OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE | |||
| USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. | |||
| *****************************************************************************/ | |||
| static void sgemv_kernel_16( long n, float alpha, float *a, long lda, float *x, float *y) | |||
| { | |||
| //n = n / 16; | |||
| __asm__ __volatile__ | |||
| ( | |||
| "movq %0, %%rax\n\t" // n -> rax | |||
| "vmovss %1, %%xmm1\n\t" // alpha -> xmm1 | |||
| "movq %2, %%rsi\n\t" // adress of a -> rsi | |||
| "movq %3, %%rcx\n\t" // value of lda > rcx | |||
| "movq %4, %%rdi\n\t" // adress of x -> rdi | |||
| "movq %5, %%rdx\n\t" // adress of y -> rdx | |||
| "leaq (, %%rcx,4), %%rcx \n\t" // scale lda by size of float | |||
| "leaq (%%rsi,%%rcx,1), %%r8 \n\t" // pointer to next line | |||
| "vxorps %%xmm12, %%xmm12, %%xmm12\n\t" // set to zero | |||
| "vxorps %%xmm13, %%xmm13, %%xmm13\n\t" // set to zero | |||
| "vxorps %%xmm14, %%xmm14, %%xmm14\n\t" // set to zero | |||
| "vxorps %%xmm15, %%xmm15, %%xmm15\n\t" // set to zero | |||
| "sarq $4, %%rax \n\t" // n = n / 16 | |||
| ".align 16 \n\t" | |||
| ".L01LOOP%=: \n\t" | |||
| // "prefetcht0 512(%%rsi) \n\t" | |||
| "prefetcht0 (%%r8) \n\t" //prefetch next line of a | |||
| "vmovups (%%rsi), %%xmm4 \n\t" | |||
| "vmovups 4*4(%%rsi), %%xmm5 \n\t" | |||
| "vmovups 8*4(%%rsi), %%xmm6 \n\t" | |||
| "vmovups 12*4(%%rsi), %%xmm7 \n\t" | |||
| "vmulps 0*4(%%rdi), %%xmm4, %%xmm8 \n\t" // multiply a and c and add to temp | |||
| "vmulps 4*4(%%rdi), %%xmm5, %%xmm9 \n\t" // multiply a and c and add to temp | |||
| "vmulps 8*4(%%rdi), %%xmm6, %%xmm10\n\t" // multiply a and c and add to temp | |||
| "vmulps 12*4(%%rdi), %%xmm7, %%xmm11\n\t" // multiply a and c and add to temp | |||
| "vaddps %%xmm12, %%xmm8 , %%xmm12\n\t" | |||
| "vaddps %%xmm13, %%xmm9 , %%xmm13\n\t" | |||
| "vaddps %%xmm14, %%xmm10, %%xmm14\n\t" | |||
| "vaddps %%xmm15, %%xmm11, %%xmm15\n\t" | |||
| "addq $16*4 , %%r8 \n\t" // increment prefetch pointer | |||
| "addq $16*4 , %%rsi \n\t" // increment pointer of a | |||
| "addq $16*4 , %%rdi \n\t" // increment pointer of c | |||
| "dec %%rax \n\t" // n = n -1 | |||
| "jnz .L01LOOP%= \n\t" | |||
| "vaddps %%xmm12, %%xmm14, %%xmm12\n\t" | |||
| "vaddps %%xmm13, %%xmm15, %%xmm13\n\t" | |||
| "vaddps %%xmm12, %%xmm13, %%xmm12\n\t" | |||
| "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" | |||
| "vhaddps %%xmm12, %%xmm12, %%xmm12\n\t" | |||
| "vmulss %%xmm12, %%xmm1, %%xmm12 \n\t" | |||
| "vaddss (%%rdx), %%xmm12, %%xmm12\n\t" | |||
| "vmovss %%xmm12, (%%rdx) \n\t" // store temp -> y | |||
| : | |||
| : | |||
| "m" (n), // 0 | |||
| "m" (alpha), // 1 | |||
| "m" (a), // 2 | |||
| "m" (lda), // 3 | |||
| "m" (x), // 4 | |||
| "m" (y) // 5 | |||
| : "%rax", "%rcx", "%rdx", "%rsi", "%rdi", "%r8", "cc", | |||
| "%xmm0", "%xmm1", | |||
| "%xmm4", "%xmm5", "%xmm6", "%xmm7", | |||
| "%xmm8", "%xmm9", "%xmm10", "%xmm11", | |||
| "%xmm12", "%xmm13", "%xmm14", "%xmm15", | |||
| "memory" | |||
| ); | |||
| } | |||
| @@ -1,6 +1,6 @@ | |||
| Data file for testing DSGESV/DSPOSV LAPACK routines | |||
| 12 Number of values of M | |||
| 0 1 2 13 17 45 78 91 101 119 120 132 values of M (row dimension) | |||
| 0 1 2 13 17 45 78 91 101 119 112 132 values of M (row dimension) | |||
| 6 Number of values of NRHS | |||
| 1 2 14 15 16 13 Values of NRHS (number of right hand sides) | |||
| 30.0 Threshold value of test ratio | |||